Skip to content

Commit

Permalink
Merge pull request #15 from allenai/soldni/tests
Browse files Browse the repository at this point in the history
Added tests for local/remote bindings for deduper/mixer; tests in CI
  • Loading branch information
soldni authored Jul 16, 2023
2 parents 4228002 + 4d4fa29 commit 9a04d9d
Show file tree
Hide file tree
Showing 35 changed files with 567 additions and 383 deletions.
78 changes: 78 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,84 @@ permissions:
contents: read

jobs:

tests:
runs-on: ubuntu-latest
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
strategy:
fail-fast: false
matrix:
python: [3.8]
task:
- name: Check Python style
run: |
isort --check .
black --check .
- name: Check Rust style
run: |
rustfmt --edition 2021 src/*.rs --check
- name: Lint
run: |
flake8 .
- name: Type check
run: |
mypy .
- name: Test
run: |
pytest -v --color=yes tests/python/
steps:
- name: Checkout repository
uses: actions/checkout@v1

- name: Setup system libraries
run: |
sudo apt-get update
sudo apt-get install --yes --upgrade build-essential cmake protobuf-compiler libssl-dev glibc-source
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
components: rustfmt

- name: Install Python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python }}
architecture: "x64"
sccache: true

- name: Create a new Python environment & install maturin
run: |
python -m venv .venv
source .venv/bin/activate
pip install -U pip
pip install maturin
- name: Install dolma wheels
run: |
source .venv/bin/activate
maturin develop --extras=dev
- name: ${{ matrix.task.name }}
run: |
source .venv/bin/activate
${{ matrix.task.run }}
- name: Clean up
if: always()
run: |
source .venv/bin/activate
pip uninstall -y dolma
build-linux:
runs-on: ubuntu-latest
env:
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "dolma"
version = "0.6.3"
version = "0.6.4"
edition = "2021"
license = "Apache-2.0"

Expand Down
13 changes: 1 addition & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,13 @@ setup:
publish:
maturin publish

test: setup develop setup-test test-python test-rust

test-python:
pytest -vs tests/python

test-rust-clean:
rm -rf tests/work/*
aws s3 rm --recursive s3://ai2-llm/pretraining-data/tests/mixer/

test-rust-setup:
aws s3 cp tests/data/documents.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/documents/head/0000.json.gz
aws s3 cp tests/data/pii-attributes.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/attributes/pii/head/0000.json.gz
aws s3 cp tests/data/toxicity-attributes.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/attributes/toxicity/head/0000.json.gz
aws s3 cp tests/data/sample-attributes.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/attributes/sample/head/0000.json.gz
aws s3 cp tests/data/duplicate-paragraphs.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/attributes/duplicate_paragraphs/head/0000.json.gz

test-rust: test-rust-clean test-rust-setup
cargo test -- --nocapture
rm -rf tests/work/*

develop:
maturin develop --extras=dev
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*Data to feed OLMo's Appetite*


<img alt="DOLMa logo. It's a watercolor of grape leaves with the word DOLMa in the top left." src="https://github.com/allenai/dolma/blob/main/res/logo.png?raw=true" width="256"></img>
<img alt="DOLMa logo. It's a watercolor of grape leaves with the word DOLMa in the top left." src="https://github.com/allenai/dolma/blob/main/res/logo.png?raw=true" width="256">

Data and tools for generating and inspecting OLMo pre-training data.

Expand Down
32 changes: 16 additions & 16 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
[project]
name = "dolma"
version = "0.6.3"
version = "0.6.4"
description = "Data filters"
license = {text = "Apache-2.0"}
readme = "README.md"
requires-python = ">=3.8"
dependencies = [
"requests",
"tqdm",
"anyascii>=0.3.2",
"blingfire==0.1.8",
"boto3",
"cached-path==1.3.4",
"detect-secrets==1.4.0",
# "fasttext==0.9.2", # broken with new version of setuptools; using fasttext-wheel instead
"fasttext-wheel==0.9.2",
"fsspec",
"msgspec>=0.14.2",
"nltk==3.8.1",
"omegaconf>=2.3.0",
"presidio_analyzer==2.2.32",
"pycld2==0.41",
# "pycld3==0.22",
"fasttext>=0.9.2",
# "pycld3==0.22", # does not instlal correctly
"pyyaml",
"requests",
"rich",
"s3fs",
"smart-open",
"tokenizers>=0.13.3,<1.0.0",
"omegaconf>=2.3.0",
"anyascii>=0.3.2",
"tqdm",
"uniseg",
"pyyaml",
"blingfire==0.1.8",
"detect-secrets==1.4.0",
"rich>=10.12.0",
"smart-open>=6.3.0",
"nltk==3.8.1",
"fsspec>=2021.10.0",
"s3fs>=2021.10.0",
]
classifiers = [
"Development Status :: 3 - Alpha",
Expand Down Expand Up @@ -103,7 +104,6 @@ dev = [
"ipdb>=0.13.0",
"flake8-pyi>=22.8.1",
"Flake8-pyproject>=1.1.0",
"awscli>=1.16.0",
]
[build-system]
requires = [
Expand Down
13 changes: 10 additions & 3 deletions python/dolma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,19 @@
# must import taggers to register them
# we import the rust extension here and wrap it in a python module
from . import dolma as _dolma # type: ignore # noqa: E402
from . import taggers # noqa: E402
from .core.errors import DolmaRustPipelineError # noqa: E402
from .taggers import * # noqa: E402


def deduper(config: dict):
return _dolma.deduper_entrypoint(json.dumps(config))
try:
_dolma.deduper_entrypoint(json.dumps(config))
except RuntimeError as e:
raise DolmaRustPipelineError(f"Error running deduper: {e}") from e


def mixer(config: dict):
return _dolma.mixer_entrypoint(json.dumps(config))
try:
_dolma.mixer_entrypoint(json.dumps(config))
except RuntimeError as e:
raise DolmaRustPipelineError(f"Error running mixer: {e}") from e
12 changes: 8 additions & 4 deletions python/dolma/core/errors.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
class DolmaFilterError(Exception):
class DolmaError(Exception):
"""Base class for all errors"""


class DolmaFatalError(DolmaFilterError):
class DolmaFatalError(DolmaError):
"""Fatal error. Abort the entire process"""


class DolmaShardError(DolmaFilterError):
class DolmaShardError(DolmaError):
"""Fail the shard and continue"""


class DolmaRetryableFailure(DolmaFilterError):
class DolmaRetryableFailure(DolmaError):
"""Retry if a shard throws this error"""


class DolmaRustPipelineError(DolmaError):
"""Error raised by the rust pipeline"""
4 changes: 2 additions & 2 deletions python/dolma/core/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import smart_open
import tqdm

from .errors import DolmaFilterError, DolmaRetryableFailure
from .errors import DolmaError, DolmaRetryableFailure
from .paths import add_suffix, glob_path, make_relative, mkdir_p, sub_prefix

METADATA_SUFFIX = ".done.txt"
Expand Down Expand Up @@ -162,7 +162,7 @@ def _process_single_and_save_status(
except DolmaRetryableFailure as e:
retries_on_error -= 1
if retries_on_error == 0:
raise DolmaFilterError from e
raise DolmaError from e

with smart_open.open(metadata_path, "wt") as f:
f.write(datetime.now().isoformat())
Expand Down
3 changes: 3 additions & 0 deletions python/dolma/core/vizualizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# flake8: noqa
# type: ignore

import argparse
import json
import os
Expand Down
12 changes: 11 additions & 1 deletion python/dolma/taggers/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@
"""
from typing import Iterable, List, Tuple

import cld3
try:
import cld3

CLD3_AVAILABLE = True
except ImportError:
CLD3_AVAILABLE = False

import pycld2 as cld2
import regex
from anyascii import anyascii
Expand All @@ -21,6 +27,10 @@

@TaggerRegistry.add("cld3_en_doc_v2")
class Cld3LanguageTagger(BaseTagger):
def __init__(self) -> None:
if not CLD3_AVAILABLE:
raise ImportError(f"cld3 is not install, cannot instantiate {self.__class__.__name__}")

def _predict_text(self, text: str) -> Tuple[str, float]:
pred = cld3.get_language(text) # pyright: ignore
score = pred.probability if pred.language == "en" else 0.0
Expand Down
2 changes: 1 addition & 1 deletion python/dolma/taggers/length.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def predict(self, doc: Document) -> DocResult:
@TaggerRegistry.add("olmo_pretokenizer_v1")
class OlmoPreTokenizerV1(BaseTagger):
def __init__(self) -> None:
self.pre_tokenizer = pre_tokenizers.Sequence( # type: ignore
self.pre_tokenizer = pre_tokenizers.Sequence(
[
# Split on all punctuation.
pre_tokenizers.Split(
Expand Down
Loading

0 comments on commit 9a04d9d

Please sign in to comment.