Skip to content

Commit

Permalink
Merge branch 'main' into reddit
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni authored Nov 27, 2023
2 parents 7cf0650 + 38fa168 commit c5af4a7
Show file tree
Hide file tree
Showing 38 changed files with 3,455 additions and 363 deletions.
103 changes: 66 additions & 37 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ permissions:
env:
DOLMA_TESTS_SKIP_AWS: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'true' || 'false' }}
DOLMA_TEST_S3_PREFIX: s3://dolma-tests
RUST_CHANNEL: stable


jobs:
Expand All @@ -38,17 +39,69 @@ jobs:
echo "PR base repo: ${{ github.event.pull_request.base.repo.full_name }}/tree/${{ github.event.pull_request.base.ref }}"
echo "PR head repo: ${{ github.event.pull_request.head.repo.full_name }}/tree/${{ github.event.pull_request.head.ref }}"
prepare-venv:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Cache Virtual Env
uses: actions/cache@v3
# name for referring later
id: cache-venv
with:
# what we cache: the virtualenv
path: ./.venv/
# The cache key depends on pyproject.toml and Cargo.toml
key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml', '**/Cargo.toml, **/Cargo.lock') }}--${{ hashFiles('python/**', 'src/**') }}

- name: Setup system libraries
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
sudo apt-get update
sudo apt-get install --yes --upgrade build-essential cmake protobuf-compiler libssl-dev glibc-source
- name: Install Rust toolchain
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
rustup update ${{ env.RUST_CHANNEL }}
rustup component add --toolchain ${{ env.RUST_CHANNEL }} rustfmt rust-src
rustup default ${{ env.RUST_CHANNEL }}
- name: Install Python
if: steps.cache-venv.outputs.cache-hit != 'true'
uses: actions/setup-python@v4
with:
python-version: '3.8'
architecture: "x64"
cache: 'pip'

- name: Create a new Python environment & install maturin
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
python -m venv .venv
source .venv/bin/activate
pip install -U pip
pip install maturin
- name: Install dolma wheels
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
source .venv/bin/activate
maturin build --release -i $(which python) --out dist
wheel_path=$(ls dist/*.whl)
pip install "${wheel_path}[all]"
tests:
runs-on: ubuntu-latest
needs: prepare-venv
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
strategy:
fail-fast: true
matrix:
python: [3.8]
task:
- name: Check Python style
run: |
Expand All @@ -73,50 +126,23 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v1
uses: actions/checkout@v3

- name: Setup system libraries
run: |
sudo apt-get update
sudo apt-get install --yes --upgrade build-essential cmake protobuf-compiler libssl-dev glibc-source
- name: Install Rust
uses: actions-rs/toolchain@v1
- name: Cache Virtual Env
uses: actions/cache@v3
# name for referring later
id: cache-venv
with:
toolchain: stable
components: rustfmt

- name: Install Python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python }}
architecture: "x64"
sccache: true

- name: Create a new Python environment & install maturin
run: |
python -m venv .venv
source .venv/bin/activate
pip install -U pip
pip install maturin
- name: Install dolma wheels
run: |
source .venv/bin/activate
maturin develop --extras=dev
# what we cache: the virtualenv
path: ./.venv/
# The cache key depends on pyproject.toml and Cargo.toml
key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml', '**/Cargo.toml, **/Cargo.lock') }}--${{ hashFiles('python/**', 'src/**') }}

- name: ${{ matrix.task.name }}
run: |
source .venv/bin/activate
${{ matrix.task.run }}
- name: Clean up
if: always()
run: |
source .venv/bin/activate
pip uninstall -y dolma
build-linux:
if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/') }}
Expand All @@ -132,6 +158,7 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: 'pip'
- name: Setup environment
run: |
sudo apt-get update
Expand Down Expand Up @@ -165,6 +192,7 @@ jobs:
with:
python-version: '3.10'
architecture: ${{ matrix.target }}
cache: 'pip'
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
Expand All @@ -188,6 +216,7 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: 'pip'
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,7 @@ target/

# ignore vscode directory
.vscode

# ignore temporary directories
/tmp/
/temp/
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ test-rust:
rm -rf tests/work/*

develop:
maturin develop --extras=dev
maturin develop --extras=all

style:
rustfmt --edition 2021 src/*.rs
Expand Down
53 changes: 36 additions & 17 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,20 @@ requires-python = ">=3.8"
dependencies = [
"anyascii>=0.3.2",
"blingfire==0.1.8",
"boto3",
"boto3>=1.28",
"cached-path==1.3.4",
"detect-secrets==1.4.0",
# "fasttext==0.9.2", # broken with new version of setuptools; using fasttext-wheel instead
"fasttext-wheel==0.9.2",
"fsspec",
"fsspec>=2023.6.0",
"msgspec>=0.14.2",
"nltk==3.8.1",
"omegaconf>=2.3.0",
"presidio_analyzer==2.2.32",
"pycld2==0.41",
# "pycld3==0.22", # does not install correctly
"pyyaml",
"requests",
"rich",
"s3fs",
"s3fs>=2023.6.0",
"smart-open",
"tokenizers>=0.13.3,<1.0.0",
"tqdm",
Expand Down Expand Up @@ -108,18 +106,39 @@ dev = [
"flake8-pyi>=22.8.1",
"Flake8-pyproject>=1.1.0",
]
warc = [
"warcio>=1.7.4",
"trafilatura>=1.6.1",
"justext>=3.0.0",
"goose3>=3.1.17",

# following are all for speeding up trafilatura
"brotli",
"cchardet >= 2.1.7; python_version < '3.11'", # build issue
"faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build
"htmldate[speed] >= 1.4.3",
"py3langid >= 0.2.2",
# extension to process code
code = [
"detect-secrets==1.4.0",
"beautifulsoup4>=4",
"pygments",
"regex"
]
# extension to detect PIIs using presidio
pii = [
"presidio_analyzer==2.2.32",
"regex"
]
# # extension to parse warc files
# warc = [
# "warcio>=1.7.4",
# "trafilatura>=1.6.1",
# "justext>=3.0.0",
# "goose3>=3.1.17",

# # following are all for speeding up trafilatura
# "brotli",
# "cchardet >= 2.1.7; python_version < '3.11'", # build issue
# "faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build
# "htmldate[speed] >= 1.4.3",
# "py3langid >= 0.2.2",
# ]

# all extensions
all = [
"dolma[dev]",
"dolma[code]",
"dolma[pii]",
# "dolma[warc]",
]

[build-system]
Expand Down
3 changes: 2 additions & 1 deletion python/dolma/core/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
import multiprocessing
import re
import shutil
Expand Down Expand Up @@ -27,7 +28,7 @@ def _make_tracker(type_: str = "fixed", **kwargs: int) -> BaseBucketApi:
if type_ == "infer":
return InferBucketsValTracker(**{"n": NUM_BINS, "b": BUFF_SIZE, **kwargs})
elif type_ == "fixed":
return FixedBucketsValTracker(**{"n": NUM_BINS, **kwargs})
return FixedBucketsValTracker(**{"n": int(math.log10(NUM_BINS)), **kwargs})
else:
raise ValueError(f"Unknown tracker type {type_}")

Expand Down
17 changes: 14 additions & 3 deletions python/dolma/core/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,17 @@ def summarize(self, n: int, density: bool = False) -> SummaryTuple:

class FixedBucketsValTracker(BaseBucketApi):
def __init__(self, n: int = 2):
# we use n to determine the precision of the bins; for convenience we store it as a power of 10.
# 10**n will be the maximum number of bins for each power of 2.
# Too large numbers will cause numeric problems and can cause a lot of memory use.
assert n >= 0
# we use n to determine the precision of the bins; for convenience we store it as a power of 10
assert n <= 100
self.n = 10**n
self._bins: Dict[Tuple[int, int], int] = {}

def add(self, value: Union[int, float], count: int = 1):
m, e = math.frexp(value)
k = int(m * self.n), e
k = math.floor(m * self.n), e

if k not in self._bins:
self._bins[k] = 0
Expand All @@ -255,12 +258,20 @@ def __len__(self) -> int:
def full(self) -> bool:
return False

def get_bin_upper_bound(self, val: float) -> float:
"""Return the upper bound of the bin containing val"""
m, e = math.frexp(val)
k = math.floor(m * self.n) + 1 # Add one to obtain the next bin
return k / self.n * 2**e

def summarize(self, n: int, density: bool = False) -> SummaryTuple:
bins, counts = zip(*sorted((m / self.n * 2**e, c) for (m, e), c in self._bins.items()))

if len(self) <= n:
# if there are fewer than n buckets, return the buckets as is
return SummaryTuple(counts=[int(c) for c in counts], bins=[float(b) for b in bins])
# To be consistent we also add the limit of the last bin, so the bins denote bin edges
upper_bin = self.get_bin_upper_bound(max(float(b) for b in bins))
return SummaryTuple(counts=[int(c) for c in counts], bins=[float(b) for b in bins] + [upper_bin])

# computing the weighted histograms
new_counts, new_values = np.histogram(a=bins, bins=n, weights=counts, density=density)
Expand Down
Loading

0 comments on commit c5af4a7

Please sign in to comment.