diff --git a/.devcontainer/postInstall.sh b/.devcontainer/postInstall.sh
index cf3761a9..f2b12ea5 100755
--- a/.devcontainer/postInstall.sh
+++ b/.devcontainer/postInstall.sh
@@ -2,4 +2,4 @@
 
 PATH=/home/vscode/.cargo/bin:$PATH
 cd dolma
-source /home/vscode/miniforge3/bin/activate && pip install cmake "maturin[patchelf]>=1.1,<2.0"
+source /home/vscode/miniforge3/bin/activate && pip install cmake "maturin>=1.5,<2.0"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 838abff0..1f007813 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -19,6 +19,7 @@ permissions:
 env:
   DOLMA_TESTS_SKIP_AWS: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'true' || 'false' }}
   DOLMA_TEST_S3_PREFIX: s3://dolma-tests
+  DOLMA_TEST_SKIP_LARGE_MODELS: "true"
   RUST_CHANNEL: stable
 
 jobs:
diff --git a/Cargo.lock b/Cargo.lock
index 1457f246..2beea7e9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -987,7 +987,7 @@ dependencies = [
 
 [[package]]
 name = "dolma"
-version = "1.0.14"
+version = "1.1.0"
 dependencies = [
  "adblock",
  "ahash",
diff --git a/classifiers/README.md b/classifiers/README.md
new file mode 100644
index 00000000..b71253f3
--- /dev/null
+++ b/classifiers/README.md
@@ -0,0 +1,30 @@
+# Dolma Classifiers
+
+
+## Getting Started
+
+From root directory, install the package:
+
+```bash
+pip install -e classifiers
+```
+
+## Examples
+
+Run [Huggingface FineWeb classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) on S3 data:
+
+```bash
+python -m dolma_classifiers.inference \
+    -s 's3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/20b-01/*zstd' \
+    -m HuggingFaceFW/fineweb-edu-classifier
+```
+
+Run [NVIDIA's Deberta quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) on S3 data with model compilation:
+
+```bash
+python -m dolma_classifiers.inference \
+    -s 's3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd' \
+    -m nvidia/quality-classifier-deberta \
+    --model-compile \
+    --max-length 1024
+```
diff --git a/classifiers/pyproject.toml b/classifiers/pyproject.toml
new file mode 100755
index 00000000..6ee1dd1b
--- /dev/null
+++ b/classifiers/pyproject.toml
@@ -0,0 +1,107 @@
+[project]
+name = "dolma-classifiers"
+version = "0.1.0"
+description = "Toolkit for easy classification of data in Dolma format."
+authors = [
+    {name = "Luca Soldaini", email = "lucas@allenai.org" }
+]
+license = {text = "Apache-2.0"}
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "msgspec",
+    "fsspec[s3]",
+    "smart_open[s3]>=7.0.4",
+    "tqdm",
+    "torch",
+    "transformers",
+    "wandb",
+    "jq"
+]
+
+[project.urls]
+"Homepage" = "https://github.com/allenai/dolma"
+"Repository" = "https://github.com/allenai/dolma"
+"Bug Tracker" = "https://github.com/allenai/dolma/issues"
+
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+dolma_classifiers = ["py.typed", "*.pyi"]
+
+
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = [
+    "setuptools >= 61.0.0",
+    "wheel"
+]
+
+[project.optional-dependencies]
+dev = [
+    "black>=22.6.0",
+    "isort>=5.10.1",
+    "mypy>=0.971",
+    "pytest>=5.2",
+    "ipython>=8.4.0",
+    "autopep8>=1.7.0",
+    "flake8>=5.0",
+    "ipdb>=0.13.0",
+    "flake8-pyi>=22.8.1",
+    "Flake8-pyproject>=1.1.0",
+    "pytest-asyncio>=0.15.1",
+    "pytest-cov>=2.12.1",
+    "aioresponses>=0.7.2",
+]
+
+[tool.black]
+line-length = 115
+include = '\.pyi?$'
+exclude = '''
+(
+      __pycache__
+    | \.git
+    | \.mypy_cache
+    | \.pytest_cache
+    | \.vscode
+    | \.venv
+    | \bdist\b
+    | \bdoc\b
+)
+'''
+
+[tool.isort]
+profile = "black"
+line_length = 115
+multi_line_output = 3
+
+[tool.autopep8]
+max_line_length = 115
+in-place = true
+recursive = true
+aggressive = 3
+
+[tool.mypy]
+python_version = "3.10"
+ignore_missing_imports = true
+no_site_packages = true
+allow_redefinition = false
+warn_unused_configs = true
+warn_unused_ignores = true
+warn_no_return = true
+warn_return_any = false
+warn_unreachable = true
+show_error_codes = true
+pretty = true
+
+[tool.mypy-tests]
+strict_optional = false
+
+[tool.flake8]
+per-file-ignores = [
+    '__init__.py:F401',
+    '*.pyi:E302,E305',
+    '*.py:E203'
+]
diff --git a/classifiers/scripts/fineweb_100b.sh b/classifiers/scripts/fineweb_100b.sh
new file mode 100644
index 00000000..45298889
--- /dev/null
+++ b/classifiers/scripts/fineweb_100b.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl.zstd'
+
+NUM_NODES=2
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="high"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_40b.sh b/classifiers/scripts/fineweb_40b.sh
new file mode 100644
index 00000000..69c43247
--- /dev/null
+++ b/classifiers/scripts/fineweb_40b.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd'
+NUM_NODES=1
+BATCH_SIZE=1024
+CLUSTER="ai2/neptune*"
+PRIORITY="high"
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_50b_extra.sh b/classifiers/scripts/fineweb_50b_extra.sh
new file mode 100644
index 00000000..d80909c8
--- /dev/null
+++ b/classifiers/scripts/fineweb_50b_extra.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/20240909-50b/*zstd'
+NUM_NODES=1
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="high"
+
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_automath_arxiv.sh b/classifiers/scripts/fineweb_automath_arxiv.sh
new file mode 100644
index 00000000..ed9afe84
--- /dev/null
+++ b/classifiers/scripts/fineweb_automath_arxiv.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/arxiv/*/*.gz'
+
+NUM_NODES=1
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_automath_code.sh b/classifiers/scripts/fineweb_automath_code.sh
new file mode 100644
index 00000000..a9769496
--- /dev/null
+++ b/classifiers/scripts/fineweb_automath_code.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/code/*/*.gz'
+
+NUM_NODES=1
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_automath_web.sh b/classifiers/scripts/fineweb_automath_web.sh
new file mode 100644
index 00000000..2994999a
--- /dev/null
+++ b/classifiers/scripts/fineweb_automath_web.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/web/*.gz'
+
+NUM_NODES=1
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_dclm07.sh b/classifiers/scripts/fineweb_dclm07.sh
new file mode 100644
index 00000000..bb41d32b
--- /dev/null
+++ b/classifiers/scripts/fineweb_dclm07.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile/documents/*zst'
+
+NUM_NODES=4
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4"
diff --git a/classifiers/scripts/fineweb_flan.sh b/classifiers/scripts/fineweb_flan.sh
new file mode 100644
index 00000000..4def83f0
--- /dev/null
+++ b/classifiers/scripts/fineweb_flan.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/*.gz'
+
+NUM_NODES=1
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_full.sh b/classifiers/scripts/fineweb_full.sh
new file mode 100644
index 00000000..a74d07f7
--- /dev/null
+++ b/classifiers/scripts/fineweb_full.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.jsonl.zstd'
+
+NUM_NODES=4
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_owm.sh b/classifiers/scripts/fineweb_owm.sh
new file mode 100644
index 00000000..29d872d1
--- /dev/null
+++ b/classifiers/scripts/fineweb_owm.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/proof-pile-2/v0_decontaminated/documents/*/*/*.gz'
+
+NUM_NODES=1
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_pes2o.sh b/classifiers/scripts/fineweb_pes2o.sh
new file mode 100644
index 00000000..38386ac7
--- /dev/null
+++ b/classifiers/scripts/fineweb_pes2o.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/*/*/*/*.gz'
+
+NUM_NODES=1
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/fineweb_se.sh b/classifiers/scripts/fineweb_se.sh
new file mode 100644
index 00000000..7fe3e43b
--- /dev/null
+++ b/classifiers/scripts/fineweb_se.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*.zst'
+
+NUM_NODES=1
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
diff --git a/classifiers/scripts/nvidia-deberta-100b.sh b/classifiers/scripts/nvidia-deberta-100b.sh
new file mode 100644
index 00000000..5c29e669
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-100b.sh
@@ -0,0 +1,57 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl.zstd'
+
+
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0000*.jsonl.zstd'
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0001*.jsonl.zstd'
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0002*.jsonl.zstd'
+
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0000*.jsonl.zstd'
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0001*.jsonl.zstd'
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0002*.jsonl.zstd'
+
+
+NUM_NODES=4
+# NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="high"
+# PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-40b.sh b/classifiers/scripts/nvidia-deberta-40b.sh
new file mode 100644
index 00000000..b3a57c14
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-40b.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd'
+
+NUM_NODES=2
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="high"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-50_extra.sh b/classifiers/scripts/nvidia-deberta-50_extra.sh
new file mode 100644
index 00000000..cdd1f7b5
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-50_extra.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/20240909-50b/*zstd'
+
+NUM_NODES=2
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="high"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-automath-arxiv.sh b/classifiers/scripts/nvidia-deberta-automath-arxiv.sh
new file mode 100644
index 00000000..52898bbd
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-automath-arxiv.sh
@@ -0,0 +1,46 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/arxiv/*/*.gz'
+
+
+NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-automath-code.sh b/classifiers/scripts/nvidia-deberta-automath-code.sh
new file mode 100644
index 00000000..958264a3
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-automath-code.sh
@@ -0,0 +1,46 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/code/*/*.gz'
+
+
+NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-automath-web.sh b/classifiers/scripts/nvidia-deberta-automath-web.sh
new file mode 100644
index 00000000..631d5b06
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-automath-web.sh
@@ -0,0 +1,46 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/web/*.gz'
+
+
+NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-flan.sh b/classifiers/scripts/nvidia-deberta-flan.sh
new file mode 100644
index 00000000..da0be027
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-flan.sh
@@ -0,0 +1,46 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/*.gz'
+
+
+NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-full.sh b/classifiers/scripts/nvidia-deberta-full.sh
new file mode 100644
index 00000000..cddacc11
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-full.sh
@@ -0,0 +1,57 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.jsonl.zstd'
+
+
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0000*.jsonl.zstd'
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0001*.jsonl.zstd'
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0002*.jsonl.zstd'
+
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0000*.jsonl.zstd'
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0001*.jsonl.zstd'
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0002*.jsonl.zstd'
+
+
+NUM_NODES=8
+# NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="urgent"
+# PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-owm.sh b/classifiers/scripts/nvidia-deberta-owm.sh
new file mode 100644
index 00000000..7b779edb
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-owm.sh
@@ -0,0 +1,46 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/proof-pile-2/v0_decontaminated/documents/*/*/*.gz'
+
+
+NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-pes2o.sh b/classifiers/scripts/nvidia-deberta-pes2o.sh
new file mode 100644
index 00000000..22423f94
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-pes2o.sh
@@ -0,0 +1,46 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/*/*/*/*.gz'
+
+
+NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/scripts/nvidia-deberta-se.sh b/classifiers/scripts/nvidia-deberta-se.sh
new file mode 100644
index 00000000..fe24bbb0
--- /dev/null
+++ b/classifiers/scripts/nvidia-deberta-se.sh
@@ -0,0 +1,46 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*.zst'
+
+
+NUM_NODES=1
+MODEL_NAME="nvidia/quality-classifier-deberta"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=512
+PRIORITY="urgent"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
diff --git a/classifiers/src/dolma_classifiers/__init__.py b/classifiers/src/dolma_classifiers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/classifiers/src/dolma_classifiers/inference/__init__.py b/classifiers/src/dolma_classifiers/inference/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/classifiers/src/dolma_classifiers/inference/__main__.py b/classifiers/src/dolma_classifiers/inference/__main__.py
new file mode 100644
index 00000000..92515713
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/inference/__main__.py
@@ -0,0 +1,5 @@
+from .inference import main, parse_args
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/classifiers/src/dolma_classifiers/inference/inference.py b/classifiers/src/dolma_classifiers/inference/inference.py
new file mode 100644
index 00000000..b488a84a
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/inference/inference.py
@@ -0,0 +1,445 @@
+import argparse
+import time
+from collections import defaultdict
+from functools import partial
+from itertools import zip_longest
+from multiprocessing import Event, Process
+from queue import Empty
+from queue import Queue as QueueType
+from threading import Event as EventType
+from typing import Any, Generator, NamedTuple
+from urllib.parse import urlparse
+
+import fsspec
+import jq
+import msgspec
+import smart_open
+import torch
+import torch.multiprocessing as mp
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import (  # pyright: ignore
+    DataLoader,
+    IterableDataset,
+    get_worker_info,
+)
+from transformers import BatchEncoding, PreTrainedTokenizer
+
+from .loggers import ProgressLogger, WandbLogger, get_logger
+from .models import Registry
+from .utils import cleanup, get_local_gpu_rank, sanitize_model_name, setup
+
+
+class Batch(NamedTuple):
+    encoding: BatchEncoding | dict[str, torch.Tensor]
+    ids: list[str]
+    lengths: list[int]
+    sources: list[str]
+
+    def __len__(self):
+        return len(self.ids)
+
+
+class OutputPath(NamedTuple):
+    source: str
+    count: int
+
+
+class DocumentsIterableDataset(IterableDataset[Batch]):
+    def __init__(
+        self,
+        input_paths_queue: QueueType[str],
+        output_paths_queue: QueueType[OutputPath],
+        tokenizer: PreTrainedTokenizer,
+        max_length: int | None,
+        text_selector: str = '.text',
+        id_selector: str = ".id",
+    ):
+        self.input_paths_queue = input_paths_queue
+        self.output_paths_queue = output_paths_queue
+
+        self.text_selector = text_selector
+        self.id_selector = id_selector
+        self.tokenizer = tokenizer
+        self.logger = get_logger(self.__class__.__name__)
+        self.max_length = max_length or int(tokenizer.model_max_length)
+
+    @property
+    def worker_info(self):
+        worker_rank = 0
+        world_size = 1
+        if (worker_info := get_worker_info()):
+            worker_rank = worker_info.id
+            world_size = worker_info.num_workers
+        return worker_rank, world_size
+
+    def __iter__(self) -> Generator[Batch, None, None]:
+        decoder = msgspec.json.Decoder()
+        text_selector = jq.compile(self.text_selector)
+        id_selector = jq.compile(self.id_selector)
+
+        while self.input_paths_queue.qsize() > 0:
+            path = self.input_paths_queue.get()
+            self.logger.info(f"Reading {path}")
+            count = 0
+            with smart_open.open(path, "rt") as source_file:
+                for line in source_file:
+                    doc = decoder.decode(line)
+                    text = str(text_selector.input(doc).first())
+                    id_ = str(id_selector.input(doc).first())
+                    encoding = self.tokenizer(
+                        text,
+                        return_tensors="pt",
+                        truncation=True,
+                        max_length=self.max_length,
+                    )
+                    yield Batch(encoding=encoding, ids=[id_], lengths=[len(text)], sources=[path])
+                    count += 1
+
+            self.logger.info(f"Read {count:,} documents from {path}")
+            self.output_paths_queue.put(OutputPath(source=path, count=count))
+
+
+def collate_batch(batch: list[Batch], pad_token_id: int) -> Batch:
+    padded_encodings = {
+        key: pad_sequence(
+            # assuming first dimension is batch size
+            [b.encoding[key][-1, :] for b in batch],   # pyright: ignore
+            batch_first=True,
+            padding_value=pad_token_id,
+        )
+        for key in batch[0].encoding.keys()
+    }
+    return Batch(
+        encoding=padded_encodings,
+        ids=[id_ for elem in batch for id_ in elem.ids],
+        lengths=[length for elem in batch for length in elem.lengths],
+        sources=[source for elem in batch for source in elem.sources],
+    )
+
+
+class AttributeRow(NamedTuple):
+    sources: list[str]
+    attributes: list[dict[str, Any]]
+
+
+def writer_worker(
+    error_event: EventType,
+    scores_queue: QueueType[AttributeRow | None],
+    output_paths_queue: QueueType[OutputPath],
+    source_destination_mapping: dict[str, str],
+    log_every: int = 10_000,
+):
+
+    progress_logger = ProgressLogger(log_every=log_every, wandb_logger=WandbLogger())
+    console_logger = get_logger("writer_worker")
+
+    files_writers = {}
+    try:
+        encoder = msgspec.json.Encoder()
+        counts = defaultdict(int)
+        total_count = 0
+
+        while True:
+            if scores_queue.qsize() == 0:
+                time.sleep(0.1)
+                continue
+
+            element = scores_queue.get()
+            if element is None:
+                break
+
+            group_by_source = defaultdict(list)
+            for source, attribute in zip(element.sources, element.attributes):
+                group_by_source[source].append(attribute)
+                if source not in files_writers:
+                    destination_path = source_destination_mapping[source]
+                    files_writers[source] = smart_open.open(destination_path, "wt", encoding="utf-8")
+                    console_logger.info(f"Opened {destination_path} for writing")
+
+            for source, attributes in group_by_source.items():
+                files_writers[source].write(
+                    encoder.encode_lines(attributes).decode("utf-8")
+                )
+                progress_logger.increment(docs=len(attributes))
+                counts[source] += len(attributes)
+                total_count += len(attributes)
+
+            if total_count > log_every:
+                # we at most close one file per log_every documents
+                try:
+                    # get the paths from the output queue (these have been fully processed)
+                    path = output_paths_queue.get_nowait()
+                except Empty:
+                    path = None
+
+                if path is not None and path.count == counts[path.source]:
+                    # I've finished processing this source; close the file
+                    f = files_writers.pop(path.source)
+                    f.close()
+                    console_logger.info(f"Closed {source_destination_mapping[path.source]}")
+                    progress_logger.increment(files=1)
+                elif path is not None and counts[path.source] > path.count:
+                    raise RuntimeError(
+                        f"More documents ({counts[path.source]}) than expected ({path.count}) " +
+                        f"for source {path.source}. This should not happen!"
+                    )
+                elif path is not None:
+                    console_logger.info(
+                        f"Tried to close {source_destination_mapping[path.source]}, " +
+                        f"but only seen {counts[path.source]}/{path.count} documents"
+                    )
+                    # more documents still to be written for this source; put it back
+                    output_paths_queue.put(path)
+                total_count = 0
+    except Exception as e:
+        console_logger.error(f"Writer process encountered an error: {e}")
+        error_event.set()
+    finally:
+        for f in files_writers.values():
+            f.close()
+
+
+def process_documents(
+    source_paths: list[str],
+    destination_paths: list[str],
+    batch_size: int,
+    model_name: str,
+    model_dtype: str,
+    model_compile: bool,
+    log_every: int,
+    max_length: int | None = None,
+    text_selector: str = ".text",
+    id_selector: str = ".id",
+    num_workers: int = 1,
+    prefetch_factor: int = 2,
+    suffix: str | None = None
+):
+    """Processes a batch of files using distributed processing."""
+
+    classifier = Registry.get(
+        model_name=model_name,
+        device=f'cuda:{get_local_gpu_rank()}',
+        dtype='float16',
+        compile=model_compile,
+    )
+
+    # get filesystem for first source path (we assume is the same for all source paths); we will use this
+    # to check if destination path exists (file already processed)
+    fs = fsspec.get_filesystem_class(urlparse(source_paths[0]).scheme)()
+
+    source_destination_mapping = {
+        source_path: destination_path
+        for source_path, destination_path in zip(source_paths, destination_paths)
+        if not fs.exists(destination_path)
+    }
+
+    with torch.no_grad(), mp.Manager() as manager:
+        input_paths_queue: QueueType[str] = manager.Queue()
+        output_paths_queue: QueueType[OutputPath] = manager.Queue()
+        scores_queue: QueueType[AttributeRow | None] = manager.Queue()
+        for source_path in source_destination_mapping:
+            input_paths_queue.put(source_path)
+
+        writer_process_error = Event()
+        writer_process = Process(
+            target=writer_worker,
+            kwargs=dict(
+                scores_queue=scores_queue,
+                output_paths_queue=output_paths_queue,
+                source_destination_mapping=source_destination_mapping,
+                log_every=log_every,
+                error_event=writer_process_error,
+            ),
+        )
+        writer_process.start()
+
+        try:
+            source_dataset = DocumentsIterableDataset(
+                # path=source_path,
+                input_paths_queue=input_paths_queue,
+                output_paths_queue=output_paths_queue,
+                tokenizer=classifier.tokenizer,
+                max_length=max_length,
+                text_selector=text_selector,
+                id_selector=id_selector,
+            )
+
+            data_loader = DataLoader(
+                source_dataset,
+                batch_size=batch_size,
+                shuffle=False,
+                num_workers=num_workers,
+                prefetch_factor=prefetch_factor,
+                collate_fn=partial(collate_batch, pad_token_id=getattr(classifier.tokenizer, "pad_token_id", 0)),
+            )
+
+            counts = defaultdict(int)
+
+            for batch in data_loader:
+                for s in batch.sources:
+                    counts[s] += 1
+
+                if writer_process_error.is_set():
+                    raise RuntimeError("Writer process encountered an error")
+
+                inputs = {k: v.to(classifier.device) for k, v in batch.encoding.items()}
+                scores = classifier.score(**inputs)
+
+                attributes = [
+                    {"id": doc_id, "attributes": {pred.label: [[0, doc_length, pred.score]] for pred in doc_preds}}
+                    for doc_preds, doc_id, doc_length in zip(scores, batch.ids, batch.lengths)
+                ]
+                scores_queue.put_nowait(AttributeRow(sources=batch.sources, attributes=attributes))
+
+            scores_queue.put(None)
+        finally:
+            writer_process.join()
+            if writer_process_error.is_set():
+                raise RuntimeError("Writer process encountered an error")
+
+    cleanup()
+
+
+def longest_common_sequence(paths: list[str]) -> str:
+    # Split each string by "/"
+    split_strings = [s.split("/") for s in paths]
+
+    # Zip the split lists together and find the longest common sequence
+    common_sequence = []
+    for fragments in zip_longest(*split_strings, fillvalue=None):
+        # Check if all fragments in this position are the same
+        if len(set(fragments)) == 1:
+            common_sequence.append(fragments[0])
+        else:
+            break
+
+    # Join the longest common sequence back with "/"
+    return "/".join(common_sequence)
+
+
+def main(args: argparse.Namespace) -> None:
+    # disable multiprocessing for tokenizer
+    console_logger = get_logger("main")
+
+    # initialize distributed processing
+    rank, world_size = setup()
+
+    # initialize wandb logging (if enabled)
+    WandbLogger()
+
+    # check for available GPUs
+    if not torch.cuda.is_available():
+        raise RuntimeError("No GPUs available, but the script is designed to use multiple GPUs.")
+
+    # if necessary, unglob source prefix
+    fs = fsspec.get_filesystem_class((scheme := urlparse(args.source_prefix).scheme))()
+    source_paths = [(f"{scheme}://{p}" if scheme else p) for p in fs.glob(args.source_prefix)]
+
+    assert len(source_paths) > 0, f"No files found in {args.source_prefix}"
+
+    if all("/documents/" in p for p in source_paths):
+        source_prefix = longest_common_sequence([p.split("/documents/", 1)[0] for p in source_paths])
+        source_prefix = f"{source_prefix}/documents/"
+    else:
+        source_prefix = longest_common_sequence(source_paths)
+
+    destination_paths = [
+        f'{args.output_prefix.rstrip("/")}/{p.replace(source_prefix, "").lstrip("/")}' for p in source_paths
+    ]
+
+    console_logger.info(f"Processing up to {len(source_paths)} files from {args.source_prefix} to {args.output_prefix}")
+
+    # Filter out existing files unless --override is set
+    if not args.override:
+
+        # possible existing destinations might contain more files than destination_paths because it glob
+        # at the attribute name level, while destination_paths might only be about a subset of documents.
+        possible_existing_destinations = set(f"{scheme}://{p}" for p in fs.glob(f'{args.output_prefix.rstrip("/")}/**'))
+        existing_destinations = {p for p in destination_paths if p in possible_existing_destinations}
+
+        console_logger.info(f"Found {len(existing_destinations)} existing files in {args.output_prefix}")
+
+        if len(existing_destinations) >= len(source_paths):
+            console_logger.info("No files left to process, exiting")
+            return
+
+        source_paths, destination_paths = map(
+            lambda t: list(t),
+            zip(*[(p, d) for p, d in zip(source_paths, destination_paths) if d not in existing_destinations]),
+        )
+
+    console_logger.info(f"After filtering, tagging {len(source_paths)} files")
+
+    # Distribute files across processes
+    files_per_process = len(source_paths) / world_size
+    start_idx = int(rank * files_per_process)
+    end_idx = int((rank + 1) * files_per_process) if rank < world_size - 1 else len(source_paths)
+    partition_source_paths = source_paths[start_idx:end_idx]
+    partition_destination_paths = destination_paths[start_idx:end_idx]
+
+    console_logger.info(f"Partitioned into {world_size} workers of with avg {files_per_process:.2f} files.")
+    console_logger.info(f"Processing GPU {rank}/{world_size}: {len(partition_source_paths)} files")
+
+    process_documents(
+        model_name=args.model_name,
+        model_dtype=args.model_dtype,
+        log_every=args.log_every,
+        source_paths=partition_source_paths,
+        destination_paths=partition_destination_paths,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        max_length=args.max_length,
+        text_selector=args.text_key,
+        id_selector=args.id_key,
+        suffix=args.attribute_suffix,
+        model_compile=args.model_compile,
+        prefetch_factor=args.prefetch_factor,
+    )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Classify text from JSONL files on S3 using a Hugging Face model."
+    )
+    parser.add_argument(
+        "-s",
+        "--source-prefix",
+        type=str,
+        required=True,
+        help="S3 glob pattern for input files (e.g., s3://path/to/docs/*/*.jsonl.gz)",
+    )
+    parser.add_argument("--output-prefix", type=str, default=None, help="S3 prefix to save the results")
+    parser.add_argument("-b", "--batch-size", type=int, default=32, help="Batch size for processing (default: 32)")
+    parser.add_argument("-m", "--model-name", type=str, required=True, help="Hugging Face model name")
+    parser.add_argument(
+        "--max-length", type=int, default=None, help="Maximum sequence length for tokenization (default: None)"
+    )
+    parser.add_argument("--model-compile", action="store_true", help="Compile the model using torch.compile")
+    parser.add_argument("--use-wandb", action="store_true", help="Use Weights & Biases for logging")
+    parser.add_argument("--wandb-project", type=str, default=None, help="Weights & Biases project name")
+    parser.add_argument("--wandb-entity", type=str, default=None, help="Weights & Biases entity name")
+    parser.add_argument("--wandb-name", type=str, default=None, help="Gantry task name")
+    parser.add_argument("--override", action="store_true", help="Override existing files")
+    parser.add_argument("--text-key", type=str, default=".text", help="JQ key to extract text from documents")
+    parser.add_argument("--id-key", type=str, default=".id", help="JQ key to extract id from documents")
+    parser.add_argument("--num-workers", type=int, default=1, help="Number of workers for processing")
+    parser.add_argument("--log-every", type=int, default=10000, help="Log every n documents")
+    parser.add_argument("--model-dtype", type=str, default="float16", help="Data type for model")
+    parser.add_argument("--attribute-suffix", type=str, default=None, help="Optional suffix for attribute keys")
+    parser.add_argument("--prefetch-factor", type=int, default=2, help="Prefetch factor for DataLoader")
+    opts = parser.parse_args()
+
+    if opts.output_prefix is None:
+        if "/documents/" not in opts.source_prefix:
+            raise ValueError("Output prefix is required unless source prefix contains 'documents'")
+        base, _ = opts.source_prefix.split("/documents/", 1)
+        opts.output_prefix = f"{base}/attributes/{sanitize_model_name(opts.model_name)}"
+
+    if opts.use_wandb:
+        WandbLogger.use_wandb = True
+        WandbLogger.project = opts.wandb_project or WandbLogger.project
+        WandbLogger.entity = opts.wandb_entity or WandbLogger.entity
+        # use name provided by user, or name of run in wandb, or sanitize model name
+        WandbLogger.name = opts.wandb_name or WandbLogger.name or sanitize_model_name(opts.model_name, opts.__dict__)
+
+    return opts
diff --git a/classifiers/src/dolma_classifiers/inference/loggers.py b/classifiers/src/dolma_classifiers/inference/loggers.py
new file mode 100644
index 00000000..a09437b3
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/inference/loggers.py
@@ -0,0 +1,107 @@
+import logging
+import os
+import time
+
+import wandb
+
+from .utils import get_rank_and_world_size
+
+
+def get_logger(logger_name: str):
+    rank, world_size = get_rank_and_world_size()
+
+    # Create a custom formatter
+    class RankFormatter(logging.Formatter):
+        def format(self, record):
+            record.rank = rank
+            record.world_size = world_size
+            return super().format(record)
+
+    # Create a logger with the given name
+    logger = logging.getLogger(f'dolma_classifiers.{logger_name}')
+    logger.setLevel(logging.INFO)
+
+    # Create a handler for console output
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+
+    # Create and set the custom formatter
+    formatter = RankFormatter(
+        '%(asctime)s [%(rank)d/%(world_size)d] %(levelname)s: %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    console_handler.setFormatter(formatter)
+
+    # Add the handler to the logger
+    logger.addHandler(console_handler)
+
+    return logger
+
+
+class WandbLogger:
+    is_initialized = False
+    use_wandb = False
+    project = os.environ.get("WANDB_PROJECT", "")
+    entity = os.environ.get("WANDB_ENTITY", "")
+    name = os.environ.get("GANTRY_TASK_NAME", "")
+
+    def __new__(cls, *args, **kwargs):
+        rank, _ = get_rank_and_world_size()
+        if not cls.is_initialized and cls.use_wandb and rank == 0:
+            assert cls.project, "W&B project name is not set"
+            assert cls.entity, "W&B entity name is not set"
+            assert cls.name, "W&B run name is not set"
+            wandb.init(project=cls.project, entity=cls.entity, name=cls.name)
+            cls.is_initialized = True
+        return super().__new__(cls, *args, **kwargs)
+
+    def __init__(self):
+        self.rank, self.world_size = get_rank_and_world_size()
+
+    def log(self, **kwargs):
+        if (self.rank == 0) and (self.use_wandb):
+            if step := kwargs.pop("step", None):
+                wandb.log(kwargs, step=step)
+            else:
+                wandb.log(kwargs)
+
+
+class ProgressLogger:
+    def __init__(self, log_every: int = 10_000, wandb_logger: WandbLogger | None = None):
+        self.log_every = log_every
+        self.logger = get_logger(self.__class__.__name__)
+        self.start_time = self.prev_time = time.time()
+        self.total_docs = 0
+        self.current_docs = 0
+        self.current_files = 0
+        self.total_files = 0
+        self.wandb_logger = wandb_logger
+
+    def increment(self, docs: int = 0, files: int = 0):
+        self.current_docs += docs
+        self.current_files += files
+        self.total_docs += docs
+        self.total_files += files
+
+        if self.current_docs >= self.log_every or files > 0:
+            current_time = time.time()
+            docs_throughput = self.current_docs / (current_time - self.prev_time)
+            files_throughput = self.current_files / (current_time - self.prev_time)
+
+            self.logger.info(
+                f"Throughput: {docs_throughput:.2f} docs/s, {files_throughput:.2f} files/s " +
+                f" ({self.total_docs:.1e} docs; {self.total_files:,} files)"
+            )
+            if self.wandb_logger is not None:
+                self.wandb_logger.log(
+                    step=self.total_docs,
+                    instant_doc_throughput=docs_throughput,
+                    total_doc_throughput=self.total_docs / (current_time - self.start_time),
+                    instant_file_throughput=files_throughput,
+                    total_file_throughput=self.total_files / (current_time - self.start_time),
+                    total_files=self.total_files,
+                )
+
+            self.prev_time = current_time
+            self.current_docs = 0
+            self.current_files = 0
diff --git a/classifiers/src/dolma_classifiers/inference/models.py b/classifiers/src/dolma_classifiers/inference/models.py
new file mode 100644
index 00000000..ef1a0dd1
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/inference/models.py
@@ -0,0 +1,163 @@
+from typing import NamedTuple, Type
+
+import torch
+from huggingface_hub import PyTorchModelHubMixin
+from torch import nn
+from torch.nn import functional as F
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+from transformers.modeling_outputs import SequenceClassifierOutput
+
+from .loggers import get_logger
+from .utils import sanitize_model_name
+
+
+class Prediction(NamedTuple):
+    label: str
+    score: float
+
+
+class BaseQualityClassifier:
+    model: PreTrainedModel
+    tokenizer: PreTrainedTokenizer
+
+    def __init__(
+        self,
+        model_name: str,
+        device: str,
+        dtype: str,
+        compile: bool = False,
+        trust_remote_code: bool = False,
+    ):
+        self.model = self._make_model(
+            model_name=model_name,
+            device=device,
+            dtype=dtype,
+            compile=compile,
+            trust_remote_code=trust_remote_code,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)  # pyright: ignore
+
+        if len(self.model.config.id2label) > 1:
+            def label_name_fn(label: str):
+                return f"{sanitize_model_name(model_name)}_{sanitize_model_name(label)}"
+        else:
+            def label_name_fn(label: str):
+                return sanitize_model_name(model_name)
+
+        self.labels_map = {
+            id_: label_name_fn(label)
+            for id_, label in self.model.config.id2label.items()
+        }
+
+    def _make_model(
+        self,
+        model_name: str,
+        device: str,
+        dtype: str,
+        compile: bool,
+        trust_remote_code: bool,
+    ) -> PreTrainedModel:
+        model = AutoModelForSequenceClassification.from_pretrained(
+            pretrained_model_name_or_path=model_name,
+            torch_dtype=getattr(torch, dtype),
+            trust_remote_code=trust_remote_code,
+        )
+        model = model.to(torch.device(device))
+
+        if compile:
+            model = torch.compile(model)  # pyright: ignore
+
+        model.eval()  # pyright: ignore
+
+        return model  # pyright: ignore
+
+    @property
+    def device(self) -> torch.device:
+        return self.model.device
+
+    def score(self, **batch: torch.Tensor) -> list[list[Prediction]]:
+        outputs = self.model(**batch)
+        scores = (
+            F.softmax(outputs.logits, dim=-1) if outputs.logits.size(-1) != 1 else outputs.logits
+        )
+        return [
+            [Prediction(label=self.labels_map[i], score=float(score)) for i, score in enumerate(row)]
+            for row in scores.float().cpu().numpy()
+        ]
+
+
+class Registry:
+    _registry: dict[str, Type[BaseQualityClassifier]] = {}
+    _logger = get_logger("ModelRegistry")
+
+    def __new__(cls, *args, **kwargs):
+        return cls
+
+    @classmethod
+    def add(cls, classifier_name: str):
+        def _add(classifier: Type[BaseQualityClassifier]):
+            cls._registry[classifier_name] = classifier
+        return _add
+
+    @classmethod
+    def get(cls, model_name: str, **kwargs) -> BaseQualityClassifier:
+        if model_name not in cls._registry:
+            cls._logger.warning(f"Classifier {model_name} not found in registry; using default classifier")
+            return BaseQualityClassifier(model_name=model_name, **kwargs)
+        else:
+            return cls._registry[model_name](model_name=model_name, **kwargs)
+
+
+@Registry.add("HuggingFaceFW/fineweb-edu-classifier")
+class FineWebEduClassifier(BaseQualityClassifier):
+    pass
+
+
+class QualityModel(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, config):
+        super(QualityModel, self).__init__()
+        self.model = AutoModel.from_pretrained(config["base_model"])
+        self.dropout = nn.Dropout(config["fc_dropout"])
+        self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))
+
+    @property
+    def device(self):
+        return self.model.device
+
+    def forward(self, input_ids, attention_mask, **kwargs):
+        features = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
+        dropped = self.dropout(features)
+        outputs = self.fc(dropped)
+        return SequenceClassifierOutput(logits=outputs[:, 0, :])
+
+
+@Registry.add("nvidia/quality-classifier-deberta")
+class DebertaQualityClassifier(BaseQualityClassifier):
+    def _make_model(
+        self,
+        model_name: str,
+        device: str,
+        dtype: str,
+        compile: bool,
+        trust_remote_code: bool,
+    ) -> PreTrainedModel:
+        model = QualityModel.from_pretrained(model_name)
+        model = model.to(getattr(torch, dtype))
+        model = model.to(torch.device(device))
+
+        if compile:
+            model = torch.compile(model)  # pyright: ignore
+
+        model.eval()  # pyright: ignore
+
+        # for some reason the config is not loaded automatically; need to set it manually
+        model.config = AutoConfig.from_pretrained(model_name)  # pyright: ignore
+
+        return model  # pyright: ignore
diff --git a/classifiers/src/dolma_classifiers/inference/utils.py b/classifiers/src/dolma_classifiers/inference/utils.py
new file mode 100644
index 00000000..b95f1c0b
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/inference/utils.py
@@ -0,0 +1,59 @@
+import os
+import re
+from hashlib import md5
+from typing import Any
+
+import msgspec
+import torch
+import torch.distributed as dist
+from smart_open.compression import (
+    _handle_zstd,
+    get_supported_compression_types,
+    register_compressor,
+)
+
+
+def get_rank_and_world_size():
+    if dist.is_initialized():
+        return dist.get_rank(), dist.get_world_size()
+    else:
+        return 0, 1
+
+
+def get_local_gpu_rank() -> int:
+    """Returns the local GPU rank for the current process using torch.distributed."""
+    if dist.is_initialized():
+        return dist.get_rank() % torch.cuda.device_count()
+    else:
+        return 0
+
+
+def setup() -> tuple[int, int]:
+    if (rank := os.environ.get("RANK")) and (world_size := os.environ.get("WORLD_SIZE")):
+        dist.init_process_group("nccl", rank=int(rank), world_size=int(world_size))
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(get_local_gpu_rank())
+
+    return get_rank_and_world_size()
+
+
+def cleanup():
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+def sanitize_model_name(model_name: str, suffix_data: Any = None) -> str:
+    replaced_with_underscores = re.sub("[^a-zA-Z0-9_]", "_", model_name)
+    removed_duplicates = re.sub("_{2,}", "_", replaced_with_underscores)
+    stripped_trailing_underscores = removed_duplicates.strip("_")
+
+    if suffix_data:
+        # encode suffix_data and use first 6 characters of md5 hash as suffix
+        encoder = msgspec.json.Encoder()
+        stripped_trailing_underscores += f"_{md5(encoder.encode(suffix_data)).hexdigest()[:6]}"
+
+    return stripped_trailing_underscores
+
+
+if ".zstd" not in get_supported_compression_types():
+    register_compressor(".zstd", _handle_zstd)
diff --git a/classifiers/src/dolma_classifiers/label/__init__.py b/classifiers/src/dolma_classifiers/label/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/classifiers/src/dolma_classifiers/label/__main__.py b/classifiers/src/dolma_classifiers/label/__main__.py
new file mode 100644
index 00000000..d3f339be
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/label/__main__.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+
+import argparse
+import glob
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict, List
+from urllib.parse import urlparse
+
+import grequests
+import jinja2
+import urllib3
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+OPENAI_API_ENDPOINT = "https://api.openai.com/v1/chat/completions"
+
+
+class DocumentProcessor:
+    def __init__(
+        self,
+        documents_path: str,
+        destination: str,
+        prompt_template: str,
+        api_key: str,
+        batch_size: int = 5,
+        max_retries: int = 3,
+        retry_delay: int = 1
+    ):
+        self.documents_path = documents_path
+        self.destination = destination
+        self.prompt_template = prompt_template
+        self.api_key = api_key
+        self.batch_size = batch_size
+        self.max_retries = max_retries
+        self.retry_delay = retry_delay
+        self.template = jinja2.Template(prompt_template)
+
+    def _create_request(self, document: Dict[str, Any]) -> grequests.AsyncRequest:
+        """Create a single grequest for a document."""
+        try:
+            # Render the prompt template with document fields
+            prompt = self.template.render(**document)
+
+            # Prepare the request payload
+            payload = {
+                "model": "gpt-4",
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant that processes documents."},
+                    {"role": "user", "content": prompt}
+                ]
+            }
+
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.api_key}"
+            }
+
+            # Create the request object
+            return grequests.post(
+                OPENAI_API_ENDPOINT,
+                json=payload,
+                headers=headers,
+                timeout=30
+            ), document
+
+        except Exception as e:
+            logger.error(f"Error creating request: {e}")
+            return None
+
+    def _process_response(self, response, document: Dict[str, Any]) -> Dict[str, Any]:
+        """Process a single response from the API."""
+        try:
+            if response.status_code == 200:
+                result = response.json()
+                document['gpt4_response'] = result['choices'][0]['message']['content']
+            else:
+                document['error'] = f"API Error: {response.status_code} - {response.text}"
+        except Exception as e:
+            document['error'] = f"Processing Error: {str(e)}"
+
+        return document
+
+    def _process_batch(self, batch: List[Dict[str, Any]], output_file: str):
+        """Process a batch of documents and write results to output file."""
+        # Create request objects for the batch
+        request_pairs = [self._create_request(doc) for doc in batch]
+        requests, documents = zip(*[pair for pair in request_pairs if pair is not None])
+
+        # Make async requests
+        responses = grequests.map(requests, size=len(requests))
+
+        # Process responses and write to file
+        with open(output_file, 'a') as f:
+            for response, document in zip(responses, documents):
+                result = self._process_response(response, document)
+                f.write(json.dumps(result) + '\n')
+
+    def _download_file(self, url: str, local_path: str) -> str:
+        """Download a remote file to local storage."""
+        with urllib3.PoolManager() as http:
+            response = http.request('GET', url)
+            if response.status == 200:
+                with open(local_path, 'w') as f:
+                    f.write(response.data.decode('utf-8'))
+                return local_path
+            else:
+                raise Exception(f"Failed to download file: {response.status}")
+
+    def _get_file_paths(self) -> List[str]:
+        """Get list of files to process, handling both local and remote paths."""
+        if urlparse(self.documents_path).scheme in ('http', 'https'):
+            # Handle remote files
+            temp_dir = Path('temp_downloads')
+            temp_dir.mkdir(exist_ok=True)
+
+            # Download remote files
+            local_paths = []
+            with urllib3.PoolManager() as http:
+                response = http.request('GET', self.documents_path)
+                if response.status == 200:
+                    file_list = response.data.decode('utf-8').splitlines()
+                    for url in file_list:
+                        local_path = temp_dir / Path(urlparse(url).path).name
+                        self._download_file(url, str(local_path))
+                        local_paths.append(str(local_path))
+            return local_paths
+        else:
+            # Handle local files
+            return glob.glob(self.documents_path)
+
+    def process_files(self):
+        """Main method to process all files."""
+        # Create destination directory if it doesn't exist
+        os.makedirs(self.destination, exist_ok=True)
+
+        # Get list of files to process
+        file_paths = self._get_file_paths()
+        logger.info(f"Found {len(file_paths)} files to process")
+
+        for file_path in file_paths:
+            try:
+                # Read input file
+                with open(file_path, 'r') as f:
+                    documents = [json.loads(line) for line in f]
+
+                # Create output file path
+                output_file = os.path.join(
+                    self.destination,
+                    f"processed_{os.path.basename(file_path)}"
+                )
+
+                # Process documents in batches
+                for i in range(0, len(documents), self.batch_size):
+                    batch = documents[i:i + self.batch_size]
+                    self._process_batch(batch, output_file)
+                    logger.info(f"Processed batch {i//self.batch_size + 1} of file {file_path}")
+
+            except Exception as e:
+                logger.error(f"Error processing file {file_path}: {e}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Process documents with GPT-4')
+    parser.add_argument('--documents', required=True, help='Glob pattern for input documents')
+    parser.add_argument('--destination', required=True, help='Output directory')
+    parser.add_argument('--prompt', required=True, help='Prompt template')
+    parser.add_argument('--api-key', required=True, help='OpenAI API key')
+    parser.add_argument('--batch-size', type=int, default=5, help='Batch size for processing')
+
+    args = parser.parse_args()
+
+    # Read prompt template from file if it's a file path
+    prompt_template = args.prompt
+    if os.path.isfile(args.prompt):
+        with open(args.prompt, 'r') as f:
+            prompt_template = f.read()
+
+    processor = DocumentProcessor(
+        documents_path=args.documents,
+        destination=args.destination,
+        prompt_template=prompt_template,
+        api_key=args.api_key,
+        batch_size=args.batch_size
+    )
+
+    # Run the processor
+    processor.process_files()
+
+if __name__ == "__main__":
+    main()
diff --git a/classifiers/src/dolma_classifiers/label/api.py b/classifiers/src/dolma_classifiers/label/api.py
new file mode 100644
index 00000000..f3976c9d
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/label/api.py
@@ -0,0 +1,50 @@
+import os
+from dataclasses import dataclass, field
+
+import aiohttp
+
+
+@dataclass(frozen=True)
+class Message:
+    role: str
+    content: str
+
+    def to_dict(self):
+        return {
+            "role": self.role,
+            "content": self.content
+        }
+
+
+@dataclass(frozen=True)
+class BaseApiRequest:
+    endpoint: str
+    messages: list[Message]
+    parameters: dict = field(default_factory=dict)
+    headers: dict = field(default_factory=dict)
+
+    async def make(self):
+        payload = {**self.parameters, "messages": [message.to_dict() for message in self.messages]}
+        async with aiohttp.ClientSession() as session:
+            async with session.post(self.endpoint, json=payload, headers=self.headers) as response:
+                return await response.json()
+
+
+@dataclass(frozen=True)
+class Gpt4oRequest(BaseApiRequest):
+    model: str = "gpt-4o"
+    temperature: float = 1.0
+    top_p: float = 1.0
+    headers: dict = field(
+        default_factory=lambda: {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"
+        }
+    )
+
+    def __post_init__(self):
+        self.parameters.update({
+            "model": self.model,
+            "temperature": self.temperature,
+            "top_p": self.top_p
+        })
diff --git a/classifiers/src/dolma_classifiers/label/templates.py b/classifiers/src/dolma_classifiers/label/templates.py
new file mode 100644
index 00000000..15c5749b
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/label/templates.py
@@ -0,0 +1,107 @@
+from typing import Any, Dict, Optional
+
+import jq
+
+
+class JqTemplate:
+    """
+    A template engine that processes strings containing JQ expressions in {expression} syntax.
+    Supports escaping curly braces with {{ and }}.
+    """
+
+    def __init__(self, template_string: str):
+        """
+        Initialize the template with a template string.
+
+        Args:
+            template_string: The template string containing JQ expressions in {expression} syntax
+        """
+        self.template_string = template_string
+        self._compiled = self._compile_template(template_string)
+
+    @staticmethod
+    def _compile_template(template_string: str) -> list[tuple[str, Optional[jq.jq]]]:
+        """
+        Compile the template string into a list of (text, expression) tuples.
+
+        Args:
+            template_string: The template string to compile
+
+        Returns:
+            List of tuples containing (text, compiled_jq_expression)
+
+        Raises:
+            ValueError: If there are unmatched braces or invalid JQ expressions
+        """
+        parts = []
+        current_pos = 0
+
+        # Handle escaped braces first
+        template_string = template_string.replace("{{", "\0LEFT_BRACE\0").replace("}}", "\0RIGHT_BRACE\0")
+
+        while current_pos < len(template_string):
+            # Find next unescaped opening brace
+            start = template_string.find("{", current_pos)
+
+            if start == -1:
+                # No more expressions, add remaining text
+                text = template_string[current_pos:]
+                text = text.replace("\0LEFT_BRACE\0", "{").replace("\0RIGHT_BRACE\0", "}")
+                parts.append((text, None))
+                break
+
+            # Add text before the expression
+            if start > current_pos:
+                text = template_string[current_pos:start]
+                text = text.replace("\0LEFT_BRACE\0", "{").replace("\0RIGHT_BRACE\0", "}")
+                parts.append((text, None))
+
+            # Find matching closing brace
+            end = template_string.find("}", start)
+            if end == -1:
+                raise ValueError(f"Unmatched opening brace at position {start}")
+
+            # Extract and compile JQ expression
+            expr = template_string[start + 1:end].strip()
+            try:
+                compiled_expr = jq.compile(expr)
+            except ValueError as e:
+                raise ValueError(f"Invalid JQ expression '{expr}': {str(e)}")
+
+            parts.append(("", compiled_expr))
+            current_pos = end + 1
+
+        return parts
+
+    def render(self, data: Dict[str, Any]) -> str:
+        """
+        Render the template by evaluating all JQ expressions against the provided data.
+
+        Args:
+            data: Dictionary containing the data to evaluate expressions against
+
+        Returns:
+            The rendered template string
+
+        Raises:
+            ValueError: If any JQ expression fails to evaluate
+        """
+        result = []
+
+        for text, expr in self._compiled:
+            result.append(text)
+            if expr is None:
+                continue
+
+            try:
+                # Evaluate expression and get first result
+                evaluated = expr.input(data).first()
+                # append the evaluated result to the result list
+                result.append(str(evaluated or ""))
+            except StopIteration:
+                # No results from JQ expression
+                result.append("")
+            except Exception as e:
+                raise ValueError(f"Error evaluating expression: {str(e)}")
+
+        return "".join(result)
diff --git a/classifiers/src/dolma_classifiers/train.py b/classifiers/src/dolma_classifiers/train.py
new file mode 100644
index 00000000..23747b34
--- /dev/null
+++ b/classifiers/src/dolma_classifiers/train.py
@@ -0,0 +1,94 @@
+import multiprocessing
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable
+from urllib.parse import urlparse
+
+import fsspec
+import jq
+import smart_open
+from msgspec.json import Decoder
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+
+@dataclass(frozen=True)
+class Document:
+    text: str
+    label: str
+
+
+def _label_selector_fn(row: dict, selector: Callable | None, label: str | None) -> str:
+    if selector is not None:
+        return str(selector(row).first())
+    elif label is not None:
+        return str(label)
+    else:
+        raise ValueError("Either `label` or `selector` must be provided")
+
+
+def read_file(path: str, label: str | None = None, selector: str | None = None) -> list[Document]:
+    label_fn = partial(_label_selector_fn, label=label, selector=(jq.compile(selector) if selector else None))
+
+    decoder = Decoder()
+    documents = []
+
+    with smart_open.open(path) as f:
+        for line in f:
+            row = decoder.decode(line)
+            label = label_fn(row)
+            documents.append(Document(text=row["text"], label=label))
+
+    return documents
+
+
+@dataclass(frozen=True)
+class DataConfig:
+    path: str
+    label: str | None = None
+    selector: str | None = None
+
+    @staticmethod
+    def expand(data_config: "DataConfig", fs: fsspec.AbstractFileSystem | None = None) -> list["DataConfig"]:
+        fs = fs or fsspec.get_filesystem_class(urlparse(data_config.path).scheme)()
+        assert fs is not None, f"Could not determine filesystem for {data_config.path}"
+        paths = [str(p) for p in fs.glob(data_config.path)] if "*" in data_config.path else [data_config.path]
+        return [DataConfig(path=path, label=data_config.label, selector=data_config.selector) for path in paths]
+
+
+class ClassifierDataset(Dataset):
+    def __init__(
+        self,
+        configs: list[DataConfig],
+        workers: int = 1,
+    ):
+        with multiprocessing.Pool(workers) as pool:
+            expanded_configs: list[DataConfig] = [
+                data_config
+                for data_configs in tqdm(
+                    pool.imap_unordered(DataConfig.expand, configs),
+                    total=len(configs),
+                    desc="Expanding configs",
+                )
+                for data_config in data_configs
+            ]
+
+        with multiprocessing.Pool(workers) as pool:
+            self.documents = list(
+                tqdm(
+                    pool.imap_unordered(
+                        lambda c: read_file(path=c.path, label=c.label, selector=c.selector),
+                        expanded_configs
+                    ),
+                    total=len(expanded_configs),
+                    desc="Reading files",
+                )
+            )
+
+        print(f"Read {len(self.documents)} documents from {len(expanded_configs)} configs")
+
+    def __len__(self):
+        return len(self.documents)
+
+    def __getitem__(self, idx):
+        return self.documents[idx]
diff --git a/classifiers/tests/__init__.py b/classifiers/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/classifiers/tests/test_api.py b/classifiers/tests/test_api.py
new file mode 100644
index 00000000..b0e34b95
--- /dev/null
+++ b/classifiers/tests/test_api.py
@@ -0,0 +1,88 @@
+import aiohttp
+import pytest
+from aioresponses import aioresponses
+from dolma_classifiers.label.api import (  # Replace with your actual module name
+    BaseApiRequest,
+    Message,
+)
+
+
+@pytest.fixture
+def mock_api():
+    with aioresponses() as m:
+        yield m
+
+
+@pytest.mark.asyncio
+async def test_successful_api_request(mock_api):
+    # Arrange
+    endpoint = "https://api.example.com/v1/chat"
+    expected_response = {"response": "Hello, world!", "status": "success"}
+
+    mock_api.post(endpoint, status=200, payload=expected_response)
+
+    request = BaseApiRequest(
+        endpoint=endpoint,
+        messages=[Message(role="user", content="Hello!")],
+        headers={"Authorization": "Bearer test-token"},
+    )
+
+    # Act
+    response = await request.make()
+
+    # Assert
+    assert response == expected_response
+
+
+@pytest.mark.asyncio
+async def test_api_request_with_error(mock_api):
+    # Arrange
+    endpoint = "https://api.example.com/v1/chat"
+    error_response = {"error": "Invalid token", "status": "error"}
+
+    mock_api.post(endpoint, status=401, payload=error_response)
+
+    request = BaseApiRequest(
+        endpoint=endpoint,
+        messages=[Message(role="user", content="Hello!")],
+        headers={"Authorization": "Bearer invalid-token"},
+    )
+
+    # Act & Assert
+    with pytest.raises(aiohttp.ClientResponseError) as exc_info:
+        await request.make()
+    assert exc_info.value.status == 401
+
+
+@pytest.mark.asyncio
+async def test_api_request_payload(mock_api):
+    # Arrange
+    endpoint = "https://api.example.com/v1/chat"
+    messages = [Message(role="user", content="Hello!")]
+    parameters = {"temperature": 0.7}
+
+    expected_payload = {"messages": [{"role": "user", "content": "Hello!"}], "temperature": 0.7}
+
+    def match_payload(url, **kwargs):
+        assert kwargs["json"] == expected_payload
+        return True
+
+    mock_api.post(endpoint, status=200, callback=match_payload)
+
+    request = BaseApiRequest(endpoint=endpoint, messages=messages, parameters=parameters)
+
+    # Act
+    await request.make()  # If no assertion error is raised, the payload matched
+
+
+@pytest.mark.asyncio
+async def test_network_error(mock_api):
+    # Arrange
+    endpoint = "https://api.example.com/v1/chat"
+    mock_api.post(endpoint, exception=aiohttp.ClientConnectionError())
+
+    request = BaseApiRequest(endpoint=endpoint, messages=[Message(role="user", content="Hello!")])
+
+    # Act & Assert
+    with pytest.raises(aiohttp.ClientConnectionError):
+        await request.make()
diff --git a/classifiers/tests/test_templates.py b/classifiers/tests/test_templates.py
new file mode 100644
index 00000000..183a790d
--- /dev/null
+++ b/classifiers/tests/test_templates.py
@@ -0,0 +1,81 @@
+import unittest
+
+from dolma_classifiers.label.templates import JqTemplate
+
+
+class TestJqTemplate(unittest.TestCase):
+    """Test cases for the JqTemplate class."""
+
+    def setUp(self):
+        """Set up test data that will be used across multiple tests."""
+        self.test_data = {
+            "name": "John",
+            "age": 30,
+            "address": {"street": "123 Main St", "city": "Springfield"},
+            "hobbies": ["reading", "hiking", "coding"],
+        }
+
+    def test_basic_expression(self):
+        """Test basic template expression."""
+        template = JqTemplate("Hello, {.name}!")
+        self.assertEqual(template.render(self.test_data), "Hello, John!")
+
+    def test_nested_object_access(self):
+        """Test accessing nested object properties."""
+        template = JqTemplate("Address: {.address.street}, {.address.city}")
+        self.assertEqual(template.render(self.test_data), "Address: 123 Main St, Springfield")
+
+    def test_array_access(self):
+        """Test accessing array elements."""
+        template = JqTemplate("First hobby: {.hobbies[0]}")
+        self.assertEqual(template.render(self.test_data), "First hobby: reading")
+
+    def test_complex_jq_expression(self):
+        """Test more complex JQ expressions."""
+        template = JqTemplate('Hobbies: {.hobbies | join(", ")}')
+        self.assertEqual(template.render(self.test_data), "Hobbies: reading, hiking, coding")
+
+    def test_escaped_braces(self):
+        """Test that escaped braces are handled correctly."""
+        template = JqTemplate("User {{.name}} is {.age} years old")
+        self.assertEqual(template.render(self.test_data), "User {.name} is 30 years old")
+
+    def test_multiple_expressions(self):
+        """Test multiple expressions in the same template."""
+        template = JqTemplate("{.name} lives at {.address.street}")
+        self.assertEqual(template.render(self.test_data), "John lives at 123 Main St")
+
+    def test_missing_field(self):
+        """Test behavior when accessing a non-existent field."""
+        template = JqTemplate("Name: {.missing_field}")
+        self.assertEqual(template.render(self.test_data), "Name: ")
+
+    def test_unmatched_brace(self):
+        """Test that unmatched braces raise an error."""
+        with self.assertRaises(ValueError):
+            JqTemplate("Hello {.name")
+
+    def test_invalid_jq_expression(self):
+        """Test that invalid JQ expressions raise an error."""
+        with self.assertRaises(ValueError):
+            JqTemplate("Hello {invalid!}")
+
+    def test_empty_template(self):
+        """Test handling of empty template strings."""
+        template = JqTemplate("")
+        self.assertEqual(template.render(self.test_data), "")
+
+    def test_template_without_expressions(self):
+        """Test template string without any expressions."""
+        template = JqTemplate("Hello, world!")
+        self.assertEqual(template.render(self.test_data), "Hello, world!")
+
+    def test_adjacent_expressions(self):
+        """Test handling of adjacent expressions."""
+        template = JqTemplate("{.name}{.age}")
+        self.assertEqual(template.render(self.test_data), "John30")
+
+    def test_whitespace_handling(self):
+        """Test that whitespace in expressions is handled correctly."""
+        template = JqTemplate("Hello, {  .name   }!")
+        self.assertEqual(template.render(self.test_data), "Hello, John!")
diff --git a/configs/cc-news/dedupe_by_lang.sh b/configs/cc-news/dedupe_by_lang.sh
new file mode 100644
index 00000000..35706739
--- /dev/null
+++ b/configs/cc-news/dedupe_by_lang.sh
@@ -0,0 +1,75 @@
+#! /usr/bin/env bash
+
+base_dir="${HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents"
+
+langs=($(du -sh "${base_dir}"/* 2>/dev/null | sort -hr | awk '{print $2}' | xargs -n1 basename))
+
+for lang in "${langs[@]}"; do
+    documents=()
+    size=0
+    while IFS= read -r -d '' file; do
+      documents+=("$file")
+      size=$(expr $size + $(stat -c %s "$file"))
+    done < <(find "${base_dir}/${lang}" -type f \( -name "*.zst" -o -name "*.gz" -o -name "*.gzip" -o -name "*.json" -o -name "*.jsonl" \) -print0)
+
+    # sort documents by name
+    documents=($(echo "${documents[@]}" | tr ' ' '\n' | sort))
+
+    # run deduplication
+    echo "Running fuzzy dedupe for ${lang} with ${size} bytes Bloom filter (files: ${#documents[@]})"
+
+    # Start the output
+    document_linearized="documents:\n"
+
+    # Loop through the array and append each element
+    for doc in "${documents[@]}"; do
+        document_linearized+="  - $doc\n"
+    done
+
+    config_yaml=$(cat <<EOF
+${document_linearized}
+dedupe:
+  name: dedupe_by_lang
+  paragraphs:
+    attribute_name: dedupe_ngrams_13_1
+    by_ngram:
+      ngram_length: 13
+      stride: 1
+      overlap_threshold: 0.5
+      skip_short_paragraphs: true
+  skip_empty: true
+
+bloom_filter:
+  file: /tmp/cc_news_${lang}_dedupe_ngram.bin
+  read_only: false
+  estimated_doc_count: ${size}
+  desired_false_positive_rate: 0.1
+
+work_dir:
+  input: /tmp/cc_news_${lang}_dedupe_ngrams_13_1/input
+  output: /tmp/cc_news_${lang}_dedupe_ngrams_13_1/output
+EOF
+)
+    # Set the number of processes to the minimum of the number of documents
+    # and <number of available processors - 4> to leave some room for other processes
+    processes=$(( $(expr $(nproc) - 4) < ${#documents[@]} ? $(expr $(nproc) - 4) : ${#documents[@]} ))
+
+    # Create a temporary file for the YAML config
+    temp_config_file=$(mktemp)
+
+    # Write the YAML config to the temporary file
+    printf "$config_yaml" > "$temp_config_file"
+
+
+    set -ex
+    # Run dolma with the temporary config file
+    dolma -c "$temp_config_file" dedupe --processes "${processes}"
+    # cat "$temp_config_file"
+    set +ex
+
+
+    # Remove the temporary file
+    rm "$temp_config_file"
+    rm -rf "/tmp/cc_news_${lang}*"
+
+done
diff --git a/configs/cc-news/dedupe_by_year.sh b/configs/cc-news/dedupe_by_year.sh
new file mode 100644
index 00000000..321af0b4
--- /dev/null
+++ b/configs/cc-news/dedupe_by_year.sh
@@ -0,0 +1,69 @@
+#! /usr/bin/env bash
+
+base_dir="${HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents"
+
+# run years between 2016 and 2024
+for year in {2016..2024}; do
+    # Initialize an empty array to store document paths and a variable for total size
+    documents=()
+    size=0
+    while IFS= read -r -d '' file; do
+      documents+=("$file")
+      size=$(expr $size + $(stat -c %s "$file"))
+    done < <(find "${base_dir}/${year}" -type f \( -name "*.zst" -o -name "*.gz" -o -name "*.gzip" -o -name "*.json" -o -name "*.jsonl" \) -print0)
+
+    # run deduplication
+    echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter (files: ${#documents[@]})"
+
+    # Start the output
+    document_linearized="documents:\n"
+
+    # Loop through the array and append each element
+    for doc in "${documents[@]}"; do
+        document_linearized+="  - $doc\n"
+    done
+
+    config_yaml=$(cat <<EOF
+${document_linearized}
+dedupe:
+  name: dedupe_by_year
+  paragraphs:
+    attribute_name: dedupe_ngrams_20_1
+    by_ngram:
+      ngram_length: 20
+      stride: 1
+      overlap_threshold: 0.5
+      skip_short_paragraphs: true
+  skip_empty: true
+
+bloom_filter:
+  file: /tmp/cc_news_${year}_dedupe_ngram.bin
+  read_only: false
+  estimated_doc_count: ${size}
+  desired_false_positive_rate: 0.1
+
+work_dir:
+  input: /tmp/cc_news_${year}_dedupe_ngrams_20_1/input
+  output: /tmp/cc_news_${year}_dedupe_ngrams_20_1/output
+EOF
+)
+
+
+    # Create a temporary file for the YAML config
+    temp_config_file=$(mktemp)
+
+    # Write the YAML config to the temporary file
+    printf "$config_yaml" > "$temp_config_file"
+
+
+    set -ex
+    # Run dolma with the temporary config file
+    dolma -c "$temp_config_file" dedupe --processes $(expr $(nproc) - 4)
+    set +ex
+
+
+    # Remove the temporary file
+    rm "$temp_config_file"
+    rm -rf "/tmp/cc_news_${year}*"
+
+done
diff --git a/configs/cc-news/find_broken.py b/configs/cc-news/find_broken.py
new file mode 100644
index 00000000..4b6d9002
--- /dev/null
+++ b/configs/cc-news/find_broken.py
@@ -0,0 +1,77 @@
+from argparse import ArgumentParser
+from queue import Queue
+from tempfile import TemporaryDirectory
+from typing import Any, Tuple, Union
+
+import smart_open
+from dolma.core.parallel import BaseParallelProcessor
+
+
+class FindBrokenFilesProcessor(BaseParallelProcessor):
+    @classmethod
+    def increment_progressbar(
+        cls,
+        queue: "Queue[Union[Tuple[int, ...], None]]",
+        /,
+        files: int = 0,
+        docs: int = 0,
+    ):
+        return super().increment_progressbar(queue, files=files, docs=docs)
+
+    @classmethod
+    def process_single(
+        cls,
+        source_path: str,
+        destination_path: str,
+        queue: Queue,
+        **kwargs: Any,
+    ):
+        """
+        This method is called for each file. It reads the file
+        line by line, and writes to the destination file only
+        if the document is not empty.
+        """
+
+        try:
+            with smart_open.open(source_path, mode="rt", encoding="utf-8") as f:
+                cnt = 0
+                for _ in f:
+                    cnt += 1
+                    if cnt >= 1000:
+                        cls.increment_progressbar(queue, docs=cnt)
+                        cnt = 0
+        except Exception as e:  # pylint: disable=broad-except
+            print(f"Error {e} in file {source_path}")
+
+        if cnt > 0:
+            cls.increment_progressbar(queue, docs=cnt, files=1)
+
+
+def parse_args():
+    ag = ArgumentParser()
+    ag.add_argument("-s", "--source-prefix", type=str, required=True)
+    ag.add_argument("-n", "--num-processes", type=int, default=1)
+    ag.add_argument("-u", "--debug", action="store_true")
+    ag.add_argument("-t", "--temp-dir", type=str, default=None)
+    return ag.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    with TemporaryDirectory(dir=args.temp_dir) as tmpdir:
+        # create the processor
+        processor = FindBrokenFilesProcessor(
+            source_prefix=args.source_prefix,
+            destination_prefix=tmpdir,
+            metadata_prefix=tmpdir,
+            num_processes=args.num_processes,
+            debug=args.debug,
+        )
+
+        # run the processor
+        processor()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/cc-news/mix-deupe-by-year.yaml b/configs/cc-news/mix-deupe-by-year.yaml
new file mode 100644
index 00000000..5038bf21
--- /dev/null
+++ b/configs/cc-news/mix-deupe-by-year.yaml
@@ -0,0 +1,78 @@
+streams:
+    - name: cc-news_2016
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2016/*.json.zst
+      attributes: &attributes
+          - dedupe_by_year
+      output: &output
+          max_size_in_bytes: 3_814_697_265
+          path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup/documents
+      filter: &filter
+        include:
+            - >-
+              (.attributes.dedupe_ngrams_20_1 | length == 0) or
+              ((.attributes.dedupe_ngrams_20_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3)
+        syntax: jq
+
+    - name: cc-news_2017
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2017/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2018
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2018/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2019
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2019/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2020
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2020/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2021
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2021/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2022
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2022/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2023
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2023/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2024
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2024/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/output
+
+processes: 188
diff --git a/configs/cc-news/partition_by_lang.py b/configs/cc-news/partition_by_lang.py
new file mode 100644
index 00000000..f39a9279
--- /dev/null
+++ b/configs/cc-news/partition_by_lang.py
@@ -0,0 +1,145 @@
+from argparse import ArgumentParser
+from contextlib import ExitStack
+import os
+from queue import Queue
+from tempfile import TemporaryDirectory
+from typing import Any, Tuple, Union
+
+import msgspec
+import smart_open
+from dolma.core.parallel import BaseParallelProcessor
+from dolma.core.data_types import InputSpecWithMetadataAndAttributes, OutputSpec
+
+
+class PartitionByLangProcessor(BaseParallelProcessor):
+    @classmethod
+    def increment_progressbar(
+        cls,
+        queue: "Queue[Union[Tuple[int, ...], None]]",
+        /,
+        files: int = 0,
+        skipped: int = 0,
+        written: int = 0,
+    ):
+        return super().increment_progressbar(queue, files=files, skipped=skipped, written=written)
+
+    @classmethod
+    def process_single(
+        cls,
+        source_path: str,
+        destination_path: str,
+        queue: Queue,
+        **kwargs: Any,
+    ):
+        """
+        This method is called for each file. It reads the file
+        line by line, and writes to the destination file only
+        if the document is not empty.
+        """
+
+        attribute_prefix = kwargs.get("attribute_prefix", None)
+        attribute_name = kwargs.get("attribute_name", None)
+        lang_min_score = float(kwargs.get("lang_min_score", -1))
+
+        document_parser = msgspec.json.Decoder(InputSpecWithMetadataAndAttributes)
+        attribute_parser = msgspec.json.Decoder(OutputSpec)
+        encoder = msgspec.json.Encoder()
+
+        assert attribute_prefix is not None, "Attribute prefix is required"
+        assert attribute_name is not None, "Attribute name is required"
+        assert 0 <= lang_min_score <= 1, "Language min score must be between 0 and 1"
+
+        dest_dir, dest_file = os.path.split(destination_path)
+
+        written = skipped = 0
+
+        with ExitStack() as stack:
+            source_file = stack.enter_context(smart_open.open(source_path, mode="rt", encoding="utf-8"))
+            language_attribute_path = source_path.replace("/documents/", f"/attributes/{attribute_name}/")
+            language_attribute_file = stack.enter_context(
+                smart_open.open(language_attribute_path, mode="rt", encoding="utf-8")
+            )
+            dst_files = {}
+
+            while True:
+                raw_doc = source_file.readline()
+                raw_attr = language_attribute_file.readline()
+
+                if not raw_doc or not raw_attr:
+                    # end of file
+                    break
+
+                attr = attribute_parser.decode(raw_attr)
+
+                all_langs = {
+                    k.replace(attribute_prefix, ""): v[0][-1]
+                    for k, v in attr.attributes.items()
+                    if k.startswith(attribute_prefix)
+                }
+
+                if all_langs:
+                    top_lang, top_score = max(all_langs.items(), key=lambda x: x[1])
+                else:
+                    top_lang = "unk"
+                    top_score = 0
+
+                if top_score < lang_min_score:
+                    top_lang = "unk"
+                    skipped += 1
+
+                doc = document_parser.decode(raw_doc)
+                doc.attributes = {**(doc.attributes or {}), **attr.attributes}
+
+                if top_lang not in dst_files:
+                    dir_path = os.path.join(dest_dir, top_lang)
+                    os.makedirs(dir_path, exist_ok=True)
+                    dst_files[top_lang] = stack.enter_context(
+                        smart_open.open(os.path.join(dir_path, dest_file), mode="wt", encoding="utf-8")
+                    )
+
+                dst_files[top_lang].write(encoder.encode(doc).decode('utf-8') + "\n")
+                written += 1
+
+                if (written + skipped) > 1000:
+                    cls.increment_progressbar(queue, written=written, skipped=skipped)
+                    written = skipped = 0
+
+        cls.increment_progressbar(queue, written=written, skipped=skipped, files=1)
+
+
+def parse_args():
+    ag = ArgumentParser()
+    ag.add_argument("-s", "--source-prefix", type=str, required=True)
+    ag.add_argument("-d", "--destination-prefix", type=str, required=True)
+    ag.add_argument("-n", "--num-processes", type=int, default=1)
+    ag.add_argument("-u", "--debug", action="store_true")
+    ag.add_argument("--temp-dir", type=str, default=None)
+    ag.add_argument("--attribute-name", type=str, default="glotlid_doc_v3_1e2")
+    ag.add_argument("--attribute-prefix", type=str, default="glotlid_doc_v3_1e2__glotlid_doc_v3_1e2__")
+    ag.add_argument("--lang-min-score", type=float, default=0.5)
+    return ag.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    with TemporaryDirectory(dir=args.temp_dir) as tmpdir:
+        # create the processor
+        processor = PartitionByLangProcessor(
+            source_prefix=args.source_prefix,
+            destination_prefix=args.destination_prefix,
+            metadata_prefix=tmpdir,
+            num_processes=args.num_processes,
+            debug=args.debug,
+        )
+
+        # run the processor
+        processor(
+            attribute_name=args.attribute_name,
+            attribute_prefix=args.attribute_prefix,
+            lang_min_score=args.lang_min_score,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/cc-news/tag_v2.yaml b/configs/cc-news/tag_v2.yaml
new file mode 100644
index 00000000..4c534099
--- /dev/null
+++ b/configs/cc-news/tag_v2.yaml
@@ -0,0 +1,8 @@
+taggers:
+  - glotlid_doc_v3_1e2
+  - whitespace_tokenizer_v1
+
+processes: 188
+
+documents:
+  - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup/documents/*.json.gz
diff --git a/configs/cc-news/v3_mix_lang.yaml b/configs/cc-news/v3_mix_lang.yaml
new file mode 100644
index 00000000..c53f328c
--- /dev/null
+++ b/configs/cc-news/v3_mix_lang.yaml
@@ -0,0 +1,6312 @@
+streams:
+  - name: cc-news_abk_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/abk_Cyrl/*.json.gz
+    attributes: &attributes
+        - dedupe_by_lang
+    output: &output
+        path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/abk_Cyrl
+        max_size_in_bytes: 3_814_697_265
+    filter: &filter
+      include:
+          - >-
+            (.attributes.dedupe_ngrams_13_1 | length == 0) or
+            ((.attributes.dedupe_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3)
+      syntax: jq
+
+  - name: cc-news_abs_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/abs_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/abs_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_abz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/abz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/abz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ace_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ace_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ace_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ace_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ace_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ace_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_acf_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/acf_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/acf_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_acm_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/acm_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/acm_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_acn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/acn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/acn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ade_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ade_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ade_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ady_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ady_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ady_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_aeb_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/aeb_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/aeb_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_afr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/afr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/afr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_agx_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/agx_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/agx_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_aii_Syrc
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/aii_Syrc/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/aii_Syrc
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ajp_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ajp_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ajp_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ajz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ajz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ajz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_akb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/akb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/akb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_aln_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/aln_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/aln_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_alq_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/alq_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/alq_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_als_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/als_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/als_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_alt_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/alt_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/alt_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_alz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/alz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/alz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_amh_Ethi
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/amh_Ethi/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/amh_Ethi
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ami_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ami_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ami_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_amp_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/amp_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/amp_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ang_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ang_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ang_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_anp_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/anp_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/anp_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_apc_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/apc_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/apc_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_arb_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arb_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arb_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_arb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_arg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_arn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_arr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ars_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ars_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ars_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ary_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ary_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ary_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_arz_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arz_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arz_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_asm_Beng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/asm_Beng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/asm_Beng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_asm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/asm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/asm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ast_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ast_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ast_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ata_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ata_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ata_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_atj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/atj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/atj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_avk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/avk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/avk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_awa_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/awa_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/awa_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ayp_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ayp_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ayp_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ayr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ayr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ayr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_azb_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/azb_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/azb_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_azj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/azj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/azj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_azz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/azz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/azz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bak_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bak_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bak_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bam_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bam_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bam_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ban_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ban_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ban_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bar_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bar_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bar_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bbc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bbc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bbc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bcc_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bcc_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bcc_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bcl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bcl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bcl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bel_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bel_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bel_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bem_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bem_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bem_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ben_Beng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ben_Beng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ben_Beng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ben_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ben_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ben_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bew_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bew_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bew_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bho_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bho_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bho_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bhp_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bhp_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bhp_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bim_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bim_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bim_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bis_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bis_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bis_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bjn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bjn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bjn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bla_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bla_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bla_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_blk_Mymr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/blk_Mymr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/blk_Mymr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bod_Tibt
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bod_Tibt/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bod_Tibt
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bos_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bos_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bos_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bpr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bpr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bpr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bpy_Beng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bpy_Beng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bpy_Beng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bqj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bqj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bqj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bre_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bre_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bre_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_brh_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/brh_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/brh_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_brx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/brx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/brx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bsq_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bsq_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bsq_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bts_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bts_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bts_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_btx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/btx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/btx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bug_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bug_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bug_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bul_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bul_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bul_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bum_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bum_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bum_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bwu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bwu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bwu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bxr_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bxr_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bxr_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_byv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/byv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/byv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bzd_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bzd_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bzd_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_bzj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bzj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bzj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_caa_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/caa_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/caa_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cat_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cat_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cat_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cbk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cbk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cbk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ccp_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ccp_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ccp_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cdf_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cdf_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cdf_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ceb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ceb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ceb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ces_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ces_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ces_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cgc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cgc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cgc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cha_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cha_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cha_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_che_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/che_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/che_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_chk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/chk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/chk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_chr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/chr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/chr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_chv_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/chv_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/chv_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cjk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cjk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cjk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ckb_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ckb_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ckb_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ckm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ckm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ckm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cmn_Hani
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cmn_Hani/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cmn_Hani
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cmr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cmr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cmr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cnh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cnh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cnh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cnr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cnr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cnr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cof_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cof_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cof_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cos_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cos_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cos_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cot_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cot_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cot_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cou_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cou_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cou_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cpu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cpu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cpu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_crh_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crh_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crh_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_crh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cri_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cri_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cri_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_crk_Cans
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crk_Cans/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crk_Cans
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_crs_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crs_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crs_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_crx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_csw_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/csw_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/csw_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cto_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cto_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cto_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cuc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cuc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cuc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cuk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cuk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cuk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_cym_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cym_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cym_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dag_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dag_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dag_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dan_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dan_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dan_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dar_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dar_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dar_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ded_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ded_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ded_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_deu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/deu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/deu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dgr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dgr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dgr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dgz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dgz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dgz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dhv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dhv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dhv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dik_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dik_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dik_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_diq_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/diq_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/diq_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_div_Thaa
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/div_Thaa/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/div_Thaa
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_doi_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/doi_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/doi_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dsb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dsb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dsb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dsh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dsh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dsh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dwr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dwr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dwr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_dzo_Tibt
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dzo_Tibt/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dzo_Tibt
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_efi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/efi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/efi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ekk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ekk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ekk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ell_Grek
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ell_Grek/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ell_Grek
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_eml_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/eml_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/eml_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_eng_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/eng_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/eng_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_enl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/enl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/enl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_enm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/enm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/enm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_epo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/epo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/epo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ese_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ese_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ese_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_esi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/esi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/esi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_esk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/esk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/esk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_esu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/esu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/esu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_eus_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/eus_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/eus_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ewe_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ewe_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ewe_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ewo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ewo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ewo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ext_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ext_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ext_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fad_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fad_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fad_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fao_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fao_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fao_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fas_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fas_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fas_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fat_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fat_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fat_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ffm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ffm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ffm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fij_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fij_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fij_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fil_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fil_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fil_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fin_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fin_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fin_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fit_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fit_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fit_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fkv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fkv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fkv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fra_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fra_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fra_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fro_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fro_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fro_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_frp_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/frp_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/frp_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_frr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/frr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/frr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fry_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fry_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fry_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fuf_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fuf_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fuf_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fuq_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fuq_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fuq_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fur_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fur_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fur_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_fuv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fuv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fuv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gaa_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gaa_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gaa_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gag_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gag_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gag_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gaz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gaz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gaz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gcf_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gcf_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gcf_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gcr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gcr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gcr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ghs_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ghs_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ghs_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gid_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gid_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gid_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gil_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gil_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gil_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gla_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gla_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gla_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gle_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gle_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gle_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_glg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/glg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/glg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_glk_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/glk_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/glk_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_glv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/glv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/glv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gmh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gmh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gmh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gmv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gmv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gmv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_goh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/goh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/goh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gom_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gom_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gom_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gom_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gom_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gom_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gor_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gor_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gor_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gos_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gos_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gos_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_grc_Grek
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/grc_Grek/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/grc_Grek
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_grt_Beng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/grt_Beng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/grt_Beng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gsw_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gsw_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gsw_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_guc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gug_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gug_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gug_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gui_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gui_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gui_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_guj_Gujr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guj_Gujr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guj_Gujr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_guj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_guk_Ethi
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guk_Ethi/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guk_Ethi
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gux_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gux_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gux_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_guz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gwi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gwi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gwi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_gym_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gym_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gym_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hac_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hac_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hac_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hae_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hae_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hae_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hak_Hani
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hak_Hani/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hak_Hani
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hak_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hak_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hak_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hat_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hat_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hat_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hau_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hau_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hau_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_haw_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/haw_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/haw_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hbo_Hebr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hbo_Hebr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hbo_Hebr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hch_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hch_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hch_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_heb_Hebr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/heb_Hebr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/heb_Hebr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_her_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/her_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/her_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hif_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hif_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hif_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hil_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hil_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hil_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hin_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hin_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hin_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hin_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hin_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hin_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hmo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hmo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hmo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hne_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hne_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hne_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hns_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hns_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hns_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hrv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hrv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hrv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hrx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hrx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hrx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hsb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hsb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hsb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hun_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hun_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hun_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hus_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hus_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hus_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hwc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hwc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hwc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hye_Armn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hye_Armn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hye_Armn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_hyw_Armn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hyw_Armn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hyw_Armn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_iba_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/iba_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/iba_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ibg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ibg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ibg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ibo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ibo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ibo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_icr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/icr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/icr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ido_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ido_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ido_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_idu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/idu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/idu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ike_Cans
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ike_Cans/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ike_Cans
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ile_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ile_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ile_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ilo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ilo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ilo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ina_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ina_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ina_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ind_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ind_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ind_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_inh_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/inh_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/inh_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_isl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/isl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/isl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ita_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ita_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ita_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_itv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/itv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/itv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_jam_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/jam_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/jam_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_jav_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/jav_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/jav_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_jpn_Jpan
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/jpn_Jpan/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/jpn_Jpan
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kaa_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kaa_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kaa_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kaa_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kaa_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kaa_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kab_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kab_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kab_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kak_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kak_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kak_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kal_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kal_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kal_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kam_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kam_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kam_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kan_Knda
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kan_Knda/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kan_Knda
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kan_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kan_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kan_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kao_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kao_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kao_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kas_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kas_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kas_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kas_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kas_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kas_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kas_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kas_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kas_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kat_Geor
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kat_Geor/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kat_Geor
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kaz_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kaz_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kaz_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kbd_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kbd_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kbd_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kbp_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kbp_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kbp_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kca_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kca_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kca_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kck_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kck_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kck_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kdr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kdr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kdr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kea_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kea_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kea_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kff_Telu
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kff_Telu/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kff_Telu
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kha_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kha_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kha_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_khk_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/khk_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/khk_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_khm_Khmr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/khm_Khmr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/khm_Khmr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_khz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/khz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/khz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kik_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kik_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kik_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kin_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kin_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kin_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kir_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kir_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kir_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kiu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kiu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kiu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kjh_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kjh_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kjh_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kmb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kmb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kmb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kmg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kmg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kmg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kmr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kmr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kmr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kmy_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kmy_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kmy_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_knc_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/knc_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/knc_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_knc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/knc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/knc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kne_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kne_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kne_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kog_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kog_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kog_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_koi_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/koi_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/koi_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kor_Hang
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kor_Hang/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kor_Hang
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kos_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kos_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kos_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kpv_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kpv_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kpv_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_krc_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/krc_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/krc_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kri_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kri_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kri_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_krj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/krj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/krj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_krl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/krl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/krl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ksd_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ksd_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ksd_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ksh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ksh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ksh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ksw_Mymr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ksw_Mymr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ksw_Mymr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ktb_Ethi
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ktb_Ethi/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ktb_Ethi
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kua_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kua_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kua_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kum_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kum_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kum_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kup_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kup_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kup_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kus_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kus_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kus_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kwn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kwn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kwn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kwy_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kwy_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kwy_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_kxm_Thai
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kxm_Thai/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kxm_Thai
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lad_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lad_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lad_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_laj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/laj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/laj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lam_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lam_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lam_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lao_Laoo
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lao_Laoo/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lao_Laoo
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lat_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lat_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lat_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lez_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lez_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lez_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lij_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lij_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lij_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lim_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lim_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lim_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lin_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lin_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lin_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lip_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lip_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lip_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lit_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lit_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lit_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lki_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lki_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lki_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lld_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lld_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lld_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lmk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lmk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lmk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lmo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lmo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lmo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_loz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/loz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/loz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lrc_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lrc_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lrc_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ltg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ltg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ltg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ltz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ltz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ltz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lub_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lub_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lub_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lue_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lue_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lue_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lug_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lug_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lug_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lun_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lun_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lun_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_luo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/luo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/luo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lus_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lus_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lus_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lvs_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lvs_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lvs_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lwg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lwg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lwg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_lzh_Hani
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lzh_Hani/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lzh_Hani
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mad_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mad_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mad_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mag_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mag_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mag_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mah_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mah_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mah_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mai_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mai_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mai_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mak_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mak_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mak_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mal_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mal_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mal_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mal_Mlym
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mal_Mlym/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mal_Mlym
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mar_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mar_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mar_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mar_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mar_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mar_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mas_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mas_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mas_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mbc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mbc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mbc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mcp_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mcp_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mcp_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mdf_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mdf_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mdf_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mer_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mer_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mer_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mfe_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mfe_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mfe_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mfy_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mfy_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mfy_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mhr_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mhr_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mhr_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mhx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mhx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mhx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_min_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/min_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/min_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_min_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/min_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/min_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mkd_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mkd_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mkd_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mkn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mkn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mkn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mlt_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mlt_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mlt_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mnb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mnb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mnb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mni_Beng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mni_Beng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mni_Beng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mni_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mni_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mni_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mnk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mnk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mnk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_moh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/moh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/moh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mop_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mop_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mop_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mos_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mos_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mos_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mqy_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mqy_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mqy_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mri_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mri_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mri_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mrj_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mrj_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mrj_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mrw_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mrw_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mrw_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_msb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/msb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/msb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_msm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/msm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/msm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mui_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mui_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mui_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mup_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mup_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mup_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mwl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mwl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mwl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mww_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mww_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mww_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mya_Mymr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mya_Mymr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mya_Mymr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_myv_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/myv_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/myv_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_myx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/myx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/myx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_mzn_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mzn_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mzn_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nah_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nah_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nah_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nan_Hani
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nan_Hani/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nan_Hani
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nan_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nan_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nan_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nap_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nap_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nap_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_naq_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/naq_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/naq_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nav_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nav_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nav_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nbl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nbl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nbl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nbu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nbu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nbu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ncj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ncj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ncj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ncx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ncx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ncx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ndc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ndc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ndc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nde_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nde_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nde_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ndj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ndj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ndj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ndo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ndo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ndo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nds_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nds_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nds_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_new_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/new_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/new_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nhd_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nhd_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nhd_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nhe_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nhe_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nhe_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nia_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nia_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nia_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_njz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/njz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/njz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nki_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nki_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nki_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nld_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nld_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nld_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nmz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nmz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nmz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nnb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nnb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nnb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nno_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nno_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nno_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nob_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nob_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nob_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nod_Thai
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nod_Thai/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nod_Thai
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_non_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/non_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/non_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nov_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nov_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nov_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_npi_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/npi_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/npi_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_npi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/npi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/npi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_npl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/npl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/npl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nrf_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nrf_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nrf_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nrm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nrm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nrm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nso_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nso_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nso_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nsu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nsu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nsu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nuj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nuj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nuj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nus_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nus_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nus_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nya_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nya_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nya_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nyf_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nyf_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nyf_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nyn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nyn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nyn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nyu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nyu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nyu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_nzi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nzi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nzi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_oci_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/oci_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/oci_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ojb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ojb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ojb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_oke_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/oke_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/oke_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_olo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/olo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/olo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_orv_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/orv_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/orv_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ory_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ory_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ory_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ory_Orya
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ory_Orya/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ory_Orya
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_oss_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/oss_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/oss_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ote_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ote_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ote_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ots_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ots_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ots_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_otw_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/otw_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/otw_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pag_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pag_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pag_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pam_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pam_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pam_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pan_Guru
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pan_Guru/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pan_Guru
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pan_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pan_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pan_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pap_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pap_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pap_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pau_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pau_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pau_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pbt_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pbt_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pbt_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pcd_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pcd_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pcd_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pcm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pcm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pcm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pdc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pdc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pdc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pem_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pem_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pem_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pfl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pfl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pfl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pis_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pis_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pis_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pkb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pkb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pkb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pls_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pls_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pls_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_plt_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/plt_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/plt_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pms_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pms_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pms_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pnb_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pnb_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pnb_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pnt_Grek
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pnt_Grek/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pnt_Grek
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pol_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pol_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pol_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pon_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pon_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pon_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_por_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/por_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/por_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pui_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pui_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pui_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_pwn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pwn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pwn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qub_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qub_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qub_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_quc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/quc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/quc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_quf_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/quf_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/quf_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_quy_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/quy_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/quy_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_quz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/quz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/quz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qve_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qve_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qve_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qvh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qvh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qvh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qvi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qvi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qvi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qvo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qvo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qvo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qvz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qvz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qvz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qwh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qwh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qwh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qxn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qxn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qxn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qxo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qxo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qxo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_qxr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qxr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qxr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rap_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rap_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rap_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rar_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rar_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rar_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_raw_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/raw_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/raw_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rcf_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rcf_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rcf_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rhg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rhg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rhg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rmc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rme_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rme_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rme_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rml_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rml_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rml_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rmn_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmn_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmn_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rmn_Grek
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmn_Grek/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmn_Grek
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rmn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rmo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rmq_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmq_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmq_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rmy_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmy_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmy_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rmy_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmy_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmy_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rnd_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rnd_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rnd_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_roh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/roh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/roh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ron_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ron_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ron_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ron_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ron_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ron_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rop_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rop_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rop_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rue_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rue_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rue_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_run_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/run_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/run_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_rus_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rus_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rus_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sab_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sab_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sab_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sag_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sag_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sag_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sah_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sah_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sah_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_san_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/san_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/san_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_san_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/san_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/san_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sas_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sas_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sas_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sat_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sat_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sat_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_scn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/scn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/scn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sco_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sco_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sco_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sdc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sdc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sdc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sdh_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sdh_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sdh_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_seh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/seh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/seh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sgc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sgc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sgc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sgs_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sgs_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sgs_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_shi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/shi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/shi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_shn_Mymr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/shn_Mymr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/shn_Mymr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_shu_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/shu_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/shu_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sin_Sinh
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sin_Sinh/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sin_Sinh
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sju_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sju_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sju_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_skg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/skg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/skg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_skr_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/skr_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/skr_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_slk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/slk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/slk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_slv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/slv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/slv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sma_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sma_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sma_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sme_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sme_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sme_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_smj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/smj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/smj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_smn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/smn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/smn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_smo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/smo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/smo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sms_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sms_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sms_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sna_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sna_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sna_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_snd_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/snd_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/snd_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_snd_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/snd_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/snd_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_som_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/som_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/som_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sot_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sot_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sot_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_spa_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/spa_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/spa_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_srd_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/srd_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/srd_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_srn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/srn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/srn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_srp_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/srp_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/srp_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_srp_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/srp_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/srp_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ssw_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ssw_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ssw_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_stq_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/stq_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/stq_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_sun_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sun_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sun_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_swc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/swc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/swc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_swe_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/swe_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/swe_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_swg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/swg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/swg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_swh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/swh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/swh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_syc_Syrc
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/syc_Syrc/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/syc_Syrc
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_syl_Beng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/syl_Beng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/syl_Beng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_syl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/syl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/syl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_szl_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/szl_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/szl_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tah_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tah_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tah_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tam_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tam_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tam_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tam_Taml
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tam_Taml/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tam_Taml
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_taq_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/taq_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/taq_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tat_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tat_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tat_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tat_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tat_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tat_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tay_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tay_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tay_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tcy_Knda
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tcy_Knda/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tcy_Knda
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tcz_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tcz_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tcz_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tdt_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tdt_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tdt_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tdx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tdx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tdx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tel_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tel_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tel_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tel_Telu
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tel_Telu/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tel_Telu
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_teo_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/teo_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/teo_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tfr_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tfr_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tfr_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tgk_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tgk_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tgk_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tha_Thai
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tha_Thai/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tha_Thai
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_thk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/thk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/thk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_thl_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/thl_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/thl_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tig_Ethi
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tig_Ethi/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tig_Ethi
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tih_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tih_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tih_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tir_Ethi
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tir_Ethi/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tir_Ethi
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tiv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tiv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tiv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tlh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tlh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tlh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tll_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tll_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tll_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tly_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tly_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tly_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tmc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tmc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tmc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tob_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tob_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tob_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_toi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/toi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/toi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_toj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/toj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/toj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ton_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ton_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ton_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tpi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tpi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tpi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_trv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/trv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/trv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tsg_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tsg_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tsg_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tsn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tsn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tsn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tso_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tso_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tso_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tuc_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tuc_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tuc_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tuk_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tuk_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tuk_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tuk_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tuk_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tuk_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tum_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tum_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tum_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tur_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tur_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tur_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_twb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/twb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/twb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_twi_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/twi_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/twi_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_twx_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/twx_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/twx_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tyv_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tyv_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tyv_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tzh_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tzh_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tzh_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tzj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tzj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tzj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_tzm_Tfng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tzm_Tfng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tzm_Tfng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ubu_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ubu_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ubu_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_udm_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/udm_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/udm_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_uig_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uig_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uig_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_uig_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uig_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uig_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ukr_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ukr_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ukr_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_umb_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/umb_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/umb_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Ahom
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Ahom/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Ahom
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Armn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Armn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Armn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Bamu
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Bamu/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Bamu
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Beng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Beng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Beng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Brah
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Brah/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Brah
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Brai
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Brai/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Brai
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Cakm
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Cakm/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Cakm
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Cans
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Cans/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Cans
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Copt
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Copt/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Copt
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Cpmn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Cpmn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Cpmn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Diak
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Diak/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Diak
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Dupl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Dupl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Dupl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Egyp
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Egyp/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Egyp
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Ethi
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Ethi/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Ethi
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Glag
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Glag/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Glag
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Grek
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Grek/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Grek
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Hebr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hebr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hebr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Hira
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hira/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hira
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Hluw
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hluw/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hluw
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Hmng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hmng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hmng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Hung
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hung/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hung
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Java
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Java/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Java
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Kana
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Kana/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Kana
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Khmr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Khmr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Khmr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Khoj
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Khoj/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Khoj
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Kits
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Kits/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Kits
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Laoo
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Laoo/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Laoo
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Limb
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Limb/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Limb
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Lina
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Lina/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Lina
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Linb
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Linb/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Linb
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Lisu
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Lisu/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Lisu
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Marc
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Marc/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Marc
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Mult
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Mult/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Mult
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Mymr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Mymr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Mymr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Nshu
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Nshu/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Nshu
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Orya
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Orya/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Orya
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Rohg
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Rohg/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Rohg
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Runr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Runr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Runr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Saur
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Saur/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Saur
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Sgnw
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Sgnw/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Sgnw
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Sinh
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Sinh/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Sinh
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Takr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Takr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Takr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Tang
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Tang/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Tang
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Thai
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Thai/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Thai
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Tibt
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Tibt/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Tibt
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Tnsa
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Tnsa/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Tnsa
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Vaii
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Vaii/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Vaii
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Vith
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Vith/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Vith
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Xsux
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Xsux/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Xsux
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_und_Yiii
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Yiii/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Yiii
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_urd_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/urd_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/urd_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_urd_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/urd_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/urd_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_uri_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uri_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uri_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_uzn_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uzn_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uzn_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_uzn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uzn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uzn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_uzs_Arab
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uzs_Arab/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uzs_Arab
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_vec_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vec_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vec_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ven_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ven_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ven_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_vep_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vep_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vep_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_vid_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vid_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vid_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_vie_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vie_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vie_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_vls_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vls_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vls_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_vmw_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vmw_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vmw_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_vro_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vro_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vro_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_war_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/war_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/war_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wbm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wbm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wbm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wbp_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wbp_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wbp_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wed_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wed_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wed_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wes_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wes_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wes_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wln_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wln_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wln_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wls_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wls_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wls_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wol_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wol_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wol_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wrs_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wrs_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wrs_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wsg_Telu
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wsg_Telu/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wsg_Telu
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wuu_Hani
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wuu_Hani/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wuu_Hani
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_wuv_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wuv_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wuv_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_xav_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xav_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xav_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_xho_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xho_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xho_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_xla_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xla_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xla_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_xmf_Geor
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xmf_Geor/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xmf_Geor
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_xmm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xmm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xmm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_xnn_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xnn_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xnn_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_xsr_Deva
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xsr_Deva/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xsr_Deva
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_xum_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xum_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xum_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_ydd_Hebr
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ydd_Hebr/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ydd_Hebr
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_yml_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yml_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yml_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_yor_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yor_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yor_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_yrk_Cyrl
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yrk_Cyrl/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yrk_Cyrl
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_yua_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yua_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yua_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_yue_Hani
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yue_Hani/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yue_Hani
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zab_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zab_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zab_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zai_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zai_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zai_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zas_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zas_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zas_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zdj_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zdj_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zdj_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zea_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zea_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zea_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zgh_Tfng
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zgh_Tfng/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zgh_Tfng
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zne_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zne_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zne_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zom_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zom_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zom_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zsm_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zsm_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zsm_Latn
+      <<: *output
+    filter: *filter
+
+  - name: cc-news_zul_Latn
+    documents:
+        - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zul_Latn/*.json.gz
+    attributes: *attributes
+    output:
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zul_Latn
+      <<: *output
+    filter: *filter
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v3-resiliparse-lang_dedup/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v3-resiliparse-lang_dedup/output
+
+processes: 188
diff --git a/configs/peteish-anneal/README.md b/configs/peteish-anneal/README.md
new file mode 100644
index 00000000..e69de29b
diff --git a/configs/peteish-anneal/digits.sh b/configs/peteish-anneal/digits.sh
new file mode 100644
index 00000000..bd020169
--- /dev/null
+++ b/configs/peteish-anneal/digits.sh
@@ -0,0 +1,48 @@
+collections=(
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/dclm/*/*.json.zst"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/flan/*.json.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/codesearchnet-owmfilter/*/*.jsonl.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/basic_math/*TRAIN.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm_mind/*/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/gsm8k/*/train/*.jsonl.zst"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/ajibawa-2023/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/m-a-p_Matrix/*/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/metamath-owmfilter/*.jsonl.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tinyGSM-MIND/*/*.jsonl.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tulu_math/*/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/pes2o/*.json.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/stackexchange/*.json.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/wiki/*.json.gz"
+)
+tokenizer="allenai/dolma2-tokenizer-sigdig"
+
+for path in "${collections[@]}"; do
+    name=$(echo "${path}" | sed -E 's|.*/documents/([^*]+).*|\1|' | sed 's:^/::; s:/$::')
+    destination="${HOME}/ai2-llm/preprocessed/dolmino-mix-1124/${tokenizer}/${name}"
+
+    echo "Tokenizing $path to $destination"
+    echo "Number of files: $(ls -1 $path 2>/dev/null | wc -l)"
+
+    if [[ "$name" == *"dclm"* ]]; then
+        processes=$(($(nproc) - 4))
+    else
+        processes=20
+    fi
+
+    set -ex
+    dolma tokens \
+        --documents "${path}" \
+        --destination $destination \
+        --tokenizer.name_or_path ${tokenizer} \
+        --tokenizer.eos_token_id 100257 \
+        --tokenizer.pad_token_id 100277 \
+        --no-tokenizer.segment_before_tokenization \
+        --tokenizer.encode_special_tokens \
+        --processes ${processes} \
+        --seed 3920 \
+        --max_size 1073741824 \
+        --sample_ring_prop \
+        --dtype uint32
+    set +ex
+done
diff --git a/configs/peteish-anneal/digits_mix.yaml b/configs/peteish-anneal/digits_mix.yaml
new file mode 100644
index 00000000..2512549f
--- /dev/null
+++ b/configs/peteish-anneal/digits_mix.yaml
@@ -0,0 +1,44 @@
+target_size: 200G
+
+sources:
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/dclm/*.npy
+    mix_percent: 0.5
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/pes2o/*.npy
+    mix_percent: 0.0585
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/flan/*.npy
+    mix_percent: 0.1660
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/codesearchnet-owmfilter/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/dolmino_math_synth/basic_math/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/dolmino_math_synth/gsm_mind/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/gsm8k/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/mathcoder2-synthmath/*/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/metamath-owmfilter/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/tinyGSM-MIND/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/tulu_math/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/stackexchange/*.npy
+    sample_percent: 1.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/wiki/*.npy
+    sample_percent: 1.0
diff --git a/configs/peteish-anneal/fw2-dedupe/mix.yaml b/configs/peteish-anneal/fw2-dedupe/mix.yaml
new file mode 100644
index 00000000..e6e30f17
--- /dev/null
+++ b/configs/peteish-anneal/fw2-dedupe/mix.yaml
@@ -0,0 +1,268 @@
+streams:
+  - name: dclm
+    documents:
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0000/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0001/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0002/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0003/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0004/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0005/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0006/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0007/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0008/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0009/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0010/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0011/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0012/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0013/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0014/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0015/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0016/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0017/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0018/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0019/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0020/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0021/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0022/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0023/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0024/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0025/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0026/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0027/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0028/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0029/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0030/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0031/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0032/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0033/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0034/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0035/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0036/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0037/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0038/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0039/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0040/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0041/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0042/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0043/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0044/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0045/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0046/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0047/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0048/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0049/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0050/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0051/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0052/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0053/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0054/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0055/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0056/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0057/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0058/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0059/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0060/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0061/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0062/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0063/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0064/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0065/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0066/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0067/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0068/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0069/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0070/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0071/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0072/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0073/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0074/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0075/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0076/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0077/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0078/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0079/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0080/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0081/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0082/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0083/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0084/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0085/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0086/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0087/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0088/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0089/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0090/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0091/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0092/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0093/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0094/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0095/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0096/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0097/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0098/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0099/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0100/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0101/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0102/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0103/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0104/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0105/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0106/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0107/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0108/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0109/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0110/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0111/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0112/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0113/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0114/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0115/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0116/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0117/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0118/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0119/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0120/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0121/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0122/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0123/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0124/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0125/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0126/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0127/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0128/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0129/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0130/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0131/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0132/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0133/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0134/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0135/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0136/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0137/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0138/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0139/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0140/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0141/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0142/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0143/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0144/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0145/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0146/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0147/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0148/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0149/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0150/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0151/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0152/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0153/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0154/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0155/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0156/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0157/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0158/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0159/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0160/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0161/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0162/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0163/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0164/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0165/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0166/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0167/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0168/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0169/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0170/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0171/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0172/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0173/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0174/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0175/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0176/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0177/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0178/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0179/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0180/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0181/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0182/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0183/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0184/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0185/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0186/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0187/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0188/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0189/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0190/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0191/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0192/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0193/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0194/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0195/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0196/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0197/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0198/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0199/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0200/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0201/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0202/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0203/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0204/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0205/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0206/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0207/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0208/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0209/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0210/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0211/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0212/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0213/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0214/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0215/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0216/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0217/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0218/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0219/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0220/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0221/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0222/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0223/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0224/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0225/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0226/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0227/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0228/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0229/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0230/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0231/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0232/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0233/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0234/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0235/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0236/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0237/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0238/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0239/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0240/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0241/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0242/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0243/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0244/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0245/*zst
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0246/*zst
+    attributes:
+      - dedupe_para_ngrams_13_1
+    output:
+      max_size_in_bytes: 3_814_697_265
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2_dedupe/documents
+    filter:
+      include:
+        - >-
+          (.attributes.dedupe_para_ngrams_13_1 | length == 0) or
+          ((.attributes.dedupe_para_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3)
+
+      syntax: jq
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v0_rep32_ft7percentile_fw2_dedupe/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v0_rep32_ft7percentile_fw2_dedupe/output
+
+processes: 188
diff --git a/configs/peteish-anneal/fw2-dedupe/part1.yaml b/configs/peteish-anneal/fw2-dedupe/part1.yaml
new file mode 100644
index 00000000..72b86afb
--- /dev/null
+++ b/configs/peteish-anneal/fw2-dedupe/part1.yaml
@@ -0,0 +1,145 @@
+documents:
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0000/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0001/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0002/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0003/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0004/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0005/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0006/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0007/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0008/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0009/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0010/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0011/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0012/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0013/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0014/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0015/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0016/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0017/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0018/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0019/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0020/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0021/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0022/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0023/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0024/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0025/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0026/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0027/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0028/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0029/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0030/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0031/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0032/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0033/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0034/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0035/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0036/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0037/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0038/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0039/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0040/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0041/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0042/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0043/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0044/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0045/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0046/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0047/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0048/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0049/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0050/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0051/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0052/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0053/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0054/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0055/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0056/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0057/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0058/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0059/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0060/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0061/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0062/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0063/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0064/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0065/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0066/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0067/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0068/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0069/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0070/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0071/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0072/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0073/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0074/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0075/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0076/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0077/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0078/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0079/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0080/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0081/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0082/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0083/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0084/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0085/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0086/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0087/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0088/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0089/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0090/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0091/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0092/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0093/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0094/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0095/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0096/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0097/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0098/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0099/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0100/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0101/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0102/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0103/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0104/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0105/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0106/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0107/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0108/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0109/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0110/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0111/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0112/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0113/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0114/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0115/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0116/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0117/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0118/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0119/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0120/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0121/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0122/*zst
+
+
+dedupe:
+  name: dedupe_para_ngrams_13_1
+  paragraphs:
+    attribute_name: dedupe_para_ngrams_13_1
+    by_ngram:
+      ngram_length: 13
+      stride: 1
+      overlap_threshold: 0.5
+      skip_short_paragraphs: true
+  skip_empty: true
+
+bloom_filter:
+  file: ${oc.env:HOME}/bloomp/fw2-part1.bin
+  read_only: false
+  # set to of words
+  estimated_doc_count: 300_711_504_079
+  desired_false_positive_rate: 0.1
+
+processes: 16
diff --git a/configs/peteish-anneal/fw2-dedupe/part2.yaml b/configs/peteish-anneal/fw2-dedupe/part2.yaml
new file mode 100644
index 00000000..3b393b77
--- /dev/null
+++ b/configs/peteish-anneal/fw2-dedupe/part2.yaml
@@ -0,0 +1,146 @@
+documents:
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0123/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0124/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0125/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0126/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0127/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0128/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0129/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0130/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0131/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0132/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0133/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0134/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0135/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0136/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0137/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0138/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0139/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0140/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0141/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0142/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0143/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0144/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0145/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0146/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0147/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0148/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0149/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0150/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0151/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0152/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0153/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0154/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0155/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0156/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0157/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0158/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0159/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0160/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0161/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0162/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0163/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0164/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0165/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0166/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0167/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0168/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0169/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0170/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0171/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0172/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0173/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0174/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0175/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0176/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0177/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0178/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0179/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0180/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0181/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0182/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0183/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0184/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0185/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0186/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0187/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0188/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0189/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0190/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0191/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0192/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0193/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0194/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0195/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0196/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0197/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0198/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0199/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0200/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0201/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0202/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0203/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0204/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0205/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0206/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0207/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0208/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0209/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0210/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0211/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0212/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0213/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0214/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0215/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0216/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0217/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0218/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0219/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0220/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0221/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0222/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0223/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0224/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0225/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0226/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0227/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0228/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0229/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0230/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0231/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0232/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0233/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0234/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0235/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0236/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0237/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0238/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0239/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0240/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0241/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0242/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0243/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0244/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0245/*zst
+  - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0246/*zst
+
+
+dedupe:
+  name: dedupe_para_ngrams_13_1
+  paragraphs:
+    attribute_name: dedupe_para_ngrams_13_1
+    by_ngram:
+      ngram_length: 13
+      stride: 1
+      overlap_threshold: 0.5
+      skip_short_paragraphs: true
+  skip_empty: true
+
+bloom_filter:
+  file: ${oc.env:HOME}/bloomp/fw2-part2.bin
+  read_only: false
+  # set to of words
+  estimated_doc_count: 300_711_504_079
+  desired_false_positive_rate: 0.1
+
+processes: 16
diff --git a/configs/peteish-anneal/mix-fw25.yaml b/configs/peteish-anneal/mix-fw25.yaml
new file mode 100644
index 00000000..a46bbd17
--- /dev/null
+++ b/configs/peteish-anneal/mix-fw25.yaml
@@ -0,0 +1,47 @@
+# HuggingFaceFW_fineweb_edu_classifier/score
+# ┏━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┓
+# ┃ value         ┃ dist     ┃ count       ┃
+# ┡━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━┩
+# │ [-1.0, 0.5)   │ 0.0556   │ 6,120,032   │
+# │ [0.5, 0.8)    │ 0.1115   │ 12,283,459  │
+# │ [0.8, 1.0)    │ 0.1117   │ 12,305,723  │
+# │ [1.0, 1.2)    │ 0.1123   │ 12,367,897  │
+# │ [1.2, 1.4)    │ 0.1110   │ 12,220,008  │
+# │ [1.4, 1.7)    │ 0.1098   │ 12,094,336  │
+# │ [1.7, 2.0)    │ 0.1106   │ 12,180,628  │
+# │ [2.0, 2.4)    │ 0.1109   │ 12,216,375  │
+# │ [2.4, 3.1)    │ 0.1113   │ 12,262,622  │
+# │ [3.1, 5.4]    │ 0.0553   │ 6,088,265   │
+# └───────────────┴──────────┴─────────────┘
+
+streams:
+  - name: dclm
+    documents:
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.zstd
+    attributes:
+      - random_number_v1
+      - HuggingFaceFW_fineweb_edu_classifier
+    output:
+      max_size_in_bytes: 3_814_697_265
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_fwEdu25/documents/full
+      discard_fields:
+        - attributes
+
+    compression:
+      input: zst
+      output: zst
+
+    filter:
+      include:
+        # Remove repetitions
+        - >-
+          (.attributes.HuggingFaceFW_fineweb_edu_classifier[0][-1] * 0.8) +
+          (.attributes.random_number_v1__random_number_v1__random[0][-1] * 5 * 0.2) >= 2
+      syntax: jq
+    span_replacement: []
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v1_fwEdu25/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v1_fwEdu25/output
+
+processes: 188
diff --git a/configs/peteish-anneal/mix-nvidia25.yaml b/configs/peteish-anneal/mix-nvidia25.yaml
new file mode 100644
index 00000000..88be94a6
--- /dev/null
+++ b/configs/peteish-anneal/mix-nvidia25.yaml
@@ -0,0 +1,83 @@
+# nvidia_quality_classifier_deberta_Low/score
+# ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┓
+# ┃ value            ┃ dist    ┃ count      ┃
+# ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━┩
+# │ [0.000, 0.001)   │ 0.0935  │ 10,302,917 │
+# │ [0.001, 0.002)   │ 0.2006  │ 22,092,046 │
+# │ [0.002, 0.003)   │ 0.1299  │ 14,307,537 │
+# │ [0.003, 0.004)   │ 0.0756  │ 8,326,514  │
+# │ [0.004, 0.006)   │ 0.0847  │ 9,329,927  │
+# │ [0.006, 0.011)   │ 0.0887  │ 9,771,702  │
+# │ [0.011, 0.031)   │ 0.0860  │ 9,471,766  │
+# │ [0.031, 0.215)   │ 0.0955  │ 10,520,370 │
+# │ [0.215, 0.938)   │ 0.0956  │ 10,530,852 │
+# │ [0.938, 0.993]   │ 0.0498  │ 5,485,714  │
+# └──────────────────┴─────────┴────────────┘
+# nvidia_quality_classifier_deberta_Medium/score
+# ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┓
+# ┃ value             ┃ dist     ┃ count       ┃
+# ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━┩
+# │ [0.004, 0.069)    │ 0.0558   │ 6,141,162   │
+# │ [0.069, 0.244)    │ 0.1110   │ 12,222,713  │
+# │ [0.244, 0.462)    │ 0.1113   │ 12,254,640  │
+# │ [0.462, 0.700)    │ 0.1110   │ 12,229,105  │
+# │ [0.700, 0.857)    │ 0.1112   │ 12,247,913  │
+# │ [0.857, 0.932)    │ 0.1120   │ 12,331,155  │
+# │ [0.932, 0.966)    │ 0.1127   │ 12,411,860  │
+# │ [0.966, 0.982)    │ 0.1104   │ 12,155,159  │
+# │ [0.982, 0.991)    │ 0.1194   │ 13,147,942  │
+# │ [0.991, 0.994]    │ 0.0454   │ 4,997,696   │
+# └───────────────────┴──────────┴─────────────┘
+# nvidia_quality_classifier_deberta_High/score
+# ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┓
+# ┃ value            ┃ dist    ┃ count       ┃
+# ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━┩
+# │ [0.000, 0.001)   │ 0.1535  │ 16,910,071  │
+# │ [0.001, 0.002)   │ 0.0819  │ 9,021,261   │
+# │ [0.002, 0.004)   │ 0.0869  │ 9,570,058   │
+# │ [0.004, 0.010)   │ 0.0959  │ 10,562,952  │
+# │ [0.010, 0.031)   │ 0.1079  │ 11,883,866  │
+# │ [0.031, 0.095)   │ 0.1038  │ 11,429,509  │
+# │ [0.095, 0.274)   │ 0.1057  │ 11,646,421  │
+# │ [0.274, 0.572)   │ 0.1060  │ 11,670,182  │
+# │ [0.572, 0.813)   │ 0.1058  │ 11,652,356  │
+# │ [0.813, 0.975]   │ 0.0526  │ 5,792,669   │
+# └──────────────────┴─────────┴─────────────┘
+
+streams:
+  - name: dclm
+    documents:
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.zstd
+    attributes:
+      - random_number_v1
+      - nvidia_quality_classifier_deberta
+    output:
+      max_size_in_bytes: 3_814_697_265
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_nvidia25/documents/full
+      discard_fields:
+        - attributes
+
+    compression:
+      input: zst
+      output: zst
+
+    filter:
+      include:
+        # Remove repetitions
+        - >-
+          (.attributes.nvidia_quality_classifier_deberta_High[0][-1] * 0.9) +
+          (.attributes.random_number_v1__random_number_v1__random[0][-1] * 0.1) >= 0.274
+        - >-
+          (.attributes.nvidia_quality_classifier_deberta_Medium[0][-1] * 0.5) +
+          (.attributes.random_number_v1__random_number_v1__random[0][-1] * 0.5) >= 0.932
+        - >-
+          (.attributes.nvidia_quality_classifier_deberta_Low[0][-1] * 0.2) +
+          (.attributes.random_number_v1__random_number_v1__random[0][-1] * 0.8) >= 0.938
+      syntax: jq
+    span_replacement: []
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v1_nvidia25/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v1_nvidia25/output
+
+processes: 188
diff --git a/configs/peteish-anneal/mmlu-web/decontaminate.sh b/configs/peteish-anneal/mmlu-web/decontaminate.sh
new file mode 100644
index 00000000..9e067ddb
--- /dev/null
+++ b/configs/peteish-anneal/mmlu-web/decontaminate.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_PATH=$(realpath "$0")
+
+bloom_filter_file=/tmp/oe-eval-data-dedupe_ngrams_8_1-train_dev_test.bin
+remote_bloom_filter_file=s3://ai2-llm/bloom-filters/oe-eval-data-dedupe_ngrams_8_1-20241018-train_dev_test.bin
+
+aws s3 cp $remote_bloom_filter_file $bloom_filter_file
+size=331605257
+
+dolma dedupe \
+    --documents \
+        "${HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup/documents/*.json.zst" \
+    --dedupe.name dedupe_ngrams_8_1_all_train \
+    --dedupe.paragraphs.attribute_name dedupe_ngrams_8_1_all_train \
+    --dedupe.paragraphs.by_ngram.ngram_length 8 \
+    --dedupe.paragraphs.by_ngram.skip_short_paragraphs \
+    --dedupe.paragraphs.by_ngram.stride 1 \
+    --dedupe.paragraphs.by_ngram.overlap_threshold 0 \
+    --dedupe.skip_empty \
+    --bloom_filter.file $bloom_filter_file \
+    --bloom_filter.read_only \
+    --bloom_filter.estimated_doc_count $size \
+    --bloom_filter.desired_false_positive_rate 0.001 \
+    --processes "$(expr $(nproc) - 4)"
+
+
+dolma -c "$(dirname ${SCRIPT_PATH})/remove_all_train.yaml" mix --processes $(expr $(nproc) - 4)
diff --git a/configs/peteish-anneal/mmlu-web/dedupe.yaml b/configs/peteish-anneal/mmlu-web/dedupe.yaml
new file mode 100644
index 00000000..c67902e9
--- /dev/null
+++ b/configs/peteish-anneal/mmlu-web/dedupe.yaml
@@ -0,0 +1,22 @@
+documents:
+  - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web/documents/full/*.zst
+
+dedupe:
+  name: dedupe_para_ngrams_13_1
+  paragraphs:
+    attribute_name: dedupe_para_ngrams_13_1
+    by_ngram:
+      ngram_length: 13
+      stride: 1
+      overlap_threshold: 0.5
+      skip_short_paragraphs: true
+  skip_empty: true
+
+bloom_filter:
+  file: ${oc.env:HOME}/bloom/mmmlu_web_dedupe_para_ngrams_13_1.bloom
+  read_only: false
+  # set to of words
+  estimated_doc_count: 64_000_000_000
+  desired_false_positive_rate: 0.1
+
+processes: 188
diff --git a/configs/peteish-anneal/mmlu-web/make.yaml b/configs/peteish-anneal/mmlu-web/make.yaml
new file mode 100644
index 00000000..9df09403
--- /dev/null
+++ b/configs/peteish-anneal/mmlu-web/make.yaml
@@ -0,0 +1,28 @@
+streams:
+  - name: dclm
+    documents:
+      - s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.zstd
+    attributes:
+      - flashcards_domains_v1
+    output:
+      max_size_in_bytes: 3_814_697_265
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web/documents/full
+      discard_fields:
+        - attributes
+
+    compression:
+      input: zst
+      output: zst
+
+    filter:
+      include:
+        # Only include documents whose domains have flashcards content
+        - .attributes.flashcards_domains_v1__flashcards_domains_v1__url != null
+      syntax: jq
+    span_replacement: []
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/mmlu_web/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/mmlu_web/output
+
+processes: 188
diff --git a/configs/peteish-anneal/mmlu-web/mix.yaml b/configs/peteish-anneal/mmlu-web/mix.yaml
new file mode 100644
index 00000000..afc89853
--- /dev/null
+++ b/configs/peteish-anneal/mmlu-web/mix.yaml
@@ -0,0 +1,23 @@
+streams:
+  - name: dclm
+    documents:
+       - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web/documents/full/*.zst
+    attributes:
+      - dedupe_para_ngrams_13_1
+
+    output:
+      max_size_in_bytes: 1_073_741_824
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_deduped/documents/
+    filter:
+      include:
+        - >-
+          (.attributes.dedupe_para_ngrams_13_1 | length == 0) or
+          ((.attributes.dedupe_para_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3)
+
+      syntax: jq
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v0_mmlu_web_deduped/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v0_mmlu_web_deduped/output
+
+processes: 188
diff --git a/configs/peteish-anneal/mmlu-web/remove_all_train.yaml b/configs/peteish-anneal/mmlu-web/remove_all_train.yaml
new file mode 100644
index 00000000..b4bc0d56
--- /dev/null
+++ b/configs/peteish-anneal/mmlu-web/remove_all_train.yaml
@@ -0,0 +1,13 @@
+streams:
+  - name: dclm
+    documents: &documents
+      - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup/documents/*.json.zst
+    attributes: &attributes
+      - dedupe_ngrams_8_1_all_train
+    output:
+      max_size_in_bytes: 200_000_000
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup_decontam/documents
+    filter:
+      exclude:
+        - ([.attributes.dedupe_ngrams_8_1_all_train[] | select(.[2] >= 0.1)] | length != 0)
+      syntax: jq
diff --git a/configs/peteish-anneal/mmlu-web/tokenize.yaml b/configs/peteish-anneal/mmlu-web/tokenize.yaml
new file mode 100644
index 00000000..7cdc88f3
--- /dev/null
+++ b/configs/peteish-anneal/mmlu-web/tokenize.yaml
@@ -0,0 +1,16 @@
+destination: ${oc.env:HOME}/ai2-llm/preprocessed/dclm/v0_mmlu_web_minhash_dedup_decontam/allenai/dolma2-tokenizer
+documents:
+  - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup_decontam/documents/*
+
+processes: 128
+seed: 3920
+max_size: 4_294_967_296
+dtype: uint32
+
+tokenizer:
+  name_or_path: allenai/dolma2-tokenizer
+  bos_token_id: null
+  eos_token_id: 100257
+  pad_token_id: 100277
+  segment_before_tokenization: false
+  encode_special_tokens: true
diff --git a/configs/peteish-anneal/olmoe.sh b/configs/peteish-anneal/olmoe.sh
new file mode 100644
index 00000000..64f6620b
--- /dev/null
+++ b/configs/peteish-anneal/olmoe.sh
@@ -0,0 +1,47 @@
+collections=(
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/dclm/*/*.json.zst"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/flan/*.json.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/codesearchnet-owmfilter/*/*.jsonl.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/basic_math/*TRAIN.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm_mind/*/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/gsm8k/*/train/*.jsonl.zst"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/ajibawa-2023/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/m-a-p_Matrix/*/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/metamath-owmfilter/*.jsonl.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tinyGSM-MIND/*/*.jsonl.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tulu_math/*/*.jsonl"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/pes2o/*.json.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/stackexchange/*.json.gz"
+    "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/wiki/*.json.gz"
+)
+
+for path in "${collections[@]}"; do
+    name=$(echo "${path}" | sed -E 's|.*/documents/([^*]+).*|\1|' | sed 's:^/::; s:/$::')
+    destination="${HOME}/ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/${name}"
+
+    echo "Tokenizing $path to $destination"
+    echo "Number of files: $(ls -1 $path 2>/dev/null | wc -l)"
+
+    if [[ "$name" == *"dclm"* ]]; then
+        processes=$(($(nproc) - 4))
+    else
+        processes=20
+    fi
+
+    set -ex
+    dolma tokens \
+        --documents "${path}" \
+        --destination $destination \
+        --no-tokenizer.segment_before_tokenization \
+        --tokenizer.name_or_path "allenai/gpt-neox-olmo-dolma-v1_5" \
+        --tokenizer.eos_token_id 50279 \
+        --tokenizer.pad_token_id 1 \
+	--tokenizer.encode_special_tokens \
+        --processes ${processes} \
+        --seed 3920 \
+        --max_size 1073741824 \
+        --sample_ring_prop \
+        --dtype uint16
+    set +ex
+done
diff --git a/configs/peteish-anneal/olmoe_mix.yaml b/configs/peteish-anneal/olmoe_mix.yaml
new file mode 100644
index 00000000..4280b988
--- /dev/null
+++ b/configs/peteish-anneal/olmoe_mix.yaml
@@ -0,0 +1,44 @@
+target_size: 200G
+
+sources:
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/dclm/*.npy
+    mix_percent: 0.4922
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/pes2o/*.npy
+    mix_percent: 0.0652
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/flan/*.npy
+    mix_percent: 0.1667
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/codesearchnet-owmfilter/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/basic_math/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/gsm_mind/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/gsm8k/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/mathcoder2-synthmath/*/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/metamath-owmfilter/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/tinyGSM-MIND/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/tulu_math/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/stackexchange/*.npy
+    sample_percent: 2.0
+
+  - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/wiki/*.npy
+    sample_percent: 2.0
diff --git a/configs/peteish-anneal/stackexchange/fuzzy-dedupe.yaml b/configs/peteish-anneal/stackexchange/fuzzy-dedupe.yaml
new file mode 100644
index 00000000..2cfdfca8
--- /dev/null
+++ b/configs/peteish-anneal/stackexchange/fuzzy-dedupe.yaml
@@ -0,0 +1,22 @@
+documents:
+  - s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*zst
+
+dedupe:
+  name: dedupe_para_ngrams_13_1
+  paragraphs:
+    attribute_name: dedupe_para_ngrams_13_1
+    by_ngram:
+      ngram_length: 13
+      stride: 1
+      overlap_threshold: 0.5
+      skip_short_paragraphs: true
+  skip_empty: true
+
+bloom_filter:
+  file: ${oc.env:HOME}/stackexchange.bin
+  read_only: false
+  # set to of words
+  estimated_doc_count: 10_000_000_000
+  desired_false_positive_rate: 0.01
+
+processes: 16
diff --git a/configs/peteish-anneal/stackexchange/mix-base.yaml b/configs/peteish-anneal/stackexchange/mix-base.yaml
new file mode 100644
index 00000000..390af697
--- /dev/null
+++ b/configs/peteish-anneal/stackexchange/mix-base.yaml
@@ -0,0 +1,26 @@
+streams:
+  - name: stackexchange
+    documents:
+      - s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*zst
+    attributes:
+      - dedupe_para_ngrams_13_1
+    output:
+      max_size_in_bytes: 1_073_741_824
+      path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/stackexchange/v1_dedupe/documents
+    filter:
+      include:
+        - >-
+          (.attributes.dedupe_para_ngrams_13_1 | length == 0) or
+          ((.attributes.dedupe_para_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3)
+      exclude:
+        - >-
+          .metadata.question_score < 3
+        - >-
+          .metadata.answer_score < 5
+      syntax: jq
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/stackexchange/v1_dedupe_para_ngrams_13_1/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/stackexchange/v1_dedupe_para_ngrams_13_1/output
+
+processes: 188
diff --git a/configs/peteish-anneal/stackexchange/tokens.yaml b/configs/peteish-anneal/stackexchange/tokens.yaml
new file mode 100644
index 00000000..9b2ec938
--- /dev/null
+++ b/configs/peteish-anneal/stackexchange/tokens.yaml
@@ -0,0 +1,16 @@
+destination: ${oc.env:HOME}/ai2-llm/preprocessed/stackexchange/v1_dedupe/documents/allenai/dolma2-tokenizer
+documents:
+  - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/stackexchange/v1_dedupe/documents/*
+
+processes: 16
+seed: 3920
+max_size: 4_294_967_296
+dtype: uint32
+
+tokenizer:
+  name_or_path: allenai/dolma2-tokenizer
+  bos_token_id: null
+  eos_token_id: 100257
+  pad_token_id: 100277
+  segment_before_tokenization: false
+  encode_special_tokens: true
diff --git a/configs/peteish-anneal/tokens-fw25.yaml b/configs/peteish-anneal/tokens-fw25.yaml
new file mode 100644
index 00000000..58406235
--- /dev/null
+++ b/configs/peteish-anneal/tokens-fw25.yaml
@@ -0,0 +1,16 @@
+destination: ${oc.env:HOME}/ai2-llm/preprocessed/sources/dclm/v1_fwEdu25/documents/full/allenai/dolma2-tokenizer
+documents:
+  - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_fwEdu25/documents/full/*
+
+processes: 128
+seed: 3920
+max_size: 4_294_967_296
+dtype: uint32
+
+tokenizer:
+  name_or_path: allenai/dolma2-tokenizer
+  bos_token_id: null
+  eos_token_id: 100257
+  pad_token_id: 100277
+  segment_before_tokenization: false
+  encode_special_tokens: true
diff --git a/configs/peteish-anneal/tokens-nvidia25.yaml b/configs/peteish-anneal/tokens-nvidia25.yaml
new file mode 100644
index 00000000..162ebdac
--- /dev/null
+++ b/configs/peteish-anneal/tokens-nvidia25.yaml
@@ -0,0 +1,16 @@
+destination: ${oc.env:HOME}/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/documents/full/allenai/dolma2-tokenizer
+documents:
+  - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_nvidia25/documents/full/*
+
+processes: 128
+seed: 3920
+max_size: 4_294_967_296
+dtype: uint32
+
+tokenizer:
+  name_or_path: allenai/dolma2-tokenizer
+  bos_token_id: null
+  eos_token_id: 100257
+  pad_token_id: 100277
+  segment_before_tokenization: false
+  encode_special_tokens: true
diff --git a/pyproject.toml b/pyproject.toml
index a4957551..cf67d64d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -112,6 +112,8 @@ dev = [
     "isort>=5.10.1",
     "mypy>=0.971",
     "pytest>=5.2",
+    "types-PyYAML",
+    "types-dateparser"
 ]
 # extension to process code
 code = ["detect-secrets==1.4.0", "beautifulsoup4>=4", "pygments", "regex"]
diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py
index de6a43d5..d263d4ca 100644
--- a/python/dolma/cli/deduper.py
+++ b/python/dolma/cli/deduper.py
@@ -192,7 +192,6 @@ def run(cls, parsed_config: DeduperConfig):
             # perform some path validation to make sure we don't call the mixer with invalid config
             total_matching_documents = 0
             for document in parsed_config.documents:
-
                 if not any(
                     fnmatch.fnmatch(dict_config["dedupe"]["document_dir"], part) for part in document.split(os.sep)
                 ):
diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py
index 943d7f74..2d39d149 100644
--- a/python/dolma/cli/mixer.py
+++ b/python/dolma/cli/mixer.py
@@ -144,7 +144,6 @@ def run(cls, parsed_config: MixerConfig):
                 # perform some path validation to make sure we don't call the mixer with invalid config
                 total_matching_documents = 0
                 for document in stream_config.documents:
-
                     current_matching_documents = sum(1 for _ in glob_path(document))
                     if current_matching_documents == 0:
                         # only raise a warning if no documents are found for a single path
diff --git a/python/dolma/core/runtime.py b/python/dolma/core/runtime.py
index ac5e2a23..d14a4cc0 100644
--- a/python/dolma/core/runtime.py
+++ b/python/dolma/core/runtime.py
@@ -27,8 +27,17 @@
     TaggerOutputDictType,
 )
 from .errors import DolmaFatalError, DolmaRetryableFailure, DolmaShardError
+from .loggers import get_logger
 from .parallel import BaseParallelProcessor, QueueType
-from .paths import delete_dir, join_path, make_relative, mkdir_p, split_glob, split_path
+from .paths import (
+    delete_dir,
+    exists,
+    join_path,
+    make_relative,
+    mkdir_p,
+    split_glob,
+    split_path,
+)
 from .registry import TaggerRegistry
 from .utils import import_modules, make_variable_name
 
@@ -178,10 +187,10 @@ def _make_output_streams(
                 mkdir_p(parent)
 
                 # open a new file and create a new encoder
-                io = stack.enter_context(smart_open.open(loc.path, **open_kwargs))
+                io_ = stack.enter_context(smart_open.open(loc.path, **open_kwargs))
                 encoder = msgspec.json.Encoder()
                 opened[loc.path] = TaggerOutputIO(
-                    exp=loc.exp, taggers=set(), path=loc.path, io=io, encoder=encoder
+                    exp=loc.exp, taggers=set(), path=loc.path, io=io_, encoder=encoder
                 )
 
             # keep track of which taggers are writing to this paths
@@ -223,7 +232,7 @@ def _write_sample_to_streams(
 
 class TaggerProcessor(BaseParallelProcessor):
     @classmethod
-    def increment_progressbar(  # type: ignore
+    def increment_progressbar(  # type: ignore  # pylint: disable=arguments-differ
         cls,
         queue: QueueType,  # queue must be the first argument, and it should be a positional-only argument
         /,
@@ -245,6 +254,10 @@ def process_single(
         **kwargs,
     ):
         """Lets count run the taggers! We will use the destination path to save each tagger output."""
+
+        # get a logger
+        logger = get_logger(cls.__name__)
+
         # import tagger modules
         taggers_modules = kwargs.get("taggers_modules", None)
         if taggers_modules is not None:
@@ -264,7 +277,9 @@ def process_single(
 
         # this is the dictionary that will hold the output of each tagger
         taggers_paths = _determine_output_paths_for_taggers(
-            experiment_name=experiment_name, destination=destination_path, taggers=taggers
+            experiment_name=experiment_name,
+            destination=destination_path,
+            taggers=taggers,
         )
 
         # skip on failure
@@ -283,6 +298,27 @@ def process_single(
         # total number of documents processed
         total_docs_cnt = 0
 
+        if not kwargs.get("ignore_existing", False):
+            # we group taggers by their path (this is for cases when two taggers are going  to same file)
+            # and then remove all taggers if any of the paths exists and ignore_existing is True
+            _taggers_by_path: Dict[str, list[str]] = {}
+            for tagger_name, tagger_path in taggers_paths.items():
+                _taggers_by_path.setdefault(tagger_path.path, []).append(tagger_name)
+
+            # actually take care of removal here
+            for tagger_path, tagger_names in _taggers_by_path.items():
+                if exists(tagger_path):
+                    for tagger_name in tagger_names:
+                        logger.info("Skipping %s because %s already exists.", tagger_name, tagger_path)
+                        taggers.pop(tagger_name)
+                        taggers_paths.pop(tagger_name)
+
+            if not taggers:
+                # if all taggers have been removed, we return early
+                cls.increment_progressbar(queue, files=1)
+                logger.info("All taggers for %s have been skipped.", source_path)
+                return
+
         # creating dedicated decoder speeds up the process
         # if any of the taggers require metadata, we use a decoder that can handle it
         # otherwise, we use a decoder that does not parse metadata, which is faster
@@ -327,7 +363,7 @@ def process_single(
                             # double the update interval if the queue is full
                             update_interval *= 2
 
-            except Exception as exp:
+            except Exception as exp:  # pylint: disable=broad-except
                 # handle any exception that might have occurred
                 msg = f"Failed to process {source_path} due to {exp.__class__.__name__}: {' '.join(exp.args)}"
                 if exp.__class__.__name__ == "IncompleteReadError":
diff --git a/python/dolma/taggers/language.py b/python/dolma/taggers/language.py
index 121fd5c6..91fdedfe 100644
--- a/python/dolma/taggers/language.py
+++ b/python/dolma/taggers/language.py
@@ -4,14 +4,14 @@
 @kylel, @soldni
 """
 
-from typing import TYPE_CHECKING, List, Tuple
+from typing import TYPE_CHECKING, Iterable, List, Tuple
 
 import necessary
 import regex
 from anyascii import anyascii
 
 from ..core.data_types import DocResult, Document, Span
-from ..core.ft_tagger import BaseFastTextTagger
+from ..core.ft_tagger import BaseFastTextTagger, Prediction, TextSlice
 from ..core.registry import TaggerRegistry
 from ..core.taggers import BaseTagger
 from ..core.utils import split_paragraphs
@@ -32,14 +32,17 @@
 
 with necessary.necessary("lingua", soft=True) as LINGUA_AVAILABLE:
     if LINGUA_AVAILABLE or TYPE_CHECKING:
-        from lingua import Language, LanguageDetectorBuilder
+        from lingua import (  # pylint: disable=import-error # pyright: ignore
+            Language,
+            LanguageDetectorBuilder,
+        )
 
 
 class BaseLanguageTagger(BaseTagger):
     INCLUDE_NEGATIVE = True
     PREDICT_ON_PARAGRAPHS = False
 
-    def predict_text(self, text: str) -> List[Tuple[str, float]]:
+    def predict_text(self, text: str) -> List[Tuple[str, float]]:  # pylint: disable=unused-argument
         return []
 
     def make_negative(self, spans: List[Span]) -> List[Span]:
@@ -79,7 +82,7 @@ def __init__(self) -> None:
             raise ImportError(f"cld3 is not installed, cannot instantiate {self.__class__.__name__}")
 
     def predict_text(self, text: str) -> List[Tuple[str, float]]:
-        pred = cld3.get_language(text)  # pyright: ignore
+        pred = cld3.get_language(text)  # pyright: ignore # pylint: disable=possibly-used-before-assignment
         score = pred.probability if pred.language == "en" else 0.0
         return [("en", score)]
 
@@ -114,7 +117,7 @@ def predict_text(self, text: str) -> List[Tuple[str, float]]:
         is_reliable = False
         for fn in (self._identity_fn, self._to_ascii_input, self._sanitize_input):
             try:
-                is_reliable, _, details = cld2.detect(fn(text))
+                is_reliable, _, details = cld2.detect(fn(text))  # pylint: disable=possibly-used-before-assignment
                 break
             except cld2.error:
                 ...
@@ -146,13 +149,16 @@ class Cld2EnglishLanguageParagraphTagger(Cld2EnglishLanguageTagger):
 
 @TaggerRegistry.add("ft_lang_id_doc_v1")
 class FastTextAllLanguagesDocumentTagger(BaseLanguageTagger, BaseFastTextTagger):
-    MODEL_PATH = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
+    MODEL_PATH = "https://dolma-artifacts.org/lang_id_models/fbai/lid.176.bin"
     INCLUDE_NEGATIVE = False
     PREDICT_ON_PARAGRAPHS = False
 
     def __init__(self):
         BaseFastTextTagger.__init__(self, model_path=self.MODEL_PATH, model_mode=self.DOCUMENT_LEVEL_TAGGER)
 
+    def predict_slice(self, text_slice: TextSlice) -> Iterable[Prediction]:
+        raise RuntimeError("This method should not be called; please report this issue.")
+
     def predict_text(self, text: str) -> List[Tuple[str, float]]:
         preds = self.classifier.predict(text.lower().replace("\n", " ").strip(), k=-1)
         return [(label.replace("__label__", ""), float(score)) for label, score in zip(*preds)]
@@ -165,6 +171,16 @@ def predict_text(self, text: str) -> List[Tuple[str, float]]:
         return [(lang, round(score, 2)) for lang, score in out if score > 0.01]
 
 
+@TaggerRegistry.add("glotlid_doc_v3")
+class FastTextAllLanguagesDocumentGlotV3Tagger(FastTextAllLanguagesDocumentTagger):
+    MODEL_PATH = "https://dolma-artifacts.org/lang_id_models/cis-lmu/glotlid/model_v3.bin"
+
+
+@TaggerRegistry.add("glotlid_doc_v3_1e2")
+class FastTextAllLanguagesDocumentGlotV3MinScoreTagger(FastTextAllLanguagesDocumentMinScoreTagger):
+    MODEL_PATH = "https://dolma-artifacts.org/lang_id_models/cis-lmu/glotlid/model_v3.bin"
+
+
 @TaggerRegistry.add("ft_lang_id_paragraph_v1")
 class FastTextAllLanguageParagraphTagger(FastTextAllLanguagesDocumentTagger):
     INCLUDE_NEGATIVE = False
@@ -203,7 +219,8 @@ def __init__(self) -> None:
         if not LANGDETECT_AVAILABLE:
             raise ImportError("langdetect is not installed, please run `pip install dolma[lang]`.")
 
-        (factory := DetectorFactory()).load_profile(PROFILES_DIRECTORY)
+        factory = DetectorFactory()  # pylint: disable=possibly-used-before-assignment
+        factory.load_profile(PROFILES_DIRECTORY)  # pylint: disable=possibly-used-before-assignment
         factory.set_seed(0)
         self.detector = factory.create()
         super().__init__()
@@ -213,7 +230,7 @@ def predict_text(self, text: str) -> List[Tuple[str, float]]:
             self.detector.append(text)
             langs = self.detector.get_probabilities()
             output = [(str(r.lang.strip().lower()), float(r.prob)) for r in langs]
-        except LangDetectException:
+        except LangDetectException:  # pylint: disable=possibly-used-before-assignment
             output = []
         finally:
             self.detector.text = ""
@@ -253,7 +270,11 @@ def __init__(self) -> None:
         super().__init__()
         if not LINGUA_AVAILABLE:
             raise ImportError("langdetect is not installed, please run `pip install dolma[lang]`.")
-        self.detector = LanguageDetectorBuilder.from_languages(*Language.all()).build()
+
+        all_languages = Language.all()  # pylint: disable=possibly-used-before-assignment
+        self.detector = LanguageDetectorBuilder.from_languages(  # pylint: disable=possibly-used-before-assignment
+            *all_languages
+        ).build()
 
     def predict_text(self, text: str) -> List[Tuple[str, float]]:
         langs_conf = self.detector.compute_language_confidence_values(text) or []
diff --git a/python/dolma/taggers/url.py b/python/dolma/taggers/url.py
index 374a94df..285d828a 100644
--- a/python/dolma/taggers/url.py
+++ b/python/dolma/taggers/url.py
@@ -116,8 +116,11 @@ def clean_url(cls, url: str) -> Generator[str, None, None]:
         if url is None or not url.strip():
             return
 
-        parsed = urllib3.util.parse_url(url)
-        yield f"{parsed.host}{(f':{parsed.port}') if parsed.port else ''}{parsed.path or ''}".rstrip("/").lower()
+        try:
+            p_url = urllib3.util.parse_url(url)
+            yield f"{p_url.host}{(f':{p_url.port}') if p_url.port else ''}{p_url.path or ''}".rstrip("/").lower()
+        except Exception:
+            LOGGER.info(f"Failed to parse URL: {url}")
 
     def check_url(self, url: str) -> bool:
         return url in self.blocklist
@@ -215,6 +218,11 @@ class BlocklistProjectNsfwTagger(BaseDomainTagger):
     BLOCKLIST_PATHS = ["https://dolma-artifacts.org/blocklist_project/blocklist_project-20240207/porn.txt"]
 
 
+@TaggerRegistry.add("flashcards_domains_v1")
+class FlashcardsDomainsTagger(BaseDomainTagger):
+    BLOCKLIST_PATHS = ["https://dolma-artifacts.org/flashcard_domains/flashcard_domains-20241113/domains.txt"]
+
+
 @TaggerRegistry.add("blocklist_project_social_v1")
 class BlocklistProjectSocialTagger(BaseDomainTagger):
     BLOCKLIST_PATHS = [
diff --git a/scripts/inspect_tokenized.py b/scripts/inspect_tokenized.py
new file mode 100644
index 00000000..42439f9e
--- /dev/null
+++ b/scripts/inspect_tokenized.py
@@ -0,0 +1,47 @@
+import os
+import click
+from dolma.core.paths import cached_path
+import numpy as np
+from transformers import AutoTokenizer
+
+
+@click.command()
+@click.argument("tokenized_file")
+@click.option("--tokenizer-name-or-path", default="allenai/gpt-neox-olmo-dolma-v1_5")
+@click.option("--chunk-size", default=1024**2, type=int)
+def inspect_tokenized(tokenized_file: str, tokenizer_name_or_path: str, chunk_size: int):
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+
+    print('Vocab size:', tokenizer.vocab_size)
+    print('BOS token:', tokenizer.bos_token_id)
+    print('EOS token:', tokenizer.eos_token_id)
+    print('PAD token:', tokenizer.pad_token_id)
+    print('UNK token:', tokenizer.unk_token_id)
+
+    path = cached_path(tokenized_file)
+    size = os.path.getsize(path)
+    data = np.memmap(path, dtype='uint16', mode='r', shape=(size // 2,))
+
+    collection = []
+    i = 0
+    while i < len(data):
+        chunk = data[i : i + chunk_size]
+        i += chunk_size
+
+        while (chunk == tokenizer.eos_token_id).any():
+            # split chunk into before and after eos
+            eos_idx = np.where(chunk == tokenizer.eos_token_id)[0][0] + 1
+            collection.extend(chunk[:eos_idx].tolist())
+            output = tokenizer.decode(collection)
+            print('#' * os.get_terminal_size().columns)
+            print(output)
+            input("#" * os.get_terminal_size().columns)
+            # reset collection
+            collection = []
+            chunk = chunk[eos_idx:]
+
+        collection.extend(chunk.tolist())
+
+
+if __name__ == "__main__":
+    inspect_tokenized()
diff --git a/scripts/make_npy_mix.py b/scripts/make_npy_mix.py
new file mode 100644
index 00000000..3f2e819f
--- /dev/null
+++ b/scripts/make_npy_mix.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+
+import argparse
+import fnmatch
+import logging
+import math
+import random
+import sys
+from dataclasses import dataclass
+from typing import Callable, Generator
+from urllib.parse import urlparse
+
+import boto3
+import yaml
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+units_map = {
+    "B": 9,
+    "T": 12,
+    "G": 9,
+    "M": 6,
+    "K": 3,
+}
+
+
+@dataclass(frozen=True)
+class SourceConfig:
+    source: str
+    mix_percent: float | None = None
+    sample_percent: float = 1.0
+
+    def __post_init__(self):
+        if self.mix_percent is not None and (self.mix_percent < 0 or self.mix_percent > 1):
+            raise ValueError("mix_percent must be between 0 and 1")
+
+    @property
+    def bucket(self) -> str:
+        return urlparse(self.source).netloc
+
+    @property
+    def prefix(self) -> str:
+        path = urlparse(self.source).path.lstrip("/")
+        for i, char in enumerate(path):
+            if char in ["*", "?", "["]:
+                return path[:i]
+        return path
+
+    def sample(self, total_size: int) -> tuple[list[str], int]:
+        formatter = make_formatter(total_size)
+
+        try:
+            all_paths, all_sizes = map(list, zip(*self.glob))
+        except ValueError:
+            raise ValueError(f"No files found for source {self.source}")
+
+        source_size = sum(all_sizes)
+
+        if self.mix_percent is not None:
+            target_size = int(round(total_size * self.mix_percent))
+        else:
+            target_size = int(round(source_size * self.sample_percent))
+
+        logger.info(
+            f"Sampling {formatter(target_size)} bytes from {formatter(source_size)} "
+            f"from {self.source} ({target_size / total_size:.2%})"
+        )
+
+        # Randomly sample files
+        running_size = 0
+        selected = []
+
+        # double while loop to allow for sampling over 100% if needed
+        while running_size < target_size:
+            all_paths_copy, all_sizes_copy = all_paths[:], all_sizes[:]
+            while len(all_paths_copy) > 0:
+                idx = random.randint(0, len(all_paths_copy) - 1)
+                path = all_paths_copy.pop(idx)
+                size = all_sizes_copy.pop(idx)
+                selected.append(path)
+
+                running_size += size
+                if running_size >= target_size:
+                    break
+
+        return selected, running_size
+
+    @property
+    def glob(self) -> Generator[tuple[str, int], None, None]:
+        client = boto3.client("s3")
+
+        # Use paginator to handle cases with many objects
+        paginator = client.get_paginator("list_objects_v2")
+        page_iterator = paginator.paginate(Bucket=self.bucket, Prefix=self.prefix)
+
+        for page in page_iterator:
+            if "Contents" not in page:
+                continue
+
+            for obj in page["Contents"]:
+                path = f"s3://{self.bucket}/{obj['Key']}"
+                if not path.endswith(".npy"):
+                    continue
+
+                # Use fnmatch to check if the object key matches the pattern
+                if "*" not in self.source or fnmatch.fnmatch(path, self.source):
+                    yield path, obj["Size"]
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "SourceConfig":
+        return cls(
+            source=data["source"],
+            mix_percent=data.get("mix_percent"),
+            sample_percent=data.get("sample_percent") or 1.0,
+        )
+
+
+@dataclass(frozen=True)
+class SamplingConfig:
+    target_size: float | int | str
+    sources: list[SourceConfig]
+    output: str | None = None
+    seed: int = 42
+
+    def __post_init__(self):
+        if isinstance(self.target_size, str):
+            # check if string is in the format "xxxS" where S is a suffix for size (e.g. G, M, K)
+            try:
+                self.size
+            except ValueError as e:
+                raise ValueError("Invalid target size format") from e
+
+        if len(self.sources) == 0:
+            raise ValueError("Must specify at least one source")
+
+        random.seed(self.seed)
+
+    @property
+    def size(self) -> int:
+        if isinstance(self.target_size, float) or isinstance(self.target_size, int):
+            return int(self.target_size)
+
+        suffix = self.target_size[-1].upper()
+        try:
+            size = float(self.target_size[:-1])
+        except ValueError:
+            raise ValueError("Invalid target size format")
+
+        digits = units_map.get(suffix)
+        if digits is None:
+            raise ValueError("Invalid target size suffix")
+        return int(size * 10 ** digits)
+
+    @classmethod
+    def from_yaml(cls, path: str) -> "SamplingConfig":
+        with open(path) as f:
+            data = yaml.safe_load(f)
+        return cls.from_dict(data)
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "SamplingConfig":
+        return cls(
+            target_size=data["target_size"],
+            sources=[SourceConfig.from_dict(source) for source in data.get("sources", [])],
+            output=data.get("output"),
+        )
+
+
+def make_formatter(total_size: int) -> Callable[[int], str]:
+    num_digits = (math.floor(math.log10(total_size))) // 3 * 3
+    suffix = {v: k for k, v in units_map.items()}.get(num_digits, f"e{num_digits}")
+
+    def formatter(size: int, _num_digits: int = num_digits, _suffix: str = suffix) -> str:
+        value = size / 10 ** _num_digits
+        return f"{value:.1f}{_suffix}"
+
+    return formatter
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Sample files from S3 datasets")
+    parser.add_argument("config", type=str, help="Path to config YAML file")
+    args = parser.parse_args()
+
+    if args.config == "-":
+        config = SamplingConfig.from_dict(yaml.safe_load(sys.stdin))
+    else:
+        config = SamplingConfig.from_yaml(args.config)
+
+    formatter = make_formatter(config.size)
+
+    total = 0
+    rows = ["data:", "  paths:"]
+    for source in config.sources:
+        paths, size = source.sample(config.size)
+        logger.info(
+            f"Selected {len(paths)} files from {source.source} "
+            f"({formatter(size)} bytes; {size / config.size:.2%})"
+        )
+        rows.append(f"\n    # {source.source} ({formatter(size)};{size / config.size:.2%})")
+        rows.extend([f"    - {path}" for path in sorted(paths)])
+        total += size
+
+    logger.info(f"Total size: {formatter(config.size)} bytes requested, "
+                f"{formatter(total)} bytes selected ({total / config.size:.2%})")
+
+    output_text = "\n".join(rows)
+    if config.output:
+        with open(config.output, "w") as f:
+            f.write(output_text)
+    else:
+        print(output_text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/tokenize_sft_dataset.py b/scripts/tokenize_sft_dataset.py
new file mode 100644
index 00000000..7b93e65f
--- /dev/null
+++ b/scripts/tokenize_sft_dataset.py
@@ -0,0 +1,216 @@
+"""
+Script for preparing the Tulu data for fine-tuning an OLMo model.
+
+python scripts/tokenize_sft_dataset.py \
+    --tokenizer.name_or_path allenai/dolma2-tokenizer \
+    --tokenizer.bos_token_id 100257 \
+    --tokenizer.eos_token_id 100257 \
+    --tokenizer.pad_token_id 100277 \
+    --dataset.path allenai/tulu-v3.9-tmp
+
+"""
+
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+
+import datasets as ds
+import numpy as np
+
+from dolma.tokenizer.tokenizer import Tokenizer
+from dolma.cli.tokenizer import TokenizerConfig
+from dolma.cli import field, BaseCli
+
+
+@dataclass
+class DatasetConfig:
+    path: str | None = field(default=None, help="Path or name of the dataset. Required.")
+    name: str | None = field(default=None, help="Defining the name of the dataset configuration.")
+    split: str | None = field(default='train', help="Name of the split to load.")
+
+
+@dataclass
+class TokenizationConfig:
+    tokenizer: TokenizerConfig = field(default=TokenizerConfig(), help="Configuration for the tokenizer.")
+    dataset : DatasetConfig = field(default=DatasetConfig(), help="Configuration for the dataset.")
+    processes: int = field(default=1, help="Number of parallel processes to use.")
+    output_dir: str = field(help="Output directory to save the tokenized data.")
+    max_seq_len: int = field(default=4096, help="Maximum sequence length.")
+    max_label_len: int | None = field(default=None, help="Maximum label length.")
+    dtype: None | str = field(default=None, help="Data type for the tokenized data.")
+    max_tokens_per_file: int = field(default=2 ** 32, help="Maximum number of tokens per file.")
+
+
+def run_tokenizer(opts: TokenizationConfig) -> None:
+    assert opts.tokenizer is not None, "Tokenizer configuration is missing."
+    assert opts.tokenizer.name_or_path is not None, "Tokenizer name or path must be provided."
+    assert getattr(opts, "output_dir", None) is not None, "Output directory is missing."
+
+    opts.max_label_len = opts.max_label_len or opts.max_seq_len
+
+    tokenizer_config = {}
+    if opts.tokenizer.bos_token_id is not None:
+        tokenizer_config["bos_token_id"] = opts.tokenizer.bos_token_id
+    if opts.tokenizer.eos_token_id is not None:
+        tokenizer_config["eos_token_id"] = opts.tokenizer.eos_token_id
+    if opts.tokenizer.pad_token_id is not None:
+        tokenizer_config["pad_token_id"] = opts.tokenizer.pad_token_id
+
+    if Path(opts.tokenizer.name_or_path).is_file():
+        tokenizer = Tokenizer.from_file(opts.tokenizer.name_or_path, **tokenizer_config)
+    else:
+        tokenizer = Tokenizer.from_pretrained(opts.tokenizer.name_or_path, **tokenizer_config)
+
+    expected_bits = int(np.ceil(np.log2(tokenizer.vocab_size) / 16)) * 16
+    expected_dtype = f"uint{expected_bits}"
+
+    if opts.dtype is not None and opts.dtype != expected_dtype:
+        raise ValueError(f"Invalid data type, expected: {expected_dtype}, got: {opts.dtype}")
+    elif opts.dtype is None:
+        np_dtype = getattr(np, expected_dtype)
+    else:
+        np_dtype = getattr(np, opts.dtype)
+
+    assert opts.dataset is not None, "Dataset configuration is missing."
+    assert opts.dataset.path is not None, "Dataset path is missing."
+
+    dataset_config = {}
+    if opts.dataset.name is not None:
+        dataset_config["name"] = opts.dataset.name
+    if opts.dataset.split is not None:
+        dataset_config["split"] = opts.dataset.split
+
+    dataset = ds.load_dataset(opts.dataset.path, **dataset_config)
+
+    # # sample 10k
+    # dataset = dataset.shuffle(seed=42).select(range(10000))
+
+    print("Tokenizing dataset...")
+    dataset = dataset.map(
+        partial(preprocess, tokenizer=tokenizer, max_seq_len=opts.max_seq_len),
+        batched=False,
+        remove_columns=dataset.column_names,  # type: ignore
+        num_proc=opts.processes,    # type: ignore
+        desc="Tokenizing dataset",  # type: ignore
+    )
+
+    print("Filtering dataset...")
+    n = len(dataset)  # type: ignore
+    dataset = dataset.filter(
+        partial(filter_long_sequences, max_label_len=opts.max_label_len, max_seq_len=opts.max_seq_len),
+        batched=False,
+        num_proc=opts.processes,
+        desc="Filtering sequences that are too long",
+    )
+    print(f"Filtered out {n - len(dataset):,d} examples")
+
+    print(f"Saving results to '{opts.output_dir}'...")
+    output_dir = Path(opts.output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    total_tokens = len(dataset) * opts.max_seq_len
+    batch_size = int(np.floor((opts.max_tokens_per_file / total_tokens) * len(dataset)))
+    print(f"Saving {len(dataset):,d} examples to {output_dir} in batches of {batch_size:,d} examples")
+
+    dataset.map(
+        partial(save_memmap, output_dir=output_dir, batch_size=batch_size, dtype=np_dtype),
+        batched=True,
+        batch_size=batch_size,
+        num_proc=opts.processes,
+        desc="Saving memmaps",
+        remove_columns=dataset.column_names,  # type: ignore
+        with_indices=True,
+    )
+
+
+def save_memmap(
+    data: dict[str, list],
+    idx: list[int],
+    output_dir: Path,
+    batch_size: int,
+    dtype: np.dtype
+) -> dict[str, list]:
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    pos = idx[0] // batch_size
+    size = sum(len(input_ids) for input_ids in data["input_ids"])
+    input_ids_mm = np.memmap(output_dir / f"input_ids_{pos:06d}.npy", dtype=dtype, mode="w+", shape=(size,))
+    label_mask_mm = np.memmap(output_dir / f"label_mask_{pos:06d}.npy", dtype=np.bool_, mode="w+", shape=(size,))
+
+    offset = 0
+    for input_ids, label_mask in zip(data["input_ids"], data["label_mask"]):
+        n = len(input_ids)
+        input_ids_mm[offset : offset + n] = input_ids
+        label_mask_mm[offset : offset + n] = label_mask
+        offset += n
+
+    input_ids_mm.flush()
+    label_mask_mm.flush()
+
+    return {}
+
+
+def filter_long_sequences(example: dict, max_label_len: int = 2 ** 30, max_seq_len: int = 2 ** 30) -> bool:
+    return (
+        example["n_labels"] > 0
+        and example["n_labels"] <= max_label_len
+        and example["n_total"] <= max_seq_len
+    )
+
+
+def preprocess(example: dict, tokenizer: Tokenizer, max_seq_len: int) -> dict:
+    eos_token = tokenizer.base_tokenizer.id_to_token(tokenizer.eos_token_id)
+
+    input_ids = [tokenizer.bos_token_id]
+    label_mask = [False]
+
+    for msg in example["messages"]:
+        role_tokens = tokenizer.encode(f"<|{msg['role']}|>\n", add_special_tokens=False)
+        label_mask += [False] * len(role_tokens)
+        input_ids += role_tokens
+
+        if msg["role"] == "assistant":
+            content_tokens = tokenizer.encode(
+                msg["content"].strip() + eos_token + "\n", add_special_tokens=False
+            )
+            label_mask += [True] * len(content_tokens)
+            # mask out the last '\n'
+            assert content_tokens[-2] == tokenizer.eos_token_id
+            label_mask[-1] = False
+        else:
+            content_tokens = tokenizer.encode(msg["content"].strip() + "\n", add_special_tokens=False)
+            label_mask += [False] * len(content_tokens)
+        input_ids += content_tokens
+
+    input_ids = input_ids[:max_seq_len]
+    label_mask = label_mask[:max_seq_len]
+
+    n_total = len(input_ids)
+
+    if len(input_ids) < max_seq_len:
+        pad_len = max_seq_len - len(input_ids)
+        input_ids += [tokenizer.pad_token_id] * pad_len
+        label_mask += [False] * pad_len
+    elif len(input_ids) > max_seq_len:
+        input_ids = input_ids[:max_seq_len]
+        label_mask = label_mask[:max_seq_len]
+
+    assert len(input_ids) == len(label_mask)
+    n_labels = sum(label_mask)
+
+    return {"input_ids": input_ids, "label_mask": label_mask, "n_labels": n_labels, "n_total": n_total}
+
+
+class SftTokenizerCli(BaseCli):
+    CONFIG = TokenizationConfig
+    DESCRIPTION = "Tokenize the Tulu V2 SFT dataset."
+
+    @classmethod
+    def run(cls, parsed_config: TokenizationConfig):
+        run_tokenizer(parsed_config)
+
+
+if __name__ == "__main__":
+    parser = SftTokenizerCli.make_parser(ArgumentParser())
+    SftTokenizerCli.run_from_args(parser.parse_args())
diff --git a/search/README.md b/search/README.md
new file mode 100644
index 00000000..eda8873a
--- /dev/null
+++ b/search/README.md
@@ -0,0 +1,80 @@
+# Dolma Search
+
+Dolma Search is a toolkit for easy indexing and searching of data in Dolma format. It provides functionality to create, manage, and query indexes using the Tantivy search engine.
+
+## Features
+
+- Create and manage Tantivy indexes
+- Index documents from various sources, including local files and S3 buckets
+- Perform searches on indexed data with customizable queries
+- Display search results in different formats (JSON, table, or snippet view)
+
+## Installation
+
+You can install Dolma Search using pip:
+
+```shell
+git clone https://github.com/allenai/dolma.git
+pip install search
+```
+
+## Usage
+
+### Indexing
+
+To index documents, use the `dolma_search.index` module. Here's an example of how to use it:
+
+```shell
+dolma-search index \
+    -i /path/to/index \
+    -d "s3://ai2-llm/pretraining-data/sources/path/to/documents/*.gz"
+```
+
+The following command line options are available:
+
+| Option | Short | Description | Default |
+|--------|-------|-------------|---------|
+| `--documents` | `-d` | The documents to index. Can be any glob pattern supported by smart-open library. | Required |
+| `--index-path` | `-i` | The path to the index. If not provided, an in-memory index will be used. | None |
+| `--force` | `-f` | If the index already exists, delete it and create a new one. | False |
+| `--num-readers` | `-n` | The number of readers to use. | 1 |
+| `--num-indexers` | `-N` | The number of indexers to use. | 1 |
+| `--reader-batch-size` | `-b` | The batch size for readers. | 1000 |
+| `--indexer-batch-size` | `-B` | The batch size for indexers. | 1000 |
+| `--heap-size` | `-H` | The heap size for the index writer. | 1GB |
+| `--queue-size-per-thread` | `-q` | The size of the queue to use for storing documents. | 125 |
+
+
+
+### Searching
+
+To search the indexed documents, use the `dolma_search.query` module. Here's an example of how to use it:
+
+
+```shell
+dolma-search query \
+    -i /data/flan_index \
+    -q "What is the capital of France?"
+```
+
+You can also pass search queries from stdin
+
+```shell
+cat queries.txt | dolma-search query -i /data/flan_index
+```
+
+You can choose which format to display the results in. Valid options are:
+
+- `json`: Print the results in JSON format with no coloring; perfect for piping to another program that can parse JSONL output.
+- `table`: Print the results in a table format with coloring.
+- `snippet`: Print the results in a table format with coloring; snippets containing matches, rather than the full document, are displayed.
+
+Other options for the `query` command include:
+
+| Option | Short | Description | Default |
+|--------|-------|-------------|---------|
+| `--index-path` | `-i` | The path to the index. | Required |
+| `--query` | `-q` | The query to search for. If not provided, enters interactive mode. If set to "-", reads queries from stdin. | None |
+| `--num-hits` | `-n` | The number of hits to return. | 10 |
+| `--display-format` | `-f` | The format to display the search results in. Options: table, json, snippet. | json |
+| `--selector` | `-s` | The selector used to process the queries. Uses jq syntax. | None |
diff --git a/search/pyproject.toml b/search/pyproject.toml
new file mode 100755
index 00000000..ad36f1a3
--- /dev/null
+++ b/search/pyproject.toml
@@ -0,0 +1,107 @@
+[project]
+name = "dolma-search"
+version = "0.1.0"
+description = "Toolkit for easy indexing of data in Dolma format."
+authors = [
+    {name = "Luca Soldaini", email = "lucas@allenai.org" }
+]
+license = {text = "Apache-2.0"}
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "msgspec>=0.18.6",
+    "jq>=1.8.0,<2.0.0",
+    "fsspec[http]<=2024.6.1,>=2023.1.0",
+    "tantivy>=0.18.0",
+    "smart-open>=7.0.4,<8.0.0",
+    "rich>=13.5.0,<14.0.0",
+    "markdownify>=0.13.1,<0.14.0"
+]
+
+[project.urls]
+"Homepage" = "https://github.com/allenai/dolma"
+"Repository" = "https://github.com/allenai/dolma"
+"Bug Tracker" = "https://github.com/allenai/dolma/issues"
+
+
+[tool.setuptools.packages.find]
+where = ["python"]
+
+[tool.setuptools.package-data]
+dolma_search = ["py.typed", "*.pyi"]
+
+
+[project.scripts]
+dolma-search = "dolma_search.__main__:main"
+
+
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = [
+    "setuptools >= 61.0.0",
+    "wheel"
+]
+
+[project.optional-dependencies]
+dev = [
+    "black>=22.6.0",
+    "isort>=5.10.1",
+    "mypy>=0.971",
+    "pytest>=5.2",
+    "ipython>=8.4.0",
+    "autopep8>=1.7.0",
+    "flake8>=5.0",
+    "ipdb>=0.13.0",
+    "flake8-pyi>=22.8.1",
+    "Flake8-pyproject>=1.1.0"
+]
+
+[tool.black]
+line-length = 115
+include = '\.pyi?$'
+exclude = '''
+(
+      __pycache__
+    | \.git
+    | \.mypy_cache
+    | \.pytest_cache
+    | \.vscode
+    | \.venv
+    | \bdist\b
+    | \bdoc\b
+)
+'''
+
+[tool.isort]
+profile = "black"
+line_length = 115
+multi_line_output = 3
+
+[tool.autopep8]
+max_line_length = 115
+in-place = true
+recursive = true
+aggressive = 3
+
+[tool.mypy]
+python_version = "3.10"
+ignore_missing_imports = true
+no_site_packages = true
+allow_redefinition = false
+warn_unused_configs = true
+warn_unused_ignores = true
+warn_no_return = true
+warn_return_any = false
+warn_unreachable = true
+show_error_codes = true
+pretty = true
+
+[tool.mypy-tests]
+strict_optional = false
+
+[tool.flake8]
+per-file-ignores = [
+    '__init__.py:F401',
+    '*.pyi:E302,E305',
+    '*.py:E203'
+]
diff --git a/search/python/dolma_search/__init__.py b/search/python/dolma_search/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/search/python/dolma_search/__main__.py b/search/python/dolma_search/__main__.py
new file mode 100644
index 00000000..3f970e38
--- /dev/null
+++ b/search/python/dolma_search/__main__.py
@@ -0,0 +1,33 @@
+import argparse
+import sys
+
+from . import index, query
+
+CLI_DESCRIPTION = "Dolma Search CLI"
+
+
+def main():
+    parser = argparse.ArgumentParser(CLI_DESCRIPTION)
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+
+    # Index subparser
+    index_parser = subparsers.add_parser("index", help=index.INDEX_DESCRIPTION)
+    index.make_index_parser(index_parser)
+
+    # Query subparser
+    query_parser = subparsers.add_parser("query", help=query.QUERY_DESCRIPTION)
+    query.make_search_parser(query_parser)
+
+    args = parser.parse_args()
+
+    if args.command == "index":
+        index.index_data(args)
+    elif args.command == "query":
+        query.search_data(args)
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/search/python/dolma_search/common.py b/search/python/dolma_search/common.py
new file mode 100644
index 00000000..6ebec22b
--- /dev/null
+++ b/search/python/dolma_search/common.py
@@ -0,0 +1,31 @@
+import shutil
+from enum import Enum
+from pathlib import Path
+
+from tantivy import Index, SchemaBuilder
+
+
+class IndexFields(Enum):
+    TEXT = "text"
+    ID = "id"
+    SOURCE = "source"
+
+
+def create_index(path: str | Path | None = None, reuse: bool = False) -> Index:
+    # Declaring our schema.
+    schema_builder = SchemaBuilder()
+    schema_builder.add_text_field(IndexFields.TEXT.value, stored=True)
+    schema_builder.add_text_field(IndexFields.ID.value, stored=True)
+    schema_builder.add_text_field(IndexFields.SOURCE.value, stored=True)
+    schema = schema_builder.build()
+
+    if path:
+        path = Path(path)
+        if not reuse and path.exists():
+            shutil.rmtree(path)
+
+        path.mkdir(parents=True, exist_ok=True)
+
+    # Creating our index (in memory)
+    index = Index(schema, path=str(path), reuse=reuse)
+    return index
diff --git a/search/python/dolma_search/index.py b/search/python/dolma_search/index.py
new file mode 100644
index 00000000..df7235d3
--- /dev/null
+++ b/search/python/dolma_search/index.py
@@ -0,0 +1,204 @@
+"""
+python -m dolma_decontamination.search.index \
+    -i /data/flan_index \
+    -d "s3://ai2-llm/pretraining-data/sources/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/train/*.gz" \
+    -n 4 \
+    -N 12 \
+    -b 1000 \
+    -B 50000 \
+    -f
+"""
+
+import argparse
+import json
+import logging
+import shutil
+import time
+from contextlib import ExitStack
+from functools import partial
+from multiprocessing import Manager, Pool, set_start_method
+from pathlib import Path
+from queue import Queue
+from urllib.parse import urlparse
+
+import fsspec
+import smart_open
+import tqdm
+from tantivy import Document, Index, SchemaBuilder
+
+from .common import IndexFields, create_index
+
+INDEX_DESCRIPTION = "Index documents into a tantivy index"
+
+
+QueueType = Queue[Document | None]
+
+
+def get_fs(uri: str) -> fsspec.AbstractFileSystem:
+    return fsspec.filesystem(urlparse(uri).scheme or "file")
+
+
+def list_path(pattern: str) -> list[str]:
+    fs = get_fs(pattern)
+    protocol = urlparse(pattern).scheme
+    paths = []
+    for path in fs.glob(pattern):
+        if protocol:
+            paths.append(f"{protocol}://{path}")
+        else:
+            paths.append(str(path))
+    del fs
+    return paths
+
+
+def list_paths(glob_patterns: list[str], num_workers: int = 1) -> list[str]:
+    with Pool(processes=num_workers) as pool:
+        return [p for ps in pool.map(list_path, glob_patterns) for p in ps]
+
+
+def read_file_for_indexing(file_path: str, docs_queue: Queue[list[Document]], batch_size: int = 1_000):
+    batch: list[Document] = []
+    with smart_open.open(file_path, "rt", encoding="utf-8") as stream:
+        for line in stream:
+            row = json.loads(line)
+            doc = Document(**{f.value: (row[f.value] or "") for f in IndexFields})
+            batch.append(doc)
+
+            if len(batch) >= batch_size:
+                docs_queue.put(batch)
+                batch = []
+
+    if batch:
+        docs_queue.put(batch)
+
+
+def read_many_and_index(
+    index: Index,
+    paths: list[str],
+    num_readers: int = 1,
+    num_indexers: int = 1,
+    indexer_batch_size: int = 1_000,
+    reader_batch_size: int = 1_000,
+    heap_size: int = 1024 * 1024 * 1024,
+    queue_size: int = 1000,
+):
+    with ExitStack() as stack:
+        reader_pool = stack.enter_context(Pool(processes=num_readers))
+
+        files_pbar = stack.enter_context(
+            tqdm.tqdm(desc="Reading files", unit=" files", unit_scale=True, total=len(paths))
+        )
+        docs_pbar = stack.enter_context(tqdm.tqdm(desc="Indexing documents", unit=" docs", unit_scale=True))
+
+        writer_fn = partial(index.writer, num_threads=num_indexers, heap_size=heap_size)
+        writer = writer_fn()
+
+        docs_queue: Queue[list[Document]] = (manager := Manager()).Queue(queue_size)
+
+        fn = partial(read_file_for_indexing, docs_queue=docs_queue, batch_size=reader_batch_size)
+        async_results = [reader_pool.apply_async(fn, [p], callback=lambda _: files_pbar.update(1)) for p in paths]
+        # for p in paths:
+        #     fn(p)
+
+        indexed_count = 0
+        while any(not r.ready() for r in async_results) or not docs_queue.empty():
+            # check if there are any documents to index
+            if docs_queue.empty():
+                time.sleep(0.1)
+            else:
+                batch = docs_queue.get()
+                for doc in batch:
+                    writer.add_document(doc)
+                    indexed_count += 1
+
+                if indexed_count >= indexer_batch_size:
+                    docs_pbar.update(indexed_count)
+                    indexed_count = 0
+                    writer.commit()
+
+        for r in async_results:
+            r.wait()
+
+        if indexed_count:
+            docs_pbar.update(indexed_count)
+            writer.commit()
+        writer.wait_merging_threads()
+
+
+def make_index_parser(parser: argparse.ArgumentParser | None = None):
+    parser = parser or argparse.ArgumentParser(INDEX_DESCRIPTION)
+    parser.add_argument(
+        "-d",
+        "--documents",
+        type=str,
+        required=True,
+        nargs="+",
+        help="The documents to index. Can be any glob pattern supported by smart-open library.",
+    )
+    parser.add_argument(
+        "-i",
+        "--index-path",
+        type=str,
+        help="The path to the index. If not provided, an in-memory index will be used.",
+    )
+    parser.add_argument(
+        "-f", "--force", action="store_true", help="If the index already exists, delete it and create a new one."
+    )
+    parser.add_argument("-n", "--num-readers", type=int, default=1, help="The number of readers to use.")
+    parser.add_argument("-N", "--num-indexers", type=int, default=1, help="The number of indexers to use.")
+    parser.add_argument(
+        "-b",
+        "--reader-batch-size",
+        type=int,
+        default=1_000,
+    )
+    parser.add_argument(
+        "-B",
+        "--indexer-batch-size",
+        type=int,
+        default=1_000,
+    )
+    parser.add_argument(
+        "-H",
+        "--heap-size",
+        type=int,
+        default=1024 * 1024 * 1024,
+    )
+    parser.add_argument(
+        "-q",
+        "--queue-size-per-thread",
+        type=int,
+        default=125,
+        help="The size of the queue to use for storing documents.",
+    )
+    return parser
+
+
+def index_data(args: argparse.Namespace):
+    set_start_method("spawn")
+
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+
+    index = create_index(args.index_path, reuse=not args.force)
+    logger.info("Created index" + (f" stored at {args.index_path}" if args.index_path else " in memory"))
+
+    files = list_paths(args.documents, num_workers=args.num_readers)
+    logger.info(f"Found {len(files)} files to index")
+
+    # add_paths_to_index(args.index_path, files, num_workers=args.num_workers, batch_size=args.batch_size)
+    read_many_and_index(
+        index,
+        paths=files,
+        num_readers=args.num_readers,
+        num_indexers=args.num_indexers,
+        indexer_batch_size=args.indexer_batch_size,
+        reader_batch_size=args.reader_batch_size,
+        heap_size=args.heap_size,
+        queue_size=args.queue_size_per_thread * args.num_readers,
+    )
+    logger.info("Indexed all documents")
+
+
+if __name__ == "__main__":
+    index_data(make_index_parser().parse_args())
diff --git a/search/python/dolma_search/py.typed b/search/python/dolma_search/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/search/python/dolma_search/query.py b/search/python/dolma_search/query.py
new file mode 100644
index 00000000..8533f067
--- /dev/null
+++ b/search/python/dolma_search/query.py
@@ -0,0 +1,156 @@
+import argparse
+import json
+import sys
+from enum import Enum
+from typing import Any, Generator, NamedTuple, Type
+
+import jq
+from markdownify import markdownify as md
+from rich.console import Console
+from rich.markdown import Markdown
+from rich.table import Table
+from rich.text import Text
+from tantivy import Document, Query, Schema, Searcher, SnippetGenerator
+
+from .common import IndexFields, create_index
+
+QUERY_DESCRIPTION = "Interactive search tool on a tantivy index"
+
+
+class DisplayFormat(Enum):
+    TABLE = "table"
+    JSON = "json"
+    SNIPPET = "snippet"
+
+
+def make_search_parser(parser: argparse.ArgumentParser | None = None):
+    parser = parser or argparse.ArgumentParser(QUERY_DESCRIPTION)
+    parser.add_argument("-i", "--index-path", type=str, required=True, help="The path to the index.")
+    parser.add_argument("-q", "--query", type=str, default=None, help="The query to search for.")
+    parser.add_argument("-n", "--num-hits", type=int, default=10, help="The number of hits to return.")
+    parser.add_argument(
+        "-f",
+        "--display-format",
+        type=DisplayFormat,
+        default=DisplayFormat.JSON,
+        choices=list(DisplayFormat),
+        help="The format to display the search results in.",
+    )
+    parser.add_argument(
+        "-s",
+        "--selector",
+        type=str,
+        default=None,
+        help="The selector used to process the queries. Uses jq syntax.",
+    )
+    return parser
+
+
+def query_iterator(query: str | None) -> Generator[str, None, None]:
+    if query is None:
+        while True:
+            try:
+                query = input("Enter a query: ")
+                yield query
+            except KeyboardInterrupt:
+                print("\nExiting...")
+                break
+    elif query == "-":
+        for line in sys.stdin:
+            yield line.strip()
+    else:
+        yield str(query)
+
+
+def apply_selector(queries: Generator[str, None, None], selector: str | None):
+    selector = jq.compile(selector) if selector else None
+    fn = lambda query: (str(e) for e in selector.input(json.loads(query)).all()) if selector else (str(query),)
+    for query in queries:
+        yield from fn(query)
+
+
+class HitsTuple(NamedTuple):
+    score: float
+    doc: dict[str, list[Any]]
+    rank: int
+
+    def get(self, field: str) -> str:
+        return str(self.doc[field][0])
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "document": {f.value: self.get(f.value) for f in IndexFields},
+            "score": self.score,
+            "rank": self.rank,
+        }
+
+    @classmethod
+    def from_hits(cls: Type["HitsTuple"], hits: list[tuple[float, int]], searcher: Searcher) -> list["HitsTuple"]:
+        return [
+            cls(score=hit_score, doc=searcher.doc(hit_doc_address), rank=rank)  # pyright: ignore
+            for rank, (hit_score, hit_doc_address) in enumerate(hits, start=1)
+        ]
+
+
+def print_hits_table(
+    hits: list[HitsTuple],
+    searcher: Searcher,
+    schema: Schema,
+    query: Query,
+    show_snippets: bool = False,
+    console: Console | None = None,
+):
+    console = console or Console()
+
+    table = Table(title="Search Results", show_header=True, header_style="bold", show_lines=True)
+    table.add_column("Score", justify="right", style="green")
+    table.add_column(IndexFields.ID.value.upper(), style="magenta")
+    table.add_column(IndexFields.SOURCE.value.capitalize(), style="cyan")
+    table.add_column(IndexFields.TEXT.value.capitalize(), style="blue")
+
+    for hit in hits:
+        if show_snippets:
+            snippet_generator = SnippetGenerator.create(
+                searcher=searcher, query=query, schema=schema, field_name=IndexFields.TEXT.value
+            )
+            snippet = snippet_generator.snippet_from_doc(hit.doc)  # pyright: ignore
+            hit_text = Markdown(md(snippet.to_html()).strip())
+        else:
+            hit_text = Text(hit.get(IndexFields.TEXT.value).strip().replace("\n", "\\n"))
+
+        table.add_row(f"{hit.score:.2f}", hit.get("id"), hit.get("source"), str(hit_text))
+
+    console.print(table)
+
+
+def search_data(args: argparse.Namespace):
+    index = create_index(args.index_path, reuse=True)
+    searcher = index.searcher()
+
+    console = Console()
+
+    for query in apply_selector(query_iterator(args.query), args.selector):
+        try:
+            parsed_query = index.parse_query(query)
+        except ValueError as e:
+            raise ValueError(f"Error parsing query `{query}`: {e}")
+
+        hits = searcher.search(parsed_query, limit=args.num_hits).hits
+        parsed_hits = HitsTuple.from_hits(hits, searcher)  # pyright: ignore
+
+        if args.display_format == DisplayFormat.JSON:
+            for row in parsed_hits:
+                print(json.dumps(row.to_dict(), sort_keys=True))
+        else:
+            print_hits_table(
+                hits=parsed_hits,
+                searcher=searcher,
+                schema=index.schema,
+                query=parsed_query,
+                show_snippets=(args.display_format == DisplayFormat.SNIPPET),
+                console=console,
+            )
+
+
+if __name__ == "__main__":
+    search_data(make_search_parser().parse_args())
diff --git a/search/tests/python/__init__.py b/search/tests/python/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/sources/AutoMathText/tokens.py b/sources/AutoMathText/tokens.py
new file mode 100644
index 00000000..e02de1d6
--- /dev/null
+++ b/sources/AutoMathText/tokens.py
@@ -0,0 +1,36 @@
+from copy import deepcopy
+from dolma.cli.tokenizer import TokenizationConfig, TokenizerConfig, TokenizerCli
+from multiprocessing import cpu_count
+import numpy as np
+import os
+
+
+def main():
+    base_config = TokenizationConfig(
+        documents=[],
+        destination=f"{os.environ['HOME'].rstrip('/')}/ai2-llm/preprocessed/math-ai_AutoMathText/v0",
+        tokenizer=TokenizerConfig(
+            name_or_path="allenai/dolma2-tokenizer",
+            bos_token_id=None,
+            eos_token_id=100257,
+            pad_token_id=100277,
+            segment_before_tokenization=False,
+            encode_special_tokens=True,
+        ),
+        processes=cpu_count(),
+        max_size=100_000_000,
+        dtype='uint32',
+        sample_ring_prop=True,
+    )
+
+
+    for subset in ["arxiv/*", "code/*", "web"]:
+        config = deepcopy(base_config)
+        config.documents = [
+            f"/data/math-ai_AutoMathText/v0/documents/{subset}/*.jsonl.gz"
+        ]
+        config.destination = f"{config.destination}/{subset.rstrip('/*')}/{config.tokenizer.name_or_path}"
+        TokenizerCli.run(config)
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/AutoMathText/tokens.sh b/sources/AutoMathText/tokens.sh
new file mode 100644
index 00000000..8aedff38
--- /dev/null
+++ b/sources/AutoMathText/tokens.sh
@@ -0,0 +1,44 @@
+#! /usr/bin/env bash
+
+set -ex
+
+
+dolma tokens \
+    --documents '/data/math-ai_AutoMathText/v0/documents/arxiv/*/*.jsonl.gz' \
+    --destination "${HOME}/ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer" \
+    --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \
+    --tokenizer.eos_token_id 100257 \
+    --tokenizer.pad_token_id 100277 \
+    --no-tokenizer.segment_before_tokenization \
+    --tokenizer.encode_special_tokens \
+    --processes 16 \
+    --max_size 100_000_000 \
+    --dtype 'uint32' \
+    --sample_ring_prop
+
+dolma tokens \
+    --documents '/data/math-ai_AutoMathText/v0/documents/code/*/*.jsonl.gz' \
+    --destination "${HOME}/ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer" \
+    --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \
+    --tokenizer.eos_token_id 100257 \
+    --tokenizer.pad_token_id 100277 \
+    --no-tokenizer.segment_before_tokenization \
+    --tokenizer.encode_special_tokens \
+    --processes 16 \
+    --max_size 100_000_000 \
+    --dtype 'uint32' \
+    --sample_ring_prop
+
+
+dolma tokens \
+    --documents '/data/math-ai_AutoMathText/v0/documents/web/*.jsonl.gz' \
+    --destination "${HOME}/ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer" \
+    --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \
+    --tokenizer.eos_token_id 100257 \
+    --tokenizer.pad_token_id 100277 \
+    --no-tokenizer.segment_before_tokenization \
+    --tokenizer.encode_special_tokens \
+    --processes 16 \
+    --max_size 100_000_000 \
+    --dtype 'uint32' \
+    --sample_ring_prop
diff --git a/sources/AutoMathText/v0.py b/sources/AutoMathText/v0.py
new file mode 100644
index 00000000..634437ce
--- /dev/null
+++ b/sources/AutoMathText/v0.py
@@ -0,0 +1,221 @@
+
+import os
+import glob
+
+from contextlib import ExitStack
+from hashlib import md5
+from tempfile import TemporaryDirectory
+from typing import Any, Optional
+import datetime
+from queue import Queue
+import json
+from multiprocessing import cpu_count
+
+import smart_open
+from dolma.core.parallel import BaseParallelProcessor
+
+
+def format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str:
+    """Format a timestamp as a string using near ISO-8601 format."""
+    if timestamp is None:
+        timestamp = datetime.datetime.now()
+    return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z"
+
+
+def parse_date_web(date_str):
+    try:
+        return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S")
+    except ValueError:
+        return datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
+
+
+def parse_date_arxiv(date_str):
+    try:
+        return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S")
+    except ValueError:
+        return datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
+
+
+def parse_code_date(date_str):
+    try:
+        return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
+    except ValueError:
+        # If milliseconds are not present, try without them
+        return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
+
+
+class AutoWebMathProcessor(BaseParallelProcessor):
+    @classmethod
+    def increment_progressbar(
+        cls,
+        queue: Queue,
+        /,
+        files: int = 0,
+        docs: int = 0,
+        words: int = 0,
+    ):
+        """
+        This method is to update the progress bar. We keep
+        track of three things:
+        - files: the number of files processed
+        - read_docs: the number of documents read in
+        - written_docs: the number of documents written out
+            (i.e., the number of documents that are not empty)
+        """
+        super().increment_progressbar(
+            queue,
+            files=files,
+            docs=docs,
+            words=words,
+        )
+
+    @classmethod
+    def process_single(
+        cls,
+        source_path: str,
+        destination_path: str,
+        queue: Queue,
+        **kwargs: Any,
+    ):
+        """
+        This method is called for each file. It reads the file
+        line by line, and writes to the destination file only
+        if the document is not empty.
+        """
+
+        update_every_n_lines = 10_000
+        docs = 0
+        words = 0
+
+        with ExitStack() as stack:
+            # open source and destination files
+            source_file = stack.enter_context(
+                smart_open.open(source_path, "rt")
+            )
+            if destination_path.endswith(".jsonl"):
+                destination_path += ".gz"
+
+            destination_file = stack.enter_context(
+                smart_open.open(destination_path, "wt")
+            )
+
+            # Set a fixed creation date
+            created = datetime.datetime(2024, 1, 23)
+
+            *_, source, subset, _ = source_path.split("/")
+            for ln in source_file:
+                # we first load the json document
+                document = json.loads(ln)
+                docs += 1
+                docid = md5((ln + source + subset).encode('utf-8')).hexdigest()
+
+                metadata = document.pop("meta")
+
+                if "title" in document and "abstract" in document and "text" in document:
+                    # arxiv subset
+                    text = f"{document['title']}\n\n{document['abstract']}\n\n{document['text']}"
+                    metadata["subset"] = subset
+                    metadata["path"] = source_path
+                    metadata["url"] = document.pop("url")
+                    created = parse_date_arxiv(metadata["timestamp"])
+
+                elif "url" in document and "date" in document:
+                    created = parse_date_web(document["date"])
+                    # this is web content
+                    metadata["date"] = document["date"]
+                    metadata["url"] = document["url"]
+                    metadata["path"] = source_path
+                    text = document["text"]
+                elif "text" in document:
+                    if metadata.get("max_stars_repo_stars_event_min_datetime", None) is not None:
+                        created = min(
+                            parse_code_date(metadata["max_stars_repo_stars_event_min_datetime"]),
+                            created
+                        )
+                    if metadata.get("max_forks_repo_forks_event_min_datetime", None) is not None:
+                        created = min(
+                            parse_code_date(metadata["max_forks_repo_forks_event_min_datetime"]),
+                            created
+                        )
+                    text = document["text"]
+                    # this is a code document
+                else:
+                    raise ValueError(f"Unknown document type: {document}")
+
+                output = {
+                    "text": text.strip(),
+                    "source": f"{source}_{subset}",
+                    "added": format_to_dolma_timestamp(),
+                    "created": format_to_dolma_timestamp(created),
+                    "id": docid,
+                    "metadata": metadata
+                }
+
+                words += len(text.split())
+
+                # if the document is not empty,
+                # we write it to output
+                destination_file.write(json.dumps(output) + "\n")
+
+                # we update the progress bar every
+                # update_every_n_lines
+                if docs > update_every_n_lines:
+                    cls.increment_progressbar(queue, docs=docs, words=words)
+                    docs = 0
+                    words = 0
+
+            # we update the progress bar one last time
+            cls.increment_progressbar(
+                queue,
+                files=1,
+                docs=docs,
+                words=words,
+            )
+
+
+def main():
+
+    base_source_prefix = '/data/math-ai_AutoMathText/raw/data'
+    base_destination_prefix = '/data/math-ai_AutoMathText/v0/documents'
+
+
+    jsonl_files = []
+    for root, dirs, files in os.walk(base_source_prefix):
+        for file in files:
+            if file.endswith('.jsonl'):
+                jsonl_files.append(os.path.join(root, file))
+    print(f"Found {len(jsonl_files)} JSONL files.")
+
+
+    with TemporaryDirectory() as tmpdir:
+
+        # Create destination paths by combining end filepath after base_source_prefix with base_destination_prefix
+        destinations = []
+        temp_files = []
+        for jsonl_file in jsonl_files:
+            relative_path = os.path.relpath(jsonl_file, base_source_prefix)
+            destination = os.path.join(base_destination_prefix, relative_path)
+            destination_dir = os.path.dirname(destination)
+            os.makedirs(destination_dir, exist_ok=True)
+            destinations.append(os.path.dirname(destination))
+            temp_file = os.path.join(tmpdir, os.path.dirname(destination))
+            os.makedirs(temp_file, exist_ok=True)
+            temp_files.append(temp_file)
+
+        print(f"Created {len(destinations)} destination paths.")
+
+        # create the processor
+        processor = AutoWebMathProcessor(
+            source_prefix=jsonl_files,
+            destination_prefix=destinations,
+            metadata_prefix=temp_files,
+            num_processes=cpu_count() - 1,
+            debug=False,
+        )
+
+        # run the processor
+        processor()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/CodeSearchNet/tokens.sh b/sources/CodeSearchNet/tokens.sh
new file mode 100644
index 00000000..c6b99c48
--- /dev/null
+++ b/sources/CodeSearchNet/tokens.sh
@@ -0,0 +1,17 @@
+#! /usr/bin/env bash
+
+set -ex
+
+
+dolma tokens \
+    --documents 's3://ai2-llm/pretraining-data/sources/code_search_net/v0/documents/train/*/*.jsonl.gz' \
+    --destination "${HOME}/ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer" \
+    --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \
+    --tokenizer.eos_token_id 100257 \
+    --tokenizer.pad_token_id 100277 \
+    --no-tokenizer.segment_before_tokenization \
+    --tokenizer.encode_special_tokens \
+    --processes 16 \
+    --max_size 100_000_000 \
+    --dtype 'uint32' \
+    --sample_ring_prop
diff --git a/sources/CodeSearchNet/v0.py b/sources/CodeSearchNet/v0.py
new file mode 100644
index 00000000..124f8342
--- /dev/null
+++ b/sources/CodeSearchNet/v0.py
@@ -0,0 +1,60 @@
+from hashlib import md5
+import datasets
+import smart_open
+import datetime
+from typing import Optional
+import json
+import tqdm
+from contextlib import ExitStack
+
+dataset_name = "code-search-net/code_search_net"
+version = "v0"
+destination = f"s3://ai2-llm/pretraining-data/sources/{dataset_name.split("/")[1]}/{version}/documents"
+max_docs_per_file = 100_000
+
+def format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str:
+    """Format a timestamp as a string using near ISO-8601 format."""
+    if timestamp is None:
+        timestamp = datetime.datetime.now()
+    return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z"
+
+
+def main():
+    created = format_to_dolma_timestamp(datetime.datetime(2019, 9, 20))
+
+    with tqdm.tqdm(unit=" docs", unit_scale=True) as pbar, ExitStack() as stack:
+        for language in ["python", "java", "javascript", "go", "ruby", "php"]:
+            for split in ["train", "validation", "test"]:
+                pbar.set_description(f"Processing {language}/{split}")
+                fn = 0
+                cnt = 0
+                path = f"{destination}/{split}/{language}/{fn:04d}.jsonl.gz"
+                print(f"\nCreating new output file {path}")
+                f = stack.enter_context(smart_open.open(path, "wt"))
+                dataset = datasets.load_dataset(dataset_name, language, split=split)
+                for row in dataset:
+                    doc = {
+                        "id": md5(row["func_code_url"].encode("utf-8")).hexdigest(),
+                        "text": row.pop("whole_func_string"),
+                        "source": f"{dataset_name}_{language}_{split}",
+                        "added": format_to_dolma_timestamp(),
+                        "created": created,
+                        "metadata": row
+                    }
+                    f.write(json.dumps(doc) + "\n")
+
+                    pbar.update(1)
+                    cnt += 1
+                    if cnt >= max_docs_per_file:
+                        fn += 1
+                        cnt = 0
+                        stack.pop_all().close()
+                        path = f"{destination}/{split}/{language}/{fn:04d}.jsonl.gz"
+                        print(f"\nCreating new output file {path}")
+                        f = stack.enter_context(smart_open.open(path, "wt"))
+                stack.pop_all().close()
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/MetaMathQA/tokens.py b/sources/MetaMathQA/tokens.py
new file mode 100644
index 00000000..56a66a02
--- /dev/null
+++ b/sources/MetaMathQA/tokens.py
@@ -0,0 +1,33 @@
+from copy import deepcopy
+from dolma.cli.tokenizer import TokenizationConfig, TokenizerConfig, TokenizerCli
+from multiprocessing import cpu_count
+import numpy as np
+import os
+
+
+def main():
+    tokenizer = "allenai/dolma2-tokenizer"
+    base_source = "s3://ai2-llm/pretraining-data/sources"
+    base_destination = f"{os.environ['HOME'].rstrip('/')}/ai2-llm/preprocessed"
+
+    config = TokenizationConfig(
+        documents=[f"{base_source}/meta-math_MetaMathQA/v0/documents/train/*"],
+        destination=f"{base_destination}/meta-math_MetaMathQA/v0/tokens/{tokenizer}",
+        tokenizer=TokenizerConfig(
+            name_or_path=tokenizer,
+            bos_token_id=None,
+            eos_token_id=100257,
+            pad_token_id=100277,
+            segment_before_tokenization=False,
+            encode_special_tokens=True,
+        ),
+        processes=cpu_count(),
+        max_size=100_000_000,
+        dtype='uint32',
+        sample_ring_prop=True,
+        seed=42,
+    )
+    TokenizerCli.run(config)
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/MetaMathQA/v0.py b/sources/MetaMathQA/v0.py
new file mode 100644
index 00000000..1ba48a69
--- /dev/null
+++ b/sources/MetaMathQA/v0.py
@@ -0,0 +1,50 @@
+from hashlib import md5
+import datasets
+import smart_open
+import datetime
+from typing import Optional
+import json
+import tqdm
+
+
+dataset_name = "meta-math/MetaMathQA"
+version = "v0"
+split = "train"
+destination = (
+    f"s3://ai2-llm/pretraining-data/sources/{dataset_name.replace("/", "_")}/"
+    f"{version}/documents/{split}/0000.jsonl.gz"
+)
+
+def format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str:
+    """Format a timestamp as a string using near ISO-8601 format."""
+    if timestamp is None:
+        timestamp = datetime.datetime.now()
+    return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z"
+
+
+def main():
+    dataset = datasets.load_dataset(dataset_name, split=split)
+
+    d = datetime.datetime(2023, 10, 7)
+
+    with smart_open.open(destination, "wt") as f:
+        for row in tqdm.tqdm(dataset, desc="Processing dataset"):
+            doc_id = md5(json.dumps(row).encode("utf-8")).hexdigest()
+            text = row["query"] + "\n" + row["response"]
+            source = f"{dataset_name}_{row['type']}_{split}"
+            added = format_to_dolma_timestamp()
+            created = format_to_dolma_timestamp(d)
+
+            output = {
+                "text": text,
+                "id": doc_id,
+                "source": source,
+                "added": added,
+                "created": created,
+                "version": version,
+                "meta": {**row}
+            }
+            f.write(json.dumps(output) + "\n")
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/books/openstax.py b/sources/books/openstax.py
new file mode 100644
index 00000000..e69de29b
diff --git a/sources/eli5/v0.py b/sources/eli5/v0.py
new file mode 100644
index 00000000..2c191b09
--- /dev/null
+++ b/sources/eli5/v0.py
@@ -0,0 +1,183 @@
+import pandas as pd
+from pathlib import Path
+import json
+import smart_open
+from ftfy import fix_text
+import re
+from contextlib import ExitStack
+import datetime
+
+import tqdm
+
+DESTINATION_S3 = "s3://ai2-llm/pretraining-data/sources/max-hoffman_eli5/v0/documents"
+DCLM_SUBMISSION_SCORE = 3
+DCLM_COMMENT_SCORE = 5
+DCLM_MIN_ANSWERS = 3
+ELI5_CREATED_AT = datetime.datetime(2019, 7, 22)
+
+
+def format_to_dolma_timestamp(timestamp: datetime.datetime | None = None) -> str:
+    """Format a timestamp as a string using near ISO-8601 format."""
+    if timestamp is None:
+        timestamp = datetime.datetime.now()
+    return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z"
+
+
+
+def safe_json_loads(s: str) -> dict | None:  # pyright: ignore
+    try:
+        return json.loads(s)
+    except json.JSONDecodeError:
+        return None
+
+
+def replace_urls(row: pd.Series) -> str:
+    text, urls = tuple(row)
+    if not text:
+        return text
+    for i, url in enumerate(urls['url']):
+        text = text.replace(f"_URL_{i}_", url)
+    return text
+
+
+def read_eli5_data(split: str, data_dir: str = "/Users/lucas/code/eli5/data"):
+    data_path = Path(data_dir)
+    df = pd.read_parquet(data_path / f"eli5_{split}.parquet")
+
+    df["title_urls"] = df["title_urls"].apply(json.loads)
+    df["selftext_urls"] = df["selftext_urls"].apply(json.loads)
+    df["answers_urls"] = df["answers_urls"].apply(json.loads)
+
+    # # replace urls in title, selftext, and answers
+    df["title_with_urls"] = df[["title", "title_urls"]].apply(replace_urls, axis=1)
+    df["selftext_with_urls"] = df[["selftext", "selftext_urls"]].apply(replace_urls, axis=1)
+    df["answers_with_urls"] = df[["answers", "answers_urls"]].apply(replace_urls, axis=1)
+
+    # this is the one that might fail
+    df["answers_with_urls"] = df["answers_with_urls"].apply(safe_json_loads)
+    df["answers"] = df["answers"].apply(safe_json_loads)
+    # Count and remove rows where JSON parsing failed
+
+    initial_count = len(df)
+    df = df.dropna(subset=['answers_with_urls', 'answers'])
+    final_count = len(df)
+    failures = initial_count - final_count
+
+    print(f"Number of rows dropped in {split} due to JSON parsing failures: {failures}")
+
+    # replace all NaNs with empty strings
+    df = df.fillna("")
+
+    return df
+
+
+def main():
+
+    for split in ["test", "validation", "train"]:
+        df = read_eli5_data(split)
+        eli5_created_at = format_to_dolma_timestamp(ELI5_CREATED_AT)
+
+        with ExitStack() as stack:
+            full_file = stack.enter_context(smart_open.open(f"{DESTINATION_S3}/conversation/{split}.jsonl.gz", "w"))
+            dclm_file = stack.enter_context(smart_open.open(f"{DESTINATION_S3}/dclm/{split}.jsonl.gz", "w"))
+            format_file = stack.enter_context(smart_open.open(f"{DESTINATION_S3}/individual/{split}.jsonl.gz", "w"))
+            screen_file = stack.enter_context(smart_open.open(f"{DESTINATION_S3}/individual_filtered/{split}.jsonl.gz", "w"))
+
+            for i, row in tqdm.tqdm(df.iterrows(), total=len(df), desc=f"Processing {split}"):
+                all_text_and_answers = (
+                    str(row["title_with_urls"]),
+                    str(row["selftext_with_urls"]),
+                    *[str(text) for text in row['answers_with_urls']['text']]
+                )
+
+                # use two newlines as separator or maximum number of newlines in the text, plus one
+                spacing = max(
+                    [len(span) for text in all_text_and_answers for span in re.findall(r'\n+', text)] + [1]
+                )
+
+                # separate the text with one newline
+                full_text = ("\n" * (spacing + 1)).join(all_text_and_answers)
+
+                answer_urls = {
+                    f"_URL_{i}_": url for i, url in enumerate(row['answers_urls']['url'])
+                }
+
+                metadata = {
+                    "q_id": str(row["q_id"]),
+                    "title": {
+                        "text": str(row["title"]),
+                        "urls": [str(url) for url in row["title_urls"]["url"]]
+                    },
+                    "selftext": {
+                        "text": str(row["selftext"]),
+                        "urls": [str(url) for url in row["selftext_urls"]["url"]]
+                    },
+                    "answers": [
+                        {
+                            "a_id": str(a_id),
+                            "text": str(text),
+                            "score": int(score),
+                            'urls': [str(url) for u_id, url in answer_urls.items() if u_id in text]
+                        }
+                        for a_id, text, score in
+                        zip(row['answers']['a_id'], row['answers']['text'], row['answers']['score'])
+                    ]
+                }
+
+                full_document = {
+                    "text": full_text,
+                    "id": str(row["q_id"]),
+                    "source": "eli5",
+                    "version": "v0_conversation",
+                    "created": eli5_created_at,
+                    "added": format_to_dolma_timestamp(),
+                    "metadata": metadata
+                }
+
+                full_file.write(json.dumps(full_document) + "\n")
+
+                dclm_answer = None
+
+                title = fix_text(str(row["title"]))
+
+                for score, a_id, answer in sorted(
+                    zip(row['answers']['score'], row['answers']['a_id'], row['answers_with_urls']['text']),
+                    key=lambda x: float(f"{x[0]}.{len(x[2])}")
+                ):
+                    # use two newlines as separator or maximum number of newlines in the text, plus one
+                    spacing = max(
+                        [len(span) for span in re.findall(r'\n+', title)] +
+                        [len(span) for span in re.findall(r'\n+', answer)] +
+                        [1]
+                    )
+                    text = ("\n" * (spacing + 1)).join([title, fix_text(answer)])
+                    answer_metadata = {
+                        **{k: v for k, v in metadata.items() if k != "answers"},
+                        **[answer for answer in metadata["answers"] if answer["a_id"] == a_id][0]  # pyright: ignore
+                    }
+                    answer_document = {
+                        "text": text,
+                        "id": f"{row['q_id']}_{a_id}",
+                        "source": "eli5",
+                        "version": "v0_individual",
+                        "created": eli5_created_at,
+                        "added": format_to_dolma_timestamp(),
+                        "metadata": answer_metadata
+                    }
+
+                    format_file.write(json.dumps(answer_document) + "\n")
+
+                    if score >= DCLM_COMMENT_SCORE and len(row['answers']['a_id']) >= DCLM_MIN_ANSWERS:
+                        dclm_answer = {**answer_document, "version": "v0_dclm"}
+
+                    if score >= DCLM_COMMENT_SCORE:
+                        screen_document = {**answer_document, "version": "v0_screen"}
+                        screen_file.write(json.dumps(screen_document) + "\n")
+
+                if dclm_answer:
+                    dclm_file.write(json.dumps(dclm_answer) + "\n")
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/mathpile/tokens.py b/sources/mathpile/tokens.py
new file mode 100644
index 00000000..dec43b2b
--- /dev/null
+++ b/sources/mathpile/tokens.py
@@ -0,0 +1,37 @@
+from copy import deepcopy
+from dolma.cli.tokenizer import TokenizationConfig, TokenizerConfig, TokenizerCli
+from multiprocessing import cpu_count
+import numpy as np
+import os
+
+
+def main():
+    base_config = TokenizationConfig(
+        documents=[],
+        destination=f"{os.environ['HOME'].rstrip('/')}/ai2-llm/preprocessed/mathpile",
+        tokenizer=TokenizerConfig(
+            name_or_path="allenai/dolma2-tokenizer",
+            bos_token_id=None,
+            eos_token_id=100257,
+            pad_token_id=100277,
+            segment_before_tokenization=False,
+            encode_special_tokens=True,
+        ),
+        processes=cpu_count(),
+        max_size=100_000_000,
+        dtype='uint32',
+        sample_ring_prop=True,
+    )
+
+    for name in ["MathPile", "MathPile_Commercial"]:
+        for split in ["train", "validation"]:
+            for subset in ["arXiv", "commoncrawl", "proofwiki", "stackexchange", "textbooks", "wikipedia"]:
+                config = deepcopy(base_config)
+                config.documents = [
+                    f"/data/mathpile/v0/documents/{name}/{split}/{subset}/*"
+                ]
+                config.destination = f"{config.destination}/{name}/{split}/{subset}/{config.tokenizer.name_or_path}"
+                TokenizerCli.run(config)
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/mathpile/v0.py b/sources/mathpile/v0.py
new file mode 100644
index 00000000..1c765714
--- /dev/null
+++ b/sources/mathpile/v0.py
@@ -0,0 +1,149 @@
+from contextlib import ExitStack
+from hashlib import md5
+from tempfile import TemporaryDirectory
+from typing import Any, Optional
+import datetime
+from queue import Queue
+import json
+from multiprocessing import cpu_count
+
+import smart_open
+from dolma.core.parallel import BaseParallelProcessor
+
+
+def format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str:
+    """Format a timestamp as a string using near ISO-8601 format."""
+    if timestamp is None:
+        timestamp = datetime.datetime.now()
+    return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z"
+
+
+
+class MathpileProcessor(BaseParallelProcessor):
+    @classmethod
+    def increment_progressbar(
+        cls,
+        queue: Queue,
+        /,
+        files: int = 0,
+        docs: int = 0,
+        words: int = 0,
+    ):
+        """
+        This method is to update the progress bar. We keep
+        track of three things:
+        - files: the number of files processed
+        - read_docs: the number of documents read in
+        - written_docs: the number of documents written out
+            (i.e., the number of documents that are not empty)
+        """
+        super().increment_progressbar(
+            queue,
+            files=files,
+            docs=docs,
+            words=words,
+        )
+
+    @classmethod
+    def process_single(
+        cls,
+        source_path: str,
+        destination_path: str,
+        queue: Queue,
+        **kwargs: Any,
+    ):
+        """
+        This method is called for each file. It reads the file
+        line by line, and writes to the destination file only
+        if the document is not empty.
+        """
+
+        update_every_n_lines = 10_000
+        docs = 0
+        words = 0
+        with ExitStack() as stack:
+            # open source and destination files
+            source_file = stack.enter_context(
+                smart_open.open(source_path, "rt")
+            )
+            destination_file = stack.enter_context(
+                smart_open.open(destination_path, "wt")
+            )
+
+            # Set a fixed creation date
+            created = datetime.datetime(2023, 12, 29)
+
+            *_, source, split, subset, fn = source_path.split("/")
+            for ln in source_file:
+                # we first load the json document
+                document = json.loads(ln)
+                docs += 1
+
+                docid = md5((ln + source + split + subset).encode('utf-8')).hexdigest()
+
+                metadata = {}
+
+                if "text" in document:
+                    text = document.pop("text")
+                elif "question" in document and "answers" in document:
+                    question = document.pop("question")
+                    answers = document.pop("answers")
+
+                    text = f"{question.pop('Title').strip()}\n{question.pop('Body').strip()}\n\n"
+                    metadata.update({f"question_{k}": v for k, v in question.items()})
+
+                    for answer in answers:
+                        text += f"{answer.pop('Body').strip()}\n\n"
+                        metadata.update({f"answer_{k}": v for k, v in answer.items()})
+                else:
+                    raise ValueError(f"Unknown document type: {document}")
+
+                subset = document.pop("subset")
+
+                output = {
+                    "text": text.strip(),
+                    "source": f"{source}_{subset}_{split}",
+                    "added": format_to_dolma_timestamp(),
+                    "created": format_to_dolma_timestamp(created),
+                    "id": docid,
+                    "metadata": {**document, **metadata, "subset": subset, "split": split, "source": source}
+                }
+
+                words += len(text.split())
+
+                # if the document is not empty,
+                # we write it to output
+                destination_file.write(json.dumps(output) + "\n")
+
+                # we update the progress bar every
+                # update_every_n_lines
+                if docs > update_every_n_lines:
+                    cls.increment_progressbar(queue, docs=docs, words=words)
+                    docs = 0
+                    words = 0
+
+            # we update the progress bar one last time
+            cls.increment_progressbar(
+                queue,
+                files=1,
+                docs=docs,
+                words=words,
+            )
+
+
+def main():
+    with TemporaryDirectory() as tmpdir:
+        # create the processor
+        processor = MathpileProcessor(
+            source_prefix="/data/mathpile/raw/*/*/*/*.gz",
+            destination_prefix="/data/mathpile/v0",
+            metadata_prefix=tmpdir,
+            num_processes=cpu_count() - 2,
+        )
+
+        # run the processor
+        processor()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/openhermes/v1.py b/sources/openhermes/v1.py
new file mode 100644
index 00000000..7d2cdbcc
--- /dev/null
+++ b/sources/openhermes/v1.py
@@ -0,0 +1,52 @@
+import smart_open
+import json
+from dolma.core.paths import cached_path
+import pandas as pd
+from datasets import load_dataset
+import hashlib
+from tqdm import tqdm
+import datetime
+import re
+DESTINATION_S3 = "s3://ai2-llm/pretraining-data/sources/teknium_OpenHermes-2.5/v1/documents/oh2_5.jsonl.gz"
+dataset = load_dataset(
+    "teknium/OpenHermes-2.5",
+    split="train",
+)
+
+OPENHERMES_DATE = datetime.datetime(2023, 11, 12)
+
+
+def format_to_dolma_timestamp(timestamp: datetime.datetime | None = None) -> str:
+    """Format a timestamp as a string using near ISO-8601 format."""
+    if timestamp is None:
+        timestamp = datetime.datetime.now()
+    return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z"
+
+
+with smart_open.open(DESTINATION_S3, 'w') as f:
+
+    for row in tqdm(dataset):
+        spacing = max(
+            [len(span) for turn in row["conversations"] for span in re.findall(r'\n+', turn['value'])] + [1]
+        )
+        text = ("\n" * (spacing + 1)).join(turn['value'] for turn in row["conversations"])
+
+        row_id = row['id'] or hashlib.md5(json.dumps(row).encode('utf-8')).hexdigest()
+
+        source = f'openhermes-2.5'
+        if row['source']:
+            source += f'-{row["source"]}'
+
+        version = 'v1'
+
+        document = {
+            'id': row_id,
+            'source': source,
+            'version': version,
+            'text': text,
+            'added': format_to_dolma_timestamp(),
+            'created': format_to_dolma_timestamp(OPENHERMES_DATE),
+            'metadata': row,
+        }
+
+        f.write(json.dumps(document) + '\n')
diff --git a/sources/stackexchange/README.md b/sources/stackexchange/README.md
new file mode 100644
index 00000000..6449c4ff
--- /dev/null
+++ b/sources/stackexchange/README.md
@@ -0,0 +1,261 @@
+# Stack Exchange
+
+## Instructions
+
+1. Download the Stack Exchange data from the Internet Archive using the [`download_from_ia.sh`](download_from_ia.sh) script.
+2. Convert data to parquet using the [`v0.py`](v0.py) script.
+3. Load the data into Athena as follows:
+
+Create comments table:
+
+```sql
+CREATE EXTERNAL TABLE IF NOT EXISTS `lucas`.`se_comments_20240930` (
+    Id STRING,
+    PostId STRING,
+    Score STRING,
+    Text STRING,
+    CreationDate STRING,
+    UserID STRING,
+    ContentLicense STRING
+)
+PARTITIONED BY (forum STRING)
+STORED AS PARQUET
+LOCATION 's3://ai2-llm/pretraining-data/sources/stackexchange/raw/20240930_parquet/comments/'
+TBLPROPERTIES ('parquet.compression'='SNAPPY')
+```
+
+Then run the following to load the partitions:
+
+```sql
+MSCK REPAIR TABLE lucas.se_comments_20240930;
+```
+
+Create posts table:
+
+```sql
+CREATE EXTERNAL TABLE IF NOT EXISTS `lucas`.`se_posts_20240930` (
+    AcceptedAnswerId BIGINT,
+    AnswerCount BIGINT,
+    Body STRING,
+    ClosedDate STRING,
+    CommentCount BIGINT,
+    ContentLicense STRING,
+    CreationDate STRING,
+    Id BIGINT,
+    LastActivityDate STRING,
+    LastEditDate STRING,
+    LastEditorDisplayName STRING,
+    LastEditorUserId BIGINT,
+    OwnerDisplayName STRING,
+    OwnerUserId BIGINT,
+    ParentId BIGINT,
+    PostTypeId STRING,
+    Score BIGINT,
+    Tags STRING,
+    Title STRING,
+    ViewCount BIGINT
+)
+PARTITIONED BY (forum STRING)
+STORED AS PARQUET
+LOCATION 's3://ai2-llm/pretraining-data/sources/stackexchange/raw/20240930_parquet/posts/'
+TBLPROPERTIES ('parquet.compression'='SNAPPY')
+```
+
+Then run the following to load the partitions:
+
+```sql
+MSCK REPAIR TABLE lucas.se_posts_20240930;
+```
+
+# Selecting QA pairs
+
+
+```sql
+UNLOAD (
+    WITH valid_questions AS (
+        SELECT
+            posts.Body,
+            posts.Id,
+            posts.CommentCount,
+            posts.ContentLicense,
+            posts.CreationDate,
+            posts.LastActivityDate,
+            posts.LastEditDate,
+            posts.LastEditorDisplayName,
+            posts.LastEditorUserId,
+            posts.OwnerUserId,
+            posts.OwnerDisplayName,
+            posts.Score,
+            posts.Tags,
+            posts.ViewCount,
+            posts.Title,
+            posts.Forum,
+            posts.AcceptedAnswerid
+        FROM "lucas"."se_posts_20240930" as posts
+        WHERE
+            posttypeid = 'Question'
+            AND posts.AnswerCount > 0
+            AND posts.acceptedanswerid >= 0
+
+    ),
+    valid_answers AS  (
+        SELECT
+            posts.Body,
+            posts.Id,
+            posts.CommentCount,
+            posts.ContentLicense,
+            posts.CreationDate,
+            posts.LastActivityDate,
+            posts.LastEditDate,
+            posts.LastEditorDisplayName,
+            posts.LastEditorUserId,
+            posts.OwnerUserId,
+            posts.OwnerDisplayName,
+            posts.Score,
+            posts.ViewCount,
+            posts.Forum
+        FROM "lucas"."se_posts_20240930" as posts
+        WHERE posttypeid = 'Answer'
+    ),
+    joined_questions_answers AS (
+        SELECT
+            valid_answers.Body AS answer_body,
+            valid_answers.Id AS answer_id,
+            valid_answers.CommentCount AS answer_comment_count,
+            valid_answers.ContentLicense AS answer_content_license,
+            valid_answers.CreationDate AS answer_creation_date,
+            valid_answers.LastActivityDate AS answer_last_activity_date,
+            valid_answers.LastEditDate AS answer_last_edit_date,
+            valid_answers.LastEditorDisplayName AS answer_last_editor_display_name,
+            valid_answers.LastEditorUserId AS answer_last_editor_user_id,
+            valid_answers.OwnerUserId AS answer_owner_user_id,
+            valid_answers.OwnerDisplayName AS answer_owner_display_name,
+            valid_answers.Score AS answer_score,
+            valid_answers.ViewCount AS answer_view_count,
+            valid_answers.Forum AS answer_forum,
+            valid_questions.Title AS question_title,
+            valid_questions.Body AS question_body,
+            valid_questions.Id AS question_id,
+            valid_questions.CommentCount AS question_comment_count,
+            valid_questions.ContentLicense AS question_content_license,
+            valid_questions.CreationDate AS question_creation_date,
+            valid_questions.LastActivityDate AS question_last_activity_date,
+            valid_questions.LastEditDate AS question_last_edit_date,
+            valid_questions.LastEditorDisplayName AS question_last_editor_display_name,
+            valid_questions.LastEditorUserId AS question_last_editor_user_id,
+            valid_questions.OwnerUserId AS question_owner_user_id,
+            valid_questions.OwnerDisplayName AS question_owner_display_name,
+            valid_questions.Score AS question_score,
+            valid_questions.Tags AS question_tags,
+            valid_questions.ViewCount AS question_view_count,
+            valid_questions.Forum AS question_forum,
+            CAST (
+                ARRAY_MAX(
+                    TRANSFORM(
+                        regexp_extract_all(valid_answers.body, '\n+'),
+                        x -> LENGTH(x)
+                    )
+                    || ARRAY [1]
+                ) AS INTEGER
+            ) as question_max_newline,
+            CAST (
+                ARRAY_MAX(
+                    TRANSFORM(
+                        regexp_extract_all(valid_questions.body, '\n+'),
+                        x -> LENGTH(x)
+                    )
+                    || ARRAY [1]
+                ) AS INTEGER
+            ) as answer_max_newline
+        FROM valid_answers
+        INNER JOIN valid_questions
+            ON valid_questions.forum = valid_answers.forum
+            AND valid_questions.acceptedanswerid = valid_answers.id
+    )
+    SELECT
+        (
+            question_forum
+            || '-'
+            || CAST(question_id AS VARCHAR)
+            || '-'
+            || CAST(answer_id AS VARCHAR)
+        ) as id,
+        (
+            TRIM(question_title)
+            || ARRAY_JOIN(
+                REPEAT(
+                    CHR(10),
+                    question_max_newline + 1
+                ),
+                ''
+            )
+            || TRIM(question_body)
+            || ARRAY_JOIN(
+                REPEAT(
+                    CHR(10),
+                    IF(
+                        question_max_newline > answer_max_newline,
+                        question_max_newline + 1,
+                        answer_max_newline + 1
+                    )
+                ),
+                ''
+            )
+            || TRIM(answer_body)
+        ) as text,
+        question_creation_date AS created,
+        answer_last_activity_date AS added,
+        'stackexchange' AS source,
+        '20240930' as version,
+        CAST(
+            ROW(
+                question_forum,
+                question_id,
+                answer_id,
+                question_owner_user_id,
+                answer_owner_user_id,
+                question_last_editor_user_id,
+                answer_last_editor_user_id,
+                question_last_edit_date,
+                answer_last_edit_date,
+                question_last_activity_date,
+                answer_last_activity_date,
+                question_content_license,
+                answer_content_license,
+                question_score,
+                answer_score,
+                question_view_count,
+                answer_view_count,
+                question_comment_count,
+                answer_comment_count
+            ) AS
+            ROW(
+                forum VARCHAR,
+                question_id BIGINT,
+                answer_id BIGINT,
+                question_owner_user_id BIGINT,
+                answer_owner_user_id BIGINT,
+                question_last_editor_user_id BIGINT,
+                answer_last_editor_user_id BIGINT,
+                question_last_edit_date VARCHAR,
+                answer_last_edit_date VARCHAR,
+                question_last_activity_date VARCHAR,
+                answer_last_activity_date VARCHAR,
+                question_content_license VARCHAR,
+                answer_content_license VARCHAR,
+                question_score BIGINT,
+                answer_score BIGINT,
+                question_view_count BIGINT,
+                answer_view_count BIGINT,
+                question_comment_count BIGINT,
+                answer_comment_count BIGINT
+            )
+        ) AS metadata
+    FROM joined_questions_answers
+)
+TO 's3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/'
+WITH (
+    format='JSON',
+    compression='ZSTD'
+)
+```
diff --git a/sources/stackexchange/download_from_ia.sh b/sources/stackexchange/download_from_ia.sh
new file mode 100644
index 00000000..e51accf6
--- /dev/null
+++ b/sources/stackexchange/download_from_ia.sh
@@ -0,0 +1,118 @@
+#! /bin/bash
+
+#!/bin/bash
+
+# Function to display usage information
+usage() {
+    echo "Usage: $0 -c|--collection-id <collection_id> -d|--destination <destination> [-n|--num-processes <num_processes>] [-k|--num-chunks <num_chunks>]"
+    echo "  -c, --collection-id   : The ID of the Internet Archive collection (required)"
+    echo "  -d, --destination     : Location where to save each file from the collection (required)"
+    echo "  -n, --num-processes   : Number of parallel downloads to use (default: 1)"
+    echo "  -k, --num-chunks      : Number of chunks to split the collection into (default: 1)"
+    exit 1
+}
+
+# Initialize variables
+collection_id=""
+destination=""
+num_processes=1
+num_chunks=1
+
+# Parse command-line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -c|--collection-id)
+            collection_id="$2"
+            shift 2
+            ;;
+        -d|--destination)
+            destination="$2"
+            shift 2
+            ;;
+        -n|--num-processes)
+            num_processes="$2"
+            shift 2
+            ;;
+        -k|--num-chunks)
+            num_chunks="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+# Check if required arguments are provided
+if [ -z "$collection_id" ]; then
+    echo "Error: Collection ID is required"
+    usage
+fi
+
+if [ -z "$destination" ]; then
+    echo "Error: Destination is required"
+    usage
+fi
+
+# Ensure num_processes is an integer greater than or equal to 1
+if ! [[ "$num_processes" =~ ^[0-9]+$ ]] || [ "$num_processes" -lt 1 ]; then
+    echo "Error: num_processes must be an integer greater than or equal to 1"
+    usage
+fi
+
+# Ensure num_chunks is an integer greater than or equal to 1
+if ! [[ "$num_chunks" =~ ^[0-9]+$ ]] || [ "$num_chunks" -lt 1 ]; then
+    echo "Error: num_chunks must be an integer greater than or equal to 1"
+    usage
+fi
+
+
+# Check if aria2c is available
+if ! command -v aria2c &> /dev/null; then
+    echo "Error: aria2c is not installed or not in the system PATH"
+    exit 1
+fi
+
+# check if jq is available
+if ! command -v jq &> /dev/null; then
+    echo "Error: jq is not installed or not in the system PATH"
+    exit 1
+fi
+
+# Create a temporary file to store the download urls
+temp_file=$(mktemp)
+
+# Write items to the temporary file
+curl -s "https://archive.org/metadata/$collection_id" | jq -r '{collection_id: .metadata.identifier, name: .files[].name} | select(.name | endswith(".7z")) | "https://archive.org/download/\(.collection_id)/\(.name)"' > "$temp_file"
+
+# make destination directory if it doesn't exist
+mkdir -p "$destination"
+
+# Print the number of files to be downloaded
+num_files=$(wc -l < "$temp_file")
+echo "Downloading $num_files files"
+
+if [ "$num_files" -eq 0 ]; then
+    echo "No files to download"
+    exit 1
+fi
+
+# Download each file in parallel
+aria2c \
+    --continue \
+    --split ${num_chunks} \
+    --max-connection-per-server ${num_chunks} \
+    -k 1M \
+    -j ${num_processes} \
+    -i "$temp_file" \
+    -d "$destination" \
+    --show-console-readout=true \
+    --summary-interval=5 \
+    --console-log-level=notice
+
+# Remove the temporary file
+rm "$temp_file"
diff --git a/sources/stackexchange/requirements.txt b/sources/stackexchange/requirements.txt
new file mode 100644
index 00000000..924dd11b
--- /dev/null
+++ b/sources/stackexchange/requirements.txt
@@ -0,0 +1,7 @@
+smart-open>=7.0.4
+py7zr
+lxml
+pyarrow
+tqdm
+markdownify
+resiliparse
diff --git a/sources/stackexchange/v0.py b/sources/stackexchange/v0.py
new file mode 100644
index 00000000..f715a276
--- /dev/null
+++ b/sources/stackexchange/v0.py
@@ -0,0 +1,236 @@
+import argparse
+import os
+import re
+from contextlib import ExitStack
+from typing import Any, Callable, Iterator
+
+import libarchive  # pyright: ignore
+import py7zr  # pyright: ignore
+from resiliparse.extract.html2text import extract_plain_text  # pyright: ignore
+import pyarrow as pa
+import pyarrow.parquet as pq
+from libarchive.entry import ArchiveEntry  # pyright: ignore
+from lxml import etree  # pyright: ignore
+from tqdm import tqdm
+
+os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace"
+
+
+post_types = {
+    "1": "Question",
+    "2": "Answer",
+    "3": "Orphaned tag wiki",
+    "4": "Tag wiki excerpt",
+    "5": "Tag wiki",
+    "6": "Moderator nomination",
+    "7": "Wiki placeholder",
+    "8": "Privilege wiki",
+    "9": "Article",
+    "10": "HelpArticle",
+    "11": "Unknown",
+    "12": "Collection",
+    "13": "ModeratorQuestionnaireResponse",
+    "14": "Announcement",
+    "15": "CollectiveDiscussion",
+    "16": "CollectiveCollection",
+}
+
+POSTS_MAP: dict[str, Callable[[str | None], Any]] = {
+    "AcceptedAnswerId": lambda x: int(x or 0),
+    "AnswerCount": lambda x: int(x or 0),
+    "Body": lambda x: extract_plain_text(x or "").strip(),
+    "ClosedDate": lambda x: str(x or ""),
+    "CommentCount": lambda x: int(x or 0),
+    "CommunityOwnedDate": lambda x: str(x or ""),
+    "ContentLicense": lambda x: str(x or ""),
+    "CreationDate": lambda x: str(x or ""),
+    "Id": lambda x: int(x or 0),
+    "LastActivityDate": lambda x: str(x or ""),
+    "LastEditDate": lambda x: str(x or ""),
+    "LastEditorDisplayName": lambda x: str(x or ""),
+    "LastEditorUserId": lambda x: int(x or 0),
+    "OwnerDisplayName": lambda x: str(x or ""),
+    "OwnerUserId": lambda x: int(x or 0),
+    "ParentId": lambda x: int(x or 0),
+    "PostTypeId": lambda x: post_types.get(x or "11", "Unknown"),
+    "Score": lambda x: int(x or 0),
+    "Tags": lambda x: str(x or ""),
+    "Title": lambda x: str(x or ""),
+    "ViewCount": lambda x: int(x or 0),
+}
+
+COMMENTS_MAP: dict[str, Callable[[str | None], Any]] = {
+    "ContentLicense": lambda x: str(x or ""),
+    "CreationDate": lambda x: str(x or ""),
+    "Id": lambda x: int(x or 0),
+    "PostId": lambda x: int(x or 0),
+    "Score": lambda x: int(x or 0),
+    "Text": lambda x: str(x or ""),
+    "UserDisplayName": lambda x: str(x or ""),
+    "UserId": lambda x: int(x or 0),
+}
+
+USERS_MAP: dict[str, Callable[[str | None], Any]] = {
+    "Id": lambda x: int(x or 0),
+    "Reputation": lambda x: int(x or 0),
+    "CreationDate": lambda x: str(x or ""),
+    "DisplayName": lambda x: str(x or ""),
+    "LastAccessDate": lambda x: str(x or ""),
+    "WebsiteUrl": lambda x: str(x or ""),
+    "Location": lambda x: str(x or ""),
+    "AboutMe": lambda x: str(x or ""),
+    "Views": lambda x: int(x or 0),
+    "UpVotes": lambda x: int(x or 0),
+    "DownVotes": lambda x: int(x or 0),
+    "ProfileImageUrl": lambda x: str(x or ""),
+    "EmailHash": lambda x: str(x or ""),
+    "AccountId": lambda x: int(x or 0),
+}
+
+
+def get_7z_uncompressed_size(sz_path, entry_name):
+    with py7zr.SevenZipFile(sz_path, mode="r") as z:
+        for entry in z.list():
+            if entry.filename == entry_name:
+                return entry.uncompressed
+        raise FileNotFoundError(f"File {entry_name} not found in archive {sz_path}")
+
+
+def stream_xml_from_7z(
+    archive_path: str, filename: str, target_xpath: str = "//*", block_size: int = 8192
+) -> Iterator[etree._Element]:
+    """
+    Stream XML nodes from a file within a 7z archive, parsing them lazily.
+
+    Args:
+        archive_path (str): Path to the 7z archive
+        filename (str): Name of the XML file within the archive
+        target_xpath (str, optional): XPath expression to filter nodes. Defaults to "//*".
+        block_size (int, optional): Size of blocks to read. Defaults to 8192.
+
+    Yields:
+        lxml.etree._Element: XML nodes matching the target_xpath
+
+    Raises:
+        FileNotFoundError: If archive or file within archive is not found
+        ValueError: If file is not valid XML
+    """
+    # Initialize the XML parser that will receive chunks of data
+    parser = etree.XMLPullParser(events=("end",), recover=True)
+
+    with ExitStack() as stack:
+        archive = stack.enter_context(libarchive.file_reader(archive_path))
+        # Find the target file in the archive
+        for entry in archive:
+            if entry.pathname != filename:
+                continue
+
+            archive_name = os.path.basename(archive_path)
+            pbar = tqdm(
+                total=get_7z_uncompressed_size(archive_path, filename),
+                desc=f"Bytes {archive_name}::{filename}",
+                unit="B",
+                unit_scale=True,
+            )
+            prev_line = b""
+            for chunk in entry.get_blocks(block_size):
+                pbar.update(len(chunk))
+                first_seg, *segments = re.split(b"\r*\n|\r", chunk)
+                if segments:
+                    # there's at least one line break in the chunk, so we can yield the previous line
+                    yield prev_line + first_seg
+                    yield from segments[:-1]
+                    prev_line = segments[-1]
+                else:
+                    # no line breaks in the chunk, so we need to accumulate it
+                    prev_line += chunk
+
+
+def process_file(
+    archive_path: str,
+    output_dir: str,
+    entry_name: str,
+    entry_map: dict[str, Callable[[str| None], Any]],
+    batch_size: int = 100_000,
+    block_size: int = 8192,
+):
+    entry_prefix, _ = os.path.basename(entry_name.lower()).split(".", 1)
+    archive_name = os.path.basename(archive_path)
+
+    os.makedirs(output_dir, exist_ok=True)
+    data = []
+    schema = None
+
+    with ExitStack() as stack:
+        xml_elements = stream_xml_from_7z(archive_path, entry_name, block_size=block_size)
+        files_pbar = tqdm(desc=f"Files {archive_name}::{entry_name}")
+        elements_pbar = tqdm(xml_elements, desc=f"Rows {archive_name}::{entry_name}")
+
+        for row in elements_pbar:
+            if not row.strip().startswith(b"<row"):
+                continue
+
+            row = etree.fromstring(row)
+
+            if not row.attrib:
+                continue
+
+            data.append({k: v(row.attrib.get(k, None)) for k, v in entry_map.items()})
+
+            if schema is None:
+                schema = pa.Table.from_pylist(data).schema
+
+            if len(data) >= batch_size:
+                table = pa.Table.from_pylist(data, schema=schema)
+                pq.write_table(
+                    table,
+                    os.path.join(output_dir, f"{entry_prefix}_{files_pbar.n:06d}.parquet"),
+                )
+                data = []
+                files_pbar.update(1)
+        # Write any remaining data
+
+        if data:
+            table = pa.Table.from_pylist(data, schema=schema)
+            pq.write_table(
+                table,
+                os.path.join(output_dir, f"{entry_prefix}_{files_pbar.n:06d}.parquet"),
+            )
+            files_pbar.update(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert Stack Exchange 7z XML dumps to Parquet format")
+    parser.add_argument("archive_path", help="Path to the 7z archive")
+    parser.add_argument("output_dir", help="Directory where Parquet files will be saved")
+    parser.add_argument(
+        "--batch-size", type=int, default=100000, help="Number of rows to process at once (default: 100000)"
+    )
+    parser.add_argument("--block-size", type=int, default=8192, help="Size of blocks to read (default: 8192)")
+
+    args = parser.parse_args()
+
+    if os.path.isdir(args.archive_path):
+        archive_paths = [
+            os.path.join(args.archive_path, p) for p in os.listdir(args.archive_path) if p.endswith("7z")
+        ]
+    else:
+        archive_paths = [args.archive_path]
+
+    for archive_path in tqdm(archive_paths, desc="Archives"):
+        for entry_name, entry_map in [("Posts.xml", POSTS_MAP), ("Comments.xml", COMMENTS_MAP), ("Users.xml", USERS_MAP)]:
+            clean_entry_name = entry_name.split(".", 1)[0].lower()
+            clean_forum_name = archive_path.split("/")[-1].rsplit(".", 1)[0].lower().replace(".", "_")
+            output_path = os.path.join(args.output_dir, clean_entry_name, f"forum={clean_forum_name}")
+            process_file(
+                archive_path=archive_path,
+                output_dir=output_path,
+                entry_name=entry_name,
+                entry_map=entry_map,  # pyright: ignore
+                batch_size=args.batch_size,
+                block_size=args.block_size,
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sources/tulu_flan/search.py b/sources/tulu_flan/search.py
new file mode 100644
index 00000000..01f151df
--- /dev/null
+++ b/sources/tulu_flan/search.py
@@ -0,0 +1,114 @@
+import argparse
+from contextlib import ExitStack
+from pathlib import Path
+import re
+import json
+import os
+
+import smart_open
+import tqdm
+from dolma_decontamination.search.common import create_index
+from dolma_decontamination.search.index import list_paths
+from dolma_decontamination.search.query import HitsTuple
+
+
+def make_search_parser():
+    parser = argparse.ArgumentParser("Interactive search tool on a tantivy index")
+    parser.add_argument(
+        "-i",
+        "--index-path",
+        type=str,
+        required=True,
+        help="The path to the index."
+    )
+    parser.add_argument(
+        "-d",
+        "--documents",
+        type=str,
+        required=True,
+        nargs="+",
+        help="The paths to documents to use as queries."
+    )
+    parser.add_argument(
+        "-n",
+        "--num-hits",
+        type=int,
+        default=10,
+        help="The number of hits to return."
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default=None,
+        help="A directory to write the output to."
+    )
+    return parser
+
+
+class TextNormalizer:
+    def __init__(self):
+        self.whitespace_re = re.compile(r"\s+")
+        self.non_alnum_re = re.compile(r"[^a-zA-Z0-9\s]+")
+
+    def __call__(self, text: str) -> str:
+        text = self.whitespace_re.sub(" ", self.non_alnum_re.sub("", text.strip()))
+        return text.replace("AND", "and").replace("OR", "or").replace("NOT", "not").replace("IN", "in")
+
+
+def search_data(args: argparse.Namespace):
+    index = create_index(args.index_path, reuse=True)
+    searcher = index.searcher()
+
+    paths = list_paths(args.documents)
+    norm = TextNormalizer()
+
+    Path(args.output).mkdir(parents=True, exist_ok=True)
+
+    with ExitStack() as stack:
+        files_pbar = stack.enter_context(tqdm.tqdm(paths, unit="files", unit_scale=True))
+        docs_pbar = stack.enter_context(tqdm.tqdm(unit=" docs", unit_scale=True))
+        queries_pbar = stack.enter_context(tqdm.tqdm(unit=" queries", unit_scale=True))
+
+        output_id = 0
+
+        output_path = f"{args.output}/{output_id:06d}.jsonl.zst"
+        output_file = stack.enter_context(smart_open.open(output_path, "wt", encoding="utf-8"))
+
+        for path in files_pbar:
+            f = stack.enter_context(smart_open.open(path, "rt", encoding="utf-8"))
+            for line in f:
+                document = json.loads(line)
+
+                for start, end, score in document.get("attributes", {}).get("dedupe_ngrams_8_1", []):
+                    text = document["text"][start:end]
+                    normalized_text = norm(text)
+
+                    parsed_query = index.parse_query(normalized_text)
+                    hits = searcher.search(parsed_query, limit=args.num_hits).hits
+                    parsed_hits = HitsTuple.from_hits(hits, searcher)
+
+                    output = {
+                        "query": normalized_text,
+                        "hits": [h.to_dict() for h in parsed_hits],
+                        "document": document,
+                        "span_score": score
+                    }
+                    queries_pbar.update(1)
+                    output_file.write(json.dumps(output) + "\n")
+
+                    if queries_pbar.n % 50_000 == 0:
+                        output_file.close()
+                        output_id += 1
+                        output_path = f"{args.output}/{output_id:06d}.jsonl.zst"
+                        output_file = stack.enter_context(
+                            smart_open.open(output_path, "wt", encoding="utf-8")
+                        )
+
+                docs_pbar.update(1)
+
+            files_pbar.update(1)
+
+
+if __name__ == "__main__":
+    search_data(make_search_parser().parse_args())
diff --git a/sources/tulu_flan/tokens.sh b/sources/tulu_flan/tokens.sh
new file mode 100644
index 00000000..ae8ec63a
--- /dev/null
+++ b/sources/tulu_flan/tokens.sh
@@ -0,0 +1,18 @@
+#! /usr/bin/env bash
+
+set -ex
+
+
+dolma tokens \
+    --documents 's3://ai2-llm/pretraining-data/sources/tulu_flan/v1-FULLDECON-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/*.json.gz' \
+    --destination "${HOME}/ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer" \
+    --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \
+    --tokenizer.eos_token_id 100257 \
+    --tokenizer.pad_token_id 100277 \
+    --no-tokenizer.segment_before_tokenization \
+    --tokenizer.encode_special_tokens \
+    --ring_size 8 \
+    --processes 92 \
+    --max_size 4_000_000_000 \
+    --sample_ring_prop \
+    --dtype 'uint32'
diff --git a/tests/python/test_paths.py b/tests/python/test_paths.py
index e920af74..df758e22 100644
--- a/tests/python/test_paths.py
+++ b/tests/python/test_paths.py
@@ -295,7 +295,6 @@ def test_split_glob(self):
 
 class TestSplitExt(TestCase):
     def test_file(self):
-
         prot, parts, ext = split_ext("file.txt")
 
         self.assertEqual(prot, "")
@@ -318,7 +317,6 @@ def test_file(self):
         self.assertEqual(ext, ".")
 
     def test_path(self):
-
         prot, parts, ext = split_ext("path/to/file.txt")
 
         self.assertEqual(prot, "")