diff --git a/.devcontainer/postInstall.sh b/.devcontainer/postInstall.sh index cf3761a9..f2b12ea5 100755 --- a/.devcontainer/postInstall.sh +++ b/.devcontainer/postInstall.sh @@ -2,4 +2,4 @@ PATH=/home/vscode/.cargo/bin:$PATH cd dolma -source /home/vscode/miniforge3/bin/activate && pip install cmake "maturin[patchelf]>=1.1,<2.0" +source /home/vscode/miniforge3/bin/activate && pip install cmake "maturin>=1.5,<2.0" diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 838abff0..1f007813 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -19,6 +19,7 @@ permissions: env: DOLMA_TESTS_SKIP_AWS: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'true' || 'false' }} DOLMA_TEST_S3_PREFIX: s3://dolma-tests + DOLMA_TEST_SKIP_LARGE_MODELS: "true" RUST_CHANNEL: stable jobs: diff --git a/Cargo.lock b/Cargo.lock index 1457f246..2beea7e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -987,7 +987,7 @@ dependencies = [ [[package]] name = "dolma" -version = "1.0.14" +version = "1.1.0" dependencies = [ "adblock", "ahash", diff --git a/classifiers/README.md b/classifiers/README.md new file mode 100644 index 00000000..b71253f3 --- /dev/null +++ b/classifiers/README.md @@ -0,0 +1,30 @@ +# Dolma Classifiers + + +## Getting Started + +From root directory, install the package: + +```bash +pip install -e classifiers +``` + +## Examples + +Run [Huggingface FineWeb classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) on S3 data: + +```bash +python -m dolma_classifiers.inference \ + -s 's3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/20b-01/*zstd' \ + -m HuggingFaceFW/fineweb-edu-classifier +``` + +Run [NVIDIA's Deberta quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) on S3 data with model compilation: + +```bash +python -m dolma_classifiers.inference \ + -s 's3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd' \ + -m nvidia/quality-classifier-deberta \ + --model-compile \ + --max-length 1024 +``` diff --git a/classifiers/pyproject.toml b/classifiers/pyproject.toml new file mode 100755 index 00000000..6ee1dd1b --- /dev/null +++ b/classifiers/pyproject.toml @@ -0,0 +1,107 @@ +[project] +name = "dolma-classifiers" +version = "0.1.0" +description = "Toolkit for easy classification of data in Dolma format." +authors = [ + {name = "Luca Soldaini", email = "lucas@allenai.org" } +] +license = {text = "Apache-2.0"} +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "msgspec", + "fsspec[s3]", + "smart_open[s3]>=7.0.4", + "tqdm", + "torch", + "transformers", + "wandb", + "jq" +] + +[project.urls] +"Homepage" = "https://github.com/allenai/dolma" +"Repository" = "https://github.com/allenai/dolma" +"Bug Tracker" = "https://github.com/allenai/dolma/issues" + + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +dolma_classifiers = ["py.typed", "*.pyi"] + + +[build-system] +build-backend = "setuptools.build_meta" +requires = [ + "setuptools >= 61.0.0", + "wheel" +] + +[project.optional-dependencies] +dev = [ + "black>=22.6.0", + "isort>=5.10.1", + "mypy>=0.971", + "pytest>=5.2", + "ipython>=8.4.0", + "autopep8>=1.7.0", + "flake8>=5.0", + "ipdb>=0.13.0", + "flake8-pyi>=22.8.1", + "Flake8-pyproject>=1.1.0", + "pytest-asyncio>=0.15.1", + "pytest-cov>=2.12.1", + "aioresponses>=0.7.2", +] + +[tool.black] +line-length = 115 +include = '\.pyi?$' +exclude = ''' +( + __pycache__ + | \.git + | \.mypy_cache + | \.pytest_cache + | \.vscode + | \.venv + | \bdist\b + | \bdoc\b +) +''' + +[tool.isort] +profile = "black" +line_length = 115 +multi_line_output = 3 + +[tool.autopep8] +max_line_length = 115 +in-place = true +recursive = true +aggressive = 3 + +[tool.mypy] +python_version = "3.10" +ignore_missing_imports = true +no_site_packages = true +allow_redefinition = false +warn_unused_configs = true +warn_unused_ignores = true +warn_no_return = true +warn_return_any = false +warn_unreachable = true +show_error_codes = true +pretty = true + +[tool.mypy-tests] +strict_optional = false + +[tool.flake8] +per-file-ignores = [ + '__init__.py:F401', + '*.pyi:E302,E305', + '*.py:E203' +] diff --git a/classifiers/scripts/fineweb_100b.sh b/classifiers/scripts/fineweb_100b.sh new file mode 100644 index 00000000..45298889 --- /dev/null +++ b/classifiers/scripts/fineweb_100b.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl.zstd' + +NUM_NODES=2 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="high" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_40b.sh b/classifiers/scripts/fineweb_40b.sh new file mode 100644 index 00000000..69c43247 --- /dev/null +++ b/classifiers/scripts/fineweb_40b.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd' +NUM_NODES=1 +BATCH_SIZE=1024 +CLUSTER="ai2/neptune*" +PRIORITY="high" +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" + + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_50b_extra.sh b/classifiers/scripts/fineweb_50b_extra.sh new file mode 100644 index 00000000..d80909c8 --- /dev/null +++ b/classifiers/scripts/fineweb_50b_extra.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/20240909-50b/*zstd' +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="high" + + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_automath_arxiv.sh b/classifiers/scripts/fineweb_automath_arxiv.sh new file mode 100644 index 00000000..ed9afe84 --- /dev/null +++ b/classifiers/scripts/fineweb_automath_arxiv.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/arxiv/*/*.gz' + +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_automath_code.sh b/classifiers/scripts/fineweb_automath_code.sh new file mode 100644 index 00000000..a9769496 --- /dev/null +++ b/classifiers/scripts/fineweb_automath_code.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/code/*/*.gz' + +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_automath_web.sh b/classifiers/scripts/fineweb_automath_web.sh new file mode 100644 index 00000000..2994999a --- /dev/null +++ b/classifiers/scripts/fineweb_automath_web.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/web/*.gz' + +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_dclm07.sh b/classifiers/scripts/fineweb_dclm07.sh new file mode 100644 index 00000000..bb41d32b --- /dev/null +++ b/classifiers/scripts/fineweb_dclm07.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile/documents/*zst' + +NUM_NODES=4 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4" diff --git a/classifiers/scripts/fineweb_flan.sh b/classifiers/scripts/fineweb_flan.sh new file mode 100644 index 00000000..4def83f0 --- /dev/null +++ b/classifiers/scripts/fineweb_flan.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/*.gz' + +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_full.sh b/classifiers/scripts/fineweb_full.sh new file mode 100644 index 00000000..a74d07f7 --- /dev/null +++ b/classifiers/scripts/fineweb_full.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.jsonl.zstd' + +NUM_NODES=4 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_owm.sh b/classifiers/scripts/fineweb_owm.sh new file mode 100644 index 00000000..29d872d1 --- /dev/null +++ b/classifiers/scripts/fineweb_owm.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/proof-pile-2/v0_decontaminated/documents/*/*/*.gz' + +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_pes2o.sh b/classifiers/scripts/fineweb_pes2o.sh new file mode 100644 index 00000000..38386ac7 --- /dev/null +++ b/classifiers/scripts/fineweb_pes2o.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/*/*/*/*.gz' + +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_se.sh b/classifiers/scripts/fineweb_se.sh new file mode 100644 index 00000000..7fe3e43b --- /dev/null +++ b/classifiers/scripts/fineweb_se.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*.zst' + +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/nvidia-deberta-100b.sh b/classifiers/scripts/nvidia-deberta-100b.sh new file mode 100644 index 00000000..5c29e669 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-100b.sh @@ -0,0 +1,57 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl.zstd' + + +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0000*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0001*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0002*.jsonl.zstd' + +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0000*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0001*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0002*.jsonl.zstd' + + +NUM_NODES=4 +# NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="high" +# PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-40b.sh b/classifiers/scripts/nvidia-deberta-40b.sh new file mode 100644 index 00000000..b3a57c14 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-40b.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd' + +NUM_NODES=2 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="high" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-50_extra.sh b/classifiers/scripts/nvidia-deberta-50_extra.sh new file mode 100644 index 00000000..cdd1f7b5 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-50_extra.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/20240909-50b/*zstd' + +NUM_NODES=2 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="high" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-automath-arxiv.sh b/classifiers/scripts/nvidia-deberta-automath-arxiv.sh new file mode 100644 index 00000000..52898bbd --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-automath-arxiv.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/arxiv/*/*.gz' + + +NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-automath-code.sh b/classifiers/scripts/nvidia-deberta-automath-code.sh new file mode 100644 index 00000000..958264a3 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-automath-code.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/code/*/*.gz' + + +NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-automath-web.sh b/classifiers/scripts/nvidia-deberta-automath-web.sh new file mode 100644 index 00000000..631d5b06 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-automath-web.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/web/*.gz' + + +NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-flan.sh b/classifiers/scripts/nvidia-deberta-flan.sh new file mode 100644 index 00000000..da0be027 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-flan.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/*.gz' + + +NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-full.sh b/classifiers/scripts/nvidia-deberta-full.sh new file mode 100644 index 00000000..cddacc11 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-full.sh @@ -0,0 +1,57 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.jsonl.zstd' + + +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0000*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0001*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0002*.jsonl.zstd' + +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0000*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0001*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0002*.jsonl.zstd' + + +NUM_NODES=8 +# NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" +# PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-owm.sh b/classifiers/scripts/nvidia-deberta-owm.sh new file mode 100644 index 00000000..7b779edb --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-owm.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/proof-pile-2/v0_decontaminated/documents/*/*/*.gz' + + +NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-pes2o.sh b/classifiers/scripts/nvidia-deberta-pes2o.sh new file mode 100644 index 00000000..22423f94 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-pes2o.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/*/*/*/*.gz' + + +NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-se.sh b/classifiers/scripts/nvidia-deberta-se.sh new file mode 100644 index 00000000..fe24bbb0 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-se.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*.zst' + + +NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/src/dolma_classifiers/__init__.py b/classifiers/src/dolma_classifiers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/classifiers/src/dolma_classifiers/inference/__init__.py b/classifiers/src/dolma_classifiers/inference/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/classifiers/src/dolma_classifiers/inference/__main__.py b/classifiers/src/dolma_classifiers/inference/__main__.py new file mode 100644 index 00000000..92515713 --- /dev/null +++ b/classifiers/src/dolma_classifiers/inference/__main__.py @@ -0,0 +1,5 @@ +from .inference import main, parse_args + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/classifiers/src/dolma_classifiers/inference/inference.py b/classifiers/src/dolma_classifiers/inference/inference.py new file mode 100644 index 00000000..b488a84a --- /dev/null +++ b/classifiers/src/dolma_classifiers/inference/inference.py @@ -0,0 +1,445 @@ +import argparse +import time +from collections import defaultdict +from functools import partial +from itertools import zip_longest +from multiprocessing import Event, Process +from queue import Empty +from queue import Queue as QueueType +from threading import Event as EventType +from typing import Any, Generator, NamedTuple +from urllib.parse import urlparse + +import fsspec +import jq +import msgspec +import smart_open +import torch +import torch.multiprocessing as mp +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import ( # pyright: ignore + DataLoader, + IterableDataset, + get_worker_info, +) +from transformers import BatchEncoding, PreTrainedTokenizer + +from .loggers import ProgressLogger, WandbLogger, get_logger +from .models import Registry +from .utils import cleanup, get_local_gpu_rank, sanitize_model_name, setup + + +class Batch(NamedTuple): + encoding: BatchEncoding | dict[str, torch.Tensor] + ids: list[str] + lengths: list[int] + sources: list[str] + + def __len__(self): + return len(self.ids) + + +class OutputPath(NamedTuple): + source: str + count: int + + +class DocumentsIterableDataset(IterableDataset[Batch]): + def __init__( + self, + input_paths_queue: QueueType[str], + output_paths_queue: QueueType[OutputPath], + tokenizer: PreTrainedTokenizer, + max_length: int | None, + text_selector: str = '.text', + id_selector: str = ".id", + ): + self.input_paths_queue = input_paths_queue + self.output_paths_queue = output_paths_queue + + self.text_selector = text_selector + self.id_selector = id_selector + self.tokenizer = tokenizer + self.logger = get_logger(self.__class__.__name__) + self.max_length = max_length or int(tokenizer.model_max_length) + + @property + def worker_info(self): + worker_rank = 0 + world_size = 1 + if (worker_info := get_worker_info()): + worker_rank = worker_info.id + world_size = worker_info.num_workers + return worker_rank, world_size + + def __iter__(self) -> Generator[Batch, None, None]: + decoder = msgspec.json.Decoder() + text_selector = jq.compile(self.text_selector) + id_selector = jq.compile(self.id_selector) + + while self.input_paths_queue.qsize() > 0: + path = self.input_paths_queue.get() + self.logger.info(f"Reading {path}") + count = 0 + with smart_open.open(path, "rt") as source_file: + for line in source_file: + doc = decoder.decode(line) + text = str(text_selector.input(doc).first()) + id_ = str(id_selector.input(doc).first()) + encoding = self.tokenizer( + text, + return_tensors="pt", + truncation=True, + max_length=self.max_length, + ) + yield Batch(encoding=encoding, ids=[id_], lengths=[len(text)], sources=[path]) + count += 1 + + self.logger.info(f"Read {count:,} documents from {path}") + self.output_paths_queue.put(OutputPath(source=path, count=count)) + + +def collate_batch(batch: list[Batch], pad_token_id: int) -> Batch: + padded_encodings = { + key: pad_sequence( + # assuming first dimension is batch size + [b.encoding[key][-1, :] for b in batch], # pyright: ignore + batch_first=True, + padding_value=pad_token_id, + ) + for key in batch[0].encoding.keys() + } + return Batch( + encoding=padded_encodings, + ids=[id_ for elem in batch for id_ in elem.ids], + lengths=[length for elem in batch for length in elem.lengths], + sources=[source for elem in batch for source in elem.sources], + ) + + +class AttributeRow(NamedTuple): + sources: list[str] + attributes: list[dict[str, Any]] + + +def writer_worker( + error_event: EventType, + scores_queue: QueueType[AttributeRow | None], + output_paths_queue: QueueType[OutputPath], + source_destination_mapping: dict[str, str], + log_every: int = 10_000, +): + + progress_logger = ProgressLogger(log_every=log_every, wandb_logger=WandbLogger()) + console_logger = get_logger("writer_worker") + + files_writers = {} + try: + encoder = msgspec.json.Encoder() + counts = defaultdict(int) + total_count = 0 + + while True: + if scores_queue.qsize() == 0: + time.sleep(0.1) + continue + + element = scores_queue.get() + if element is None: + break + + group_by_source = defaultdict(list) + for source, attribute in zip(element.sources, element.attributes): + group_by_source[source].append(attribute) + if source not in files_writers: + destination_path = source_destination_mapping[source] + files_writers[source] = smart_open.open(destination_path, "wt", encoding="utf-8") + console_logger.info(f"Opened {destination_path} for writing") + + for source, attributes in group_by_source.items(): + files_writers[source].write( + encoder.encode_lines(attributes).decode("utf-8") + ) + progress_logger.increment(docs=len(attributes)) + counts[source] += len(attributes) + total_count += len(attributes) + + if total_count > log_every: + # we at most close one file per log_every documents + try: + # get the paths from the output queue (these have been fully processed) + path = output_paths_queue.get_nowait() + except Empty: + path = None + + if path is not None and path.count == counts[path.source]: + # I've finished processing this source; close the file + f = files_writers.pop(path.source) + f.close() + console_logger.info(f"Closed {source_destination_mapping[path.source]}") + progress_logger.increment(files=1) + elif path is not None and counts[path.source] > path.count: + raise RuntimeError( + f"More documents ({counts[path.source]}) than expected ({path.count}) " + + f"for source {path.source}. This should not happen!" + ) + elif path is not None: + console_logger.info( + f"Tried to close {source_destination_mapping[path.source]}, " + + f"but only seen {counts[path.source]}/{path.count} documents" + ) + # more documents still to be written for this source; put it back + output_paths_queue.put(path) + total_count = 0 + except Exception as e: + console_logger.error(f"Writer process encountered an error: {e}") + error_event.set() + finally: + for f in files_writers.values(): + f.close() + + +def process_documents( + source_paths: list[str], + destination_paths: list[str], + batch_size: int, + model_name: str, + model_dtype: str, + model_compile: bool, + log_every: int, + max_length: int | None = None, + text_selector: str = ".text", + id_selector: str = ".id", + num_workers: int = 1, + prefetch_factor: int = 2, + suffix: str | None = None +): + """Processes a batch of files using distributed processing.""" + + classifier = Registry.get( + model_name=model_name, + device=f'cuda:{get_local_gpu_rank()}', + dtype='float16', + compile=model_compile, + ) + + # get filesystem for first source path (we assume is the same for all source paths); we will use this + # to check if destination path exists (file already processed) + fs = fsspec.get_filesystem_class(urlparse(source_paths[0]).scheme)() + + source_destination_mapping = { + source_path: destination_path + for source_path, destination_path in zip(source_paths, destination_paths) + if not fs.exists(destination_path) + } + + with torch.no_grad(), mp.Manager() as manager: + input_paths_queue: QueueType[str] = manager.Queue() + output_paths_queue: QueueType[OutputPath] = manager.Queue() + scores_queue: QueueType[AttributeRow | None] = manager.Queue() + for source_path in source_destination_mapping: + input_paths_queue.put(source_path) + + writer_process_error = Event() + writer_process = Process( + target=writer_worker, + kwargs=dict( + scores_queue=scores_queue, + output_paths_queue=output_paths_queue, + source_destination_mapping=source_destination_mapping, + log_every=log_every, + error_event=writer_process_error, + ), + ) + writer_process.start() + + try: + source_dataset = DocumentsIterableDataset( + # path=source_path, + input_paths_queue=input_paths_queue, + output_paths_queue=output_paths_queue, + tokenizer=classifier.tokenizer, + max_length=max_length, + text_selector=text_selector, + id_selector=id_selector, + ) + + data_loader = DataLoader( + source_dataset, + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + prefetch_factor=prefetch_factor, + collate_fn=partial(collate_batch, pad_token_id=getattr(classifier.tokenizer, "pad_token_id", 0)), + ) + + counts = defaultdict(int) + + for batch in data_loader: + for s in batch.sources: + counts[s] += 1 + + if writer_process_error.is_set(): + raise RuntimeError("Writer process encountered an error") + + inputs = {k: v.to(classifier.device) for k, v in batch.encoding.items()} + scores = classifier.score(**inputs) + + attributes = [ + {"id": doc_id, "attributes": {pred.label: [[0, doc_length, pred.score]] for pred in doc_preds}} + for doc_preds, doc_id, doc_length in zip(scores, batch.ids, batch.lengths) + ] + scores_queue.put_nowait(AttributeRow(sources=batch.sources, attributes=attributes)) + + scores_queue.put(None) + finally: + writer_process.join() + if writer_process_error.is_set(): + raise RuntimeError("Writer process encountered an error") + + cleanup() + + +def longest_common_sequence(paths: list[str]) -> str: + # Split each string by "/" + split_strings = [s.split("/") for s in paths] + + # Zip the split lists together and find the longest common sequence + common_sequence = [] + for fragments in zip_longest(*split_strings, fillvalue=None): + # Check if all fragments in this position are the same + if len(set(fragments)) == 1: + common_sequence.append(fragments[0]) + else: + break + + # Join the longest common sequence back with "/" + return "/".join(common_sequence) + + +def main(args: argparse.Namespace) -> None: + # disable multiprocessing for tokenizer + console_logger = get_logger("main") + + # initialize distributed processing + rank, world_size = setup() + + # initialize wandb logging (if enabled) + WandbLogger() + + # check for available GPUs + if not torch.cuda.is_available(): + raise RuntimeError("No GPUs available, but the script is designed to use multiple GPUs.") + + # if necessary, unglob source prefix + fs = fsspec.get_filesystem_class((scheme := urlparse(args.source_prefix).scheme))() + source_paths = [(f"{scheme}://{p}" if scheme else p) for p in fs.glob(args.source_prefix)] + + assert len(source_paths) > 0, f"No files found in {args.source_prefix}" + + if all("/documents/" in p for p in source_paths): + source_prefix = longest_common_sequence([p.split("/documents/", 1)[0] for p in source_paths]) + source_prefix = f"{source_prefix}/documents/" + else: + source_prefix = longest_common_sequence(source_paths) + + destination_paths = [ + f'{args.output_prefix.rstrip("/")}/{p.replace(source_prefix, "").lstrip("/")}' for p in source_paths + ] + + console_logger.info(f"Processing up to {len(source_paths)} files from {args.source_prefix} to {args.output_prefix}") + + # Filter out existing files unless --override is set + if not args.override: + + # possible existing destinations might contain more files than destination_paths because it glob + # at the attribute name level, while destination_paths might only be about a subset of documents. + possible_existing_destinations = set(f"{scheme}://{p}" for p in fs.glob(f'{args.output_prefix.rstrip("/")}/**')) + existing_destinations = {p for p in destination_paths if p in possible_existing_destinations} + + console_logger.info(f"Found {len(existing_destinations)} existing files in {args.output_prefix}") + + if len(existing_destinations) >= len(source_paths): + console_logger.info("No files left to process, exiting") + return + + source_paths, destination_paths = map( + lambda t: list(t), + zip(*[(p, d) for p, d in zip(source_paths, destination_paths) if d not in existing_destinations]), + ) + + console_logger.info(f"After filtering, tagging {len(source_paths)} files") + + # Distribute files across processes + files_per_process = len(source_paths) / world_size + start_idx = int(rank * files_per_process) + end_idx = int((rank + 1) * files_per_process) if rank < world_size - 1 else len(source_paths) + partition_source_paths = source_paths[start_idx:end_idx] + partition_destination_paths = destination_paths[start_idx:end_idx] + + console_logger.info(f"Partitioned into {world_size} workers of with avg {files_per_process:.2f} files.") + console_logger.info(f"Processing GPU {rank}/{world_size}: {len(partition_source_paths)} files") + + process_documents( + model_name=args.model_name, + model_dtype=args.model_dtype, + log_every=args.log_every, + source_paths=partition_source_paths, + destination_paths=partition_destination_paths, + batch_size=args.batch_size, + num_workers=args.num_workers, + max_length=args.max_length, + text_selector=args.text_key, + id_selector=args.id_key, + suffix=args.attribute_suffix, + model_compile=args.model_compile, + prefetch_factor=args.prefetch_factor, + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Classify text from JSONL files on S3 using a Hugging Face model." + ) + parser.add_argument( + "-s", + "--source-prefix", + type=str, + required=True, + help="S3 glob pattern for input files (e.g., s3://path/to/docs/*/*.jsonl.gz)", + ) + parser.add_argument("--output-prefix", type=str, default=None, help="S3 prefix to save the results") + parser.add_argument("-b", "--batch-size", type=int, default=32, help="Batch size for processing (default: 32)") + parser.add_argument("-m", "--model-name", type=str, required=True, help="Hugging Face model name") + parser.add_argument( + "--max-length", type=int, default=None, help="Maximum sequence length for tokenization (default: None)" + ) + parser.add_argument("--model-compile", action="store_true", help="Compile the model using torch.compile") + parser.add_argument("--use-wandb", action="store_true", help="Use Weights & Biases for logging") + parser.add_argument("--wandb-project", type=str, default=None, help="Weights & Biases project name") + parser.add_argument("--wandb-entity", type=str, default=None, help="Weights & Biases entity name") + parser.add_argument("--wandb-name", type=str, default=None, help="Gantry task name") + parser.add_argument("--override", action="store_true", help="Override existing files") + parser.add_argument("--text-key", type=str, default=".text", help="JQ key to extract text from documents") + parser.add_argument("--id-key", type=str, default=".id", help="JQ key to extract id from documents") + parser.add_argument("--num-workers", type=int, default=1, help="Number of workers for processing") + parser.add_argument("--log-every", type=int, default=10000, help="Log every n documents") + parser.add_argument("--model-dtype", type=str, default="float16", help="Data type for model") + parser.add_argument("--attribute-suffix", type=str, default=None, help="Optional suffix for attribute keys") + parser.add_argument("--prefetch-factor", type=int, default=2, help="Prefetch factor for DataLoader") + opts = parser.parse_args() + + if opts.output_prefix is None: + if "/documents/" not in opts.source_prefix: + raise ValueError("Output prefix is required unless source prefix contains 'documents'") + base, _ = opts.source_prefix.split("/documents/", 1) + opts.output_prefix = f"{base}/attributes/{sanitize_model_name(opts.model_name)}" + + if opts.use_wandb: + WandbLogger.use_wandb = True + WandbLogger.project = opts.wandb_project or WandbLogger.project + WandbLogger.entity = opts.wandb_entity or WandbLogger.entity + # use name provided by user, or name of run in wandb, or sanitize model name + WandbLogger.name = opts.wandb_name or WandbLogger.name or sanitize_model_name(opts.model_name, opts.__dict__) + + return opts diff --git a/classifiers/src/dolma_classifiers/inference/loggers.py b/classifiers/src/dolma_classifiers/inference/loggers.py new file mode 100644 index 00000000..a09437b3 --- /dev/null +++ b/classifiers/src/dolma_classifiers/inference/loggers.py @@ -0,0 +1,107 @@ +import logging +import os +import time + +import wandb + +from .utils import get_rank_and_world_size + + +def get_logger(logger_name: str): + rank, world_size = get_rank_and_world_size() + + # Create a custom formatter + class RankFormatter(logging.Formatter): + def format(self, record): + record.rank = rank + record.world_size = world_size + return super().format(record) + + # Create a logger with the given name + logger = logging.getLogger(f'dolma_classifiers.{logger_name}') + logger.setLevel(logging.INFO) + + # Create a handler for console output + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + + # Create and set the custom formatter + formatter = RankFormatter( + '%(asctime)s [%(rank)d/%(world_size)d] %(levelname)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + console_handler.setFormatter(formatter) + + # Add the handler to the logger + logger.addHandler(console_handler) + + return logger + + +class WandbLogger: + is_initialized = False + use_wandb = False + project = os.environ.get("WANDB_PROJECT", "") + entity = os.environ.get("WANDB_ENTITY", "") + name = os.environ.get("GANTRY_TASK_NAME", "") + + def __new__(cls, *args, **kwargs): + rank, _ = get_rank_and_world_size() + if not cls.is_initialized and cls.use_wandb and rank == 0: + assert cls.project, "W&B project name is not set" + assert cls.entity, "W&B entity name is not set" + assert cls.name, "W&B run name is not set" + wandb.init(project=cls.project, entity=cls.entity, name=cls.name) + cls.is_initialized = True + return super().__new__(cls, *args, **kwargs) + + def __init__(self): + self.rank, self.world_size = get_rank_and_world_size() + + def log(self, **kwargs): + if (self.rank == 0) and (self.use_wandb): + if step := kwargs.pop("step", None): + wandb.log(kwargs, step=step) + else: + wandb.log(kwargs) + + +class ProgressLogger: + def __init__(self, log_every: int = 10_000, wandb_logger: WandbLogger | None = None): + self.log_every = log_every + self.logger = get_logger(self.__class__.__name__) + self.start_time = self.prev_time = time.time() + self.total_docs = 0 + self.current_docs = 0 + self.current_files = 0 + self.total_files = 0 + self.wandb_logger = wandb_logger + + def increment(self, docs: int = 0, files: int = 0): + self.current_docs += docs + self.current_files += files + self.total_docs += docs + self.total_files += files + + if self.current_docs >= self.log_every or files > 0: + current_time = time.time() + docs_throughput = self.current_docs / (current_time - self.prev_time) + files_throughput = self.current_files / (current_time - self.prev_time) + + self.logger.info( + f"Throughput: {docs_throughput:.2f} docs/s, {files_throughput:.2f} files/s " + + f" ({self.total_docs:.1e} docs; {self.total_files:,} files)" + ) + if self.wandb_logger is not None: + self.wandb_logger.log( + step=self.total_docs, + instant_doc_throughput=docs_throughput, + total_doc_throughput=self.total_docs / (current_time - self.start_time), + instant_file_throughput=files_throughput, + total_file_throughput=self.total_files / (current_time - self.start_time), + total_files=self.total_files, + ) + + self.prev_time = current_time + self.current_docs = 0 + self.current_files = 0 diff --git a/classifiers/src/dolma_classifiers/inference/models.py b/classifiers/src/dolma_classifiers/inference/models.py new file mode 100644 index 00000000..ef1a0dd1 --- /dev/null +++ b/classifiers/src/dolma_classifiers/inference/models.py @@ -0,0 +1,163 @@ +from typing import NamedTuple, Type + +import torch +from huggingface_hub import PyTorchModelHubMixin +from torch import nn +from torch.nn import functional as F +from transformers import ( + AutoConfig, + AutoModel, + AutoModelForSequenceClassification, + AutoTokenizer, + PreTrainedModel, + PreTrainedTokenizer, +) +from transformers.modeling_outputs import SequenceClassifierOutput + +from .loggers import get_logger +from .utils import sanitize_model_name + + +class Prediction(NamedTuple): + label: str + score: float + + +class BaseQualityClassifier: + model: PreTrainedModel + tokenizer: PreTrainedTokenizer + + def __init__( + self, + model_name: str, + device: str, + dtype: str, + compile: bool = False, + trust_remote_code: bool = False, + ): + self.model = self._make_model( + model_name=model_name, + device=device, + dtype=dtype, + compile=compile, + trust_remote_code=trust_remote_code, + ) + self.tokenizer = AutoTokenizer.from_pretrained(model_name) # pyright: ignore + + if len(self.model.config.id2label) > 1: + def label_name_fn(label: str): + return f"{sanitize_model_name(model_name)}_{sanitize_model_name(label)}" + else: + def label_name_fn(label: str): + return sanitize_model_name(model_name) + + self.labels_map = { + id_: label_name_fn(label) + for id_, label in self.model.config.id2label.items() + } + + def _make_model( + self, + model_name: str, + device: str, + dtype: str, + compile: bool, + trust_remote_code: bool, + ) -> PreTrainedModel: + model = AutoModelForSequenceClassification.from_pretrained( + pretrained_model_name_or_path=model_name, + torch_dtype=getattr(torch, dtype), + trust_remote_code=trust_remote_code, + ) + model = model.to(torch.device(device)) + + if compile: + model = torch.compile(model) # pyright: ignore + + model.eval() # pyright: ignore + + return model # pyright: ignore + + @property + def device(self) -> torch.device: + return self.model.device + + def score(self, **batch: torch.Tensor) -> list[list[Prediction]]: + outputs = self.model(**batch) + scores = ( + F.softmax(outputs.logits, dim=-1) if outputs.logits.size(-1) != 1 else outputs.logits + ) + return [ + [Prediction(label=self.labels_map[i], score=float(score)) for i, score in enumerate(row)] + for row in scores.float().cpu().numpy() + ] + + +class Registry: + _registry: dict[str, Type[BaseQualityClassifier]] = {} + _logger = get_logger("ModelRegistry") + + def __new__(cls, *args, **kwargs): + return cls + + @classmethod + def add(cls, classifier_name: str): + def _add(classifier: Type[BaseQualityClassifier]): + cls._registry[classifier_name] = classifier + return _add + + @classmethod + def get(cls, model_name: str, **kwargs) -> BaseQualityClassifier: + if model_name not in cls._registry: + cls._logger.warning(f"Classifier {model_name} not found in registry; using default classifier") + return BaseQualityClassifier(model_name=model_name, **kwargs) + else: + return cls._registry[model_name](model_name=model_name, **kwargs) + + +@Registry.add("HuggingFaceFW/fineweb-edu-classifier") +class FineWebEduClassifier(BaseQualityClassifier): + pass + + +class QualityModel(nn.Module, PyTorchModelHubMixin): + def __init__(self, config): + super(QualityModel, self).__init__() + self.model = AutoModel.from_pretrained(config["base_model"]) + self.dropout = nn.Dropout(config["fc_dropout"]) + self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"])) + + @property + def device(self): + return self.model.device + + def forward(self, input_ids, attention_mask, **kwargs): + features = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state + dropped = self.dropout(features) + outputs = self.fc(dropped) + return SequenceClassifierOutput(logits=outputs[:, 0, :]) + + +@Registry.add("nvidia/quality-classifier-deberta") +class DebertaQualityClassifier(BaseQualityClassifier): + def _make_model( + self, + model_name: str, + device: str, + dtype: str, + compile: bool, + trust_remote_code: bool, + ) -> PreTrainedModel: + model = QualityModel.from_pretrained(model_name) + model = model.to(getattr(torch, dtype)) + model = model.to(torch.device(device)) + + if compile: + model = torch.compile(model) # pyright: ignore + + model.eval() # pyright: ignore + + # for some reason the config is not loaded automatically; need to set it manually + model.config = AutoConfig.from_pretrained(model_name) # pyright: ignore + + return model # pyright: ignore diff --git a/classifiers/src/dolma_classifiers/inference/utils.py b/classifiers/src/dolma_classifiers/inference/utils.py new file mode 100644 index 00000000..b95f1c0b --- /dev/null +++ b/classifiers/src/dolma_classifiers/inference/utils.py @@ -0,0 +1,59 @@ +import os +import re +from hashlib import md5 +from typing import Any + +import msgspec +import torch +import torch.distributed as dist +from smart_open.compression import ( + _handle_zstd, + get_supported_compression_types, + register_compressor, +) + + +def get_rank_and_world_size(): + if dist.is_initialized(): + return dist.get_rank(), dist.get_world_size() + else: + return 0, 1 + + +def get_local_gpu_rank() -> int: + """Returns the local GPU rank for the current process using torch.distributed.""" + if dist.is_initialized(): + return dist.get_rank() % torch.cuda.device_count() + else: + return 0 + + +def setup() -> tuple[int, int]: + if (rank := os.environ.get("RANK")) and (world_size := os.environ.get("WORLD_SIZE")): + dist.init_process_group("nccl", rank=int(rank), world_size=int(world_size)) + + os.environ["CUDA_VISIBLE_DEVICES"] = str(get_local_gpu_rank()) + + return get_rank_and_world_size() + + +def cleanup(): + if dist.is_initialized(): + dist.destroy_process_group() + + +def sanitize_model_name(model_name: str, suffix_data: Any = None) -> str: + replaced_with_underscores = re.sub("[^a-zA-Z0-9_]", "_", model_name) + removed_duplicates = re.sub("_{2,}", "_", replaced_with_underscores) + stripped_trailing_underscores = removed_duplicates.strip("_") + + if suffix_data: + # encode suffix_data and use first 6 characters of md5 hash as suffix + encoder = msgspec.json.Encoder() + stripped_trailing_underscores += f"_{md5(encoder.encode(suffix_data)).hexdigest()[:6]}" + + return stripped_trailing_underscores + + +if ".zstd" not in get_supported_compression_types(): + register_compressor(".zstd", _handle_zstd) diff --git a/classifiers/src/dolma_classifiers/label/__init__.py b/classifiers/src/dolma_classifiers/label/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/classifiers/src/dolma_classifiers/label/__main__.py b/classifiers/src/dolma_classifiers/label/__main__.py new file mode 100644 index 00000000..d3f339be --- /dev/null +++ b/classifiers/src/dolma_classifiers/label/__main__.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +import argparse +import glob +import json +import logging +import os +from pathlib import Path +from typing import Any, Dict, List +from urllib.parse import urlparse + +import grequests +import jinja2 +import urllib3 + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +OPENAI_API_ENDPOINT = "https://api.openai.com/v1/chat/completions" + + +class DocumentProcessor: + def __init__( + self, + documents_path: str, + destination: str, + prompt_template: str, + api_key: str, + batch_size: int = 5, + max_retries: int = 3, + retry_delay: int = 1 + ): + self.documents_path = documents_path + self.destination = destination + self.prompt_template = prompt_template + self.api_key = api_key + self.batch_size = batch_size + self.max_retries = max_retries + self.retry_delay = retry_delay + self.template = jinja2.Template(prompt_template) + + def _create_request(self, document: Dict[str, Any]) -> grequests.AsyncRequest: + """Create a single grequest for a document.""" + try: + # Render the prompt template with document fields + prompt = self.template.render(**document) + + # Prepare the request payload + payload = { + "model": "gpt-4", + "messages": [ + {"role": "system", "content": "You are a helpful assistant that processes documents."}, + {"role": "user", "content": prompt} + ] + } + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + + # Create the request object + return grequests.post( + OPENAI_API_ENDPOINT, + json=payload, + headers=headers, + timeout=30 + ), document + + except Exception as e: + logger.error(f"Error creating request: {e}") + return None + + def _process_response(self, response, document: Dict[str, Any]) -> Dict[str, Any]: + """Process a single response from the API.""" + try: + if response.status_code == 200: + result = response.json() + document['gpt4_response'] = result['choices'][0]['message']['content'] + else: + document['error'] = f"API Error: {response.status_code} - {response.text}" + except Exception as e: + document['error'] = f"Processing Error: {str(e)}" + + return document + + def _process_batch(self, batch: List[Dict[str, Any]], output_file: str): + """Process a batch of documents and write results to output file.""" + # Create request objects for the batch + request_pairs = [self._create_request(doc) for doc in batch] + requests, documents = zip(*[pair for pair in request_pairs if pair is not None]) + + # Make async requests + responses = grequests.map(requests, size=len(requests)) + + # Process responses and write to file + with open(output_file, 'a') as f: + for response, document in zip(responses, documents): + result = self._process_response(response, document) + f.write(json.dumps(result) + '\n') + + def _download_file(self, url: str, local_path: str) -> str: + """Download a remote file to local storage.""" + with urllib3.PoolManager() as http: + response = http.request('GET', url) + if response.status == 200: + with open(local_path, 'w') as f: + f.write(response.data.decode('utf-8')) + return local_path + else: + raise Exception(f"Failed to download file: {response.status}") + + def _get_file_paths(self) -> List[str]: + """Get list of files to process, handling both local and remote paths.""" + if urlparse(self.documents_path).scheme in ('http', 'https'): + # Handle remote files + temp_dir = Path('temp_downloads') + temp_dir.mkdir(exist_ok=True) + + # Download remote files + local_paths = [] + with urllib3.PoolManager() as http: + response = http.request('GET', self.documents_path) + if response.status == 200: + file_list = response.data.decode('utf-8').splitlines() + for url in file_list: + local_path = temp_dir / Path(urlparse(url).path).name + self._download_file(url, str(local_path)) + local_paths.append(str(local_path)) + return local_paths + else: + # Handle local files + return glob.glob(self.documents_path) + + def process_files(self): + """Main method to process all files.""" + # Create destination directory if it doesn't exist + os.makedirs(self.destination, exist_ok=True) + + # Get list of files to process + file_paths = self._get_file_paths() + logger.info(f"Found {len(file_paths)} files to process") + + for file_path in file_paths: + try: + # Read input file + with open(file_path, 'r') as f: + documents = [json.loads(line) for line in f] + + # Create output file path + output_file = os.path.join( + self.destination, + f"processed_{os.path.basename(file_path)}" + ) + + # Process documents in batches + for i in range(0, len(documents), self.batch_size): + batch = documents[i:i + self.batch_size] + self._process_batch(batch, output_file) + logger.info(f"Processed batch {i//self.batch_size + 1} of file {file_path}") + + except Exception as e: + logger.error(f"Error processing file {file_path}: {e}") + +def main(): + parser = argparse.ArgumentParser(description='Process documents with GPT-4') + parser.add_argument('--documents', required=True, help='Glob pattern for input documents') + parser.add_argument('--destination', required=True, help='Output directory') + parser.add_argument('--prompt', required=True, help='Prompt template') + parser.add_argument('--api-key', required=True, help='OpenAI API key') + parser.add_argument('--batch-size', type=int, default=5, help='Batch size for processing') + + args = parser.parse_args() + + # Read prompt template from file if it's a file path + prompt_template = args.prompt + if os.path.isfile(args.prompt): + with open(args.prompt, 'r') as f: + prompt_template = f.read() + + processor = DocumentProcessor( + documents_path=args.documents, + destination=args.destination, + prompt_template=prompt_template, + api_key=args.api_key, + batch_size=args.batch_size + ) + + # Run the processor + processor.process_files() + +if __name__ == "__main__": + main() diff --git a/classifiers/src/dolma_classifiers/label/api.py b/classifiers/src/dolma_classifiers/label/api.py new file mode 100644 index 00000000..f3976c9d --- /dev/null +++ b/classifiers/src/dolma_classifiers/label/api.py @@ -0,0 +1,50 @@ +import os +from dataclasses import dataclass, field + +import aiohttp + + +@dataclass(frozen=True) +class Message: + role: str + content: str + + def to_dict(self): + return { + "role": self.role, + "content": self.content + } + + +@dataclass(frozen=True) +class BaseApiRequest: + endpoint: str + messages: list[Message] + parameters: dict = field(default_factory=dict) + headers: dict = field(default_factory=dict) + + async def make(self): + payload = {**self.parameters, "messages": [message.to_dict() for message in self.messages]} + async with aiohttp.ClientSession() as session: + async with session.post(self.endpoint, json=payload, headers=self.headers) as response: + return await response.json() + + +@dataclass(frozen=True) +class Gpt4oRequest(BaseApiRequest): + model: str = "gpt-4o" + temperature: float = 1.0 + top_p: float = 1.0 + headers: dict = field( + default_factory=lambda: { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}" + } + ) + + def __post_init__(self): + self.parameters.update({ + "model": self.model, + "temperature": self.temperature, + "top_p": self.top_p + }) diff --git a/classifiers/src/dolma_classifiers/label/templates.py b/classifiers/src/dolma_classifiers/label/templates.py new file mode 100644 index 00000000..15c5749b --- /dev/null +++ b/classifiers/src/dolma_classifiers/label/templates.py @@ -0,0 +1,107 @@ +from typing import Any, Dict, Optional + +import jq + + +class JqTemplate: + """ + A template engine that processes strings containing JQ expressions in {expression} syntax. + Supports escaping curly braces with {{ and }}. + """ + + def __init__(self, template_string: str): + """ + Initialize the template with a template string. + + Args: + template_string: The template string containing JQ expressions in {expression} syntax + """ + self.template_string = template_string + self._compiled = self._compile_template(template_string) + + @staticmethod + def _compile_template(template_string: str) -> list[tuple[str, Optional[jq.jq]]]: + """ + Compile the template string into a list of (text, expression) tuples. + + Args: + template_string: The template string to compile + + Returns: + List of tuples containing (text, compiled_jq_expression) + + Raises: + ValueError: If there are unmatched braces or invalid JQ expressions + """ + parts = [] + current_pos = 0 + + # Handle escaped braces first + template_string = template_string.replace("{{", "\0LEFT_BRACE\0").replace("}}", "\0RIGHT_BRACE\0") + + while current_pos < len(template_string): + # Find next unescaped opening brace + start = template_string.find("{", current_pos) + + if start == -1: + # No more expressions, add remaining text + text = template_string[current_pos:] + text = text.replace("\0LEFT_BRACE\0", "{").replace("\0RIGHT_BRACE\0", "}") + parts.append((text, None)) + break + + # Add text before the expression + if start > current_pos: + text = template_string[current_pos:start] + text = text.replace("\0LEFT_BRACE\0", "{").replace("\0RIGHT_BRACE\0", "}") + parts.append((text, None)) + + # Find matching closing brace + end = template_string.find("}", start) + if end == -1: + raise ValueError(f"Unmatched opening brace at position {start}") + + # Extract and compile JQ expression + expr = template_string[start + 1:end].strip() + try: + compiled_expr = jq.compile(expr) + except ValueError as e: + raise ValueError(f"Invalid JQ expression '{expr}': {str(e)}") + + parts.append(("", compiled_expr)) + current_pos = end + 1 + + return parts + + def render(self, data: Dict[str, Any]) -> str: + """ + Render the template by evaluating all JQ expressions against the provided data. + + Args: + data: Dictionary containing the data to evaluate expressions against + + Returns: + The rendered template string + + Raises: + ValueError: If any JQ expression fails to evaluate + """ + result = [] + + for text, expr in self._compiled: + result.append(text) + if expr is None: + continue + + try: + # Evaluate expression and get first result + evaluated = expr.input(data).first() + # append the evaluated result to the result list + result.append(str(evaluated or "")) + except StopIteration: + # No results from JQ expression + result.append("") + except Exception as e: + raise ValueError(f"Error evaluating expression: {str(e)}") + + return "".join(result) diff --git a/classifiers/src/dolma_classifiers/train.py b/classifiers/src/dolma_classifiers/train.py new file mode 100644 index 00000000..23747b34 --- /dev/null +++ b/classifiers/src/dolma_classifiers/train.py @@ -0,0 +1,94 @@ +import multiprocessing +from dataclasses import dataclass +from functools import partial +from typing import Callable +from urllib.parse import urlparse + +import fsspec +import jq +import smart_open +from msgspec.json import Decoder +from torch.utils.data import Dataset +from tqdm import tqdm + + +@dataclass(frozen=True) +class Document: + text: str + label: str + + +def _label_selector_fn(row: dict, selector: Callable | None, label: str | None) -> str: + if selector is not None: + return str(selector(row).first()) + elif label is not None: + return str(label) + else: + raise ValueError("Either `label` or `selector` must be provided") + + +def read_file(path: str, label: str | None = None, selector: str | None = None) -> list[Document]: + label_fn = partial(_label_selector_fn, label=label, selector=(jq.compile(selector) if selector else None)) + + decoder = Decoder() + documents = [] + + with smart_open.open(path) as f: + for line in f: + row = decoder.decode(line) + label = label_fn(row) + documents.append(Document(text=row["text"], label=label)) + + return documents + + +@dataclass(frozen=True) +class DataConfig: + path: str + label: str | None = None + selector: str | None = None + + @staticmethod + def expand(data_config: "DataConfig", fs: fsspec.AbstractFileSystem | None = None) -> list["DataConfig"]: + fs = fs or fsspec.get_filesystem_class(urlparse(data_config.path).scheme)() + assert fs is not None, f"Could not determine filesystem for {data_config.path}" + paths = [str(p) for p in fs.glob(data_config.path)] if "*" in data_config.path else [data_config.path] + return [DataConfig(path=path, label=data_config.label, selector=data_config.selector) for path in paths] + + +class ClassifierDataset(Dataset): + def __init__( + self, + configs: list[DataConfig], + workers: int = 1, + ): + with multiprocessing.Pool(workers) as pool: + expanded_configs: list[DataConfig] = [ + data_config + for data_configs in tqdm( + pool.imap_unordered(DataConfig.expand, configs), + total=len(configs), + desc="Expanding configs", + ) + for data_config in data_configs + ] + + with multiprocessing.Pool(workers) as pool: + self.documents = list( + tqdm( + pool.imap_unordered( + lambda c: read_file(path=c.path, label=c.label, selector=c.selector), + expanded_configs + ), + total=len(expanded_configs), + desc="Reading files", + ) + ) + + print(f"Read {len(self.documents)} documents from {len(expanded_configs)} configs") + + def __len__(self): + return len(self.documents) + + def __getitem__(self, idx): + return self.documents[idx] diff --git a/classifiers/tests/__init__.py b/classifiers/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/classifiers/tests/test_api.py b/classifiers/tests/test_api.py new file mode 100644 index 00000000..b0e34b95 --- /dev/null +++ b/classifiers/tests/test_api.py @@ -0,0 +1,88 @@ +import aiohttp +import pytest +from aioresponses import aioresponses +from dolma_classifiers.label.api import ( # Replace with your actual module name + BaseApiRequest, + Message, +) + + +@pytest.fixture +def mock_api(): + with aioresponses() as m: + yield m + + +@pytest.mark.asyncio +async def test_successful_api_request(mock_api): + # Arrange + endpoint = "https://api.example.com/v1/chat" + expected_response = {"response": "Hello, world!", "status": "success"} + + mock_api.post(endpoint, status=200, payload=expected_response) + + request = BaseApiRequest( + endpoint=endpoint, + messages=[Message(role="user", content="Hello!")], + headers={"Authorization": "Bearer test-token"}, + ) + + # Act + response = await request.make() + + # Assert + assert response == expected_response + + +@pytest.mark.asyncio +async def test_api_request_with_error(mock_api): + # Arrange + endpoint = "https://api.example.com/v1/chat" + error_response = {"error": "Invalid token", "status": "error"} + + mock_api.post(endpoint, status=401, payload=error_response) + + request = BaseApiRequest( + endpoint=endpoint, + messages=[Message(role="user", content="Hello!")], + headers={"Authorization": "Bearer invalid-token"}, + ) + + # Act & Assert + with pytest.raises(aiohttp.ClientResponseError) as exc_info: + await request.make() + assert exc_info.value.status == 401 + + +@pytest.mark.asyncio +async def test_api_request_payload(mock_api): + # Arrange + endpoint = "https://api.example.com/v1/chat" + messages = [Message(role="user", content="Hello!")] + parameters = {"temperature": 0.7} + + expected_payload = {"messages": [{"role": "user", "content": "Hello!"}], "temperature": 0.7} + + def match_payload(url, **kwargs): + assert kwargs["json"] == expected_payload + return True + + mock_api.post(endpoint, status=200, callback=match_payload) + + request = BaseApiRequest(endpoint=endpoint, messages=messages, parameters=parameters) + + # Act + await request.make() # If no assertion error is raised, the payload matched + + +@pytest.mark.asyncio +async def test_network_error(mock_api): + # Arrange + endpoint = "https://api.example.com/v1/chat" + mock_api.post(endpoint, exception=aiohttp.ClientConnectionError()) + + request = BaseApiRequest(endpoint=endpoint, messages=[Message(role="user", content="Hello!")]) + + # Act & Assert + with pytest.raises(aiohttp.ClientConnectionError): + await request.make() diff --git a/classifiers/tests/test_templates.py b/classifiers/tests/test_templates.py new file mode 100644 index 00000000..183a790d --- /dev/null +++ b/classifiers/tests/test_templates.py @@ -0,0 +1,81 @@ +import unittest + +from dolma_classifiers.label.templates import JqTemplate + + +class TestJqTemplate(unittest.TestCase): + """Test cases for the JqTemplate class.""" + + def setUp(self): + """Set up test data that will be used across multiple tests.""" + self.test_data = { + "name": "John", + "age": 30, + "address": {"street": "123 Main St", "city": "Springfield"}, + "hobbies": ["reading", "hiking", "coding"], + } + + def test_basic_expression(self): + """Test basic template expression.""" + template = JqTemplate("Hello, {.name}!") + self.assertEqual(template.render(self.test_data), "Hello, John!") + + def test_nested_object_access(self): + """Test accessing nested object properties.""" + template = JqTemplate("Address: {.address.street}, {.address.city}") + self.assertEqual(template.render(self.test_data), "Address: 123 Main St, Springfield") + + def test_array_access(self): + """Test accessing array elements.""" + template = JqTemplate("First hobby: {.hobbies[0]}") + self.assertEqual(template.render(self.test_data), "First hobby: reading") + + def test_complex_jq_expression(self): + """Test more complex JQ expressions.""" + template = JqTemplate('Hobbies: {.hobbies | join(", ")}') + self.assertEqual(template.render(self.test_data), "Hobbies: reading, hiking, coding") + + def test_escaped_braces(self): + """Test that escaped braces are handled correctly.""" + template = JqTemplate("User {{.name}} is {.age} years old") + self.assertEqual(template.render(self.test_data), "User {.name} is 30 years old") + + def test_multiple_expressions(self): + """Test multiple expressions in the same template.""" + template = JqTemplate("{.name} lives at {.address.street}") + self.assertEqual(template.render(self.test_data), "John lives at 123 Main St") + + def test_missing_field(self): + """Test behavior when accessing a non-existent field.""" + template = JqTemplate("Name: {.missing_field}") + self.assertEqual(template.render(self.test_data), "Name: ") + + def test_unmatched_brace(self): + """Test that unmatched braces raise an error.""" + with self.assertRaises(ValueError): + JqTemplate("Hello {.name") + + def test_invalid_jq_expression(self): + """Test that invalid JQ expressions raise an error.""" + with self.assertRaises(ValueError): + JqTemplate("Hello {invalid!}") + + def test_empty_template(self): + """Test handling of empty template strings.""" + template = JqTemplate("") + self.assertEqual(template.render(self.test_data), "") + + def test_template_without_expressions(self): + """Test template string without any expressions.""" + template = JqTemplate("Hello, world!") + self.assertEqual(template.render(self.test_data), "Hello, world!") + + def test_adjacent_expressions(self): + """Test handling of adjacent expressions.""" + template = JqTemplate("{.name}{.age}") + self.assertEqual(template.render(self.test_data), "John30") + + def test_whitespace_handling(self): + """Test that whitespace in expressions is handled correctly.""" + template = JqTemplate("Hello, { .name }!") + self.assertEqual(template.render(self.test_data), "Hello, John!") diff --git a/configs/cc-news/dedupe_by_lang.sh b/configs/cc-news/dedupe_by_lang.sh new file mode 100644 index 00000000..35706739 --- /dev/null +++ b/configs/cc-news/dedupe_by_lang.sh @@ -0,0 +1,75 @@ +#! /usr/bin/env bash + +base_dir="${HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents" + +langs=($(du -sh "${base_dir}"/* 2>/dev/null | sort -hr | awk '{print $2}' | xargs -n1 basename)) + +for lang in "${langs[@]}"; do + documents=() + size=0 + while IFS= read -r -d '' file; do + documents+=("$file") + size=$(expr $size + $(stat -c %s "$file")) + done < <(find "${base_dir}/${lang}" -type f \( -name "*.zst" -o -name "*.gz" -o -name "*.gzip" -o -name "*.json" -o -name "*.jsonl" \) -print0) + + # sort documents by name + documents=($(echo "${documents[@]}" | tr ' ' '\n' | sort)) + + # run deduplication + echo "Running fuzzy dedupe for ${lang} with ${size} bytes Bloom filter (files: ${#documents[@]})" + + # Start the output + document_linearized="documents:\n" + + # Loop through the array and append each element + for doc in "${documents[@]}"; do + document_linearized+=" - $doc\n" + done + + config_yaml=$(cat < to leave some room for other processes + processes=$(( $(expr $(nproc) - 4) < ${#documents[@]} ? $(expr $(nproc) - 4) : ${#documents[@]} )) + + # Create a temporary file for the YAML config + temp_config_file=$(mktemp) + + # Write the YAML config to the temporary file + printf "$config_yaml" > "$temp_config_file" + + + set -ex + # Run dolma with the temporary config file + dolma -c "$temp_config_file" dedupe --processes "${processes}" + # cat "$temp_config_file" + set +ex + + + # Remove the temporary file + rm "$temp_config_file" + rm -rf "/tmp/cc_news_${lang}*" + +done diff --git a/configs/cc-news/dedupe_by_year.sh b/configs/cc-news/dedupe_by_year.sh new file mode 100644 index 00000000..321af0b4 --- /dev/null +++ b/configs/cc-news/dedupe_by_year.sh @@ -0,0 +1,69 @@ +#! /usr/bin/env bash + +base_dir="${HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents" + +# run years between 2016 and 2024 +for year in {2016..2024}; do + # Initialize an empty array to store document paths and a variable for total size + documents=() + size=0 + while IFS= read -r -d '' file; do + documents+=("$file") + size=$(expr $size + $(stat -c %s "$file")) + done < <(find "${base_dir}/${year}" -type f \( -name "*.zst" -o -name "*.gz" -o -name "*.gzip" -o -name "*.json" -o -name "*.jsonl" \) -print0) + + # run deduplication + echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter (files: ${#documents[@]})" + + # Start the output + document_linearized="documents:\n" + + # Loop through the array and append each element + for doc in "${documents[@]}"; do + document_linearized+=" - $doc\n" + done + + config_yaml=$(cat < "$temp_config_file" + + + set -ex + # Run dolma with the temporary config file + dolma -c "$temp_config_file" dedupe --processes $(expr $(nproc) - 4) + set +ex + + + # Remove the temporary file + rm "$temp_config_file" + rm -rf "/tmp/cc_news_${year}*" + +done diff --git a/configs/cc-news/find_broken.py b/configs/cc-news/find_broken.py new file mode 100644 index 00000000..4b6d9002 --- /dev/null +++ b/configs/cc-news/find_broken.py @@ -0,0 +1,77 @@ +from argparse import ArgumentParser +from queue import Queue +from tempfile import TemporaryDirectory +from typing import Any, Tuple, Union + +import smart_open +from dolma.core.parallel import BaseParallelProcessor + + +class FindBrokenFilesProcessor(BaseParallelProcessor): + @classmethod + def increment_progressbar( + cls, + queue: "Queue[Union[Tuple[int, ...], None]]", + /, + files: int = 0, + docs: int = 0, + ): + return super().increment_progressbar(queue, files=files, docs=docs) + + @classmethod + def process_single( + cls, + source_path: str, + destination_path: str, + queue: Queue, + **kwargs: Any, + ): + """ + This method is called for each file. It reads the file + line by line, and writes to the destination file only + if the document is not empty. + """ + + try: + with smart_open.open(source_path, mode="rt", encoding="utf-8") as f: + cnt = 0 + for _ in f: + cnt += 1 + if cnt >= 1000: + cls.increment_progressbar(queue, docs=cnt) + cnt = 0 + except Exception as e: # pylint: disable=broad-except + print(f"Error {e} in file {source_path}") + + if cnt > 0: + cls.increment_progressbar(queue, docs=cnt, files=1) + + +def parse_args(): + ag = ArgumentParser() + ag.add_argument("-s", "--source-prefix", type=str, required=True) + ag.add_argument("-n", "--num-processes", type=int, default=1) + ag.add_argument("-u", "--debug", action="store_true") + ag.add_argument("-t", "--temp-dir", type=str, default=None) + return ag.parse_args() + + +def main(): + args = parse_args() + + with TemporaryDirectory(dir=args.temp_dir) as tmpdir: + # create the processor + processor = FindBrokenFilesProcessor( + source_prefix=args.source_prefix, + destination_prefix=tmpdir, + metadata_prefix=tmpdir, + num_processes=args.num_processes, + debug=args.debug, + ) + + # run the processor + processor() + + +if __name__ == "__main__": + main() diff --git a/configs/cc-news/mix-deupe-by-year.yaml b/configs/cc-news/mix-deupe-by-year.yaml new file mode 100644 index 00000000..5038bf21 --- /dev/null +++ b/configs/cc-news/mix-deupe-by-year.yaml @@ -0,0 +1,78 @@ +streams: + - name: cc-news_2016 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2016/*.json.zst + attributes: &attributes + - dedupe_by_year + output: &output + max_size_in_bytes: 3_814_697_265 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup/documents + filter: &filter + include: + - >- + (.attributes.dedupe_ngrams_20_1 | length == 0) or + ((.attributes.dedupe_ngrams_20_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3) + syntax: jq + + - name: cc-news_2017 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2017/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2018 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2018/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2019 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2019/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2020 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2020/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2021 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2021/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2022 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2022/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2023 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2023/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2024 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2024/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/input + output: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/output + +processes: 188 diff --git a/configs/cc-news/partition_by_lang.py b/configs/cc-news/partition_by_lang.py new file mode 100644 index 00000000..f39a9279 --- /dev/null +++ b/configs/cc-news/partition_by_lang.py @@ -0,0 +1,145 @@ +from argparse import ArgumentParser +from contextlib import ExitStack +import os +from queue import Queue +from tempfile import TemporaryDirectory +from typing import Any, Tuple, Union + +import msgspec +import smart_open +from dolma.core.parallel import BaseParallelProcessor +from dolma.core.data_types import InputSpecWithMetadataAndAttributes, OutputSpec + + +class PartitionByLangProcessor(BaseParallelProcessor): + @classmethod + def increment_progressbar( + cls, + queue: "Queue[Union[Tuple[int, ...], None]]", + /, + files: int = 0, + skipped: int = 0, + written: int = 0, + ): + return super().increment_progressbar(queue, files=files, skipped=skipped, written=written) + + @classmethod + def process_single( + cls, + source_path: str, + destination_path: str, + queue: Queue, + **kwargs: Any, + ): + """ + This method is called for each file. It reads the file + line by line, and writes to the destination file only + if the document is not empty. + """ + + attribute_prefix = kwargs.get("attribute_prefix", None) + attribute_name = kwargs.get("attribute_name", None) + lang_min_score = float(kwargs.get("lang_min_score", -1)) + + document_parser = msgspec.json.Decoder(InputSpecWithMetadataAndAttributes) + attribute_parser = msgspec.json.Decoder(OutputSpec) + encoder = msgspec.json.Encoder() + + assert attribute_prefix is not None, "Attribute prefix is required" + assert attribute_name is not None, "Attribute name is required" + assert 0 <= lang_min_score <= 1, "Language min score must be between 0 and 1" + + dest_dir, dest_file = os.path.split(destination_path) + + written = skipped = 0 + + with ExitStack() as stack: + source_file = stack.enter_context(smart_open.open(source_path, mode="rt", encoding="utf-8")) + language_attribute_path = source_path.replace("/documents/", f"/attributes/{attribute_name}/") + language_attribute_file = stack.enter_context( + smart_open.open(language_attribute_path, mode="rt", encoding="utf-8") + ) + dst_files = {} + + while True: + raw_doc = source_file.readline() + raw_attr = language_attribute_file.readline() + + if not raw_doc or not raw_attr: + # end of file + break + + attr = attribute_parser.decode(raw_attr) + + all_langs = { + k.replace(attribute_prefix, ""): v[0][-1] + for k, v in attr.attributes.items() + if k.startswith(attribute_prefix) + } + + if all_langs: + top_lang, top_score = max(all_langs.items(), key=lambda x: x[1]) + else: + top_lang = "unk" + top_score = 0 + + if top_score < lang_min_score: + top_lang = "unk" + skipped += 1 + + doc = document_parser.decode(raw_doc) + doc.attributes = {**(doc.attributes or {}), **attr.attributes} + + if top_lang not in dst_files: + dir_path = os.path.join(dest_dir, top_lang) + os.makedirs(dir_path, exist_ok=True) + dst_files[top_lang] = stack.enter_context( + smart_open.open(os.path.join(dir_path, dest_file), mode="wt", encoding="utf-8") + ) + + dst_files[top_lang].write(encoder.encode(doc).decode('utf-8') + "\n") + written += 1 + + if (written + skipped) > 1000: + cls.increment_progressbar(queue, written=written, skipped=skipped) + written = skipped = 0 + + cls.increment_progressbar(queue, written=written, skipped=skipped, files=1) + + +def parse_args(): + ag = ArgumentParser() + ag.add_argument("-s", "--source-prefix", type=str, required=True) + ag.add_argument("-d", "--destination-prefix", type=str, required=True) + ag.add_argument("-n", "--num-processes", type=int, default=1) + ag.add_argument("-u", "--debug", action="store_true") + ag.add_argument("--temp-dir", type=str, default=None) + ag.add_argument("--attribute-name", type=str, default="glotlid_doc_v3_1e2") + ag.add_argument("--attribute-prefix", type=str, default="glotlid_doc_v3_1e2__glotlid_doc_v3_1e2__") + ag.add_argument("--lang-min-score", type=float, default=0.5) + return ag.parse_args() + + +def main(): + args = parse_args() + + with TemporaryDirectory(dir=args.temp_dir) as tmpdir: + # create the processor + processor = PartitionByLangProcessor( + source_prefix=args.source_prefix, + destination_prefix=args.destination_prefix, + metadata_prefix=tmpdir, + num_processes=args.num_processes, + debug=args.debug, + ) + + # run the processor + processor( + attribute_name=args.attribute_name, + attribute_prefix=args.attribute_prefix, + lang_min_score=args.lang_min_score, + ) + + +if __name__ == "__main__": + main() diff --git a/configs/cc-news/tag_v2.yaml b/configs/cc-news/tag_v2.yaml new file mode 100644 index 00000000..4c534099 --- /dev/null +++ b/configs/cc-news/tag_v2.yaml @@ -0,0 +1,8 @@ +taggers: + - glotlid_doc_v3_1e2 + - whitespace_tokenizer_v1 + +processes: 188 + +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup/documents/*.json.gz diff --git a/configs/cc-news/v3_mix_lang.yaml b/configs/cc-news/v3_mix_lang.yaml new file mode 100644 index 00000000..c53f328c --- /dev/null +++ b/configs/cc-news/v3_mix_lang.yaml @@ -0,0 +1,6312 @@ +streams: + - name: cc-news_abk_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/abk_Cyrl/*.json.gz + attributes: &attributes + - dedupe_by_lang + output: &output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/abk_Cyrl + max_size_in_bytes: 3_814_697_265 + filter: &filter + include: + - >- + (.attributes.dedupe_ngrams_13_1 | length == 0) or + ((.attributes.dedupe_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3) + syntax: jq + + - name: cc-news_abs_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/abs_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/abs_Latn + <<: *output + filter: *filter + + - name: cc-news_abz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/abz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/abz_Latn + <<: *output + filter: *filter + + - name: cc-news_ace_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ace_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ace_Arab + <<: *output + filter: *filter + + - name: cc-news_ace_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ace_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ace_Latn + <<: *output + filter: *filter + + - name: cc-news_acf_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/acf_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/acf_Latn + <<: *output + filter: *filter + + - name: cc-news_acm_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/acm_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/acm_Arab + <<: *output + filter: *filter + + - name: cc-news_acn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/acn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/acn_Latn + <<: *output + filter: *filter + + - name: cc-news_ade_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ade_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ade_Latn + <<: *output + filter: *filter + + - name: cc-news_ady_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ady_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ady_Cyrl + <<: *output + filter: *filter + + - name: cc-news_aeb_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/aeb_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/aeb_Arab + <<: *output + filter: *filter + + - name: cc-news_afr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/afr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/afr_Latn + <<: *output + filter: *filter + + - name: cc-news_agx_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/agx_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/agx_Cyrl + <<: *output + filter: *filter + + - name: cc-news_aii_Syrc + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/aii_Syrc/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/aii_Syrc + <<: *output + filter: *filter + + - name: cc-news_ajp_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ajp_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ajp_Arab + <<: *output + filter: *filter + + - name: cc-news_ajz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ajz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ajz_Latn + <<: *output + filter: *filter + + - name: cc-news_akb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/akb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/akb_Latn + <<: *output + filter: *filter + + - name: cc-news_aln_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/aln_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/aln_Latn + <<: *output + filter: *filter + + - name: cc-news_alq_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/alq_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/alq_Latn + <<: *output + filter: *filter + + - name: cc-news_als_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/als_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/als_Latn + <<: *output + filter: *filter + + - name: cc-news_alt_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/alt_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/alt_Cyrl + <<: *output + filter: *filter + + - name: cc-news_alz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/alz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/alz_Latn + <<: *output + filter: *filter + + - name: cc-news_amh_Ethi + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/amh_Ethi/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/amh_Ethi + <<: *output + filter: *filter + + - name: cc-news_ami_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ami_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ami_Latn + <<: *output + filter: *filter + + - name: cc-news_amp_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/amp_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/amp_Latn + <<: *output + filter: *filter + + - name: cc-news_ang_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ang_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ang_Latn + <<: *output + filter: *filter + + - name: cc-news_anp_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/anp_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/anp_Deva + <<: *output + filter: *filter + + - name: cc-news_apc_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/apc_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/apc_Arab + <<: *output + filter: *filter + + - name: cc-news_arb_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arb_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arb_Arab + <<: *output + filter: *filter + + - name: cc-news_arb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arb_Latn + <<: *output + filter: *filter + + - name: cc-news_arg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arg_Latn + <<: *output + filter: *filter + + - name: cc-news_arn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arn_Latn + <<: *output + filter: *filter + + - name: cc-news_arr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arr_Latn + <<: *output + filter: *filter + + - name: cc-news_ars_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ars_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ars_Arab + <<: *output + filter: *filter + + - name: cc-news_ary_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ary_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ary_Arab + <<: *output + filter: *filter + + - name: cc-news_arz_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/arz_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/arz_Arab + <<: *output + filter: *filter + + - name: cc-news_asm_Beng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/asm_Beng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/asm_Beng + <<: *output + filter: *filter + + - name: cc-news_asm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/asm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/asm_Latn + <<: *output + filter: *filter + + - name: cc-news_ast_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ast_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ast_Latn + <<: *output + filter: *filter + + - name: cc-news_ata_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ata_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ata_Latn + <<: *output + filter: *filter + + - name: cc-news_atj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/atj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/atj_Latn + <<: *output + filter: *filter + + - name: cc-news_avk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/avk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/avk_Latn + <<: *output + filter: *filter + + - name: cc-news_awa_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/awa_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/awa_Deva + <<: *output + filter: *filter + + - name: cc-news_ayp_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ayp_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ayp_Arab + <<: *output + filter: *filter + + - name: cc-news_ayr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ayr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ayr_Latn + <<: *output + filter: *filter + + - name: cc-news_azb_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/azb_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/azb_Arab + <<: *output + filter: *filter + + - name: cc-news_azj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/azj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/azj_Latn + <<: *output + filter: *filter + + - name: cc-news_azz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/azz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/azz_Latn + <<: *output + filter: *filter + + - name: cc-news_bak_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bak_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bak_Cyrl + <<: *output + filter: *filter + + - name: cc-news_bam_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bam_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bam_Latn + <<: *output + filter: *filter + + - name: cc-news_ban_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ban_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ban_Latn + <<: *output + filter: *filter + + - name: cc-news_bar_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bar_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bar_Latn + <<: *output + filter: *filter + + - name: cc-news_bbc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bbc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bbc_Latn + <<: *output + filter: *filter + + - name: cc-news_bcc_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bcc_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bcc_Arab + <<: *output + filter: *filter + + - name: cc-news_bcl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bcl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bcl_Latn + <<: *output + filter: *filter + + - name: cc-news_bel_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bel_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bel_Cyrl + <<: *output + filter: *filter + + - name: cc-news_bem_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bem_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bem_Latn + <<: *output + filter: *filter + + - name: cc-news_ben_Beng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ben_Beng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ben_Beng + <<: *output + filter: *filter + + - name: cc-news_ben_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ben_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ben_Latn + <<: *output + filter: *filter + + - name: cc-news_bew_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bew_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bew_Latn + <<: *output + filter: *filter + + - name: cc-news_bho_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bho_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bho_Deva + <<: *output + filter: *filter + + - name: cc-news_bhp_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bhp_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bhp_Latn + <<: *output + filter: *filter + + - name: cc-news_bim_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bim_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bim_Latn + <<: *output + filter: *filter + + - name: cc-news_bis_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bis_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bis_Latn + <<: *output + filter: *filter + + - name: cc-news_bjn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bjn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bjn_Latn + <<: *output + filter: *filter + + - name: cc-news_bla_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bla_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bla_Latn + <<: *output + filter: *filter + + - name: cc-news_blk_Mymr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/blk_Mymr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/blk_Mymr + <<: *output + filter: *filter + + - name: cc-news_bod_Tibt + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bod_Tibt/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bod_Tibt + <<: *output + filter: *filter + + - name: cc-news_bos_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bos_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bos_Latn + <<: *output + filter: *filter + + - name: cc-news_bpr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bpr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bpr_Latn + <<: *output + filter: *filter + + - name: cc-news_bpy_Beng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bpy_Beng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bpy_Beng + <<: *output + filter: *filter + + - name: cc-news_bqj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bqj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bqj_Latn + <<: *output + filter: *filter + + - name: cc-news_bre_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bre_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bre_Latn + <<: *output + filter: *filter + + - name: cc-news_brh_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/brh_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/brh_Arab + <<: *output + filter: *filter + + - name: cc-news_brx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/brx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/brx_Latn + <<: *output + filter: *filter + + - name: cc-news_bsq_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bsq_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bsq_Latn + <<: *output + filter: *filter + + - name: cc-news_bts_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bts_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bts_Latn + <<: *output + filter: *filter + + - name: cc-news_btx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/btx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/btx_Latn + <<: *output + filter: *filter + + - name: cc-news_bug_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bug_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bug_Latn + <<: *output + filter: *filter + + - name: cc-news_bul_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bul_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bul_Cyrl + <<: *output + filter: *filter + + - name: cc-news_bum_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bum_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bum_Latn + <<: *output + filter: *filter + + - name: cc-news_bwu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bwu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bwu_Latn + <<: *output + filter: *filter + + - name: cc-news_bxr_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bxr_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bxr_Cyrl + <<: *output + filter: *filter + + - name: cc-news_byv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/byv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/byv_Latn + <<: *output + filter: *filter + + - name: cc-news_bzd_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bzd_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bzd_Latn + <<: *output + filter: *filter + + - name: cc-news_bzj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/bzj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/bzj_Latn + <<: *output + filter: *filter + + - name: cc-news_caa_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/caa_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/caa_Latn + <<: *output + filter: *filter + + - name: cc-news_cat_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cat_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cat_Latn + <<: *output + filter: *filter + + - name: cc-news_cbk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cbk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cbk_Latn + <<: *output + filter: *filter + + - name: cc-news_ccp_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ccp_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ccp_Latn + <<: *output + filter: *filter + + - name: cc-news_cdf_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cdf_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cdf_Latn + <<: *output + filter: *filter + + - name: cc-news_ceb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ceb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ceb_Latn + <<: *output + filter: *filter + + - name: cc-news_ces_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ces_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ces_Latn + <<: *output + filter: *filter + + - name: cc-news_cgc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cgc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cgc_Latn + <<: *output + filter: *filter + + - name: cc-news_cha_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cha_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cha_Latn + <<: *output + filter: *filter + + - name: cc-news_che_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/che_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/che_Cyrl + <<: *output + filter: *filter + + - name: cc-news_chk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/chk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/chk_Latn + <<: *output + filter: *filter + + - name: cc-news_chr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/chr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/chr_Latn + <<: *output + filter: *filter + + - name: cc-news_chv_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/chv_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/chv_Cyrl + <<: *output + filter: *filter + + - name: cc-news_cjk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cjk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cjk_Latn + <<: *output + filter: *filter + + - name: cc-news_ckb_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ckb_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ckb_Arab + <<: *output + filter: *filter + + - name: cc-news_ckm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ckm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ckm_Latn + <<: *output + filter: *filter + + - name: cc-news_cmn_Hani + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cmn_Hani/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cmn_Hani + <<: *output + filter: *filter + + - name: cc-news_cmr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cmr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cmr_Latn + <<: *output + filter: *filter + + - name: cc-news_cnh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cnh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cnh_Latn + <<: *output + filter: *filter + + - name: cc-news_cnr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cnr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cnr_Latn + <<: *output + filter: *filter + + - name: cc-news_cof_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cof_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cof_Latn + <<: *output + filter: *filter + + - name: cc-news_cos_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cos_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cos_Latn + <<: *output + filter: *filter + + - name: cc-news_cot_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cot_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cot_Latn + <<: *output + filter: *filter + + - name: cc-news_cou_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cou_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cou_Latn + <<: *output + filter: *filter + + - name: cc-news_cpu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cpu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cpu_Latn + <<: *output + filter: *filter + + - name: cc-news_crh_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crh_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crh_Cyrl + <<: *output + filter: *filter + + - name: cc-news_crh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crh_Latn + <<: *output + filter: *filter + + - name: cc-news_cri_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cri_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cri_Latn + <<: *output + filter: *filter + + - name: cc-news_crk_Cans + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crk_Cans/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crk_Cans + <<: *output + filter: *filter + + - name: cc-news_crs_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crs_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crs_Latn + <<: *output + filter: *filter + + - name: cc-news_crx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/crx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/crx_Latn + <<: *output + filter: *filter + + - name: cc-news_csw_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/csw_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/csw_Latn + <<: *output + filter: *filter + + - name: cc-news_cto_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cto_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cto_Latn + <<: *output + filter: *filter + + - name: cc-news_cuc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cuc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cuc_Latn + <<: *output + filter: *filter + + - name: cc-news_cuk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cuk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cuk_Latn + <<: *output + filter: *filter + + - name: cc-news_cym_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/cym_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/cym_Latn + <<: *output + filter: *filter + + - name: cc-news_dag_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dag_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dag_Latn + <<: *output + filter: *filter + + - name: cc-news_dan_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dan_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dan_Latn + <<: *output + filter: *filter + + - name: cc-news_dar_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dar_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dar_Cyrl + <<: *output + filter: *filter + + - name: cc-news_ded_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ded_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ded_Latn + <<: *output + filter: *filter + + - name: cc-news_deu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/deu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/deu_Latn + <<: *output + filter: *filter + + - name: cc-news_dgr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dgr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dgr_Latn + <<: *output + filter: *filter + + - name: cc-news_dgz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dgz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dgz_Latn + <<: *output + filter: *filter + + - name: cc-news_dhv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dhv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dhv_Latn + <<: *output + filter: *filter + + - name: cc-news_dik_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dik_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dik_Latn + <<: *output + filter: *filter + + - name: cc-news_diq_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/diq_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/diq_Latn + <<: *output + filter: *filter + + - name: cc-news_div_Thaa + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/div_Thaa/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/div_Thaa + <<: *output + filter: *filter + + - name: cc-news_doi_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/doi_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/doi_Deva + <<: *output + filter: *filter + + - name: cc-news_dsb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dsb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dsb_Latn + <<: *output + filter: *filter + + - name: cc-news_dsh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dsh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dsh_Latn + <<: *output + filter: *filter + + - name: cc-news_dwr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dwr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dwr_Latn + <<: *output + filter: *filter + + - name: cc-news_dzo_Tibt + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/dzo_Tibt/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/dzo_Tibt + <<: *output + filter: *filter + + - name: cc-news_efi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/efi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/efi_Latn + <<: *output + filter: *filter + + - name: cc-news_ekk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ekk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ekk_Latn + <<: *output + filter: *filter + + - name: cc-news_ell_Grek + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ell_Grek/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ell_Grek + <<: *output + filter: *filter + + - name: cc-news_eml_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/eml_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/eml_Latn + <<: *output + filter: *filter + + - name: cc-news_eng_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/eng_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/eng_Latn + <<: *output + filter: *filter + + - name: cc-news_enl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/enl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/enl_Latn + <<: *output + filter: *filter + + - name: cc-news_enm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/enm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/enm_Latn + <<: *output + filter: *filter + + - name: cc-news_epo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/epo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/epo_Latn + <<: *output + filter: *filter + + - name: cc-news_ese_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ese_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ese_Latn + <<: *output + filter: *filter + + - name: cc-news_esi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/esi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/esi_Latn + <<: *output + filter: *filter + + - name: cc-news_esk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/esk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/esk_Latn + <<: *output + filter: *filter + + - name: cc-news_esu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/esu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/esu_Latn + <<: *output + filter: *filter + + - name: cc-news_eus_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/eus_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/eus_Latn + <<: *output + filter: *filter + + - name: cc-news_ewe_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ewe_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ewe_Latn + <<: *output + filter: *filter + + - name: cc-news_ewo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ewo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ewo_Latn + <<: *output + filter: *filter + + - name: cc-news_ext_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ext_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ext_Latn + <<: *output + filter: *filter + + - name: cc-news_fad_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fad_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fad_Latn + <<: *output + filter: *filter + + - name: cc-news_fao_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fao_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fao_Latn + <<: *output + filter: *filter + + - name: cc-news_fas_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fas_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fas_Arab + <<: *output + filter: *filter + + - name: cc-news_fat_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fat_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fat_Latn + <<: *output + filter: *filter + + - name: cc-news_ffm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ffm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ffm_Latn + <<: *output + filter: *filter + + - name: cc-news_fij_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fij_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fij_Latn + <<: *output + filter: *filter + + - name: cc-news_fil_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fil_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fil_Latn + <<: *output + filter: *filter + + - name: cc-news_fin_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fin_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fin_Latn + <<: *output + filter: *filter + + - name: cc-news_fit_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fit_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fit_Latn + <<: *output + filter: *filter + + - name: cc-news_fkv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fkv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fkv_Latn + <<: *output + filter: *filter + + - name: cc-news_fra_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fra_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fra_Latn + <<: *output + filter: *filter + + - name: cc-news_fro_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fro_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fro_Latn + <<: *output + filter: *filter + + - name: cc-news_frp_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/frp_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/frp_Latn + <<: *output + filter: *filter + + - name: cc-news_frr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/frr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/frr_Latn + <<: *output + filter: *filter + + - name: cc-news_fry_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fry_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fry_Latn + <<: *output + filter: *filter + + - name: cc-news_fuf_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fuf_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fuf_Latn + <<: *output + filter: *filter + + - name: cc-news_fuq_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fuq_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fuq_Latn + <<: *output + filter: *filter + + - name: cc-news_fur_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fur_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fur_Latn + <<: *output + filter: *filter + + - name: cc-news_fuv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/fuv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/fuv_Latn + <<: *output + filter: *filter + + - name: cc-news_gaa_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gaa_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gaa_Latn + <<: *output + filter: *filter + + - name: cc-news_gag_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gag_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gag_Latn + <<: *output + filter: *filter + + - name: cc-news_gaz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gaz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gaz_Latn + <<: *output + filter: *filter + + - name: cc-news_gcf_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gcf_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gcf_Latn + <<: *output + filter: *filter + + - name: cc-news_gcr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gcr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gcr_Latn + <<: *output + filter: *filter + + - name: cc-news_ghs_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ghs_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ghs_Latn + <<: *output + filter: *filter + + - name: cc-news_gid_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gid_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gid_Latn + <<: *output + filter: *filter + + - name: cc-news_gil_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gil_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gil_Latn + <<: *output + filter: *filter + + - name: cc-news_gla_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gla_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gla_Latn + <<: *output + filter: *filter + + - name: cc-news_gle_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gle_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gle_Latn + <<: *output + filter: *filter + + - name: cc-news_glg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/glg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/glg_Latn + <<: *output + filter: *filter + + - name: cc-news_glk_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/glk_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/glk_Arab + <<: *output + filter: *filter + + - name: cc-news_glv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/glv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/glv_Latn + <<: *output + filter: *filter + + - name: cc-news_gmh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gmh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gmh_Latn + <<: *output + filter: *filter + + - name: cc-news_gmv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gmv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gmv_Latn + <<: *output + filter: *filter + + - name: cc-news_goh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/goh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/goh_Latn + <<: *output + filter: *filter + + - name: cc-news_gom_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gom_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gom_Deva + <<: *output + filter: *filter + + - name: cc-news_gom_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gom_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gom_Latn + <<: *output + filter: *filter + + - name: cc-news_gor_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gor_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gor_Latn + <<: *output + filter: *filter + + - name: cc-news_gos_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gos_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gos_Latn + <<: *output + filter: *filter + + - name: cc-news_grc_Grek + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/grc_Grek/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/grc_Grek + <<: *output + filter: *filter + + - name: cc-news_grt_Beng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/grt_Beng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/grt_Beng + <<: *output + filter: *filter + + - name: cc-news_gsw_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gsw_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gsw_Latn + <<: *output + filter: *filter + + - name: cc-news_guc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guc_Latn + <<: *output + filter: *filter + + - name: cc-news_gug_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gug_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gug_Latn + <<: *output + filter: *filter + + - name: cc-news_gui_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gui_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gui_Latn + <<: *output + filter: *filter + + - name: cc-news_guj_Gujr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guj_Gujr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guj_Gujr + <<: *output + filter: *filter + + - name: cc-news_guj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guj_Latn + <<: *output + filter: *filter + + - name: cc-news_guk_Ethi + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guk_Ethi/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guk_Ethi + <<: *output + filter: *filter + + - name: cc-news_gux_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gux_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gux_Latn + <<: *output + filter: *filter + + - name: cc-news_guz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/guz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/guz_Latn + <<: *output + filter: *filter + + - name: cc-news_gwi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gwi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gwi_Latn + <<: *output + filter: *filter + + - name: cc-news_gym_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/gym_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/gym_Latn + <<: *output + filter: *filter + + - name: cc-news_hac_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hac_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hac_Arab + <<: *output + filter: *filter + + - name: cc-news_hae_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hae_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hae_Latn + <<: *output + filter: *filter + + - name: cc-news_hak_Hani + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hak_Hani/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hak_Hani + <<: *output + filter: *filter + + - name: cc-news_hak_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hak_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hak_Latn + <<: *output + filter: *filter + + - name: cc-news_hat_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hat_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hat_Latn + <<: *output + filter: *filter + + - name: cc-news_hau_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hau_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hau_Latn + <<: *output + filter: *filter + + - name: cc-news_haw_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/haw_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/haw_Latn + <<: *output + filter: *filter + + - name: cc-news_hbo_Hebr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hbo_Hebr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hbo_Hebr + <<: *output + filter: *filter + + - name: cc-news_hch_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hch_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hch_Latn + <<: *output + filter: *filter + + - name: cc-news_heb_Hebr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/heb_Hebr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/heb_Hebr + <<: *output + filter: *filter + + - name: cc-news_her_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/her_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/her_Latn + <<: *output + filter: *filter + + - name: cc-news_hif_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hif_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hif_Latn + <<: *output + filter: *filter + + - name: cc-news_hil_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hil_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hil_Latn + <<: *output + filter: *filter + + - name: cc-news_hin_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hin_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hin_Deva + <<: *output + filter: *filter + + - name: cc-news_hin_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hin_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hin_Latn + <<: *output + filter: *filter + + - name: cc-news_hmo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hmo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hmo_Latn + <<: *output + filter: *filter + + - name: cc-news_hne_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hne_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hne_Deva + <<: *output + filter: *filter + + - name: cc-news_hns_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hns_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hns_Latn + <<: *output + filter: *filter + + - name: cc-news_hrv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hrv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hrv_Latn + <<: *output + filter: *filter + + - name: cc-news_hrx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hrx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hrx_Latn + <<: *output + filter: *filter + + - name: cc-news_hsb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hsb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hsb_Latn + <<: *output + filter: *filter + + - name: cc-news_hun_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hun_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hun_Latn + <<: *output + filter: *filter + + - name: cc-news_hus_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hus_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hus_Latn + <<: *output + filter: *filter + + - name: cc-news_hwc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hwc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hwc_Latn + <<: *output + filter: *filter + + - name: cc-news_hye_Armn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hye_Armn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hye_Armn + <<: *output + filter: *filter + + - name: cc-news_hyw_Armn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/hyw_Armn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/hyw_Armn + <<: *output + filter: *filter + + - name: cc-news_iba_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/iba_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/iba_Latn + <<: *output + filter: *filter + + - name: cc-news_ibg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ibg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ibg_Latn + <<: *output + filter: *filter + + - name: cc-news_ibo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ibo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ibo_Latn + <<: *output + filter: *filter + + - name: cc-news_icr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/icr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/icr_Latn + <<: *output + filter: *filter + + - name: cc-news_ido_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ido_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ido_Latn + <<: *output + filter: *filter + + - name: cc-news_idu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/idu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/idu_Latn + <<: *output + filter: *filter + + - name: cc-news_ike_Cans + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ike_Cans/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ike_Cans + <<: *output + filter: *filter + + - name: cc-news_ile_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ile_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ile_Latn + <<: *output + filter: *filter + + - name: cc-news_ilo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ilo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ilo_Latn + <<: *output + filter: *filter + + - name: cc-news_ina_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ina_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ina_Latn + <<: *output + filter: *filter + + - name: cc-news_ind_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ind_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ind_Latn + <<: *output + filter: *filter + + - name: cc-news_inh_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/inh_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/inh_Cyrl + <<: *output + filter: *filter + + - name: cc-news_isl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/isl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/isl_Latn + <<: *output + filter: *filter + + - name: cc-news_ita_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ita_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ita_Latn + <<: *output + filter: *filter + + - name: cc-news_itv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/itv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/itv_Latn + <<: *output + filter: *filter + + - name: cc-news_jam_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/jam_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/jam_Latn + <<: *output + filter: *filter + + - name: cc-news_jav_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/jav_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/jav_Latn + <<: *output + filter: *filter + + - name: cc-news_jpn_Jpan + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/jpn_Jpan/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/jpn_Jpan + <<: *output + filter: *filter + + - name: cc-news_kaa_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kaa_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kaa_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kaa_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kaa_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kaa_Latn + <<: *output + filter: *filter + + - name: cc-news_kab_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kab_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kab_Latn + <<: *output + filter: *filter + + - name: cc-news_kak_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kak_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kak_Latn + <<: *output + filter: *filter + + - name: cc-news_kal_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kal_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kal_Latn + <<: *output + filter: *filter + + - name: cc-news_kam_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kam_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kam_Latn + <<: *output + filter: *filter + + - name: cc-news_kan_Knda + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kan_Knda/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kan_Knda + <<: *output + filter: *filter + + - name: cc-news_kan_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kan_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kan_Latn + <<: *output + filter: *filter + + - name: cc-news_kao_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kao_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kao_Latn + <<: *output + filter: *filter + + - name: cc-news_kas_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kas_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kas_Arab + <<: *output + filter: *filter + + - name: cc-news_kas_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kas_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kas_Deva + <<: *output + filter: *filter + + - name: cc-news_kas_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kas_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kas_Latn + <<: *output + filter: *filter + + - name: cc-news_kat_Geor + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kat_Geor/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kat_Geor + <<: *output + filter: *filter + + - name: cc-news_kaz_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kaz_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kaz_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kbd_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kbd_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kbd_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kbp_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kbp_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kbp_Latn + <<: *output + filter: *filter + + - name: cc-news_kca_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kca_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kca_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kck_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kck_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kck_Latn + <<: *output + filter: *filter + + - name: cc-news_kdr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kdr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kdr_Latn + <<: *output + filter: *filter + + - name: cc-news_kea_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kea_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kea_Latn + <<: *output + filter: *filter + + - name: cc-news_kff_Telu + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kff_Telu/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kff_Telu + <<: *output + filter: *filter + + - name: cc-news_kha_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kha_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kha_Latn + <<: *output + filter: *filter + + - name: cc-news_khk_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/khk_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/khk_Cyrl + <<: *output + filter: *filter + + - name: cc-news_khm_Khmr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/khm_Khmr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/khm_Khmr + <<: *output + filter: *filter + + - name: cc-news_khz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/khz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/khz_Latn + <<: *output + filter: *filter + + - name: cc-news_kik_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kik_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kik_Latn + <<: *output + filter: *filter + + - name: cc-news_kin_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kin_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kin_Latn + <<: *output + filter: *filter + + - name: cc-news_kir_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kir_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kir_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kiu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kiu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kiu_Latn + <<: *output + filter: *filter + + - name: cc-news_kjh_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kjh_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kjh_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kmb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kmb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kmb_Latn + <<: *output + filter: *filter + + - name: cc-news_kmg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kmg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kmg_Latn + <<: *output + filter: *filter + + - name: cc-news_kmr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kmr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kmr_Latn + <<: *output + filter: *filter + + - name: cc-news_kmy_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kmy_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kmy_Latn + <<: *output + filter: *filter + + - name: cc-news_knc_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/knc_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/knc_Arab + <<: *output + filter: *filter + + - name: cc-news_knc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/knc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/knc_Latn + <<: *output + filter: *filter + + - name: cc-news_kne_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kne_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kne_Latn + <<: *output + filter: *filter + + - name: cc-news_kog_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kog_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kog_Latn + <<: *output + filter: *filter + + - name: cc-news_koi_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/koi_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/koi_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kor_Hang + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kor_Hang/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kor_Hang + <<: *output + filter: *filter + + - name: cc-news_kos_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kos_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kos_Latn + <<: *output + filter: *filter + + - name: cc-news_kpv_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kpv_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kpv_Cyrl + <<: *output + filter: *filter + + - name: cc-news_krc_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/krc_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/krc_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kri_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kri_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kri_Latn + <<: *output + filter: *filter + + - name: cc-news_krj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/krj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/krj_Latn + <<: *output + filter: *filter + + - name: cc-news_krl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/krl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/krl_Latn + <<: *output + filter: *filter + + - name: cc-news_ksd_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ksd_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ksd_Latn + <<: *output + filter: *filter + + - name: cc-news_ksh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ksh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ksh_Latn + <<: *output + filter: *filter + + - name: cc-news_ksw_Mymr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ksw_Mymr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ksw_Mymr + <<: *output + filter: *filter + + - name: cc-news_ktb_Ethi + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ktb_Ethi/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ktb_Ethi + <<: *output + filter: *filter + + - name: cc-news_kua_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kua_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kua_Latn + <<: *output + filter: *filter + + - name: cc-news_kum_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kum_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kum_Cyrl + <<: *output + filter: *filter + + - name: cc-news_kup_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kup_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kup_Latn + <<: *output + filter: *filter + + - name: cc-news_kus_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kus_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kus_Latn + <<: *output + filter: *filter + + - name: cc-news_kwn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kwn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kwn_Latn + <<: *output + filter: *filter + + - name: cc-news_kwy_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kwy_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kwy_Latn + <<: *output + filter: *filter + + - name: cc-news_kxm_Thai + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/kxm_Thai/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/kxm_Thai + <<: *output + filter: *filter + + - name: cc-news_lad_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lad_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lad_Latn + <<: *output + filter: *filter + + - name: cc-news_laj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/laj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/laj_Latn + <<: *output + filter: *filter + + - name: cc-news_lam_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lam_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lam_Latn + <<: *output + filter: *filter + + - name: cc-news_lao_Laoo + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lao_Laoo/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lao_Laoo + <<: *output + filter: *filter + + - name: cc-news_lat_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lat_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lat_Latn + <<: *output + filter: *filter + + - name: cc-news_lez_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lez_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lez_Cyrl + <<: *output + filter: *filter + + - name: cc-news_lij_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lij_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lij_Latn + <<: *output + filter: *filter + + - name: cc-news_lim_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lim_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lim_Latn + <<: *output + filter: *filter + + - name: cc-news_lin_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lin_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lin_Latn + <<: *output + filter: *filter + + - name: cc-news_lip_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lip_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lip_Latn + <<: *output + filter: *filter + + - name: cc-news_lit_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lit_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lit_Latn + <<: *output + filter: *filter + + - name: cc-news_lki_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lki_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lki_Arab + <<: *output + filter: *filter + + - name: cc-news_lld_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lld_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lld_Latn + <<: *output + filter: *filter + + - name: cc-news_lmk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lmk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lmk_Latn + <<: *output + filter: *filter + + - name: cc-news_lmo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lmo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lmo_Latn + <<: *output + filter: *filter + + - name: cc-news_loz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/loz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/loz_Latn + <<: *output + filter: *filter + + - name: cc-news_lrc_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lrc_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lrc_Arab + <<: *output + filter: *filter + + - name: cc-news_ltg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ltg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ltg_Latn + <<: *output + filter: *filter + + - name: cc-news_ltz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ltz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ltz_Latn + <<: *output + filter: *filter + + - name: cc-news_lub_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lub_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lub_Latn + <<: *output + filter: *filter + + - name: cc-news_lue_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lue_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lue_Latn + <<: *output + filter: *filter + + - name: cc-news_lug_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lug_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lug_Latn + <<: *output + filter: *filter + + - name: cc-news_lun_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lun_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lun_Latn + <<: *output + filter: *filter + + - name: cc-news_luo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/luo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/luo_Latn + <<: *output + filter: *filter + + - name: cc-news_lus_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lus_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lus_Latn + <<: *output + filter: *filter + + - name: cc-news_lvs_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lvs_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lvs_Latn + <<: *output + filter: *filter + + - name: cc-news_lwg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lwg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lwg_Latn + <<: *output + filter: *filter + + - name: cc-news_lzh_Hani + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/lzh_Hani/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/lzh_Hani + <<: *output + filter: *filter + + - name: cc-news_mad_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mad_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mad_Latn + <<: *output + filter: *filter + + - name: cc-news_mag_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mag_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mag_Deva + <<: *output + filter: *filter + + - name: cc-news_mah_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mah_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mah_Latn + <<: *output + filter: *filter + + - name: cc-news_mai_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mai_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mai_Deva + <<: *output + filter: *filter + + - name: cc-news_mak_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mak_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mak_Latn + <<: *output + filter: *filter + + - name: cc-news_mal_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mal_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mal_Latn + <<: *output + filter: *filter + + - name: cc-news_mal_Mlym + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mal_Mlym/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mal_Mlym + <<: *output + filter: *filter + + - name: cc-news_mar_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mar_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mar_Deva + <<: *output + filter: *filter + + - name: cc-news_mar_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mar_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mar_Latn + <<: *output + filter: *filter + + - name: cc-news_mas_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mas_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mas_Latn + <<: *output + filter: *filter + + - name: cc-news_mbc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mbc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mbc_Latn + <<: *output + filter: *filter + + - name: cc-news_mcp_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mcp_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mcp_Latn + <<: *output + filter: *filter + + - name: cc-news_mdf_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mdf_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mdf_Cyrl + <<: *output + filter: *filter + + - name: cc-news_mer_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mer_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mer_Latn + <<: *output + filter: *filter + + - name: cc-news_mfe_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mfe_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mfe_Latn + <<: *output + filter: *filter + + - name: cc-news_mfy_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mfy_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mfy_Latn + <<: *output + filter: *filter + + - name: cc-news_mhr_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mhr_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mhr_Cyrl + <<: *output + filter: *filter + + - name: cc-news_mhx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mhx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mhx_Latn + <<: *output + filter: *filter + + - name: cc-news_min_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/min_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/min_Arab + <<: *output + filter: *filter + + - name: cc-news_min_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/min_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/min_Latn + <<: *output + filter: *filter + + - name: cc-news_mkd_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mkd_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mkd_Cyrl + <<: *output + filter: *filter + + - name: cc-news_mkn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mkn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mkn_Latn + <<: *output + filter: *filter + + - name: cc-news_mlt_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mlt_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mlt_Latn + <<: *output + filter: *filter + + - name: cc-news_mnb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mnb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mnb_Latn + <<: *output + filter: *filter + + - name: cc-news_mni_Beng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mni_Beng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mni_Beng + <<: *output + filter: *filter + + - name: cc-news_mni_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mni_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mni_Latn + <<: *output + filter: *filter + + - name: cc-news_mnk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mnk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mnk_Latn + <<: *output + filter: *filter + + - name: cc-news_moh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/moh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/moh_Latn + <<: *output + filter: *filter + + - name: cc-news_mop_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mop_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mop_Latn + <<: *output + filter: *filter + + - name: cc-news_mos_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mos_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mos_Latn + <<: *output + filter: *filter + + - name: cc-news_mqy_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mqy_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mqy_Latn + <<: *output + filter: *filter + + - name: cc-news_mri_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mri_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mri_Latn + <<: *output + filter: *filter + + - name: cc-news_mrj_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mrj_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mrj_Cyrl + <<: *output + filter: *filter + + - name: cc-news_mrw_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mrw_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mrw_Latn + <<: *output + filter: *filter + + - name: cc-news_msb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/msb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/msb_Latn + <<: *output + filter: *filter + + - name: cc-news_msm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/msm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/msm_Latn + <<: *output + filter: *filter + + - name: cc-news_mui_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mui_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mui_Latn + <<: *output + filter: *filter + + - name: cc-news_mup_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mup_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mup_Deva + <<: *output + filter: *filter + + - name: cc-news_mwl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mwl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mwl_Latn + <<: *output + filter: *filter + + - name: cc-news_mww_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mww_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mww_Latn + <<: *output + filter: *filter + + - name: cc-news_mya_Mymr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mya_Mymr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mya_Mymr + <<: *output + filter: *filter + + - name: cc-news_myv_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/myv_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/myv_Cyrl + <<: *output + filter: *filter + + - name: cc-news_myx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/myx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/myx_Latn + <<: *output + filter: *filter + + - name: cc-news_mzn_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/mzn_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/mzn_Arab + <<: *output + filter: *filter + + - name: cc-news_nah_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nah_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nah_Latn + <<: *output + filter: *filter + + - name: cc-news_nan_Hani + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nan_Hani/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nan_Hani + <<: *output + filter: *filter + + - name: cc-news_nan_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nan_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nan_Latn + <<: *output + filter: *filter + + - name: cc-news_nap_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nap_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nap_Latn + <<: *output + filter: *filter + + - name: cc-news_naq_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/naq_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/naq_Latn + <<: *output + filter: *filter + + - name: cc-news_nav_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nav_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nav_Latn + <<: *output + filter: *filter + + - name: cc-news_nbl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nbl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nbl_Latn + <<: *output + filter: *filter + + - name: cc-news_nbu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nbu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nbu_Latn + <<: *output + filter: *filter + + - name: cc-news_ncj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ncj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ncj_Latn + <<: *output + filter: *filter + + - name: cc-news_ncx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ncx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ncx_Latn + <<: *output + filter: *filter + + - name: cc-news_ndc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ndc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ndc_Latn + <<: *output + filter: *filter + + - name: cc-news_nde_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nde_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nde_Latn + <<: *output + filter: *filter + + - name: cc-news_ndj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ndj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ndj_Latn + <<: *output + filter: *filter + + - name: cc-news_ndo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ndo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ndo_Latn + <<: *output + filter: *filter + + - name: cc-news_nds_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nds_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nds_Latn + <<: *output + filter: *filter + + - name: cc-news_new_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/new_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/new_Deva + <<: *output + filter: *filter + + - name: cc-news_nhd_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nhd_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nhd_Latn + <<: *output + filter: *filter + + - name: cc-news_nhe_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nhe_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nhe_Latn + <<: *output + filter: *filter + + - name: cc-news_nia_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nia_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nia_Latn + <<: *output + filter: *filter + + - name: cc-news_njz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/njz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/njz_Latn + <<: *output + filter: *filter + + - name: cc-news_nki_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nki_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nki_Latn + <<: *output + filter: *filter + + - name: cc-news_nld_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nld_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nld_Latn + <<: *output + filter: *filter + + - name: cc-news_nmz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nmz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nmz_Latn + <<: *output + filter: *filter + + - name: cc-news_nnb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nnb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nnb_Latn + <<: *output + filter: *filter + + - name: cc-news_nno_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nno_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nno_Latn + <<: *output + filter: *filter + + - name: cc-news_nob_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nob_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nob_Latn + <<: *output + filter: *filter + + - name: cc-news_nod_Thai + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nod_Thai/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nod_Thai + <<: *output + filter: *filter + + - name: cc-news_non_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/non_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/non_Latn + <<: *output + filter: *filter + + - name: cc-news_nov_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nov_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nov_Latn + <<: *output + filter: *filter + + - name: cc-news_npi_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/npi_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/npi_Deva + <<: *output + filter: *filter + + - name: cc-news_npi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/npi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/npi_Latn + <<: *output + filter: *filter + + - name: cc-news_npl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/npl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/npl_Latn + <<: *output + filter: *filter + + - name: cc-news_nrf_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nrf_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nrf_Latn + <<: *output + filter: *filter + + - name: cc-news_nrm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nrm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nrm_Latn + <<: *output + filter: *filter + + - name: cc-news_nso_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nso_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nso_Latn + <<: *output + filter: *filter + + - name: cc-news_nsu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nsu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nsu_Latn + <<: *output + filter: *filter + + - name: cc-news_nuj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nuj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nuj_Latn + <<: *output + filter: *filter + + - name: cc-news_nus_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nus_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nus_Latn + <<: *output + filter: *filter + + - name: cc-news_nya_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nya_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nya_Latn + <<: *output + filter: *filter + + - name: cc-news_nyf_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nyf_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nyf_Latn + <<: *output + filter: *filter + + - name: cc-news_nyn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nyn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nyn_Latn + <<: *output + filter: *filter + + - name: cc-news_nyu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nyu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nyu_Latn + <<: *output + filter: *filter + + - name: cc-news_nzi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/nzi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/nzi_Latn + <<: *output + filter: *filter + + - name: cc-news_oci_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/oci_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/oci_Latn + <<: *output + filter: *filter + + - name: cc-news_ojb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ojb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ojb_Latn + <<: *output + filter: *filter + + - name: cc-news_oke_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/oke_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/oke_Latn + <<: *output + filter: *filter + + - name: cc-news_olo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/olo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/olo_Latn + <<: *output + filter: *filter + + - name: cc-news_orv_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/orv_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/orv_Cyrl + <<: *output + filter: *filter + + - name: cc-news_ory_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ory_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ory_Latn + <<: *output + filter: *filter + + - name: cc-news_ory_Orya + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ory_Orya/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ory_Orya + <<: *output + filter: *filter + + - name: cc-news_oss_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/oss_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/oss_Cyrl + <<: *output + filter: *filter + + - name: cc-news_ote_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ote_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ote_Latn + <<: *output + filter: *filter + + - name: cc-news_ots_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ots_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ots_Latn + <<: *output + filter: *filter + + - name: cc-news_otw_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/otw_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/otw_Latn + <<: *output + filter: *filter + + - name: cc-news_pag_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pag_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pag_Latn + <<: *output + filter: *filter + + - name: cc-news_pam_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pam_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pam_Latn + <<: *output + filter: *filter + + - name: cc-news_pan_Guru + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pan_Guru/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pan_Guru + <<: *output + filter: *filter + + - name: cc-news_pan_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pan_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pan_Latn + <<: *output + filter: *filter + + - name: cc-news_pap_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pap_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pap_Latn + <<: *output + filter: *filter + + - name: cc-news_pau_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pau_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pau_Latn + <<: *output + filter: *filter + + - name: cc-news_pbt_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pbt_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pbt_Arab + <<: *output + filter: *filter + + - name: cc-news_pcd_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pcd_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pcd_Latn + <<: *output + filter: *filter + + - name: cc-news_pcm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pcm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pcm_Latn + <<: *output + filter: *filter + + - name: cc-news_pdc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pdc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pdc_Latn + <<: *output + filter: *filter + + - name: cc-news_pem_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pem_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pem_Latn + <<: *output + filter: *filter + + - name: cc-news_pfl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pfl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pfl_Latn + <<: *output + filter: *filter + + - name: cc-news_pis_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pis_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pis_Latn + <<: *output + filter: *filter + + - name: cc-news_pkb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pkb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pkb_Latn + <<: *output + filter: *filter + + - name: cc-news_pls_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pls_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pls_Latn + <<: *output + filter: *filter + + - name: cc-news_plt_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/plt_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/plt_Latn + <<: *output + filter: *filter + + - name: cc-news_pms_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pms_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pms_Latn + <<: *output + filter: *filter + + - name: cc-news_pnb_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pnb_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pnb_Arab + <<: *output + filter: *filter + + - name: cc-news_pnt_Grek + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pnt_Grek/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pnt_Grek + <<: *output + filter: *filter + + - name: cc-news_pol_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pol_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pol_Latn + <<: *output + filter: *filter + + - name: cc-news_pon_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pon_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pon_Latn + <<: *output + filter: *filter + + - name: cc-news_por_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/por_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/por_Latn + <<: *output + filter: *filter + + - name: cc-news_pui_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pui_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pui_Latn + <<: *output + filter: *filter + + - name: cc-news_pwn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/pwn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/pwn_Latn + <<: *output + filter: *filter + + - name: cc-news_qub_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qub_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qub_Latn + <<: *output + filter: *filter + + - name: cc-news_quc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/quc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/quc_Latn + <<: *output + filter: *filter + + - name: cc-news_quf_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/quf_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/quf_Latn + <<: *output + filter: *filter + + - name: cc-news_quy_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/quy_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/quy_Latn + <<: *output + filter: *filter + + - name: cc-news_quz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/quz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/quz_Latn + <<: *output + filter: *filter + + - name: cc-news_qve_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qve_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qve_Latn + <<: *output + filter: *filter + + - name: cc-news_qvh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qvh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qvh_Latn + <<: *output + filter: *filter + + - name: cc-news_qvi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qvi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qvi_Latn + <<: *output + filter: *filter + + - name: cc-news_qvo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qvo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qvo_Latn + <<: *output + filter: *filter + + - name: cc-news_qvz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qvz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qvz_Latn + <<: *output + filter: *filter + + - name: cc-news_qwh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qwh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qwh_Latn + <<: *output + filter: *filter + + - name: cc-news_qxn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qxn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qxn_Latn + <<: *output + filter: *filter + + - name: cc-news_qxo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qxo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qxo_Latn + <<: *output + filter: *filter + + - name: cc-news_qxr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/qxr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/qxr_Latn + <<: *output + filter: *filter + + - name: cc-news_rap_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rap_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rap_Latn + <<: *output + filter: *filter + + - name: cc-news_rar_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rar_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rar_Latn + <<: *output + filter: *filter + + - name: cc-news_raw_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/raw_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/raw_Latn + <<: *output + filter: *filter + + - name: cc-news_rcf_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rcf_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rcf_Latn + <<: *output + filter: *filter + + - name: cc-news_rhg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rhg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rhg_Latn + <<: *output + filter: *filter + + - name: cc-news_rmc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmc_Latn + <<: *output + filter: *filter + + - name: cc-news_rme_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rme_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rme_Latn + <<: *output + filter: *filter + + - name: cc-news_rml_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rml_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rml_Latn + <<: *output + filter: *filter + + - name: cc-news_rmn_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmn_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmn_Cyrl + <<: *output + filter: *filter + + - name: cc-news_rmn_Grek + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmn_Grek/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmn_Grek + <<: *output + filter: *filter + + - name: cc-news_rmn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmn_Latn + <<: *output + filter: *filter + + - name: cc-news_rmo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmo_Latn + <<: *output + filter: *filter + + - name: cc-news_rmq_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmq_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmq_Latn + <<: *output + filter: *filter + + - name: cc-news_rmy_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmy_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmy_Cyrl + <<: *output + filter: *filter + + - name: cc-news_rmy_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rmy_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rmy_Latn + <<: *output + filter: *filter + + - name: cc-news_rnd_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rnd_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rnd_Latn + <<: *output + filter: *filter + + - name: cc-news_roh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/roh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/roh_Latn + <<: *output + filter: *filter + + - name: cc-news_ron_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ron_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ron_Cyrl + <<: *output + filter: *filter + + - name: cc-news_ron_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ron_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ron_Latn + <<: *output + filter: *filter + + - name: cc-news_rop_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rop_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rop_Latn + <<: *output + filter: *filter + + - name: cc-news_rue_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rue_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rue_Cyrl + <<: *output + filter: *filter + + - name: cc-news_run_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/run_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/run_Latn + <<: *output + filter: *filter + + - name: cc-news_rus_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/rus_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/rus_Cyrl + <<: *output + filter: *filter + + - name: cc-news_sab_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sab_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sab_Latn + <<: *output + filter: *filter + + - name: cc-news_sag_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sag_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sag_Latn + <<: *output + filter: *filter + + - name: cc-news_sah_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sah_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sah_Cyrl + <<: *output + filter: *filter + + - name: cc-news_san_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/san_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/san_Deva + <<: *output + filter: *filter + + - name: cc-news_san_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/san_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/san_Latn + <<: *output + filter: *filter + + - name: cc-news_sas_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sas_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sas_Latn + <<: *output + filter: *filter + + - name: cc-news_sat_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sat_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sat_Latn + <<: *output + filter: *filter + + - name: cc-news_scn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/scn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/scn_Latn + <<: *output + filter: *filter + + - name: cc-news_sco_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sco_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sco_Latn + <<: *output + filter: *filter + + - name: cc-news_sdc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sdc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sdc_Latn + <<: *output + filter: *filter + + - name: cc-news_sdh_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sdh_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sdh_Arab + <<: *output + filter: *filter + + - name: cc-news_seh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/seh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/seh_Latn + <<: *output + filter: *filter + + - name: cc-news_sgc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sgc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sgc_Latn + <<: *output + filter: *filter + + - name: cc-news_sgs_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sgs_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sgs_Latn + <<: *output + filter: *filter + + - name: cc-news_shi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/shi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/shi_Latn + <<: *output + filter: *filter + + - name: cc-news_shn_Mymr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/shn_Mymr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/shn_Mymr + <<: *output + filter: *filter + + - name: cc-news_shu_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/shu_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/shu_Arab + <<: *output + filter: *filter + + - name: cc-news_sin_Sinh + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sin_Sinh/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sin_Sinh + <<: *output + filter: *filter + + - name: cc-news_sju_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sju_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sju_Latn + <<: *output + filter: *filter + + - name: cc-news_skg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/skg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/skg_Latn + <<: *output + filter: *filter + + - name: cc-news_skr_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/skr_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/skr_Arab + <<: *output + filter: *filter + + - name: cc-news_slk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/slk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/slk_Latn + <<: *output + filter: *filter + + - name: cc-news_slv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/slv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/slv_Latn + <<: *output + filter: *filter + + - name: cc-news_sma_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sma_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sma_Latn + <<: *output + filter: *filter + + - name: cc-news_sme_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sme_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sme_Latn + <<: *output + filter: *filter + + - name: cc-news_smj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/smj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/smj_Latn + <<: *output + filter: *filter + + - name: cc-news_smn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/smn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/smn_Latn + <<: *output + filter: *filter + + - name: cc-news_smo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/smo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/smo_Latn + <<: *output + filter: *filter + + - name: cc-news_sms_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sms_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sms_Latn + <<: *output + filter: *filter + + - name: cc-news_sna_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sna_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sna_Latn + <<: *output + filter: *filter + + - name: cc-news_snd_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/snd_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/snd_Arab + <<: *output + filter: *filter + + - name: cc-news_snd_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/snd_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/snd_Latn + <<: *output + filter: *filter + + - name: cc-news_som_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/som_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/som_Latn + <<: *output + filter: *filter + + - name: cc-news_sot_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sot_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sot_Latn + <<: *output + filter: *filter + + - name: cc-news_spa_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/spa_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/spa_Latn + <<: *output + filter: *filter + + - name: cc-news_srd_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/srd_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/srd_Latn + <<: *output + filter: *filter + + - name: cc-news_srn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/srn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/srn_Latn + <<: *output + filter: *filter + + - name: cc-news_srp_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/srp_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/srp_Cyrl + <<: *output + filter: *filter + + - name: cc-news_srp_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/srp_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/srp_Latn + <<: *output + filter: *filter + + - name: cc-news_ssw_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ssw_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ssw_Latn + <<: *output + filter: *filter + + - name: cc-news_stq_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/stq_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/stq_Latn + <<: *output + filter: *filter + + - name: cc-news_sun_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/sun_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/sun_Latn + <<: *output + filter: *filter + + - name: cc-news_swc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/swc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/swc_Latn + <<: *output + filter: *filter + + - name: cc-news_swe_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/swe_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/swe_Latn + <<: *output + filter: *filter + + - name: cc-news_swg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/swg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/swg_Latn + <<: *output + filter: *filter + + - name: cc-news_swh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/swh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/swh_Latn + <<: *output + filter: *filter + + - name: cc-news_syc_Syrc + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/syc_Syrc/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/syc_Syrc + <<: *output + filter: *filter + + - name: cc-news_syl_Beng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/syl_Beng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/syl_Beng + <<: *output + filter: *filter + + - name: cc-news_syl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/syl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/syl_Latn + <<: *output + filter: *filter + + - name: cc-news_szl_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/szl_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/szl_Latn + <<: *output + filter: *filter + + - name: cc-news_tah_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tah_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tah_Latn + <<: *output + filter: *filter + + - name: cc-news_tam_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tam_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tam_Latn + <<: *output + filter: *filter + + - name: cc-news_tam_Taml + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tam_Taml/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tam_Taml + <<: *output + filter: *filter + + - name: cc-news_taq_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/taq_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/taq_Latn + <<: *output + filter: *filter + + - name: cc-news_tat_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tat_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tat_Cyrl + <<: *output + filter: *filter + + - name: cc-news_tat_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tat_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tat_Latn + <<: *output + filter: *filter + + - name: cc-news_tay_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tay_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tay_Latn + <<: *output + filter: *filter + + - name: cc-news_tcy_Knda + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tcy_Knda/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tcy_Knda + <<: *output + filter: *filter + + - name: cc-news_tcz_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tcz_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tcz_Latn + <<: *output + filter: *filter + + - name: cc-news_tdt_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tdt_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tdt_Latn + <<: *output + filter: *filter + + - name: cc-news_tdx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tdx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tdx_Latn + <<: *output + filter: *filter + + - name: cc-news_tel_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tel_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tel_Latn + <<: *output + filter: *filter + + - name: cc-news_tel_Telu + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tel_Telu/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tel_Telu + <<: *output + filter: *filter + + - name: cc-news_teo_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/teo_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/teo_Latn + <<: *output + filter: *filter + + - name: cc-news_tfr_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tfr_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tfr_Latn + <<: *output + filter: *filter + + - name: cc-news_tgk_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tgk_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tgk_Cyrl + <<: *output + filter: *filter + + - name: cc-news_tha_Thai + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tha_Thai/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tha_Thai + <<: *output + filter: *filter + + - name: cc-news_thk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/thk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/thk_Latn + <<: *output + filter: *filter + + - name: cc-news_thl_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/thl_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/thl_Deva + <<: *output + filter: *filter + + - name: cc-news_tig_Ethi + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tig_Ethi/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tig_Ethi + <<: *output + filter: *filter + + - name: cc-news_tih_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tih_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tih_Latn + <<: *output + filter: *filter + + - name: cc-news_tir_Ethi + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tir_Ethi/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tir_Ethi + <<: *output + filter: *filter + + - name: cc-news_tiv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tiv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tiv_Latn + <<: *output + filter: *filter + + - name: cc-news_tlh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tlh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tlh_Latn + <<: *output + filter: *filter + + - name: cc-news_tll_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tll_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tll_Latn + <<: *output + filter: *filter + + - name: cc-news_tly_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tly_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tly_Latn + <<: *output + filter: *filter + + - name: cc-news_tmc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tmc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tmc_Latn + <<: *output + filter: *filter + + - name: cc-news_tob_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tob_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tob_Latn + <<: *output + filter: *filter + + - name: cc-news_toi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/toi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/toi_Latn + <<: *output + filter: *filter + + - name: cc-news_toj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/toj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/toj_Latn + <<: *output + filter: *filter + + - name: cc-news_ton_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ton_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ton_Latn + <<: *output + filter: *filter + + - name: cc-news_tpi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tpi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tpi_Latn + <<: *output + filter: *filter + + - name: cc-news_trv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/trv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/trv_Latn + <<: *output + filter: *filter + + - name: cc-news_tsg_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tsg_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tsg_Latn + <<: *output + filter: *filter + + - name: cc-news_tsn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tsn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tsn_Latn + <<: *output + filter: *filter + + - name: cc-news_tso_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tso_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tso_Latn + <<: *output + filter: *filter + + - name: cc-news_tuc_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tuc_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tuc_Latn + <<: *output + filter: *filter + + - name: cc-news_tuk_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tuk_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tuk_Cyrl + <<: *output + filter: *filter + + - name: cc-news_tuk_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tuk_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tuk_Latn + <<: *output + filter: *filter + + - name: cc-news_tum_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tum_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tum_Latn + <<: *output + filter: *filter + + - name: cc-news_tur_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tur_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tur_Latn + <<: *output + filter: *filter + + - name: cc-news_twb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/twb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/twb_Latn + <<: *output + filter: *filter + + - name: cc-news_twi_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/twi_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/twi_Latn + <<: *output + filter: *filter + + - name: cc-news_twx_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/twx_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/twx_Latn + <<: *output + filter: *filter + + - name: cc-news_tyv_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tyv_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tyv_Cyrl + <<: *output + filter: *filter + + - name: cc-news_tzh_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tzh_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tzh_Latn + <<: *output + filter: *filter + + - name: cc-news_tzj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tzj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tzj_Latn + <<: *output + filter: *filter + + - name: cc-news_tzm_Tfng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/tzm_Tfng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/tzm_Tfng + <<: *output + filter: *filter + + - name: cc-news_ubu_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ubu_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ubu_Latn + <<: *output + filter: *filter + + - name: cc-news_udm_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/udm_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/udm_Cyrl + <<: *output + filter: *filter + + - name: cc-news_uig_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uig_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uig_Arab + <<: *output + filter: *filter + + - name: cc-news_uig_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uig_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uig_Latn + <<: *output + filter: *filter + + - name: cc-news_ukr_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ukr_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ukr_Cyrl + <<: *output + filter: *filter + + - name: cc-news_umb_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/umb_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/umb_Latn + <<: *output + filter: *filter + + - name: cc-news_und_Ahom + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Ahom/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Ahom + <<: *output + filter: *filter + + - name: cc-news_und_Armn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Armn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Armn + <<: *output + filter: *filter + + - name: cc-news_und_Bamu + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Bamu/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Bamu + <<: *output + filter: *filter + + - name: cc-news_und_Beng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Beng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Beng + <<: *output + filter: *filter + + - name: cc-news_und_Brah + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Brah/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Brah + <<: *output + filter: *filter + + - name: cc-news_und_Brai + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Brai/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Brai + <<: *output + filter: *filter + + - name: cc-news_und_Cakm + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Cakm/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Cakm + <<: *output + filter: *filter + + - name: cc-news_und_Cans + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Cans/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Cans + <<: *output + filter: *filter + + - name: cc-news_und_Copt + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Copt/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Copt + <<: *output + filter: *filter + + - name: cc-news_und_Cpmn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Cpmn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Cpmn + <<: *output + filter: *filter + + - name: cc-news_und_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Cyrl + <<: *output + filter: *filter + + - name: cc-news_und_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Deva + <<: *output + filter: *filter + + - name: cc-news_und_Diak + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Diak/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Diak + <<: *output + filter: *filter + + - name: cc-news_und_Dupl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Dupl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Dupl + <<: *output + filter: *filter + + - name: cc-news_und_Egyp + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Egyp/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Egyp + <<: *output + filter: *filter + + - name: cc-news_und_Ethi + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Ethi/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Ethi + <<: *output + filter: *filter + + - name: cc-news_und_Glag + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Glag/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Glag + <<: *output + filter: *filter + + - name: cc-news_und_Grek + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Grek/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Grek + <<: *output + filter: *filter + + - name: cc-news_und_Hebr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hebr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hebr + <<: *output + filter: *filter + + - name: cc-news_und_Hira + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hira/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hira + <<: *output + filter: *filter + + - name: cc-news_und_Hluw + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hluw/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hluw + <<: *output + filter: *filter + + - name: cc-news_und_Hmng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hmng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hmng + <<: *output + filter: *filter + + - name: cc-news_und_Hung + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Hung/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Hung + <<: *output + filter: *filter + + - name: cc-news_und_Java + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Java/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Java + <<: *output + filter: *filter + + - name: cc-news_und_Kana + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Kana/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Kana + <<: *output + filter: *filter + + - name: cc-news_und_Khmr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Khmr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Khmr + <<: *output + filter: *filter + + - name: cc-news_und_Khoj + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Khoj/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Khoj + <<: *output + filter: *filter + + - name: cc-news_und_Kits + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Kits/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Kits + <<: *output + filter: *filter + + - name: cc-news_und_Laoo + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Laoo/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Laoo + <<: *output + filter: *filter + + - name: cc-news_und_Limb + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Limb/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Limb + <<: *output + filter: *filter + + - name: cc-news_und_Lina + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Lina/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Lina + <<: *output + filter: *filter + + - name: cc-news_und_Linb + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Linb/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Linb + <<: *output + filter: *filter + + - name: cc-news_und_Lisu + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Lisu/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Lisu + <<: *output + filter: *filter + + - name: cc-news_und_Marc + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Marc/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Marc + <<: *output + filter: *filter + + - name: cc-news_und_Mult + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Mult/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Mult + <<: *output + filter: *filter + + - name: cc-news_und_Mymr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Mymr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Mymr + <<: *output + filter: *filter + + - name: cc-news_und_Nshu + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Nshu/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Nshu + <<: *output + filter: *filter + + - name: cc-news_und_Orya + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Orya/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Orya + <<: *output + filter: *filter + + - name: cc-news_und_Rohg + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Rohg/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Rohg + <<: *output + filter: *filter + + - name: cc-news_und_Runr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Runr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Runr + <<: *output + filter: *filter + + - name: cc-news_und_Saur + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Saur/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Saur + <<: *output + filter: *filter + + - name: cc-news_und_Sgnw + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Sgnw/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Sgnw + <<: *output + filter: *filter + + - name: cc-news_und_Sinh + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Sinh/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Sinh + <<: *output + filter: *filter + + - name: cc-news_und_Takr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Takr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Takr + <<: *output + filter: *filter + + - name: cc-news_und_Tang + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Tang/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Tang + <<: *output + filter: *filter + + - name: cc-news_und_Thai + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Thai/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Thai + <<: *output + filter: *filter + + - name: cc-news_und_Tibt + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Tibt/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Tibt + <<: *output + filter: *filter + + - name: cc-news_und_Tnsa + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Tnsa/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Tnsa + <<: *output + filter: *filter + + - name: cc-news_und_Vaii + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Vaii/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Vaii + <<: *output + filter: *filter + + - name: cc-news_und_Vith + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Vith/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Vith + <<: *output + filter: *filter + + - name: cc-news_und_Xsux + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Xsux/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Xsux + <<: *output + filter: *filter + + - name: cc-news_und_Yiii + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/und_Yiii/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/und_Yiii + <<: *output + filter: *filter + + - name: cc-news_urd_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/urd_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/urd_Arab + <<: *output + filter: *filter + + - name: cc-news_urd_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/urd_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/urd_Latn + <<: *output + filter: *filter + + - name: cc-news_uri_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uri_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uri_Latn + <<: *output + filter: *filter + + - name: cc-news_uzn_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uzn_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uzn_Cyrl + <<: *output + filter: *filter + + - name: cc-news_uzn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uzn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uzn_Latn + <<: *output + filter: *filter + + - name: cc-news_uzs_Arab + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/uzs_Arab/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/uzs_Arab + <<: *output + filter: *filter + + - name: cc-news_vec_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vec_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vec_Latn + <<: *output + filter: *filter + + - name: cc-news_ven_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ven_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ven_Latn + <<: *output + filter: *filter + + - name: cc-news_vep_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vep_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vep_Latn + <<: *output + filter: *filter + + - name: cc-news_vid_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vid_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vid_Latn + <<: *output + filter: *filter + + - name: cc-news_vie_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vie_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vie_Latn + <<: *output + filter: *filter + + - name: cc-news_vls_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vls_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vls_Latn + <<: *output + filter: *filter + + - name: cc-news_vmw_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vmw_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vmw_Latn + <<: *output + filter: *filter + + - name: cc-news_vro_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/vro_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/vro_Latn + <<: *output + filter: *filter + + - name: cc-news_war_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/war_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/war_Latn + <<: *output + filter: *filter + + - name: cc-news_wbm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wbm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wbm_Latn + <<: *output + filter: *filter + + - name: cc-news_wbp_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wbp_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wbp_Latn + <<: *output + filter: *filter + + - name: cc-news_wed_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wed_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wed_Latn + <<: *output + filter: *filter + + - name: cc-news_wes_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wes_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wes_Latn + <<: *output + filter: *filter + + - name: cc-news_wln_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wln_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wln_Latn + <<: *output + filter: *filter + + - name: cc-news_wls_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wls_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wls_Latn + <<: *output + filter: *filter + + - name: cc-news_wol_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wol_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wol_Latn + <<: *output + filter: *filter + + - name: cc-news_wrs_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wrs_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wrs_Latn + <<: *output + filter: *filter + + - name: cc-news_wsg_Telu + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wsg_Telu/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wsg_Telu + <<: *output + filter: *filter + + - name: cc-news_wuu_Hani + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wuu_Hani/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wuu_Hani + <<: *output + filter: *filter + + - name: cc-news_wuv_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/wuv_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/wuv_Latn + <<: *output + filter: *filter + + - name: cc-news_xav_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xav_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xav_Latn + <<: *output + filter: *filter + + - name: cc-news_xho_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xho_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xho_Latn + <<: *output + filter: *filter + + - name: cc-news_xla_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xla_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xla_Latn + <<: *output + filter: *filter + + - name: cc-news_xmf_Geor + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xmf_Geor/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xmf_Geor + <<: *output + filter: *filter + + - name: cc-news_xmm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xmm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xmm_Latn + <<: *output + filter: *filter + + - name: cc-news_xnn_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xnn_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xnn_Latn + <<: *output + filter: *filter + + - name: cc-news_xsr_Deva + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xsr_Deva/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xsr_Deva + <<: *output + filter: *filter + + - name: cc-news_xum_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/xum_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/xum_Latn + <<: *output + filter: *filter + + - name: cc-news_ydd_Hebr + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/ydd_Hebr/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/ydd_Hebr + <<: *output + filter: *filter + + - name: cc-news_yml_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yml_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yml_Latn + <<: *output + filter: *filter + + - name: cc-news_yor_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yor_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yor_Latn + <<: *output + filter: *filter + + - name: cc-news_yrk_Cyrl + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yrk_Cyrl/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yrk_Cyrl + <<: *output + filter: *filter + + - name: cc-news_yua_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yua_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yua_Latn + <<: *output + filter: *filter + + - name: cc-news_yue_Hani + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/yue_Hani/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/yue_Hani + <<: *output + filter: *filter + + - name: cc-news_zab_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zab_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zab_Latn + <<: *output + filter: *filter + + - name: cc-news_zai_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zai_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zai_Latn + <<: *output + filter: *filter + + - name: cc-news_zas_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zas_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zas_Latn + <<: *output + filter: *filter + + - name: cc-news_zdj_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zdj_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zdj_Latn + <<: *output + filter: *filter + + - name: cc-news_zea_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zea_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zea_Latn + <<: *output + filter: *filter + + - name: cc-news_zgh_Tfng + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zgh_Tfng/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zgh_Tfng + <<: *output + filter: *filter + + - name: cc-news_zne_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zne_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zne_Latn + <<: *output + filter: *filter + + - name: cc-news_zom_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zom_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zom_Latn + <<: *output + filter: *filter + + - name: cc-news_zsm_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zsm_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zsm_Latn + <<: *output + filter: *filter + + - name: cc-news_zul_Latn + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-year_dedup-lang/documents/zul_Latn/*.json.gz + attributes: *attributes + output: + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v3-resiliparse-lang_dedup/documents/zul_Latn + <<: *output + filter: *filter + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v3-resiliparse-lang_dedup/input + output: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v3-resiliparse-lang_dedup/output + +processes: 188 diff --git a/configs/peteish-anneal/README.md b/configs/peteish-anneal/README.md new file mode 100644 index 00000000..e69de29b diff --git a/configs/peteish-anneal/digits.sh b/configs/peteish-anneal/digits.sh new file mode 100644 index 00000000..bd020169 --- /dev/null +++ b/configs/peteish-anneal/digits.sh @@ -0,0 +1,48 @@ +collections=( + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/dclm/*/*.json.zst" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/flan/*.json.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/codesearchnet-owmfilter/*/*.jsonl.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/basic_math/*TRAIN.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm_mind/*/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/gsm8k/*/train/*.jsonl.zst" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/ajibawa-2023/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/m-a-p_Matrix/*/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/metamath-owmfilter/*.jsonl.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tinyGSM-MIND/*/*.jsonl.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tulu_math/*/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/pes2o/*.json.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/stackexchange/*.json.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/wiki/*.json.gz" +) +tokenizer="allenai/dolma2-tokenizer-sigdig" + +for path in "${collections[@]}"; do + name=$(echo "${path}" | sed -E 's|.*/documents/([^*]+).*|\1|' | sed 's:^/::; s:/$::') + destination="${HOME}/ai2-llm/preprocessed/dolmino-mix-1124/${tokenizer}/${name}" + + echo "Tokenizing $path to $destination" + echo "Number of files: $(ls -1 $path 2>/dev/null | wc -l)" + + if [[ "$name" == *"dclm"* ]]; then + processes=$(($(nproc) - 4)) + else + processes=20 + fi + + set -ex + dolma tokens \ + --documents "${path}" \ + --destination $destination \ + --tokenizer.name_or_path ${tokenizer} \ + --tokenizer.eos_token_id 100257 \ + --tokenizer.pad_token_id 100277 \ + --no-tokenizer.segment_before_tokenization \ + --tokenizer.encode_special_tokens \ + --processes ${processes} \ + --seed 3920 \ + --max_size 1073741824 \ + --sample_ring_prop \ + --dtype uint32 + set +ex +done diff --git a/configs/peteish-anneal/digits_mix.yaml b/configs/peteish-anneal/digits_mix.yaml new file mode 100644 index 00000000..2512549f --- /dev/null +++ b/configs/peteish-anneal/digits_mix.yaml @@ -0,0 +1,44 @@ +target_size: 200G + +sources: + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/dclm/*.npy + mix_percent: 0.5 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/pes2o/*.npy + mix_percent: 0.0585 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/flan/*.npy + mix_percent: 0.1660 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/codesearchnet-owmfilter/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/dolmino_math_synth/basic_math/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/dolmino_math_synth/gsm_mind/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/gsm8k/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/mathcoder2-synthmath/*/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/metamath-owmfilter/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/tinyGSM-MIND/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/math/tulu_math/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/stackexchange/*.npy + sample_percent: 1.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/dolma2-tokenizer-sigdig/wiki/*.npy + sample_percent: 1.0 diff --git a/configs/peteish-anneal/fw2-dedupe/mix.yaml b/configs/peteish-anneal/fw2-dedupe/mix.yaml new file mode 100644 index 00000000..e6e30f17 --- /dev/null +++ b/configs/peteish-anneal/fw2-dedupe/mix.yaml @@ -0,0 +1,268 @@ +streams: + - name: dclm + documents: + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0000/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0001/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0002/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0003/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0004/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0005/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0006/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0007/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0008/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0009/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0010/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0011/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0012/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0013/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0014/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0015/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0016/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0017/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0018/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0019/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0020/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0021/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0022/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0023/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0024/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0025/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0026/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0027/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0028/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0029/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0030/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0031/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0032/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0033/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0034/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0035/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0036/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0037/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0038/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0039/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0040/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0041/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0042/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0043/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0044/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0045/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0046/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0047/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0048/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0049/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0050/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0051/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0052/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0053/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0054/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0055/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0056/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0057/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0058/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0059/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0060/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0061/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0062/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0063/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0064/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0065/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0066/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0067/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0068/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0069/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0070/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0071/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0072/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0073/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0074/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0075/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0076/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0077/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0078/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0079/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0080/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0081/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0082/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0083/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0084/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0085/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0086/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0087/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0088/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0089/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0090/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0091/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0092/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0093/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0094/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0095/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0096/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0097/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0098/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0099/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0100/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0101/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0102/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0103/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0104/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0105/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0106/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0107/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0108/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0109/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0110/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0111/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0112/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0113/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0114/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0115/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0116/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0117/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0118/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0119/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0120/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0121/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0122/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0123/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0124/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0125/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0126/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0127/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0128/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0129/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0130/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0131/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0132/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0133/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0134/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0135/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0136/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0137/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0138/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0139/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0140/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0141/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0142/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0143/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0144/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0145/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0146/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0147/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0148/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0149/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0150/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0151/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0152/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0153/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0154/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0155/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0156/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0157/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0158/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0159/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0160/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0161/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0162/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0163/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0164/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0165/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0166/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0167/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0168/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0169/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0170/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0171/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0172/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0173/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0174/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0175/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0176/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0177/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0178/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0179/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0180/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0181/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0182/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0183/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0184/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0185/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0186/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0187/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0188/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0189/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0190/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0191/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0192/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0193/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0194/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0195/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0196/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0197/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0198/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0199/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0200/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0201/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0202/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0203/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0204/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0205/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0206/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0207/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0208/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0209/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0210/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0211/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0212/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0213/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0214/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0215/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0216/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0217/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0218/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0219/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0220/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0221/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0222/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0223/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0224/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0225/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0226/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0227/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0228/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0229/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0230/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0231/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0232/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0233/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0234/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0235/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0236/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0237/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0238/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0239/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0240/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0241/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0242/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0243/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0244/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0245/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0246/*zst + attributes: + - dedupe_para_ngrams_13_1 + output: + max_size_in_bytes: 3_814_697_265 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2_dedupe/documents + filter: + include: + - >- + (.attributes.dedupe_para_ngrams_13_1 | length == 0) or + ((.attributes.dedupe_para_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3) + + syntax: jq + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v0_rep32_ft7percentile_fw2_dedupe/input + output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v0_rep32_ft7percentile_fw2_dedupe/output + +processes: 188 diff --git a/configs/peteish-anneal/fw2-dedupe/part1.yaml b/configs/peteish-anneal/fw2-dedupe/part1.yaml new file mode 100644 index 00000000..72b86afb --- /dev/null +++ b/configs/peteish-anneal/fw2-dedupe/part1.yaml @@ -0,0 +1,145 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0000/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0001/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0002/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0003/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0004/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0005/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0006/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0007/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0008/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0009/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0010/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0011/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0012/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0013/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0014/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0015/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0016/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0017/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0018/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0019/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0020/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0021/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0022/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0023/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0024/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0025/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0026/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0027/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0028/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0029/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0030/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0031/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0032/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0033/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0034/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0035/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0036/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0037/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0038/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0039/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0040/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0041/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0042/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0043/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0044/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0045/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0046/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0047/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0048/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0049/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0050/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0051/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0052/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0053/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0054/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0055/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0056/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0057/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0058/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0059/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0060/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0061/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0062/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0063/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0064/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0065/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0066/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0067/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0068/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0069/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0070/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0071/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0072/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0073/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0074/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0075/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0076/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0077/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0078/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0079/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0080/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0081/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0082/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0083/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0084/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0085/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0086/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0087/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0088/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0089/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0090/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0091/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0092/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0093/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0094/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0095/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0096/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0097/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0098/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0099/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0100/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0101/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0102/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0103/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0104/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0105/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0106/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0107/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0108/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0109/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0110/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0111/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0112/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0113/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0114/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0115/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0116/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0117/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0118/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0119/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0120/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0121/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0122/*zst + + +dedupe: + name: dedupe_para_ngrams_13_1 + paragraphs: + attribute_name: dedupe_para_ngrams_13_1 + by_ngram: + ngram_length: 13 + stride: 1 + overlap_threshold: 0.5 + skip_short_paragraphs: true + skip_empty: true + +bloom_filter: + file: ${oc.env:HOME}/bloomp/fw2-part1.bin + read_only: false + # set to of words + estimated_doc_count: 300_711_504_079 + desired_false_positive_rate: 0.1 + +processes: 16 diff --git a/configs/peteish-anneal/fw2-dedupe/part2.yaml b/configs/peteish-anneal/fw2-dedupe/part2.yaml new file mode 100644 index 00000000..3b393b77 --- /dev/null +++ b/configs/peteish-anneal/fw2-dedupe/part2.yaml @@ -0,0 +1,146 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0123/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0124/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0125/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0126/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0127/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0128/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0129/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0130/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0131/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0132/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0133/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0134/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0135/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0136/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0137/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0138/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0139/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0140/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0141/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0142/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0143/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0144/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0145/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0146/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0147/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0148/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0149/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0150/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0151/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0152/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0153/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0154/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0155/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0156/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0157/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0158/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0159/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0160/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0161/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0162/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0163/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0164/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0165/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0166/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0167/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0168/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0169/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0170/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0171/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0172/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0173/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0174/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0175/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0176/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0177/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0178/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0179/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0180/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0181/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0182/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0183/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0184/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0185/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0186/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0187/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0188/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0189/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0190/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0191/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0192/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0193/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0194/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0195/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0196/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0197/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0198/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0199/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0200/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0201/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0202/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0203/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0204/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0205/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0206/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0207/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0208/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0209/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0210/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0211/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0212/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0213/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0214/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0215/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0216/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0217/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0218/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0219/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0220/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0221/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0222/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0223/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0224/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0225/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0226/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0227/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0228/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0229/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0230/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0231/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0232/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0233/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0234/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0235/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0236/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0237/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0238/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0239/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0240/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0241/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0242/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0243/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0244/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0245/*zst + - s3://ai2-llm/pretraining-data/sources/dclm/v0_rep32_ft7percentile_fw2/documents/0246/*zst + + +dedupe: + name: dedupe_para_ngrams_13_1 + paragraphs: + attribute_name: dedupe_para_ngrams_13_1 + by_ngram: + ngram_length: 13 + stride: 1 + overlap_threshold: 0.5 + skip_short_paragraphs: true + skip_empty: true + +bloom_filter: + file: ${oc.env:HOME}/bloomp/fw2-part2.bin + read_only: false + # set to of words + estimated_doc_count: 300_711_504_079 + desired_false_positive_rate: 0.1 + +processes: 16 diff --git a/configs/peteish-anneal/mix-fw25.yaml b/configs/peteish-anneal/mix-fw25.yaml new file mode 100644 index 00000000..a46bbd17 --- /dev/null +++ b/configs/peteish-anneal/mix-fw25.yaml @@ -0,0 +1,47 @@ +# HuggingFaceFW_fineweb_edu_classifier/score +# ┏━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┓ +# ┃ value ┃ dist ┃ count ┃ +# ┡━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━┩ +# │ [-1.0, 0.5) │ 0.0556 │ 6,120,032 │ +# │ [0.5, 0.8) │ 0.1115 │ 12,283,459 │ +# │ [0.8, 1.0) │ 0.1117 │ 12,305,723 │ +# │ [1.0, 1.2) │ 0.1123 │ 12,367,897 │ +# │ [1.2, 1.4) │ 0.1110 │ 12,220,008 │ +# │ [1.4, 1.7) │ 0.1098 │ 12,094,336 │ +# │ [1.7, 2.0) │ 0.1106 │ 12,180,628 │ +# │ [2.0, 2.4) │ 0.1109 │ 12,216,375 │ +# │ [2.4, 3.1) │ 0.1113 │ 12,262,622 │ +# │ [3.1, 5.4] │ 0.0553 │ 6,088,265 │ +# └───────────────┴──────────┴─────────────┘ + +streams: + - name: dclm + documents: + - s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.zstd + attributes: + - random_number_v1 + - HuggingFaceFW_fineweb_edu_classifier + output: + max_size_in_bytes: 3_814_697_265 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_fwEdu25/documents/full + discard_fields: + - attributes + + compression: + input: zst + output: zst + + filter: + include: + # Remove repetitions + - >- + (.attributes.HuggingFaceFW_fineweb_edu_classifier[0][-1] * 0.8) + + (.attributes.random_number_v1__random_number_v1__random[0][-1] * 5 * 0.2) >= 2 + syntax: jq + span_replacement: [] + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v1_fwEdu25/input + output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v1_fwEdu25/output + +processes: 188 diff --git a/configs/peteish-anneal/mix-nvidia25.yaml b/configs/peteish-anneal/mix-nvidia25.yaml new file mode 100644 index 00000000..88be94a6 --- /dev/null +++ b/configs/peteish-anneal/mix-nvidia25.yaml @@ -0,0 +1,83 @@ +# nvidia_quality_classifier_deberta_Low/score +# ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┓ +# ┃ value ┃ dist ┃ count ┃ +# ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━┩ +# │ [0.000, 0.001) │ 0.0935 │ 10,302,917 │ +# │ [0.001, 0.002) │ 0.2006 │ 22,092,046 │ +# │ [0.002, 0.003) │ 0.1299 │ 14,307,537 │ +# │ [0.003, 0.004) │ 0.0756 │ 8,326,514 │ +# │ [0.004, 0.006) │ 0.0847 │ 9,329,927 │ +# │ [0.006, 0.011) │ 0.0887 │ 9,771,702 │ +# │ [0.011, 0.031) │ 0.0860 │ 9,471,766 │ +# │ [0.031, 0.215) │ 0.0955 │ 10,520,370 │ +# │ [0.215, 0.938) │ 0.0956 │ 10,530,852 │ +# │ [0.938, 0.993] │ 0.0498 │ 5,485,714 │ +# └──────────────────┴─────────┴────────────┘ +# nvidia_quality_classifier_deberta_Medium/score +# ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┓ +# ┃ value ┃ dist ┃ count ┃ +# ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━┩ +# │ [0.004, 0.069) │ 0.0558 │ 6,141,162 │ +# │ [0.069, 0.244) │ 0.1110 │ 12,222,713 │ +# │ [0.244, 0.462) │ 0.1113 │ 12,254,640 │ +# │ [0.462, 0.700) │ 0.1110 │ 12,229,105 │ +# │ [0.700, 0.857) │ 0.1112 │ 12,247,913 │ +# │ [0.857, 0.932) │ 0.1120 │ 12,331,155 │ +# │ [0.932, 0.966) │ 0.1127 │ 12,411,860 │ +# │ [0.966, 0.982) │ 0.1104 │ 12,155,159 │ +# │ [0.982, 0.991) │ 0.1194 │ 13,147,942 │ +# │ [0.991, 0.994] │ 0.0454 │ 4,997,696 │ +# └───────────────────┴──────────┴─────────────┘ +# nvidia_quality_classifier_deberta_High/score +# ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┓ +# ┃ value ┃ dist ┃ count ┃ +# ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━┩ +# │ [0.000, 0.001) │ 0.1535 │ 16,910,071 │ +# │ [0.001, 0.002) │ 0.0819 │ 9,021,261 │ +# │ [0.002, 0.004) │ 0.0869 │ 9,570,058 │ +# │ [0.004, 0.010) │ 0.0959 │ 10,562,952 │ +# │ [0.010, 0.031) │ 0.1079 │ 11,883,866 │ +# │ [0.031, 0.095) │ 0.1038 │ 11,429,509 │ +# │ [0.095, 0.274) │ 0.1057 │ 11,646,421 │ +# │ [0.274, 0.572) │ 0.1060 │ 11,670,182 │ +# │ [0.572, 0.813) │ 0.1058 │ 11,652,356 │ +# │ [0.813, 0.975] │ 0.0526 │ 5,792,669 │ +# └──────────────────┴─────────┴─────────────┘ + +streams: + - name: dclm + documents: + - s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.zstd + attributes: + - random_number_v1 + - nvidia_quality_classifier_deberta + output: + max_size_in_bytes: 3_814_697_265 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_nvidia25/documents/full + discard_fields: + - attributes + + compression: + input: zst + output: zst + + filter: + include: + # Remove repetitions + - >- + (.attributes.nvidia_quality_classifier_deberta_High[0][-1] * 0.9) + + (.attributes.random_number_v1__random_number_v1__random[0][-1] * 0.1) >= 0.274 + - >- + (.attributes.nvidia_quality_classifier_deberta_Medium[0][-1] * 0.5) + + (.attributes.random_number_v1__random_number_v1__random[0][-1] * 0.5) >= 0.932 + - >- + (.attributes.nvidia_quality_classifier_deberta_Low[0][-1] * 0.2) + + (.attributes.random_number_v1__random_number_v1__random[0][-1] * 0.8) >= 0.938 + syntax: jq + span_replacement: [] + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v1_nvidia25/input + output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v1_nvidia25/output + +processes: 188 diff --git a/configs/peteish-anneal/mmlu-web/decontaminate.sh b/configs/peteish-anneal/mmlu-web/decontaminate.sh new file mode 100644 index 00000000..9e067ddb --- /dev/null +++ b/configs/peteish-anneal/mmlu-web/decontaminate.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_PATH=$(realpath "$0") + +bloom_filter_file=/tmp/oe-eval-data-dedupe_ngrams_8_1-train_dev_test.bin +remote_bloom_filter_file=s3://ai2-llm/bloom-filters/oe-eval-data-dedupe_ngrams_8_1-20241018-train_dev_test.bin + +aws s3 cp $remote_bloom_filter_file $bloom_filter_file +size=331605257 + +dolma dedupe \ + --documents \ + "${HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup/documents/*.json.zst" \ + --dedupe.name dedupe_ngrams_8_1_all_train \ + --dedupe.paragraphs.attribute_name dedupe_ngrams_8_1_all_train \ + --dedupe.paragraphs.by_ngram.ngram_length 8 \ + --dedupe.paragraphs.by_ngram.skip_short_paragraphs \ + --dedupe.paragraphs.by_ngram.stride 1 \ + --dedupe.paragraphs.by_ngram.overlap_threshold 0 \ + --dedupe.skip_empty \ + --bloom_filter.file $bloom_filter_file \ + --bloom_filter.read_only \ + --bloom_filter.estimated_doc_count $size \ + --bloom_filter.desired_false_positive_rate 0.001 \ + --processes "$(expr $(nproc) - 4)" + + +dolma -c "$(dirname ${SCRIPT_PATH})/remove_all_train.yaml" mix --processes $(expr $(nproc) - 4) diff --git a/configs/peteish-anneal/mmlu-web/dedupe.yaml b/configs/peteish-anneal/mmlu-web/dedupe.yaml new file mode 100644 index 00000000..c67902e9 --- /dev/null +++ b/configs/peteish-anneal/mmlu-web/dedupe.yaml @@ -0,0 +1,22 @@ +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web/documents/full/*.zst + +dedupe: + name: dedupe_para_ngrams_13_1 + paragraphs: + attribute_name: dedupe_para_ngrams_13_1 + by_ngram: + ngram_length: 13 + stride: 1 + overlap_threshold: 0.5 + skip_short_paragraphs: true + skip_empty: true + +bloom_filter: + file: ${oc.env:HOME}/bloom/mmmlu_web_dedupe_para_ngrams_13_1.bloom + read_only: false + # set to of words + estimated_doc_count: 64_000_000_000 + desired_false_positive_rate: 0.1 + +processes: 188 diff --git a/configs/peteish-anneal/mmlu-web/make.yaml b/configs/peteish-anneal/mmlu-web/make.yaml new file mode 100644 index 00000000..9df09403 --- /dev/null +++ b/configs/peteish-anneal/mmlu-web/make.yaml @@ -0,0 +1,28 @@ +streams: + - name: dclm + documents: + - s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.zstd + attributes: + - flashcards_domains_v1 + output: + max_size_in_bytes: 3_814_697_265 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web/documents/full + discard_fields: + - attributes + + compression: + input: zst + output: zst + + filter: + include: + # Only include documents whose domains have flashcards content + - .attributes.flashcards_domains_v1__flashcards_domains_v1__url != null + syntax: jq + span_replacement: [] + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/mmlu_web/input + output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/mmlu_web/output + +processes: 188 diff --git a/configs/peteish-anneal/mmlu-web/mix.yaml b/configs/peteish-anneal/mmlu-web/mix.yaml new file mode 100644 index 00000000..afc89853 --- /dev/null +++ b/configs/peteish-anneal/mmlu-web/mix.yaml @@ -0,0 +1,23 @@ +streams: + - name: dclm + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web/documents/full/*.zst + attributes: + - dedupe_para_ngrams_13_1 + + output: + max_size_in_bytes: 1_073_741_824 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_deduped/documents/ + filter: + include: + - >- + (.attributes.dedupe_para_ngrams_13_1 | length == 0) or + ((.attributes.dedupe_para_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3) + + syntax: jq + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v0_mmlu_web_deduped/input + output: ${oc.env:HOME}/ai2-llm/work_dir/dclm/v0_mmlu_web_deduped/output + +processes: 188 diff --git a/configs/peteish-anneal/mmlu-web/remove_all_train.yaml b/configs/peteish-anneal/mmlu-web/remove_all_train.yaml new file mode 100644 index 00000000..b4bc0d56 --- /dev/null +++ b/configs/peteish-anneal/mmlu-web/remove_all_train.yaml @@ -0,0 +1,13 @@ +streams: + - name: dclm + documents: &documents + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup/documents/*.json.zst + attributes: &attributes + - dedupe_ngrams_8_1_all_train + output: + max_size_in_bytes: 200_000_000 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup_decontam/documents + filter: + exclude: + - ([.attributes.dedupe_ngrams_8_1_all_train[] | select(.[2] >= 0.1)] | length != 0) + syntax: jq diff --git a/configs/peteish-anneal/mmlu-web/tokenize.yaml b/configs/peteish-anneal/mmlu-web/tokenize.yaml new file mode 100644 index 00000000..7cdc88f3 --- /dev/null +++ b/configs/peteish-anneal/mmlu-web/tokenize.yaml @@ -0,0 +1,16 @@ +destination: ${oc.env:HOME}/ai2-llm/preprocessed/dclm/v0_mmlu_web_minhash_dedup_decontam/allenai/dolma2-tokenizer +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup_decontam/documents/* + +processes: 128 +seed: 3920 +max_size: 4_294_967_296 +dtype: uint32 + +tokenizer: + name_or_path: allenai/dolma2-tokenizer + bos_token_id: null + eos_token_id: 100257 + pad_token_id: 100277 + segment_before_tokenization: false + encode_special_tokens: true diff --git a/configs/peteish-anneal/olmoe.sh b/configs/peteish-anneal/olmoe.sh new file mode 100644 index 00000000..64f6620b --- /dev/null +++ b/configs/peteish-anneal/olmoe.sh @@ -0,0 +1,47 @@ +collections=( + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/dclm/*/*.json.zst" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/flan/*.json.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/codesearchnet-owmfilter/*/*.jsonl.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/basic_math/*TRAIN.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm_mind/*/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/gsm8k/*/train/*.jsonl.zst" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/ajibawa-2023/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/m-a-p_Matrix/*/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/metamath-owmfilter/*.jsonl.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tinyGSM-MIND/*/*.jsonl.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tulu_math/*/*.jsonl" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/pes2o/*.json.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/stackexchange/*.json.gz" + "${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/wiki/*.json.gz" +) + +for path in "${collections[@]}"; do + name=$(echo "${path}" | sed -E 's|.*/documents/([^*]+).*|\1|' | sed 's:^/::; s:/$::') + destination="${HOME}/ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/${name}" + + echo "Tokenizing $path to $destination" + echo "Number of files: $(ls -1 $path 2>/dev/null | wc -l)" + + if [[ "$name" == *"dclm"* ]]; then + processes=$(($(nproc) - 4)) + else + processes=20 + fi + + set -ex + dolma tokens \ + --documents "${path}" \ + --destination $destination \ + --no-tokenizer.segment_before_tokenization \ + --tokenizer.name_or_path "allenai/gpt-neox-olmo-dolma-v1_5" \ + --tokenizer.eos_token_id 50279 \ + --tokenizer.pad_token_id 1 \ + --tokenizer.encode_special_tokens \ + --processes ${processes} \ + --seed 3920 \ + --max_size 1073741824 \ + --sample_ring_prop \ + --dtype uint16 + set +ex +done diff --git a/configs/peteish-anneal/olmoe_mix.yaml b/configs/peteish-anneal/olmoe_mix.yaml new file mode 100644 index 00000000..4280b988 --- /dev/null +++ b/configs/peteish-anneal/olmoe_mix.yaml @@ -0,0 +1,44 @@ +target_size: 200G + +sources: + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/dclm/*.npy + mix_percent: 0.4922 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/pes2o/*.npy + mix_percent: 0.0652 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/flan/*.npy + mix_percent: 0.1667 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/codesearchnet-owmfilter/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/basic_math/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/gsm_mind/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/gsm8k/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/mathcoder2-synthmath/*/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/metamath-owmfilter/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/tinyGSM-MIND/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/tulu_math/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/stackexchange/*.npy + sample_percent: 2.0 + + - source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/wiki/*.npy + sample_percent: 2.0 diff --git a/configs/peteish-anneal/stackexchange/fuzzy-dedupe.yaml b/configs/peteish-anneal/stackexchange/fuzzy-dedupe.yaml new file mode 100644 index 00000000..2cfdfca8 --- /dev/null +++ b/configs/peteish-anneal/stackexchange/fuzzy-dedupe.yaml @@ -0,0 +1,22 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*zst + +dedupe: + name: dedupe_para_ngrams_13_1 + paragraphs: + attribute_name: dedupe_para_ngrams_13_1 + by_ngram: + ngram_length: 13 + stride: 1 + overlap_threshold: 0.5 + skip_short_paragraphs: true + skip_empty: true + +bloom_filter: + file: ${oc.env:HOME}/stackexchange.bin + read_only: false + # set to of words + estimated_doc_count: 10_000_000_000 + desired_false_positive_rate: 0.01 + +processes: 16 diff --git a/configs/peteish-anneal/stackexchange/mix-base.yaml b/configs/peteish-anneal/stackexchange/mix-base.yaml new file mode 100644 index 00000000..390af697 --- /dev/null +++ b/configs/peteish-anneal/stackexchange/mix-base.yaml @@ -0,0 +1,26 @@ +streams: + - name: stackexchange + documents: + - s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*zst + attributes: + - dedupe_para_ngrams_13_1 + output: + max_size_in_bytes: 1_073_741_824 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/stackexchange/v1_dedupe/documents + filter: + include: + - >- + (.attributes.dedupe_para_ngrams_13_1 | length == 0) or + ((.attributes.dedupe_para_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3) + exclude: + - >- + .metadata.question_score < 3 + - >- + .metadata.answer_score < 5 + syntax: jq + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/stackexchange/v1_dedupe_para_ngrams_13_1/input + output: ${oc.env:HOME}/ai2-llm/work_dir/stackexchange/v1_dedupe_para_ngrams_13_1/output + +processes: 188 diff --git a/configs/peteish-anneal/stackexchange/tokens.yaml b/configs/peteish-anneal/stackexchange/tokens.yaml new file mode 100644 index 00000000..9b2ec938 --- /dev/null +++ b/configs/peteish-anneal/stackexchange/tokens.yaml @@ -0,0 +1,16 @@ +destination: ${oc.env:HOME}/ai2-llm/preprocessed/stackexchange/v1_dedupe/documents/allenai/dolma2-tokenizer +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/stackexchange/v1_dedupe/documents/* + +processes: 16 +seed: 3920 +max_size: 4_294_967_296 +dtype: uint32 + +tokenizer: + name_or_path: allenai/dolma2-tokenizer + bos_token_id: null + eos_token_id: 100257 + pad_token_id: 100277 + segment_before_tokenization: false + encode_special_tokens: true diff --git a/configs/peteish-anneal/tokens-fw25.yaml b/configs/peteish-anneal/tokens-fw25.yaml new file mode 100644 index 00000000..58406235 --- /dev/null +++ b/configs/peteish-anneal/tokens-fw25.yaml @@ -0,0 +1,16 @@ +destination: ${oc.env:HOME}/ai2-llm/preprocessed/sources/dclm/v1_fwEdu25/documents/full/allenai/dolma2-tokenizer +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_fwEdu25/documents/full/* + +processes: 128 +seed: 3920 +max_size: 4_294_967_296 +dtype: uint32 + +tokenizer: + name_or_path: allenai/dolma2-tokenizer + bos_token_id: null + eos_token_id: 100257 + pad_token_id: 100277 + segment_before_tokenization: false + encode_special_tokens: true diff --git a/configs/peteish-anneal/tokens-nvidia25.yaml b/configs/peteish-anneal/tokens-nvidia25.yaml new file mode 100644 index 00000000..162ebdac --- /dev/null +++ b/configs/peteish-anneal/tokens-nvidia25.yaml @@ -0,0 +1,16 @@ +destination: ${oc.env:HOME}/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/documents/full/allenai/dolma2-tokenizer +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_nvidia25/documents/full/* + +processes: 128 +seed: 3920 +max_size: 4_294_967_296 +dtype: uint32 + +tokenizer: + name_or_path: allenai/dolma2-tokenizer + bos_token_id: null + eos_token_id: 100257 + pad_token_id: 100277 + segment_before_tokenization: false + encode_special_tokens: true diff --git a/pyproject.toml b/pyproject.toml index a4957551..cf67d64d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,6 +112,8 @@ dev = [ "isort>=5.10.1", "mypy>=0.971", "pytest>=5.2", + "types-PyYAML", + "types-dateparser" ] # extension to process code code = ["detect-secrets==1.4.0", "beautifulsoup4>=4", "pygments", "regex"] diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py index de6a43d5..d263d4ca 100644 --- a/python/dolma/cli/deduper.py +++ b/python/dolma/cli/deduper.py @@ -192,7 +192,6 @@ def run(cls, parsed_config: DeduperConfig): # perform some path validation to make sure we don't call the mixer with invalid config total_matching_documents = 0 for document in parsed_config.documents: - if not any( fnmatch.fnmatch(dict_config["dedupe"]["document_dir"], part) for part in document.split(os.sep) ): diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py index 943d7f74..2d39d149 100644 --- a/python/dolma/cli/mixer.py +++ b/python/dolma/cli/mixer.py @@ -144,7 +144,6 @@ def run(cls, parsed_config: MixerConfig): # perform some path validation to make sure we don't call the mixer with invalid config total_matching_documents = 0 for document in stream_config.documents: - current_matching_documents = sum(1 for _ in glob_path(document)) if current_matching_documents == 0: # only raise a warning if no documents are found for a single path diff --git a/python/dolma/core/runtime.py b/python/dolma/core/runtime.py index ac5e2a23..d14a4cc0 100644 --- a/python/dolma/core/runtime.py +++ b/python/dolma/core/runtime.py @@ -27,8 +27,17 @@ TaggerOutputDictType, ) from .errors import DolmaFatalError, DolmaRetryableFailure, DolmaShardError +from .loggers import get_logger from .parallel import BaseParallelProcessor, QueueType -from .paths import delete_dir, join_path, make_relative, mkdir_p, split_glob, split_path +from .paths import ( + delete_dir, + exists, + join_path, + make_relative, + mkdir_p, + split_glob, + split_path, +) from .registry import TaggerRegistry from .utils import import_modules, make_variable_name @@ -178,10 +187,10 @@ def _make_output_streams( mkdir_p(parent) # open a new file and create a new encoder - io = stack.enter_context(smart_open.open(loc.path, **open_kwargs)) + io_ = stack.enter_context(smart_open.open(loc.path, **open_kwargs)) encoder = msgspec.json.Encoder() opened[loc.path] = TaggerOutputIO( - exp=loc.exp, taggers=set(), path=loc.path, io=io, encoder=encoder + exp=loc.exp, taggers=set(), path=loc.path, io=io_, encoder=encoder ) # keep track of which taggers are writing to this paths @@ -223,7 +232,7 @@ def _write_sample_to_streams( class TaggerProcessor(BaseParallelProcessor): @classmethod - def increment_progressbar( # type: ignore + def increment_progressbar( # type: ignore # pylint: disable=arguments-differ cls, queue: QueueType, # queue must be the first argument, and it should be a positional-only argument /, @@ -245,6 +254,10 @@ def process_single( **kwargs, ): """Lets count run the taggers! We will use the destination path to save each tagger output.""" + + # get a logger + logger = get_logger(cls.__name__) + # import tagger modules taggers_modules = kwargs.get("taggers_modules", None) if taggers_modules is not None: @@ -264,7 +277,9 @@ def process_single( # this is the dictionary that will hold the output of each tagger taggers_paths = _determine_output_paths_for_taggers( - experiment_name=experiment_name, destination=destination_path, taggers=taggers + experiment_name=experiment_name, + destination=destination_path, + taggers=taggers, ) # skip on failure @@ -283,6 +298,27 @@ def process_single( # total number of documents processed total_docs_cnt = 0 + if not kwargs.get("ignore_existing", False): + # we group taggers by their path (this is for cases when two taggers are going to same file) + # and then remove all taggers if any of the paths exists and ignore_existing is True + _taggers_by_path: Dict[str, list[str]] = {} + for tagger_name, tagger_path in taggers_paths.items(): + _taggers_by_path.setdefault(tagger_path.path, []).append(tagger_name) + + # actually take care of removal here + for tagger_path, tagger_names in _taggers_by_path.items(): + if exists(tagger_path): + for tagger_name in tagger_names: + logger.info("Skipping %s because %s already exists.", tagger_name, tagger_path) + taggers.pop(tagger_name) + taggers_paths.pop(tagger_name) + + if not taggers: + # if all taggers have been removed, we return early + cls.increment_progressbar(queue, files=1) + logger.info("All taggers for %s have been skipped.", source_path) + return + # creating dedicated decoder speeds up the process # if any of the taggers require metadata, we use a decoder that can handle it # otherwise, we use a decoder that does not parse metadata, which is faster @@ -327,7 +363,7 @@ def process_single( # double the update interval if the queue is full update_interval *= 2 - except Exception as exp: + except Exception as exp: # pylint: disable=broad-except # handle any exception that might have occurred msg = f"Failed to process {source_path} due to {exp.__class__.__name__}: {' '.join(exp.args)}" if exp.__class__.__name__ == "IncompleteReadError": diff --git a/python/dolma/taggers/language.py b/python/dolma/taggers/language.py index 121fd5c6..91fdedfe 100644 --- a/python/dolma/taggers/language.py +++ b/python/dolma/taggers/language.py @@ -4,14 +4,14 @@ @kylel, @soldni """ -from typing import TYPE_CHECKING, List, Tuple +from typing import TYPE_CHECKING, Iterable, List, Tuple import necessary import regex from anyascii import anyascii from ..core.data_types import DocResult, Document, Span -from ..core.ft_tagger import BaseFastTextTagger +from ..core.ft_tagger import BaseFastTextTagger, Prediction, TextSlice from ..core.registry import TaggerRegistry from ..core.taggers import BaseTagger from ..core.utils import split_paragraphs @@ -32,14 +32,17 @@ with necessary.necessary("lingua", soft=True) as LINGUA_AVAILABLE: if LINGUA_AVAILABLE or TYPE_CHECKING: - from lingua import Language, LanguageDetectorBuilder + from lingua import ( # pylint: disable=import-error # pyright: ignore + Language, + LanguageDetectorBuilder, + ) class BaseLanguageTagger(BaseTagger): INCLUDE_NEGATIVE = True PREDICT_ON_PARAGRAPHS = False - def predict_text(self, text: str) -> List[Tuple[str, float]]: + def predict_text(self, text: str) -> List[Tuple[str, float]]: # pylint: disable=unused-argument return [] def make_negative(self, spans: List[Span]) -> List[Span]: @@ -79,7 +82,7 @@ def __init__(self) -> None: raise ImportError(f"cld3 is not installed, cannot instantiate {self.__class__.__name__}") def predict_text(self, text: str) -> List[Tuple[str, float]]: - pred = cld3.get_language(text) # pyright: ignore + pred = cld3.get_language(text) # pyright: ignore # pylint: disable=possibly-used-before-assignment score = pred.probability if pred.language == "en" else 0.0 return [("en", score)] @@ -114,7 +117,7 @@ def predict_text(self, text: str) -> List[Tuple[str, float]]: is_reliable = False for fn in (self._identity_fn, self._to_ascii_input, self._sanitize_input): try: - is_reliable, _, details = cld2.detect(fn(text)) + is_reliable, _, details = cld2.detect(fn(text)) # pylint: disable=possibly-used-before-assignment break except cld2.error: ... @@ -146,13 +149,16 @@ class Cld2EnglishLanguageParagraphTagger(Cld2EnglishLanguageTagger): @TaggerRegistry.add("ft_lang_id_doc_v1") class FastTextAllLanguagesDocumentTagger(BaseLanguageTagger, BaseFastTextTagger): - MODEL_PATH = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" + MODEL_PATH = "https://dolma-artifacts.org/lang_id_models/fbai/lid.176.bin" INCLUDE_NEGATIVE = False PREDICT_ON_PARAGRAPHS = False def __init__(self): BaseFastTextTagger.__init__(self, model_path=self.MODEL_PATH, model_mode=self.DOCUMENT_LEVEL_TAGGER) + def predict_slice(self, text_slice: TextSlice) -> Iterable[Prediction]: + raise RuntimeError("This method should not be called; please report this issue.") + def predict_text(self, text: str) -> List[Tuple[str, float]]: preds = self.classifier.predict(text.lower().replace("\n", " ").strip(), k=-1) return [(label.replace("__label__", ""), float(score)) for label, score in zip(*preds)] @@ -165,6 +171,16 @@ def predict_text(self, text: str) -> List[Tuple[str, float]]: return [(lang, round(score, 2)) for lang, score in out if score > 0.01] +@TaggerRegistry.add("glotlid_doc_v3") +class FastTextAllLanguagesDocumentGlotV3Tagger(FastTextAllLanguagesDocumentTagger): + MODEL_PATH = "https://dolma-artifacts.org/lang_id_models/cis-lmu/glotlid/model_v3.bin" + + +@TaggerRegistry.add("glotlid_doc_v3_1e2") +class FastTextAllLanguagesDocumentGlotV3MinScoreTagger(FastTextAllLanguagesDocumentMinScoreTagger): + MODEL_PATH = "https://dolma-artifacts.org/lang_id_models/cis-lmu/glotlid/model_v3.bin" + + @TaggerRegistry.add("ft_lang_id_paragraph_v1") class FastTextAllLanguageParagraphTagger(FastTextAllLanguagesDocumentTagger): INCLUDE_NEGATIVE = False @@ -203,7 +219,8 @@ def __init__(self) -> None: if not LANGDETECT_AVAILABLE: raise ImportError("langdetect is not installed, please run `pip install dolma[lang]`.") - (factory := DetectorFactory()).load_profile(PROFILES_DIRECTORY) + factory = DetectorFactory() # pylint: disable=possibly-used-before-assignment + factory.load_profile(PROFILES_DIRECTORY) # pylint: disable=possibly-used-before-assignment factory.set_seed(0) self.detector = factory.create() super().__init__() @@ -213,7 +230,7 @@ def predict_text(self, text: str) -> List[Tuple[str, float]]: self.detector.append(text) langs = self.detector.get_probabilities() output = [(str(r.lang.strip().lower()), float(r.prob)) for r in langs] - except LangDetectException: + except LangDetectException: # pylint: disable=possibly-used-before-assignment output = [] finally: self.detector.text = "" @@ -253,7 +270,11 @@ def __init__(self) -> None: super().__init__() if not LINGUA_AVAILABLE: raise ImportError("langdetect is not installed, please run `pip install dolma[lang]`.") - self.detector = LanguageDetectorBuilder.from_languages(*Language.all()).build() + + all_languages = Language.all() # pylint: disable=possibly-used-before-assignment + self.detector = LanguageDetectorBuilder.from_languages( # pylint: disable=possibly-used-before-assignment + *all_languages + ).build() def predict_text(self, text: str) -> List[Tuple[str, float]]: langs_conf = self.detector.compute_language_confidence_values(text) or [] diff --git a/python/dolma/taggers/url.py b/python/dolma/taggers/url.py index 374a94df..285d828a 100644 --- a/python/dolma/taggers/url.py +++ b/python/dolma/taggers/url.py @@ -116,8 +116,11 @@ def clean_url(cls, url: str) -> Generator[str, None, None]: if url is None or not url.strip(): return - parsed = urllib3.util.parse_url(url) - yield f"{parsed.host}{(f':{parsed.port}') if parsed.port else ''}{parsed.path or ''}".rstrip("/").lower() + try: + p_url = urllib3.util.parse_url(url) + yield f"{p_url.host}{(f':{p_url.port}') if p_url.port else ''}{p_url.path or ''}".rstrip("/").lower() + except Exception: + LOGGER.info(f"Failed to parse URL: {url}") def check_url(self, url: str) -> bool: return url in self.blocklist @@ -215,6 +218,11 @@ class BlocklistProjectNsfwTagger(BaseDomainTagger): BLOCKLIST_PATHS = ["https://dolma-artifacts.org/blocklist_project/blocklist_project-20240207/porn.txt"] +@TaggerRegistry.add("flashcards_domains_v1") +class FlashcardsDomainsTagger(BaseDomainTagger): + BLOCKLIST_PATHS = ["https://dolma-artifacts.org/flashcard_domains/flashcard_domains-20241113/domains.txt"] + + @TaggerRegistry.add("blocklist_project_social_v1") class BlocklistProjectSocialTagger(BaseDomainTagger): BLOCKLIST_PATHS = [ diff --git a/scripts/inspect_tokenized.py b/scripts/inspect_tokenized.py new file mode 100644 index 00000000..42439f9e --- /dev/null +++ b/scripts/inspect_tokenized.py @@ -0,0 +1,47 @@ +import os +import click +from dolma.core.paths import cached_path +import numpy as np +from transformers import AutoTokenizer + + +@click.command() +@click.argument("tokenized_file") +@click.option("--tokenizer-name-or-path", default="allenai/gpt-neox-olmo-dolma-v1_5") +@click.option("--chunk-size", default=1024**2, type=int) +def inspect_tokenized(tokenized_file: str, tokenizer_name_or_path: str, chunk_size: int): + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + + print('Vocab size:', tokenizer.vocab_size) + print('BOS token:', tokenizer.bos_token_id) + print('EOS token:', tokenizer.eos_token_id) + print('PAD token:', tokenizer.pad_token_id) + print('UNK token:', tokenizer.unk_token_id) + + path = cached_path(tokenized_file) + size = os.path.getsize(path) + data = np.memmap(path, dtype='uint16', mode='r', shape=(size // 2,)) + + collection = [] + i = 0 + while i < len(data): + chunk = data[i : i + chunk_size] + i += chunk_size + + while (chunk == tokenizer.eos_token_id).any(): + # split chunk into before and after eos + eos_idx = np.where(chunk == tokenizer.eos_token_id)[0][0] + 1 + collection.extend(chunk[:eos_idx].tolist()) + output = tokenizer.decode(collection) + print('#' * os.get_terminal_size().columns) + print(output) + input("#" * os.get_terminal_size().columns) + # reset collection + collection = [] + chunk = chunk[eos_idx:] + + collection.extend(chunk.tolist()) + + +if __name__ == "__main__": + inspect_tokenized() diff --git a/scripts/make_npy_mix.py b/scripts/make_npy_mix.py new file mode 100644 index 00000000..3f2e819f --- /dev/null +++ b/scripts/make_npy_mix.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 + +import argparse +import fnmatch +import logging +import math +import random +import sys +from dataclasses import dataclass +from typing import Callable, Generator +from urllib.parse import urlparse + +import boto3 +import yaml + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +units_map = { + "B": 9, + "T": 12, + "G": 9, + "M": 6, + "K": 3, +} + + +@dataclass(frozen=True) +class SourceConfig: + source: str + mix_percent: float | None = None + sample_percent: float = 1.0 + + def __post_init__(self): + if self.mix_percent is not None and (self.mix_percent < 0 or self.mix_percent > 1): + raise ValueError("mix_percent must be between 0 and 1") + + @property + def bucket(self) -> str: + return urlparse(self.source).netloc + + @property + def prefix(self) -> str: + path = urlparse(self.source).path.lstrip("/") + for i, char in enumerate(path): + if char in ["*", "?", "["]: + return path[:i] + return path + + def sample(self, total_size: int) -> tuple[list[str], int]: + formatter = make_formatter(total_size) + + try: + all_paths, all_sizes = map(list, zip(*self.glob)) + except ValueError: + raise ValueError(f"No files found for source {self.source}") + + source_size = sum(all_sizes) + + if self.mix_percent is not None: + target_size = int(round(total_size * self.mix_percent)) + else: + target_size = int(round(source_size * self.sample_percent)) + + logger.info( + f"Sampling {formatter(target_size)} bytes from {formatter(source_size)} " + f"from {self.source} ({target_size / total_size:.2%})" + ) + + # Randomly sample files + running_size = 0 + selected = [] + + # double while loop to allow for sampling over 100% if needed + while running_size < target_size: + all_paths_copy, all_sizes_copy = all_paths[:], all_sizes[:] + while len(all_paths_copy) > 0: + idx = random.randint(0, len(all_paths_copy) - 1) + path = all_paths_copy.pop(idx) + size = all_sizes_copy.pop(idx) + selected.append(path) + + running_size += size + if running_size >= target_size: + break + + return selected, running_size + + @property + def glob(self) -> Generator[tuple[str, int], None, None]: + client = boto3.client("s3") + + # Use paginator to handle cases with many objects + paginator = client.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=self.bucket, Prefix=self.prefix) + + for page in page_iterator: + if "Contents" not in page: + continue + + for obj in page["Contents"]: + path = f"s3://{self.bucket}/{obj['Key']}" + if not path.endswith(".npy"): + continue + + # Use fnmatch to check if the object key matches the pattern + if "*" not in self.source or fnmatch.fnmatch(path, self.source): + yield path, obj["Size"] + + @classmethod + def from_dict(cls, data: dict) -> "SourceConfig": + return cls( + source=data["source"], + mix_percent=data.get("mix_percent"), + sample_percent=data.get("sample_percent") or 1.0, + ) + + +@dataclass(frozen=True) +class SamplingConfig: + target_size: float | int | str + sources: list[SourceConfig] + output: str | None = None + seed: int = 42 + + def __post_init__(self): + if isinstance(self.target_size, str): + # check if string is in the format "xxxS" where S is a suffix for size (e.g. G, M, K) + try: + self.size + except ValueError as e: + raise ValueError("Invalid target size format") from e + + if len(self.sources) == 0: + raise ValueError("Must specify at least one source") + + random.seed(self.seed) + + @property + def size(self) -> int: + if isinstance(self.target_size, float) or isinstance(self.target_size, int): + return int(self.target_size) + + suffix = self.target_size[-1].upper() + try: + size = float(self.target_size[:-1]) + except ValueError: + raise ValueError("Invalid target size format") + + digits = units_map.get(suffix) + if digits is None: + raise ValueError("Invalid target size suffix") + return int(size * 10 ** digits) + + @classmethod + def from_yaml(cls, path: str) -> "SamplingConfig": + with open(path) as f: + data = yaml.safe_load(f) + return cls.from_dict(data) + + @classmethod + def from_dict(cls, data: dict) -> "SamplingConfig": + return cls( + target_size=data["target_size"], + sources=[SourceConfig.from_dict(source) for source in data.get("sources", [])], + output=data.get("output"), + ) + + +def make_formatter(total_size: int) -> Callable[[int], str]: + num_digits = (math.floor(math.log10(total_size))) // 3 * 3 + suffix = {v: k for k, v in units_map.items()}.get(num_digits, f"e{num_digits}") + + def formatter(size: int, _num_digits: int = num_digits, _suffix: str = suffix) -> str: + value = size / 10 ** _num_digits + return f"{value:.1f}{_suffix}" + + return formatter + + +def main(): + parser = argparse.ArgumentParser(description="Sample files from S3 datasets") + parser.add_argument("config", type=str, help="Path to config YAML file") + args = parser.parse_args() + + if args.config == "-": + config = SamplingConfig.from_dict(yaml.safe_load(sys.stdin)) + else: + config = SamplingConfig.from_yaml(args.config) + + formatter = make_formatter(config.size) + + total = 0 + rows = ["data:", " paths:"] + for source in config.sources: + paths, size = source.sample(config.size) + logger.info( + f"Selected {len(paths)} files from {source.source} " + f"({formatter(size)} bytes; {size / config.size:.2%})" + ) + rows.append(f"\n # {source.source} ({formatter(size)};{size / config.size:.2%})") + rows.extend([f" - {path}" for path in sorted(paths)]) + total += size + + logger.info(f"Total size: {formatter(config.size)} bytes requested, " + f"{formatter(total)} bytes selected ({total / config.size:.2%})") + + output_text = "\n".join(rows) + if config.output: + with open(config.output, "w") as f: + f.write(output_text) + else: + print(output_text) + + +if __name__ == "__main__": + main() diff --git a/scripts/tokenize_sft_dataset.py b/scripts/tokenize_sft_dataset.py new file mode 100644 index 00000000..7b93e65f --- /dev/null +++ b/scripts/tokenize_sft_dataset.py @@ -0,0 +1,216 @@ +""" +Script for preparing the Tulu data for fine-tuning an OLMo model. + +python scripts/tokenize_sft_dataset.py \ + --tokenizer.name_or_path allenai/dolma2-tokenizer \ + --tokenizer.bos_token_id 100257 \ + --tokenizer.eos_token_id 100257 \ + --tokenizer.pad_token_id 100277 \ + --dataset.path allenai/tulu-v3.9-tmp + +""" + +from argparse import ArgumentParser +from dataclasses import dataclass +from functools import partial +from pathlib import Path + +import datasets as ds +import numpy as np + +from dolma.tokenizer.tokenizer import Tokenizer +from dolma.cli.tokenizer import TokenizerConfig +from dolma.cli import field, BaseCli + + +@dataclass +class DatasetConfig: + path: str | None = field(default=None, help="Path or name of the dataset. Required.") + name: str | None = field(default=None, help="Defining the name of the dataset configuration.") + split: str | None = field(default='train', help="Name of the split to load.") + + +@dataclass +class TokenizationConfig: + tokenizer: TokenizerConfig = field(default=TokenizerConfig(), help="Configuration for the tokenizer.") + dataset : DatasetConfig = field(default=DatasetConfig(), help="Configuration for the dataset.") + processes: int = field(default=1, help="Number of parallel processes to use.") + output_dir: str = field(help="Output directory to save the tokenized data.") + max_seq_len: int = field(default=4096, help="Maximum sequence length.") + max_label_len: int | None = field(default=None, help="Maximum label length.") + dtype: None | str = field(default=None, help="Data type for the tokenized data.") + max_tokens_per_file: int = field(default=2 ** 32, help="Maximum number of tokens per file.") + + +def run_tokenizer(opts: TokenizationConfig) -> None: + assert opts.tokenizer is not None, "Tokenizer configuration is missing." + assert opts.tokenizer.name_or_path is not None, "Tokenizer name or path must be provided." + assert getattr(opts, "output_dir", None) is not None, "Output directory is missing." + + opts.max_label_len = opts.max_label_len or opts.max_seq_len + + tokenizer_config = {} + if opts.tokenizer.bos_token_id is not None: + tokenizer_config["bos_token_id"] = opts.tokenizer.bos_token_id + if opts.tokenizer.eos_token_id is not None: + tokenizer_config["eos_token_id"] = opts.tokenizer.eos_token_id + if opts.tokenizer.pad_token_id is not None: + tokenizer_config["pad_token_id"] = opts.tokenizer.pad_token_id + + if Path(opts.tokenizer.name_or_path).is_file(): + tokenizer = Tokenizer.from_file(opts.tokenizer.name_or_path, **tokenizer_config) + else: + tokenizer = Tokenizer.from_pretrained(opts.tokenizer.name_or_path, **tokenizer_config) + + expected_bits = int(np.ceil(np.log2(tokenizer.vocab_size) / 16)) * 16 + expected_dtype = f"uint{expected_bits}" + + if opts.dtype is not None and opts.dtype != expected_dtype: + raise ValueError(f"Invalid data type, expected: {expected_dtype}, got: {opts.dtype}") + elif opts.dtype is None: + np_dtype = getattr(np, expected_dtype) + else: + np_dtype = getattr(np, opts.dtype) + + assert opts.dataset is not None, "Dataset configuration is missing." + assert opts.dataset.path is not None, "Dataset path is missing." + + dataset_config = {} + if opts.dataset.name is not None: + dataset_config["name"] = opts.dataset.name + if opts.dataset.split is not None: + dataset_config["split"] = opts.dataset.split + + dataset = ds.load_dataset(opts.dataset.path, **dataset_config) + + # # sample 10k + # dataset = dataset.shuffle(seed=42).select(range(10000)) + + print("Tokenizing dataset...") + dataset = dataset.map( + partial(preprocess, tokenizer=tokenizer, max_seq_len=opts.max_seq_len), + batched=False, + remove_columns=dataset.column_names, # type: ignore + num_proc=opts.processes, # type: ignore + desc="Tokenizing dataset", # type: ignore + ) + + print("Filtering dataset...") + n = len(dataset) # type: ignore + dataset = dataset.filter( + partial(filter_long_sequences, max_label_len=opts.max_label_len, max_seq_len=opts.max_seq_len), + batched=False, + num_proc=opts.processes, + desc="Filtering sequences that are too long", + ) + print(f"Filtered out {n - len(dataset):,d} examples") + + print(f"Saving results to '{opts.output_dir}'...") + output_dir = Path(opts.output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + total_tokens = len(dataset) * opts.max_seq_len + batch_size = int(np.floor((opts.max_tokens_per_file / total_tokens) * len(dataset))) + print(f"Saving {len(dataset):,d} examples to {output_dir} in batches of {batch_size:,d} examples") + + dataset.map( + partial(save_memmap, output_dir=output_dir, batch_size=batch_size, dtype=np_dtype), + batched=True, + batch_size=batch_size, + num_proc=opts.processes, + desc="Saving memmaps", + remove_columns=dataset.column_names, # type: ignore + with_indices=True, + ) + + +def save_memmap( + data: dict[str, list], + idx: list[int], + output_dir: Path, + batch_size: int, + dtype: np.dtype +) -> dict[str, list]: + output_dir.mkdir(exist_ok=True, parents=True) + + pos = idx[0] // batch_size + size = sum(len(input_ids) for input_ids in data["input_ids"]) + input_ids_mm = np.memmap(output_dir / f"input_ids_{pos:06d}.npy", dtype=dtype, mode="w+", shape=(size,)) + label_mask_mm = np.memmap(output_dir / f"label_mask_{pos:06d}.npy", dtype=np.bool_, mode="w+", shape=(size,)) + + offset = 0 + for input_ids, label_mask in zip(data["input_ids"], data["label_mask"]): + n = len(input_ids) + input_ids_mm[offset : offset + n] = input_ids + label_mask_mm[offset : offset + n] = label_mask + offset += n + + input_ids_mm.flush() + label_mask_mm.flush() + + return {} + + +def filter_long_sequences(example: dict, max_label_len: int = 2 ** 30, max_seq_len: int = 2 ** 30) -> bool: + return ( + example["n_labels"] > 0 + and example["n_labels"] <= max_label_len + and example["n_total"] <= max_seq_len + ) + + +def preprocess(example: dict, tokenizer: Tokenizer, max_seq_len: int) -> dict: + eos_token = tokenizer.base_tokenizer.id_to_token(tokenizer.eos_token_id) + + input_ids = [tokenizer.bos_token_id] + label_mask = [False] + + for msg in example["messages"]: + role_tokens = tokenizer.encode(f"<|{msg['role']}|>\n", add_special_tokens=False) + label_mask += [False] * len(role_tokens) + input_ids += role_tokens + + if msg["role"] == "assistant": + content_tokens = tokenizer.encode( + msg["content"].strip() + eos_token + "\n", add_special_tokens=False + ) + label_mask += [True] * len(content_tokens) + # mask out the last '\n' + assert content_tokens[-2] == tokenizer.eos_token_id + label_mask[-1] = False + else: + content_tokens = tokenizer.encode(msg["content"].strip() + "\n", add_special_tokens=False) + label_mask += [False] * len(content_tokens) + input_ids += content_tokens + + input_ids = input_ids[:max_seq_len] + label_mask = label_mask[:max_seq_len] + + n_total = len(input_ids) + + if len(input_ids) < max_seq_len: + pad_len = max_seq_len - len(input_ids) + input_ids += [tokenizer.pad_token_id] * pad_len + label_mask += [False] * pad_len + elif len(input_ids) > max_seq_len: + input_ids = input_ids[:max_seq_len] + label_mask = label_mask[:max_seq_len] + + assert len(input_ids) == len(label_mask) + n_labels = sum(label_mask) + + return {"input_ids": input_ids, "label_mask": label_mask, "n_labels": n_labels, "n_total": n_total} + + +class SftTokenizerCli(BaseCli): + CONFIG = TokenizationConfig + DESCRIPTION = "Tokenize the Tulu V2 SFT dataset." + + @classmethod + def run(cls, parsed_config: TokenizationConfig): + run_tokenizer(parsed_config) + + +if __name__ == "__main__": + parser = SftTokenizerCli.make_parser(ArgumentParser()) + SftTokenizerCli.run_from_args(parser.parse_args()) diff --git a/search/README.md b/search/README.md new file mode 100644 index 00000000..eda8873a --- /dev/null +++ b/search/README.md @@ -0,0 +1,80 @@ +# Dolma Search + +Dolma Search is a toolkit for easy indexing and searching of data in Dolma format. It provides functionality to create, manage, and query indexes using the Tantivy search engine. + +## Features + +- Create and manage Tantivy indexes +- Index documents from various sources, including local files and S3 buckets +- Perform searches on indexed data with customizable queries +- Display search results in different formats (JSON, table, or snippet view) + +## Installation + +You can install Dolma Search using pip: + +```shell +git clone https://github.com/allenai/dolma.git +pip install search +``` + +## Usage + +### Indexing + +To index documents, use the `dolma_search.index` module. Here's an example of how to use it: + +```shell +dolma-search index \ + -i /path/to/index \ + -d "s3://ai2-llm/pretraining-data/sources/path/to/documents/*.gz" +``` + +The following command line options are available: + +| Option | Short | Description | Default | +|--------|-------|-------------|---------| +| `--documents` | `-d` | The documents to index. Can be any glob pattern supported by smart-open library. | Required | +| `--index-path` | `-i` | The path to the index. If not provided, an in-memory index will be used. | None | +| `--force` | `-f` | If the index already exists, delete it and create a new one. | False | +| `--num-readers` | `-n` | The number of readers to use. | 1 | +| `--num-indexers` | `-N` | The number of indexers to use. | 1 | +| `--reader-batch-size` | `-b` | The batch size for readers. | 1000 | +| `--indexer-batch-size` | `-B` | The batch size for indexers. | 1000 | +| `--heap-size` | `-H` | The heap size for the index writer. | 1GB | +| `--queue-size-per-thread` | `-q` | The size of the queue to use for storing documents. | 125 | + + + +### Searching + +To search the indexed documents, use the `dolma_search.query` module. Here's an example of how to use it: + + +```shell +dolma-search query \ + -i /data/flan_index \ + -q "What is the capital of France?" +``` + +You can also pass search queries from stdin + +```shell +cat queries.txt | dolma-search query -i /data/flan_index +``` + +You can choose which format to display the results in. Valid options are: + +- `json`: Print the results in JSON format with no coloring; perfect for piping to another program that can parse JSONL output. +- `table`: Print the results in a table format with coloring. +- `snippet`: Print the results in a table format with coloring; snippets containing matches, rather than the full document, are displayed. + +Other options for the `query` command include: + +| Option | Short | Description | Default | +|--------|-------|-------------|---------| +| `--index-path` | `-i` | The path to the index. | Required | +| `--query` | `-q` | The query to search for. If not provided, enters interactive mode. If set to "-", reads queries from stdin. | None | +| `--num-hits` | `-n` | The number of hits to return. | 10 | +| `--display-format` | `-f` | The format to display the search results in. Options: table, json, snippet. | json | +| `--selector` | `-s` | The selector used to process the queries. Uses jq syntax. | None | diff --git a/search/pyproject.toml b/search/pyproject.toml new file mode 100755 index 00000000..ad36f1a3 --- /dev/null +++ b/search/pyproject.toml @@ -0,0 +1,107 @@ +[project] +name = "dolma-search" +version = "0.1.0" +description = "Toolkit for easy indexing of data in Dolma format." +authors = [ + {name = "Luca Soldaini", email = "lucas@allenai.org" } +] +license = {text = "Apache-2.0"} +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "msgspec>=0.18.6", + "jq>=1.8.0,<2.0.0", + "fsspec[http]<=2024.6.1,>=2023.1.0", + "tantivy>=0.18.0", + "smart-open>=7.0.4,<8.0.0", + "rich>=13.5.0,<14.0.0", + "markdownify>=0.13.1,<0.14.0" +] + +[project.urls] +"Homepage" = "https://github.com/allenai/dolma" +"Repository" = "https://github.com/allenai/dolma" +"Bug Tracker" = "https://github.com/allenai/dolma/issues" + + +[tool.setuptools.packages.find] +where = ["python"] + +[tool.setuptools.package-data] +dolma_search = ["py.typed", "*.pyi"] + + +[project.scripts] +dolma-search = "dolma_search.__main__:main" + + +[build-system] +build-backend = "setuptools.build_meta" +requires = [ + "setuptools >= 61.0.0", + "wheel" +] + +[project.optional-dependencies] +dev = [ + "black>=22.6.0", + "isort>=5.10.1", + "mypy>=0.971", + "pytest>=5.2", + "ipython>=8.4.0", + "autopep8>=1.7.0", + "flake8>=5.0", + "ipdb>=0.13.0", + "flake8-pyi>=22.8.1", + "Flake8-pyproject>=1.1.0" +] + +[tool.black] +line-length = 115 +include = '\.pyi?$' +exclude = ''' +( + __pycache__ + | \.git + | \.mypy_cache + | \.pytest_cache + | \.vscode + | \.venv + | \bdist\b + | \bdoc\b +) +''' + +[tool.isort] +profile = "black" +line_length = 115 +multi_line_output = 3 + +[tool.autopep8] +max_line_length = 115 +in-place = true +recursive = true +aggressive = 3 + +[tool.mypy] +python_version = "3.10" +ignore_missing_imports = true +no_site_packages = true +allow_redefinition = false +warn_unused_configs = true +warn_unused_ignores = true +warn_no_return = true +warn_return_any = false +warn_unreachable = true +show_error_codes = true +pretty = true + +[tool.mypy-tests] +strict_optional = false + +[tool.flake8] +per-file-ignores = [ + '__init__.py:F401', + '*.pyi:E302,E305', + '*.py:E203' +] diff --git a/search/python/dolma_search/__init__.py b/search/python/dolma_search/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/search/python/dolma_search/__main__.py b/search/python/dolma_search/__main__.py new file mode 100644 index 00000000..3f970e38 --- /dev/null +++ b/search/python/dolma_search/__main__.py @@ -0,0 +1,33 @@ +import argparse +import sys + +from . import index, query + +CLI_DESCRIPTION = "Dolma Search CLI" + + +def main(): + parser = argparse.ArgumentParser(CLI_DESCRIPTION) + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # Index subparser + index_parser = subparsers.add_parser("index", help=index.INDEX_DESCRIPTION) + index.make_index_parser(index_parser) + + # Query subparser + query_parser = subparsers.add_parser("query", help=query.QUERY_DESCRIPTION) + query.make_search_parser(query_parser) + + args = parser.parse_args() + + if args.command == "index": + index.index_data(args) + elif args.command == "query": + query.search_data(args) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/search/python/dolma_search/common.py b/search/python/dolma_search/common.py new file mode 100644 index 00000000..6ebec22b --- /dev/null +++ b/search/python/dolma_search/common.py @@ -0,0 +1,31 @@ +import shutil +from enum import Enum +from pathlib import Path + +from tantivy import Index, SchemaBuilder + + +class IndexFields(Enum): + TEXT = "text" + ID = "id" + SOURCE = "source" + + +def create_index(path: str | Path | None = None, reuse: bool = False) -> Index: + # Declaring our schema. + schema_builder = SchemaBuilder() + schema_builder.add_text_field(IndexFields.TEXT.value, stored=True) + schema_builder.add_text_field(IndexFields.ID.value, stored=True) + schema_builder.add_text_field(IndexFields.SOURCE.value, stored=True) + schema = schema_builder.build() + + if path: + path = Path(path) + if not reuse and path.exists(): + shutil.rmtree(path) + + path.mkdir(parents=True, exist_ok=True) + + # Creating our index (in memory) + index = Index(schema, path=str(path), reuse=reuse) + return index diff --git a/search/python/dolma_search/index.py b/search/python/dolma_search/index.py new file mode 100644 index 00000000..df7235d3 --- /dev/null +++ b/search/python/dolma_search/index.py @@ -0,0 +1,204 @@ +""" +python -m dolma_decontamination.search.index \ + -i /data/flan_index \ + -d "s3://ai2-llm/pretraining-data/sources/tulu_flan/v1-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/train/*.gz" \ + -n 4 \ + -N 12 \ + -b 1000 \ + -B 50000 \ + -f +""" + +import argparse +import json +import logging +import shutil +import time +from contextlib import ExitStack +from functools import partial +from multiprocessing import Manager, Pool, set_start_method +from pathlib import Path +from queue import Queue +from urllib.parse import urlparse + +import fsspec +import smart_open +import tqdm +from tantivy import Document, Index, SchemaBuilder + +from .common import IndexFields, create_index + +INDEX_DESCRIPTION = "Index documents into a tantivy index" + + +QueueType = Queue[Document | None] + + +def get_fs(uri: str) -> fsspec.AbstractFileSystem: + return fsspec.filesystem(urlparse(uri).scheme or "file") + + +def list_path(pattern: str) -> list[str]: + fs = get_fs(pattern) + protocol = urlparse(pattern).scheme + paths = [] + for path in fs.glob(pattern): + if protocol: + paths.append(f"{protocol}://{path}") + else: + paths.append(str(path)) + del fs + return paths + + +def list_paths(glob_patterns: list[str], num_workers: int = 1) -> list[str]: + with Pool(processes=num_workers) as pool: + return [p for ps in pool.map(list_path, glob_patterns) for p in ps] + + +def read_file_for_indexing(file_path: str, docs_queue: Queue[list[Document]], batch_size: int = 1_000): + batch: list[Document] = [] + with smart_open.open(file_path, "rt", encoding="utf-8") as stream: + for line in stream: + row = json.loads(line) + doc = Document(**{f.value: (row[f.value] or "") for f in IndexFields}) + batch.append(doc) + + if len(batch) >= batch_size: + docs_queue.put(batch) + batch = [] + + if batch: + docs_queue.put(batch) + + +def read_many_and_index( + index: Index, + paths: list[str], + num_readers: int = 1, + num_indexers: int = 1, + indexer_batch_size: int = 1_000, + reader_batch_size: int = 1_000, + heap_size: int = 1024 * 1024 * 1024, + queue_size: int = 1000, +): + with ExitStack() as stack: + reader_pool = stack.enter_context(Pool(processes=num_readers)) + + files_pbar = stack.enter_context( + tqdm.tqdm(desc="Reading files", unit=" files", unit_scale=True, total=len(paths)) + ) + docs_pbar = stack.enter_context(tqdm.tqdm(desc="Indexing documents", unit=" docs", unit_scale=True)) + + writer_fn = partial(index.writer, num_threads=num_indexers, heap_size=heap_size) + writer = writer_fn() + + docs_queue: Queue[list[Document]] = (manager := Manager()).Queue(queue_size) + + fn = partial(read_file_for_indexing, docs_queue=docs_queue, batch_size=reader_batch_size) + async_results = [reader_pool.apply_async(fn, [p], callback=lambda _: files_pbar.update(1)) for p in paths] + # for p in paths: + # fn(p) + + indexed_count = 0 + while any(not r.ready() for r in async_results) or not docs_queue.empty(): + # check if there are any documents to index + if docs_queue.empty(): + time.sleep(0.1) + else: + batch = docs_queue.get() + for doc in batch: + writer.add_document(doc) + indexed_count += 1 + + if indexed_count >= indexer_batch_size: + docs_pbar.update(indexed_count) + indexed_count = 0 + writer.commit() + + for r in async_results: + r.wait() + + if indexed_count: + docs_pbar.update(indexed_count) + writer.commit() + writer.wait_merging_threads() + + +def make_index_parser(parser: argparse.ArgumentParser | None = None): + parser = parser or argparse.ArgumentParser(INDEX_DESCRIPTION) + parser.add_argument( + "-d", + "--documents", + type=str, + required=True, + nargs="+", + help="The documents to index. Can be any glob pattern supported by smart-open library.", + ) + parser.add_argument( + "-i", + "--index-path", + type=str, + help="The path to the index. If not provided, an in-memory index will be used.", + ) + parser.add_argument( + "-f", "--force", action="store_true", help="If the index already exists, delete it and create a new one." + ) + parser.add_argument("-n", "--num-readers", type=int, default=1, help="The number of readers to use.") + parser.add_argument("-N", "--num-indexers", type=int, default=1, help="The number of indexers to use.") + parser.add_argument( + "-b", + "--reader-batch-size", + type=int, + default=1_000, + ) + parser.add_argument( + "-B", + "--indexer-batch-size", + type=int, + default=1_000, + ) + parser.add_argument( + "-H", + "--heap-size", + type=int, + default=1024 * 1024 * 1024, + ) + parser.add_argument( + "-q", + "--queue-size-per-thread", + type=int, + default=125, + help="The size of the queue to use for storing documents.", + ) + return parser + + +def index_data(args: argparse.Namespace): + set_start_method("spawn") + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + + index = create_index(args.index_path, reuse=not args.force) + logger.info("Created index" + (f" stored at {args.index_path}" if args.index_path else " in memory")) + + files = list_paths(args.documents, num_workers=args.num_readers) + logger.info(f"Found {len(files)} files to index") + + # add_paths_to_index(args.index_path, files, num_workers=args.num_workers, batch_size=args.batch_size) + read_many_and_index( + index, + paths=files, + num_readers=args.num_readers, + num_indexers=args.num_indexers, + indexer_batch_size=args.indexer_batch_size, + reader_batch_size=args.reader_batch_size, + heap_size=args.heap_size, + queue_size=args.queue_size_per_thread * args.num_readers, + ) + logger.info("Indexed all documents") + + +if __name__ == "__main__": + index_data(make_index_parser().parse_args()) diff --git a/search/python/dolma_search/py.typed b/search/python/dolma_search/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/search/python/dolma_search/query.py b/search/python/dolma_search/query.py new file mode 100644 index 00000000..8533f067 --- /dev/null +++ b/search/python/dolma_search/query.py @@ -0,0 +1,156 @@ +import argparse +import json +import sys +from enum import Enum +from typing import Any, Generator, NamedTuple, Type + +import jq +from markdownify import markdownify as md +from rich.console import Console +from rich.markdown import Markdown +from rich.table import Table +from rich.text import Text +from tantivy import Document, Query, Schema, Searcher, SnippetGenerator + +from .common import IndexFields, create_index + +QUERY_DESCRIPTION = "Interactive search tool on a tantivy index" + + +class DisplayFormat(Enum): + TABLE = "table" + JSON = "json" + SNIPPET = "snippet" + + +def make_search_parser(parser: argparse.ArgumentParser | None = None): + parser = parser or argparse.ArgumentParser(QUERY_DESCRIPTION) + parser.add_argument("-i", "--index-path", type=str, required=True, help="The path to the index.") + parser.add_argument("-q", "--query", type=str, default=None, help="The query to search for.") + parser.add_argument("-n", "--num-hits", type=int, default=10, help="The number of hits to return.") + parser.add_argument( + "-f", + "--display-format", + type=DisplayFormat, + default=DisplayFormat.JSON, + choices=list(DisplayFormat), + help="The format to display the search results in.", + ) + parser.add_argument( + "-s", + "--selector", + type=str, + default=None, + help="The selector used to process the queries. Uses jq syntax.", + ) + return parser + + +def query_iterator(query: str | None) -> Generator[str, None, None]: + if query is None: + while True: + try: + query = input("Enter a query: ") + yield query + except KeyboardInterrupt: + print("\nExiting...") + break + elif query == "-": + for line in sys.stdin: + yield line.strip() + else: + yield str(query) + + +def apply_selector(queries: Generator[str, None, None], selector: str | None): + selector = jq.compile(selector) if selector else None + fn = lambda query: (str(e) for e in selector.input(json.loads(query)).all()) if selector else (str(query),) + for query in queries: + yield from fn(query) + + +class HitsTuple(NamedTuple): + score: float + doc: dict[str, list[Any]] + rank: int + + def get(self, field: str) -> str: + return str(self.doc[field][0]) + + def to_dict(self) -> dict[str, Any]: + return { + "document": {f.value: self.get(f.value) for f in IndexFields}, + "score": self.score, + "rank": self.rank, + } + + @classmethod + def from_hits(cls: Type["HitsTuple"], hits: list[tuple[float, int]], searcher: Searcher) -> list["HitsTuple"]: + return [ + cls(score=hit_score, doc=searcher.doc(hit_doc_address), rank=rank) # pyright: ignore + for rank, (hit_score, hit_doc_address) in enumerate(hits, start=1) + ] + + +def print_hits_table( + hits: list[HitsTuple], + searcher: Searcher, + schema: Schema, + query: Query, + show_snippets: bool = False, + console: Console | None = None, +): + console = console or Console() + + table = Table(title="Search Results", show_header=True, header_style="bold", show_lines=True) + table.add_column("Score", justify="right", style="green") + table.add_column(IndexFields.ID.value.upper(), style="magenta") + table.add_column(IndexFields.SOURCE.value.capitalize(), style="cyan") + table.add_column(IndexFields.TEXT.value.capitalize(), style="blue") + + for hit in hits: + if show_snippets: + snippet_generator = SnippetGenerator.create( + searcher=searcher, query=query, schema=schema, field_name=IndexFields.TEXT.value + ) + snippet = snippet_generator.snippet_from_doc(hit.doc) # pyright: ignore + hit_text = Markdown(md(snippet.to_html()).strip()) + else: + hit_text = Text(hit.get(IndexFields.TEXT.value).strip().replace("\n", "\\n")) + + table.add_row(f"{hit.score:.2f}", hit.get("id"), hit.get("source"), str(hit_text)) + + console.print(table) + + +def search_data(args: argparse.Namespace): + index = create_index(args.index_path, reuse=True) + searcher = index.searcher() + + console = Console() + + for query in apply_selector(query_iterator(args.query), args.selector): + try: + parsed_query = index.parse_query(query) + except ValueError as e: + raise ValueError(f"Error parsing query `{query}`: {e}") + + hits = searcher.search(parsed_query, limit=args.num_hits).hits + parsed_hits = HitsTuple.from_hits(hits, searcher) # pyright: ignore + + if args.display_format == DisplayFormat.JSON: + for row in parsed_hits: + print(json.dumps(row.to_dict(), sort_keys=True)) + else: + print_hits_table( + hits=parsed_hits, + searcher=searcher, + schema=index.schema, + query=parsed_query, + show_snippets=(args.display_format == DisplayFormat.SNIPPET), + console=console, + ) + + +if __name__ == "__main__": + search_data(make_search_parser().parse_args()) diff --git a/search/tests/python/__init__.py b/search/tests/python/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sources/AutoMathText/tokens.py b/sources/AutoMathText/tokens.py new file mode 100644 index 00000000..e02de1d6 --- /dev/null +++ b/sources/AutoMathText/tokens.py @@ -0,0 +1,36 @@ +from copy import deepcopy +from dolma.cli.tokenizer import TokenizationConfig, TokenizerConfig, TokenizerCli +from multiprocessing import cpu_count +import numpy as np +import os + + +def main(): + base_config = TokenizationConfig( + documents=[], + destination=f"{os.environ['HOME'].rstrip('/')}/ai2-llm/preprocessed/math-ai_AutoMathText/v0", + tokenizer=TokenizerConfig( + name_or_path="allenai/dolma2-tokenizer", + bos_token_id=None, + eos_token_id=100257, + pad_token_id=100277, + segment_before_tokenization=False, + encode_special_tokens=True, + ), + processes=cpu_count(), + max_size=100_000_000, + dtype='uint32', + sample_ring_prop=True, + ) + + + for subset in ["arxiv/*", "code/*", "web"]: + config = deepcopy(base_config) + config.documents = [ + f"/data/math-ai_AutoMathText/v0/documents/{subset}/*.jsonl.gz" + ] + config.destination = f"{config.destination}/{subset.rstrip('/*')}/{config.tokenizer.name_or_path}" + TokenizerCli.run(config) + +if __name__ == "__main__": + main() diff --git a/sources/AutoMathText/tokens.sh b/sources/AutoMathText/tokens.sh new file mode 100644 index 00000000..8aedff38 --- /dev/null +++ b/sources/AutoMathText/tokens.sh @@ -0,0 +1,44 @@ +#! /usr/bin/env bash + +set -ex + + +dolma tokens \ + --documents '/data/math-ai_AutoMathText/v0/documents/arxiv/*/*.jsonl.gz' \ + --destination "${HOME}/ai2-llm/preprocessed/math-ai_AutoMathText/v0/arxiv/allenai/dolma2-tokenizer" \ + --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \ + --tokenizer.eos_token_id 100257 \ + --tokenizer.pad_token_id 100277 \ + --no-tokenizer.segment_before_tokenization \ + --tokenizer.encode_special_tokens \ + --processes 16 \ + --max_size 100_000_000 \ + --dtype 'uint32' \ + --sample_ring_prop + +dolma tokens \ + --documents '/data/math-ai_AutoMathText/v0/documents/code/*/*.jsonl.gz' \ + --destination "${HOME}/ai2-llm/preprocessed/math-ai_AutoMathText/v0/code/allenai/dolma2-tokenizer" \ + --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \ + --tokenizer.eos_token_id 100257 \ + --tokenizer.pad_token_id 100277 \ + --no-tokenizer.segment_before_tokenization \ + --tokenizer.encode_special_tokens \ + --processes 16 \ + --max_size 100_000_000 \ + --dtype 'uint32' \ + --sample_ring_prop + + +dolma tokens \ + --documents '/data/math-ai_AutoMathText/v0/documents/web/*.jsonl.gz' \ + --destination "${HOME}/ai2-llm/preprocessed/math-ai_AutoMathText/v0/web/allenai/dolma2-tokenizer" \ + --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \ + --tokenizer.eos_token_id 100257 \ + --tokenizer.pad_token_id 100277 \ + --no-tokenizer.segment_before_tokenization \ + --tokenizer.encode_special_tokens \ + --processes 16 \ + --max_size 100_000_000 \ + --dtype 'uint32' \ + --sample_ring_prop diff --git a/sources/AutoMathText/v0.py b/sources/AutoMathText/v0.py new file mode 100644 index 00000000..634437ce --- /dev/null +++ b/sources/AutoMathText/v0.py @@ -0,0 +1,221 @@ + +import os +import glob + +from contextlib import ExitStack +from hashlib import md5 +from tempfile import TemporaryDirectory +from typing import Any, Optional +import datetime +from queue import Queue +import json +from multiprocessing import cpu_count + +import smart_open +from dolma.core.parallel import BaseParallelProcessor + + +def format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str: + """Format a timestamp as a string using near ISO-8601 format.""" + if timestamp is None: + timestamp = datetime.datetime.now() + return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z" + + +def parse_date_web(date_str): + try: + return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S") + except ValueError: + return datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") + + +def parse_date_arxiv(date_str): + try: + return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S") + except ValueError: + return datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") + + +def parse_code_date(date_str): + try: + return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ") + except ValueError: + # If milliseconds are not present, try without them + return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ") + + +class AutoWebMathProcessor(BaseParallelProcessor): + @classmethod + def increment_progressbar( + cls, + queue: Queue, + /, + files: int = 0, + docs: int = 0, + words: int = 0, + ): + """ + This method is to update the progress bar. We keep + track of three things: + - files: the number of files processed + - read_docs: the number of documents read in + - written_docs: the number of documents written out + (i.e., the number of documents that are not empty) + """ + super().increment_progressbar( + queue, + files=files, + docs=docs, + words=words, + ) + + @classmethod + def process_single( + cls, + source_path: str, + destination_path: str, + queue: Queue, + **kwargs: Any, + ): + """ + This method is called for each file. It reads the file + line by line, and writes to the destination file only + if the document is not empty. + """ + + update_every_n_lines = 10_000 + docs = 0 + words = 0 + + with ExitStack() as stack: + # open source and destination files + source_file = stack.enter_context( + smart_open.open(source_path, "rt") + ) + if destination_path.endswith(".jsonl"): + destination_path += ".gz" + + destination_file = stack.enter_context( + smart_open.open(destination_path, "wt") + ) + + # Set a fixed creation date + created = datetime.datetime(2024, 1, 23) + + *_, source, subset, _ = source_path.split("/") + for ln in source_file: + # we first load the json document + document = json.loads(ln) + docs += 1 + docid = md5((ln + source + subset).encode('utf-8')).hexdigest() + + metadata = document.pop("meta") + + if "title" in document and "abstract" in document and "text" in document: + # arxiv subset + text = f"{document['title']}\n\n{document['abstract']}\n\n{document['text']}" + metadata["subset"] = subset + metadata["path"] = source_path + metadata["url"] = document.pop("url") + created = parse_date_arxiv(metadata["timestamp"]) + + elif "url" in document and "date" in document: + created = parse_date_web(document["date"]) + # this is web content + metadata["date"] = document["date"] + metadata["url"] = document["url"] + metadata["path"] = source_path + text = document["text"] + elif "text" in document: + if metadata.get("max_stars_repo_stars_event_min_datetime", None) is not None: + created = min( + parse_code_date(metadata["max_stars_repo_stars_event_min_datetime"]), + created + ) + if metadata.get("max_forks_repo_forks_event_min_datetime", None) is not None: + created = min( + parse_code_date(metadata["max_forks_repo_forks_event_min_datetime"]), + created + ) + text = document["text"] + # this is a code document + else: + raise ValueError(f"Unknown document type: {document}") + + output = { + "text": text.strip(), + "source": f"{source}_{subset}", + "added": format_to_dolma_timestamp(), + "created": format_to_dolma_timestamp(created), + "id": docid, + "metadata": metadata + } + + words += len(text.split()) + + # if the document is not empty, + # we write it to output + destination_file.write(json.dumps(output) + "\n") + + # we update the progress bar every + # update_every_n_lines + if docs > update_every_n_lines: + cls.increment_progressbar(queue, docs=docs, words=words) + docs = 0 + words = 0 + + # we update the progress bar one last time + cls.increment_progressbar( + queue, + files=1, + docs=docs, + words=words, + ) + + +def main(): + + base_source_prefix = '/data/math-ai_AutoMathText/raw/data' + base_destination_prefix = '/data/math-ai_AutoMathText/v0/documents' + + + jsonl_files = [] + for root, dirs, files in os.walk(base_source_prefix): + for file in files: + if file.endswith('.jsonl'): + jsonl_files.append(os.path.join(root, file)) + print(f"Found {len(jsonl_files)} JSONL files.") + + + with TemporaryDirectory() as tmpdir: + + # Create destination paths by combining end filepath after base_source_prefix with base_destination_prefix + destinations = [] + temp_files = [] + for jsonl_file in jsonl_files: + relative_path = os.path.relpath(jsonl_file, base_source_prefix) + destination = os.path.join(base_destination_prefix, relative_path) + destination_dir = os.path.dirname(destination) + os.makedirs(destination_dir, exist_ok=True) + destinations.append(os.path.dirname(destination)) + temp_file = os.path.join(tmpdir, os.path.dirname(destination)) + os.makedirs(temp_file, exist_ok=True) + temp_files.append(temp_file) + + print(f"Created {len(destinations)} destination paths.") + + # create the processor + processor = AutoWebMathProcessor( + source_prefix=jsonl_files, + destination_prefix=destinations, + metadata_prefix=temp_files, + num_processes=cpu_count() - 1, + debug=False, + ) + + # run the processor + processor() + + +if __name__ == "__main__": + main() diff --git a/sources/CodeSearchNet/tokens.sh b/sources/CodeSearchNet/tokens.sh new file mode 100644 index 00000000..c6b99c48 --- /dev/null +++ b/sources/CodeSearchNet/tokens.sh @@ -0,0 +1,17 @@ +#! /usr/bin/env bash + +set -ex + + +dolma tokens \ + --documents 's3://ai2-llm/pretraining-data/sources/code_search_net/v0/documents/train/*/*.jsonl.gz' \ + --destination "${HOME}/ai2-llm/preprocessed/code_search_net/v0/train/allenai/dolma2-tokenizer" \ + --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \ + --tokenizer.eos_token_id 100257 \ + --tokenizer.pad_token_id 100277 \ + --no-tokenizer.segment_before_tokenization \ + --tokenizer.encode_special_tokens \ + --processes 16 \ + --max_size 100_000_000 \ + --dtype 'uint32' \ + --sample_ring_prop diff --git a/sources/CodeSearchNet/v0.py b/sources/CodeSearchNet/v0.py new file mode 100644 index 00000000..124f8342 --- /dev/null +++ b/sources/CodeSearchNet/v0.py @@ -0,0 +1,60 @@ +from hashlib import md5 +import datasets +import smart_open +import datetime +from typing import Optional +import json +import tqdm +from contextlib import ExitStack + +dataset_name = "code-search-net/code_search_net" +version = "v0" +destination = f"s3://ai2-llm/pretraining-data/sources/{dataset_name.split("/")[1]}/{version}/documents" +max_docs_per_file = 100_000 + +def format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str: + """Format a timestamp as a string using near ISO-8601 format.""" + if timestamp is None: + timestamp = datetime.datetime.now() + return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z" + + +def main(): + created = format_to_dolma_timestamp(datetime.datetime(2019, 9, 20)) + + with tqdm.tqdm(unit=" docs", unit_scale=True) as pbar, ExitStack() as stack: + for language in ["python", "java", "javascript", "go", "ruby", "php"]: + for split in ["train", "validation", "test"]: + pbar.set_description(f"Processing {language}/{split}") + fn = 0 + cnt = 0 + path = f"{destination}/{split}/{language}/{fn:04d}.jsonl.gz" + print(f"\nCreating new output file {path}") + f = stack.enter_context(smart_open.open(path, "wt")) + dataset = datasets.load_dataset(dataset_name, language, split=split) + for row in dataset: + doc = { + "id": md5(row["func_code_url"].encode("utf-8")).hexdigest(), + "text": row.pop("whole_func_string"), + "source": f"{dataset_name}_{language}_{split}", + "added": format_to_dolma_timestamp(), + "created": created, + "metadata": row + } + f.write(json.dumps(doc) + "\n") + + pbar.update(1) + cnt += 1 + if cnt >= max_docs_per_file: + fn += 1 + cnt = 0 + stack.pop_all().close() + path = f"{destination}/{split}/{language}/{fn:04d}.jsonl.gz" + print(f"\nCreating new output file {path}") + f = stack.enter_context(smart_open.open(path, "wt")) + stack.pop_all().close() + + + +if __name__ == "__main__": + main() diff --git a/sources/MetaMathQA/tokens.py b/sources/MetaMathQA/tokens.py new file mode 100644 index 00000000..56a66a02 --- /dev/null +++ b/sources/MetaMathQA/tokens.py @@ -0,0 +1,33 @@ +from copy import deepcopy +from dolma.cli.tokenizer import TokenizationConfig, TokenizerConfig, TokenizerCli +from multiprocessing import cpu_count +import numpy as np +import os + + +def main(): + tokenizer = "allenai/dolma2-tokenizer" + base_source = "s3://ai2-llm/pretraining-data/sources" + base_destination = f"{os.environ['HOME'].rstrip('/')}/ai2-llm/preprocessed" + + config = TokenizationConfig( + documents=[f"{base_source}/meta-math_MetaMathQA/v0/documents/train/*"], + destination=f"{base_destination}/meta-math_MetaMathQA/v0/tokens/{tokenizer}", + tokenizer=TokenizerConfig( + name_or_path=tokenizer, + bos_token_id=None, + eos_token_id=100257, + pad_token_id=100277, + segment_before_tokenization=False, + encode_special_tokens=True, + ), + processes=cpu_count(), + max_size=100_000_000, + dtype='uint32', + sample_ring_prop=True, + seed=42, + ) + TokenizerCli.run(config) + +if __name__ == "__main__": + main() diff --git a/sources/MetaMathQA/v0.py b/sources/MetaMathQA/v0.py new file mode 100644 index 00000000..1ba48a69 --- /dev/null +++ b/sources/MetaMathQA/v0.py @@ -0,0 +1,50 @@ +from hashlib import md5 +import datasets +import smart_open +import datetime +from typing import Optional +import json +import tqdm + + +dataset_name = "meta-math/MetaMathQA" +version = "v0" +split = "train" +destination = ( + f"s3://ai2-llm/pretraining-data/sources/{dataset_name.replace("/", "_")}/" + f"{version}/documents/{split}/0000.jsonl.gz" +) + +def format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str: + """Format a timestamp as a string using near ISO-8601 format.""" + if timestamp is None: + timestamp = datetime.datetime.now() + return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z" + + +def main(): + dataset = datasets.load_dataset(dataset_name, split=split) + + d = datetime.datetime(2023, 10, 7) + + with smart_open.open(destination, "wt") as f: + for row in tqdm.tqdm(dataset, desc="Processing dataset"): + doc_id = md5(json.dumps(row).encode("utf-8")).hexdigest() + text = row["query"] + "\n" + row["response"] + source = f"{dataset_name}_{row['type']}_{split}" + added = format_to_dolma_timestamp() + created = format_to_dolma_timestamp(d) + + output = { + "text": text, + "id": doc_id, + "source": source, + "added": added, + "created": created, + "version": version, + "meta": {**row} + } + f.write(json.dumps(output) + "\n") + +if __name__ == "__main__": + main() diff --git a/sources/books/openstax.py b/sources/books/openstax.py new file mode 100644 index 00000000..e69de29b diff --git a/sources/eli5/v0.py b/sources/eli5/v0.py new file mode 100644 index 00000000..2c191b09 --- /dev/null +++ b/sources/eli5/v0.py @@ -0,0 +1,183 @@ +import pandas as pd +from pathlib import Path +import json +import smart_open +from ftfy import fix_text +import re +from contextlib import ExitStack +import datetime + +import tqdm + +DESTINATION_S3 = "s3://ai2-llm/pretraining-data/sources/max-hoffman_eli5/v0/documents" +DCLM_SUBMISSION_SCORE = 3 +DCLM_COMMENT_SCORE = 5 +DCLM_MIN_ANSWERS = 3 +ELI5_CREATED_AT = datetime.datetime(2019, 7, 22) + + +def format_to_dolma_timestamp(timestamp: datetime.datetime | None = None) -> str: + """Format a timestamp as a string using near ISO-8601 format.""" + if timestamp is None: + timestamp = datetime.datetime.now() + return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z" + + + +def safe_json_loads(s: str) -> dict | None: # pyright: ignore + try: + return json.loads(s) + except json.JSONDecodeError: + return None + + +def replace_urls(row: pd.Series) -> str: + text, urls = tuple(row) + if not text: + return text + for i, url in enumerate(urls['url']): + text = text.replace(f"_URL_{i}_", url) + return text + + +def read_eli5_data(split: str, data_dir: str = "/Users/lucas/code/eli5/data"): + data_path = Path(data_dir) + df = pd.read_parquet(data_path / f"eli5_{split}.parquet") + + df["title_urls"] = df["title_urls"].apply(json.loads) + df["selftext_urls"] = df["selftext_urls"].apply(json.loads) + df["answers_urls"] = df["answers_urls"].apply(json.loads) + + # # replace urls in title, selftext, and answers + df["title_with_urls"] = df[["title", "title_urls"]].apply(replace_urls, axis=1) + df["selftext_with_urls"] = df[["selftext", "selftext_urls"]].apply(replace_urls, axis=1) + df["answers_with_urls"] = df[["answers", "answers_urls"]].apply(replace_urls, axis=1) + + # this is the one that might fail + df["answers_with_urls"] = df["answers_with_urls"].apply(safe_json_loads) + df["answers"] = df["answers"].apply(safe_json_loads) + # Count and remove rows where JSON parsing failed + + initial_count = len(df) + df = df.dropna(subset=['answers_with_urls', 'answers']) + final_count = len(df) + failures = initial_count - final_count + + print(f"Number of rows dropped in {split} due to JSON parsing failures: {failures}") + + # replace all NaNs with empty strings + df = df.fillna("") + + return df + + +def main(): + + for split in ["test", "validation", "train"]: + df = read_eli5_data(split) + eli5_created_at = format_to_dolma_timestamp(ELI5_CREATED_AT) + + with ExitStack() as stack: + full_file = stack.enter_context(smart_open.open(f"{DESTINATION_S3}/conversation/{split}.jsonl.gz", "w")) + dclm_file = stack.enter_context(smart_open.open(f"{DESTINATION_S3}/dclm/{split}.jsonl.gz", "w")) + format_file = stack.enter_context(smart_open.open(f"{DESTINATION_S3}/individual/{split}.jsonl.gz", "w")) + screen_file = stack.enter_context(smart_open.open(f"{DESTINATION_S3}/individual_filtered/{split}.jsonl.gz", "w")) + + for i, row in tqdm.tqdm(df.iterrows(), total=len(df), desc=f"Processing {split}"): + all_text_and_answers = ( + str(row["title_with_urls"]), + str(row["selftext_with_urls"]), + *[str(text) for text in row['answers_with_urls']['text']] + ) + + # use two newlines as separator or maximum number of newlines in the text, plus one + spacing = max( + [len(span) for text in all_text_and_answers for span in re.findall(r'\n+', text)] + [1] + ) + + # separate the text with one newline + full_text = ("\n" * (spacing + 1)).join(all_text_and_answers) + + answer_urls = { + f"_URL_{i}_": url for i, url in enumerate(row['answers_urls']['url']) + } + + metadata = { + "q_id": str(row["q_id"]), + "title": { + "text": str(row["title"]), + "urls": [str(url) for url in row["title_urls"]["url"]] + }, + "selftext": { + "text": str(row["selftext"]), + "urls": [str(url) for url in row["selftext_urls"]["url"]] + }, + "answers": [ + { + "a_id": str(a_id), + "text": str(text), + "score": int(score), + 'urls': [str(url) for u_id, url in answer_urls.items() if u_id in text] + } + for a_id, text, score in + zip(row['answers']['a_id'], row['answers']['text'], row['answers']['score']) + ] + } + + full_document = { + "text": full_text, + "id": str(row["q_id"]), + "source": "eli5", + "version": "v0_conversation", + "created": eli5_created_at, + "added": format_to_dolma_timestamp(), + "metadata": metadata + } + + full_file.write(json.dumps(full_document) + "\n") + + dclm_answer = None + + title = fix_text(str(row["title"])) + + for score, a_id, answer in sorted( + zip(row['answers']['score'], row['answers']['a_id'], row['answers_with_urls']['text']), + key=lambda x: float(f"{x[0]}.{len(x[2])}") + ): + # use two newlines as separator or maximum number of newlines in the text, plus one + spacing = max( + [len(span) for span in re.findall(r'\n+', title)] + + [len(span) for span in re.findall(r'\n+', answer)] + + [1] + ) + text = ("\n" * (spacing + 1)).join([title, fix_text(answer)]) + answer_metadata = { + **{k: v for k, v in metadata.items() if k != "answers"}, + **[answer for answer in metadata["answers"] if answer["a_id"] == a_id][0] # pyright: ignore + } + answer_document = { + "text": text, + "id": f"{row['q_id']}_{a_id}", + "source": "eli5", + "version": "v0_individual", + "created": eli5_created_at, + "added": format_to_dolma_timestamp(), + "metadata": answer_metadata + } + + format_file.write(json.dumps(answer_document) + "\n") + + if score >= DCLM_COMMENT_SCORE and len(row['answers']['a_id']) >= DCLM_MIN_ANSWERS: + dclm_answer = {**answer_document, "version": "v0_dclm"} + + if score >= DCLM_COMMENT_SCORE: + screen_document = {**answer_document, "version": "v0_screen"} + screen_file.write(json.dumps(screen_document) + "\n") + + if dclm_answer: + dclm_file.write(json.dumps(dclm_answer) + "\n") + + + +if __name__ == "__main__": + main() diff --git a/sources/mathpile/tokens.py b/sources/mathpile/tokens.py new file mode 100644 index 00000000..dec43b2b --- /dev/null +++ b/sources/mathpile/tokens.py @@ -0,0 +1,37 @@ +from copy import deepcopy +from dolma.cli.tokenizer import TokenizationConfig, TokenizerConfig, TokenizerCli +from multiprocessing import cpu_count +import numpy as np +import os + + +def main(): + base_config = TokenizationConfig( + documents=[], + destination=f"{os.environ['HOME'].rstrip('/')}/ai2-llm/preprocessed/mathpile", + tokenizer=TokenizerConfig( + name_or_path="allenai/dolma2-tokenizer", + bos_token_id=None, + eos_token_id=100257, + pad_token_id=100277, + segment_before_tokenization=False, + encode_special_tokens=True, + ), + processes=cpu_count(), + max_size=100_000_000, + dtype='uint32', + sample_ring_prop=True, + ) + + for name in ["MathPile", "MathPile_Commercial"]: + for split in ["train", "validation"]: + for subset in ["arXiv", "commoncrawl", "proofwiki", "stackexchange", "textbooks", "wikipedia"]: + config = deepcopy(base_config) + config.documents = [ + f"/data/mathpile/v0/documents/{name}/{split}/{subset}/*" + ] + config.destination = f"{config.destination}/{name}/{split}/{subset}/{config.tokenizer.name_or_path}" + TokenizerCli.run(config) + +if __name__ == "__main__": + main() diff --git a/sources/mathpile/v0.py b/sources/mathpile/v0.py new file mode 100644 index 00000000..1c765714 --- /dev/null +++ b/sources/mathpile/v0.py @@ -0,0 +1,149 @@ +from contextlib import ExitStack +from hashlib import md5 +from tempfile import TemporaryDirectory +from typing import Any, Optional +import datetime +from queue import Queue +import json +from multiprocessing import cpu_count + +import smart_open +from dolma.core.parallel import BaseParallelProcessor + + +def format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str: + """Format a timestamp as a string using near ISO-8601 format.""" + if timestamp is None: + timestamp = datetime.datetime.now() + return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z" + + + +class MathpileProcessor(BaseParallelProcessor): + @classmethod + def increment_progressbar( + cls, + queue: Queue, + /, + files: int = 0, + docs: int = 0, + words: int = 0, + ): + """ + This method is to update the progress bar. We keep + track of three things: + - files: the number of files processed + - read_docs: the number of documents read in + - written_docs: the number of documents written out + (i.e., the number of documents that are not empty) + """ + super().increment_progressbar( + queue, + files=files, + docs=docs, + words=words, + ) + + @classmethod + def process_single( + cls, + source_path: str, + destination_path: str, + queue: Queue, + **kwargs: Any, + ): + """ + This method is called for each file. It reads the file + line by line, and writes to the destination file only + if the document is not empty. + """ + + update_every_n_lines = 10_000 + docs = 0 + words = 0 + with ExitStack() as stack: + # open source and destination files + source_file = stack.enter_context( + smart_open.open(source_path, "rt") + ) + destination_file = stack.enter_context( + smart_open.open(destination_path, "wt") + ) + + # Set a fixed creation date + created = datetime.datetime(2023, 12, 29) + + *_, source, split, subset, fn = source_path.split("/") + for ln in source_file: + # we first load the json document + document = json.loads(ln) + docs += 1 + + docid = md5((ln + source + split + subset).encode('utf-8')).hexdigest() + + metadata = {} + + if "text" in document: + text = document.pop("text") + elif "question" in document and "answers" in document: + question = document.pop("question") + answers = document.pop("answers") + + text = f"{question.pop('Title').strip()}\n{question.pop('Body').strip()}\n\n" + metadata.update({f"question_{k}": v for k, v in question.items()}) + + for answer in answers: + text += f"{answer.pop('Body').strip()}\n\n" + metadata.update({f"answer_{k}": v for k, v in answer.items()}) + else: + raise ValueError(f"Unknown document type: {document}") + + subset = document.pop("subset") + + output = { + "text": text.strip(), + "source": f"{source}_{subset}_{split}", + "added": format_to_dolma_timestamp(), + "created": format_to_dolma_timestamp(created), + "id": docid, + "metadata": {**document, **metadata, "subset": subset, "split": split, "source": source} + } + + words += len(text.split()) + + # if the document is not empty, + # we write it to output + destination_file.write(json.dumps(output) + "\n") + + # we update the progress bar every + # update_every_n_lines + if docs > update_every_n_lines: + cls.increment_progressbar(queue, docs=docs, words=words) + docs = 0 + words = 0 + + # we update the progress bar one last time + cls.increment_progressbar( + queue, + files=1, + docs=docs, + words=words, + ) + + +def main(): + with TemporaryDirectory() as tmpdir: + # create the processor + processor = MathpileProcessor( + source_prefix="/data/mathpile/raw/*/*/*/*.gz", + destination_prefix="/data/mathpile/v0", + metadata_prefix=tmpdir, + num_processes=cpu_count() - 2, + ) + + # run the processor + processor() + + +if __name__ == "__main__": + main() diff --git a/sources/openhermes/v1.py b/sources/openhermes/v1.py new file mode 100644 index 00000000..7d2cdbcc --- /dev/null +++ b/sources/openhermes/v1.py @@ -0,0 +1,52 @@ +import smart_open +import json +from dolma.core.paths import cached_path +import pandas as pd +from datasets import load_dataset +import hashlib +from tqdm import tqdm +import datetime +import re +DESTINATION_S3 = "s3://ai2-llm/pretraining-data/sources/teknium_OpenHermes-2.5/v1/documents/oh2_5.jsonl.gz" +dataset = load_dataset( + "teknium/OpenHermes-2.5", + split="train", +) + +OPENHERMES_DATE = datetime.datetime(2023, 11, 12) + + +def format_to_dolma_timestamp(timestamp: datetime.datetime | None = None) -> str: + """Format a timestamp as a string using near ISO-8601 format.""" + if timestamp is None: + timestamp = datetime.datetime.now() + return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:23] + "Z" + + +with smart_open.open(DESTINATION_S3, 'w') as f: + + for row in tqdm(dataset): + spacing = max( + [len(span) for turn in row["conversations"] for span in re.findall(r'\n+', turn['value'])] + [1] + ) + text = ("\n" * (spacing + 1)).join(turn['value'] for turn in row["conversations"]) + + row_id = row['id'] or hashlib.md5(json.dumps(row).encode('utf-8')).hexdigest() + + source = f'openhermes-2.5' + if row['source']: + source += f'-{row["source"]}' + + version = 'v1' + + document = { + 'id': row_id, + 'source': source, + 'version': version, + 'text': text, + 'added': format_to_dolma_timestamp(), + 'created': format_to_dolma_timestamp(OPENHERMES_DATE), + 'metadata': row, + } + + f.write(json.dumps(document) + '\n') diff --git a/sources/stackexchange/README.md b/sources/stackexchange/README.md new file mode 100644 index 00000000..6449c4ff --- /dev/null +++ b/sources/stackexchange/README.md @@ -0,0 +1,261 @@ +# Stack Exchange + +## Instructions + +1. Download the Stack Exchange data from the Internet Archive using the [`download_from_ia.sh`](download_from_ia.sh) script. +2. Convert data to parquet using the [`v0.py`](v0.py) script. +3. Load the data into Athena as follows: + +Create comments table: + +```sql +CREATE EXTERNAL TABLE IF NOT EXISTS `lucas`.`se_comments_20240930` ( + Id STRING, + PostId STRING, + Score STRING, + Text STRING, + CreationDate STRING, + UserID STRING, + ContentLicense STRING +) +PARTITIONED BY (forum STRING) +STORED AS PARQUET +LOCATION 's3://ai2-llm/pretraining-data/sources/stackexchange/raw/20240930_parquet/comments/' +TBLPROPERTIES ('parquet.compression'='SNAPPY') +``` + +Then run the following to load the partitions: + +```sql +MSCK REPAIR TABLE lucas.se_comments_20240930; +``` + +Create posts table: + +```sql +CREATE EXTERNAL TABLE IF NOT EXISTS `lucas`.`se_posts_20240930` ( + AcceptedAnswerId BIGINT, + AnswerCount BIGINT, + Body STRING, + ClosedDate STRING, + CommentCount BIGINT, + ContentLicense STRING, + CreationDate STRING, + Id BIGINT, + LastActivityDate STRING, + LastEditDate STRING, + LastEditorDisplayName STRING, + LastEditorUserId BIGINT, + OwnerDisplayName STRING, + OwnerUserId BIGINT, + ParentId BIGINT, + PostTypeId STRING, + Score BIGINT, + Tags STRING, + Title STRING, + ViewCount BIGINT +) +PARTITIONED BY (forum STRING) +STORED AS PARQUET +LOCATION 's3://ai2-llm/pretraining-data/sources/stackexchange/raw/20240930_parquet/posts/' +TBLPROPERTIES ('parquet.compression'='SNAPPY') +``` + +Then run the following to load the partitions: + +```sql +MSCK REPAIR TABLE lucas.se_posts_20240930; +``` + +# Selecting QA pairs + + +```sql +UNLOAD ( + WITH valid_questions AS ( + SELECT + posts.Body, + posts.Id, + posts.CommentCount, + posts.ContentLicense, + posts.CreationDate, + posts.LastActivityDate, + posts.LastEditDate, + posts.LastEditorDisplayName, + posts.LastEditorUserId, + posts.OwnerUserId, + posts.OwnerDisplayName, + posts.Score, + posts.Tags, + posts.ViewCount, + posts.Title, + posts.Forum, + posts.AcceptedAnswerid + FROM "lucas"."se_posts_20240930" as posts + WHERE + posttypeid = 'Question' + AND posts.AnswerCount > 0 + AND posts.acceptedanswerid >= 0 + + ), + valid_answers AS ( + SELECT + posts.Body, + posts.Id, + posts.CommentCount, + posts.ContentLicense, + posts.CreationDate, + posts.LastActivityDate, + posts.LastEditDate, + posts.LastEditorDisplayName, + posts.LastEditorUserId, + posts.OwnerUserId, + posts.OwnerDisplayName, + posts.Score, + posts.ViewCount, + posts.Forum + FROM "lucas"."se_posts_20240930" as posts + WHERE posttypeid = 'Answer' + ), + joined_questions_answers AS ( + SELECT + valid_answers.Body AS answer_body, + valid_answers.Id AS answer_id, + valid_answers.CommentCount AS answer_comment_count, + valid_answers.ContentLicense AS answer_content_license, + valid_answers.CreationDate AS answer_creation_date, + valid_answers.LastActivityDate AS answer_last_activity_date, + valid_answers.LastEditDate AS answer_last_edit_date, + valid_answers.LastEditorDisplayName AS answer_last_editor_display_name, + valid_answers.LastEditorUserId AS answer_last_editor_user_id, + valid_answers.OwnerUserId AS answer_owner_user_id, + valid_answers.OwnerDisplayName AS answer_owner_display_name, + valid_answers.Score AS answer_score, + valid_answers.ViewCount AS answer_view_count, + valid_answers.Forum AS answer_forum, + valid_questions.Title AS question_title, + valid_questions.Body AS question_body, + valid_questions.Id AS question_id, + valid_questions.CommentCount AS question_comment_count, + valid_questions.ContentLicense AS question_content_license, + valid_questions.CreationDate AS question_creation_date, + valid_questions.LastActivityDate AS question_last_activity_date, + valid_questions.LastEditDate AS question_last_edit_date, + valid_questions.LastEditorDisplayName AS question_last_editor_display_name, + valid_questions.LastEditorUserId AS question_last_editor_user_id, + valid_questions.OwnerUserId AS question_owner_user_id, + valid_questions.OwnerDisplayName AS question_owner_display_name, + valid_questions.Score AS question_score, + valid_questions.Tags AS question_tags, + valid_questions.ViewCount AS question_view_count, + valid_questions.Forum AS question_forum, + CAST ( + ARRAY_MAX( + TRANSFORM( + regexp_extract_all(valid_answers.body, '\n+'), + x -> LENGTH(x) + ) + || ARRAY [1] + ) AS INTEGER + ) as question_max_newline, + CAST ( + ARRAY_MAX( + TRANSFORM( + regexp_extract_all(valid_questions.body, '\n+'), + x -> LENGTH(x) + ) + || ARRAY [1] + ) AS INTEGER + ) as answer_max_newline + FROM valid_answers + INNER JOIN valid_questions + ON valid_questions.forum = valid_answers.forum + AND valid_questions.acceptedanswerid = valid_answers.id + ) + SELECT + ( + question_forum + || '-' + || CAST(question_id AS VARCHAR) + || '-' + || CAST(answer_id AS VARCHAR) + ) as id, + ( + TRIM(question_title) + || ARRAY_JOIN( + REPEAT( + CHR(10), + question_max_newline + 1 + ), + '' + ) + || TRIM(question_body) + || ARRAY_JOIN( + REPEAT( + CHR(10), + IF( + question_max_newline > answer_max_newline, + question_max_newline + 1, + answer_max_newline + 1 + ) + ), + '' + ) + || TRIM(answer_body) + ) as text, + question_creation_date AS created, + answer_last_activity_date AS added, + 'stackexchange' AS source, + '20240930' as version, + CAST( + ROW( + question_forum, + question_id, + answer_id, + question_owner_user_id, + answer_owner_user_id, + question_last_editor_user_id, + answer_last_editor_user_id, + question_last_edit_date, + answer_last_edit_date, + question_last_activity_date, + answer_last_activity_date, + question_content_license, + answer_content_license, + question_score, + answer_score, + question_view_count, + answer_view_count, + question_comment_count, + answer_comment_count + ) AS + ROW( + forum VARCHAR, + question_id BIGINT, + answer_id BIGINT, + question_owner_user_id BIGINT, + answer_owner_user_id BIGINT, + question_last_editor_user_id BIGINT, + answer_last_editor_user_id BIGINT, + question_last_edit_date VARCHAR, + answer_last_edit_date VARCHAR, + question_last_activity_date VARCHAR, + answer_last_activity_date VARCHAR, + question_content_license VARCHAR, + answer_content_license VARCHAR, + question_score BIGINT, + answer_score BIGINT, + question_view_count BIGINT, + answer_view_count BIGINT, + question_comment_count BIGINT, + answer_comment_count BIGINT + ) + ) AS metadata + FROM joined_questions_answers +) +TO 's3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/' +WITH ( + format='JSON', + compression='ZSTD' +) +``` diff --git a/sources/stackexchange/download_from_ia.sh b/sources/stackexchange/download_from_ia.sh new file mode 100644 index 00000000..e51accf6 --- /dev/null +++ b/sources/stackexchange/download_from_ia.sh @@ -0,0 +1,118 @@ +#! /bin/bash + +#!/bin/bash + +# Function to display usage information +usage() { + echo "Usage: $0 -c|--collection-id -d|--destination [-n|--num-processes ] [-k|--num-chunks ]" + echo " -c, --collection-id : The ID of the Internet Archive collection (required)" + echo " -d, --destination : Location where to save each file from the collection (required)" + echo " -n, --num-processes : Number of parallel downloads to use (default: 1)" + echo " -k, --num-chunks : Number of chunks to split the collection into (default: 1)" + exit 1 +} + +# Initialize variables +collection_id="" +destination="" +num_processes=1 +num_chunks=1 + +# Parse command-line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -c|--collection-id) + collection_id="$2" + shift 2 + ;; + -d|--destination) + destination="$2" + shift 2 + ;; + -n|--num-processes) + num_processes="$2" + shift 2 + ;; + -k|--num-chunks) + num_chunks="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + +# Check if required arguments are provided +if [ -z "$collection_id" ]; then + echo "Error: Collection ID is required" + usage +fi + +if [ -z "$destination" ]; then + echo "Error: Destination is required" + usage +fi + +# Ensure num_processes is an integer greater than or equal to 1 +if ! [[ "$num_processes" =~ ^[0-9]+$ ]] || [ "$num_processes" -lt 1 ]; then + echo "Error: num_processes must be an integer greater than or equal to 1" + usage +fi + +# Ensure num_chunks is an integer greater than or equal to 1 +if ! [[ "$num_chunks" =~ ^[0-9]+$ ]] || [ "$num_chunks" -lt 1 ]; then + echo "Error: num_chunks must be an integer greater than or equal to 1" + usage +fi + + +# Check if aria2c is available +if ! command -v aria2c &> /dev/null; then + echo "Error: aria2c is not installed or not in the system PATH" + exit 1 +fi + +# check if jq is available +if ! command -v jq &> /dev/null; then + echo "Error: jq is not installed or not in the system PATH" + exit 1 +fi + +# Create a temporary file to store the download urls +temp_file=$(mktemp) + +# Write items to the temporary file +curl -s "https://archive.org/metadata/$collection_id" | jq -r '{collection_id: .metadata.identifier, name: .files[].name} | select(.name | endswith(".7z")) | "https://archive.org/download/\(.collection_id)/\(.name)"' > "$temp_file" + +# make destination directory if it doesn't exist +mkdir -p "$destination" + +# Print the number of files to be downloaded +num_files=$(wc -l < "$temp_file") +echo "Downloading $num_files files" + +if [ "$num_files" -eq 0 ]; then + echo "No files to download" + exit 1 +fi + +# Download each file in parallel +aria2c \ + --continue \ + --split ${num_chunks} \ + --max-connection-per-server ${num_chunks} \ + -k 1M \ + -j ${num_processes} \ + -i "$temp_file" \ + -d "$destination" \ + --show-console-readout=true \ + --summary-interval=5 \ + --console-log-level=notice + +# Remove the temporary file +rm "$temp_file" diff --git a/sources/stackexchange/requirements.txt b/sources/stackexchange/requirements.txt new file mode 100644 index 00000000..924dd11b --- /dev/null +++ b/sources/stackexchange/requirements.txt @@ -0,0 +1,7 @@ +smart-open>=7.0.4 +py7zr +lxml +pyarrow +tqdm +markdownify +resiliparse diff --git a/sources/stackexchange/v0.py b/sources/stackexchange/v0.py new file mode 100644 index 00000000..f715a276 --- /dev/null +++ b/sources/stackexchange/v0.py @@ -0,0 +1,236 @@ +import argparse +import os +import re +from contextlib import ExitStack +from typing import Any, Callable, Iterator + +import libarchive # pyright: ignore +import py7zr # pyright: ignore +from resiliparse.extract.html2text import extract_plain_text # pyright: ignore +import pyarrow as pa +import pyarrow.parquet as pq +from libarchive.entry import ArchiveEntry # pyright: ignore +from lxml import etree # pyright: ignore +from tqdm import tqdm + +os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace" + + +post_types = { + "1": "Question", + "2": "Answer", + "3": "Orphaned tag wiki", + "4": "Tag wiki excerpt", + "5": "Tag wiki", + "6": "Moderator nomination", + "7": "Wiki placeholder", + "8": "Privilege wiki", + "9": "Article", + "10": "HelpArticle", + "11": "Unknown", + "12": "Collection", + "13": "ModeratorQuestionnaireResponse", + "14": "Announcement", + "15": "CollectiveDiscussion", + "16": "CollectiveCollection", +} + +POSTS_MAP: dict[str, Callable[[str | None], Any]] = { + "AcceptedAnswerId": lambda x: int(x or 0), + "AnswerCount": lambda x: int(x or 0), + "Body": lambda x: extract_plain_text(x or "").strip(), + "ClosedDate": lambda x: str(x or ""), + "CommentCount": lambda x: int(x or 0), + "CommunityOwnedDate": lambda x: str(x or ""), + "ContentLicense": lambda x: str(x or ""), + "CreationDate": lambda x: str(x or ""), + "Id": lambda x: int(x or 0), + "LastActivityDate": lambda x: str(x or ""), + "LastEditDate": lambda x: str(x or ""), + "LastEditorDisplayName": lambda x: str(x or ""), + "LastEditorUserId": lambda x: int(x or 0), + "OwnerDisplayName": lambda x: str(x or ""), + "OwnerUserId": lambda x: int(x or 0), + "ParentId": lambda x: int(x or 0), + "PostTypeId": lambda x: post_types.get(x or "11", "Unknown"), + "Score": lambda x: int(x or 0), + "Tags": lambda x: str(x or ""), + "Title": lambda x: str(x or ""), + "ViewCount": lambda x: int(x or 0), +} + +COMMENTS_MAP: dict[str, Callable[[str | None], Any]] = { + "ContentLicense": lambda x: str(x or ""), + "CreationDate": lambda x: str(x or ""), + "Id": lambda x: int(x or 0), + "PostId": lambda x: int(x or 0), + "Score": lambda x: int(x or 0), + "Text": lambda x: str(x or ""), + "UserDisplayName": lambda x: str(x or ""), + "UserId": lambda x: int(x or 0), +} + +USERS_MAP: dict[str, Callable[[str | None], Any]] = { + "Id": lambda x: int(x or 0), + "Reputation": lambda x: int(x or 0), + "CreationDate": lambda x: str(x or ""), + "DisplayName": lambda x: str(x or ""), + "LastAccessDate": lambda x: str(x or ""), + "WebsiteUrl": lambda x: str(x or ""), + "Location": lambda x: str(x or ""), + "AboutMe": lambda x: str(x or ""), + "Views": lambda x: int(x or 0), + "UpVotes": lambda x: int(x or 0), + "DownVotes": lambda x: int(x or 0), + "ProfileImageUrl": lambda x: str(x or ""), + "EmailHash": lambda x: str(x or ""), + "AccountId": lambda x: int(x or 0), +} + + +def get_7z_uncompressed_size(sz_path, entry_name): + with py7zr.SevenZipFile(sz_path, mode="r") as z: + for entry in z.list(): + if entry.filename == entry_name: + return entry.uncompressed + raise FileNotFoundError(f"File {entry_name} not found in archive {sz_path}") + + +def stream_xml_from_7z( + archive_path: str, filename: str, target_xpath: str = "//*", block_size: int = 8192 +) -> Iterator[etree._Element]: + """ + Stream XML nodes from a file within a 7z archive, parsing them lazily. + + Args: + archive_path (str): Path to the 7z archive + filename (str): Name of the XML file within the archive + target_xpath (str, optional): XPath expression to filter nodes. Defaults to "//*". + block_size (int, optional): Size of blocks to read. Defaults to 8192. + + Yields: + lxml.etree._Element: XML nodes matching the target_xpath + + Raises: + FileNotFoundError: If archive or file within archive is not found + ValueError: If file is not valid XML + """ + # Initialize the XML parser that will receive chunks of data + parser = etree.XMLPullParser(events=("end",), recover=True) + + with ExitStack() as stack: + archive = stack.enter_context(libarchive.file_reader(archive_path)) + # Find the target file in the archive + for entry in archive: + if entry.pathname != filename: + continue + + archive_name = os.path.basename(archive_path) + pbar = tqdm( + total=get_7z_uncompressed_size(archive_path, filename), + desc=f"Bytes {archive_name}::{filename}", + unit="B", + unit_scale=True, + ) + prev_line = b"" + for chunk in entry.get_blocks(block_size): + pbar.update(len(chunk)) + first_seg, *segments = re.split(b"\r*\n|\r", chunk) + if segments: + # there's at least one line break in the chunk, so we can yield the previous line + yield prev_line + first_seg + yield from segments[:-1] + prev_line = segments[-1] + else: + # no line breaks in the chunk, so we need to accumulate it + prev_line += chunk + + +def process_file( + archive_path: str, + output_dir: str, + entry_name: str, + entry_map: dict[str, Callable[[str| None], Any]], + batch_size: int = 100_000, + block_size: int = 8192, +): + entry_prefix, _ = os.path.basename(entry_name.lower()).split(".", 1) + archive_name = os.path.basename(archive_path) + + os.makedirs(output_dir, exist_ok=True) + data = [] + schema = None + + with ExitStack() as stack: + xml_elements = stream_xml_from_7z(archive_path, entry_name, block_size=block_size) + files_pbar = tqdm(desc=f"Files {archive_name}::{entry_name}") + elements_pbar = tqdm(xml_elements, desc=f"Rows {archive_name}::{entry_name}") + + for row in elements_pbar: + if not row.strip().startswith(b"= batch_size: + table = pa.Table.from_pylist(data, schema=schema) + pq.write_table( + table, + os.path.join(output_dir, f"{entry_prefix}_{files_pbar.n:06d}.parquet"), + ) + data = [] + files_pbar.update(1) + # Write any remaining data + + if data: + table = pa.Table.from_pylist(data, schema=schema) + pq.write_table( + table, + os.path.join(output_dir, f"{entry_prefix}_{files_pbar.n:06d}.parquet"), + ) + files_pbar.update(1) + + +def main(): + parser = argparse.ArgumentParser(description="Convert Stack Exchange 7z XML dumps to Parquet format") + parser.add_argument("archive_path", help="Path to the 7z archive") + parser.add_argument("output_dir", help="Directory where Parquet files will be saved") + parser.add_argument( + "--batch-size", type=int, default=100000, help="Number of rows to process at once (default: 100000)" + ) + parser.add_argument("--block-size", type=int, default=8192, help="Size of blocks to read (default: 8192)") + + args = parser.parse_args() + + if os.path.isdir(args.archive_path): + archive_paths = [ + os.path.join(args.archive_path, p) for p in os.listdir(args.archive_path) if p.endswith("7z") + ] + else: + archive_paths = [args.archive_path] + + for archive_path in tqdm(archive_paths, desc="Archives"): + for entry_name, entry_map in [("Posts.xml", POSTS_MAP), ("Comments.xml", COMMENTS_MAP), ("Users.xml", USERS_MAP)]: + clean_entry_name = entry_name.split(".", 1)[0].lower() + clean_forum_name = archive_path.split("/")[-1].rsplit(".", 1)[0].lower().replace(".", "_") + output_path = os.path.join(args.output_dir, clean_entry_name, f"forum={clean_forum_name}") + process_file( + archive_path=archive_path, + output_dir=output_path, + entry_name=entry_name, + entry_map=entry_map, # pyright: ignore + batch_size=args.batch_size, + block_size=args.block_size, + ) + + +if __name__ == "__main__": + main() diff --git a/sources/tulu_flan/search.py b/sources/tulu_flan/search.py new file mode 100644 index 00000000..01f151df --- /dev/null +++ b/sources/tulu_flan/search.py @@ -0,0 +1,114 @@ +import argparse +from contextlib import ExitStack +from pathlib import Path +import re +import json +import os + +import smart_open +import tqdm +from dolma_decontamination.search.common import create_index +from dolma_decontamination.search.index import list_paths +from dolma_decontamination.search.query import HitsTuple + + +def make_search_parser(): + parser = argparse.ArgumentParser("Interactive search tool on a tantivy index") + parser.add_argument( + "-i", + "--index-path", + type=str, + required=True, + help="The path to the index." + ) + parser.add_argument( + "-d", + "--documents", + type=str, + required=True, + nargs="+", + help="The paths to documents to use as queries." + ) + parser.add_argument( + "-n", + "--num-hits", + type=int, + default=10, + help="The number of hits to return." + ) + parser.add_argument( + "-o", + "--output", + type=str, + default=None, + help="A directory to write the output to." + ) + return parser + + +class TextNormalizer: + def __init__(self): + self.whitespace_re = re.compile(r"\s+") + self.non_alnum_re = re.compile(r"[^a-zA-Z0-9\s]+") + + def __call__(self, text: str) -> str: + text = self.whitespace_re.sub(" ", self.non_alnum_re.sub("", text.strip())) + return text.replace("AND", "and").replace("OR", "or").replace("NOT", "not").replace("IN", "in") + + +def search_data(args: argparse.Namespace): + index = create_index(args.index_path, reuse=True) + searcher = index.searcher() + + paths = list_paths(args.documents) + norm = TextNormalizer() + + Path(args.output).mkdir(parents=True, exist_ok=True) + + with ExitStack() as stack: + files_pbar = stack.enter_context(tqdm.tqdm(paths, unit="files", unit_scale=True)) + docs_pbar = stack.enter_context(tqdm.tqdm(unit=" docs", unit_scale=True)) + queries_pbar = stack.enter_context(tqdm.tqdm(unit=" queries", unit_scale=True)) + + output_id = 0 + + output_path = f"{args.output}/{output_id:06d}.jsonl.zst" + output_file = stack.enter_context(smart_open.open(output_path, "wt", encoding="utf-8")) + + for path in files_pbar: + f = stack.enter_context(smart_open.open(path, "rt", encoding="utf-8")) + for line in f: + document = json.loads(line) + + for start, end, score in document.get("attributes", {}).get("dedupe_ngrams_8_1", []): + text = document["text"][start:end] + normalized_text = norm(text) + + parsed_query = index.parse_query(normalized_text) + hits = searcher.search(parsed_query, limit=args.num_hits).hits + parsed_hits = HitsTuple.from_hits(hits, searcher) + + output = { + "query": normalized_text, + "hits": [h.to_dict() for h in parsed_hits], + "document": document, + "span_score": score + } + queries_pbar.update(1) + output_file.write(json.dumps(output) + "\n") + + if queries_pbar.n % 50_000 == 0: + output_file.close() + output_id += 1 + output_path = f"{args.output}/{output_id:06d}.jsonl.zst" + output_file = stack.enter_context( + smart_open.open(output_path, "wt", encoding="utf-8") + ) + + docs_pbar.update(1) + + files_pbar.update(1) + + +if __name__ == "__main__": + search_data(make_search_parser().parse_args()) diff --git a/sources/tulu_flan/tokens.sh b/sources/tulu_flan/tokens.sh new file mode 100644 index 00000000..ae8ec63a --- /dev/null +++ b/sources/tulu_flan/tokens.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +set -ex + + +dolma tokens \ + --documents 's3://ai2-llm/pretraining-data/sources/tulu_flan/v1-FULLDECON-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/*.json.gz' \ + --destination "${HOME}/ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer" \ + --tokenizer.name_or_path 'allenai/dolma2-tokenizer' \ + --tokenizer.eos_token_id 100257 \ + --tokenizer.pad_token_id 100277 \ + --no-tokenizer.segment_before_tokenization \ + --tokenizer.encode_special_tokens \ + --ring_size 8 \ + --processes 92 \ + --max_size 4_000_000_000 \ + --sample_ring_prop \ + --dtype 'uint32' diff --git a/tests/python/test_paths.py b/tests/python/test_paths.py index e920af74..df758e22 100644 --- a/tests/python/test_paths.py +++ b/tests/python/test_paths.py @@ -295,7 +295,6 @@ def test_split_glob(self): class TestSplitExt(TestCase): def test_file(self): - prot, parts, ext = split_ext("file.txt") self.assertEqual(prot, "") @@ -318,7 +317,6 @@ def test_file(self): self.assertEqual(ext, ".") def test_path(self): - prot, parts, ext = split_ext("path/to/file.txt") self.assertEqual(prot, "")