From 476d6d03e023aaf5bf172a29e882e704dfd9342b Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Wed, 23 Oct 2024 23:10:39 -0700 Subject: [PATCH] instructions --- classifiers/README.md | 10 ++++++++-- classifiers/scripts/nvidia-deberta-100b.sh | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/classifiers/README.md b/classifiers/README.md index 722fb246..7ed1dc7a 100644 --- a/classifiers/README.md +++ b/classifiers/README.md @@ -19,6 +19,12 @@ python -m dolma_classifiers.inference \ -m HuggingFaceFW/fineweb-edu-classifier ``` +Run [NVIDIA's Deberta quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) on S3 data with model compilation: - +```bash +python -m dolma_classifiers.inference \ + -s 's3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd' \ + -m nvidia/quality-classifier-deberta \ + --model-compile \ + --max-length 1024 +``` diff --git a/classifiers/scripts/nvidia-deberta-100b.sh b/classifiers/scripts/nvidia-deberta-100b.sh index fe268050..34c98e88 100644 --- a/classifiers/scripts/nvidia-deberta-100b.sh +++ b/classifiers/scripts/nvidia-deberta-100b.sh @@ -5,7 +5,7 @@ DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl. NUM_NODES=2 MODEL_NAME="nvidia/quality-classifier-deberta" CLUSTER="ai2/jupiter*" -BATCH_SIZE=1024 +BATCH_SIZE=512 PRIORITY="high" # Generate a hash for the run name by combining model name and documents