new configs

allenai · Oct 24, 2024 · 93d3b6f · 93d3b6f
1 parent b29aed9
commit 93d3b6f
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 8 deletions.
diff --git a/classifiers/scripts/fineweb_100b.sh b/classifiers/scripts/fineweb_100b.sh
@@ -0,0 +1,45 @@
+#! /bin/bash
+
+DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl.zstd'
+
+NUM_NODES=2
+MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
+CLUSTER="ai2/jupiter*"
+BATCH_SIZE=1024
+PRIORITY="high"
+
+# Generate a hash for the run name by combining model name and documents
+RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
+RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"
+
+# Set the run name as an environment variable
+export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"
+
+
+gantry run \
+    --task-name "${RUN_NAME}" \
+    --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
+    --allow-dirty \
+    --workspace ai2/davidw-oe-annealing \
+    --beaker-image 'petew/olmo-torch23-gantry' \
+    --timeout -1 \
+    --show-logs \
+    --host-networking \
+    --venv 'base' \
+    --priority "${PRIORITY}" \
+    --leader-selection \
+    --gpus 8 \
+    --replicas ${NUM_NODES} \
+    --preemptible \
+    --cluster "${CLUSTER}" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --install "pip install -e classifiers/" \
+    --yes \
+    -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4"
diff --git a/classifiers/scripts/fineweb_dclm07.sh b/classifiers/scripts/fineweb_dclm07.sh
@@ -9,11 +9,11 @@ BATCH_SIZE=1024
 PRIORITY="urgent"
 
 # Test Values
-DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/20b-01/*zstd'
-NUM_NODES=1
-BATCH_SIZE=1024
-CLUSTER="ai2/neptune*"
-PRIORITY="high"
+# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/20b-01/*zstd'
+# NUM_NODES=1
+# BATCH_SIZE=1024
+# CLUSTER="ai2/neptune*"
+# PRIORITY="high"
 
 # Generate a hash for the run name by combining model name and documents
 RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')

diff --git a/classifiers/src/dolma_classifiers/inference.py b/classifiers/src/dolma_classifiers/inference.py
@@ -6,7 +6,7 @@
 from itertools import zip_longest
 from queue import Empty
 from queue import Queue as QueueType
-from typing import Generator, NamedTuple
+from typing import Any, Generator, NamedTuple
 from urllib.parse import urlparse
 
 import fsspec
@@ -116,7 +116,7 @@ def collate_batch(batch: list[Batch], pad_token_id: int) -> Batch:
 
 class AttributeRow(NamedTuple):
     source: str
-    attributes: list[dict]
+    attributes: list[dict[str, Any]]
 
 
 def writer_worker(

diff --git a/classifiers/src/dolma_classifiers/models.py b/classifiers/src/dolma_classifiers/models.py
@@ -31,7 +31,7 @@ def __init__(self, model_name: str, device: str, dtype: str, compile: bool = Fal
         ).to(torch.device(device))
 
         if compile:
-            self.model = torch.compile(self.model)
+            self.model = torch.compile(self.model)  # pyright: ignore
 
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model.eval()