Skip to content

Commit

Permalink
new configs
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Oct 24, 2024
1 parent b29aed9 commit 93d3b6f
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 8 deletions.
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_100b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl.zstd'

NUM_NODES=2
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="high"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4"
10 changes: 5 additions & 5 deletions classifiers/scripts/fineweb_dclm07.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ BATCH_SIZE=1024
PRIORITY="urgent"

# Test Values
DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/20b-01/*zstd'
NUM_NODES=1
BATCH_SIZE=1024
CLUSTER="ai2/neptune*"
PRIORITY="high"
# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/20b-01/*zstd'
# NUM_NODES=1
# BATCH_SIZE=1024
# CLUSTER="ai2/neptune*"
# PRIORITY="high"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
Expand Down
4 changes: 2 additions & 2 deletions classifiers/src/dolma_classifiers/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from itertools import zip_longest
from queue import Empty
from queue import Queue as QueueType
from typing import Generator, NamedTuple
from typing import Any, Generator, NamedTuple
from urllib.parse import urlparse

import fsspec
Expand Down Expand Up @@ -116,7 +116,7 @@ def collate_batch(batch: list[Batch], pad_token_id: int) -> Batch:

class AttributeRow(NamedTuple):
source: str
attributes: list[dict]
attributes: list[dict[str, Any]]


def writer_worker(
Expand Down
2 changes: 1 addition & 1 deletion classifiers/src/dolma_classifiers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(self, model_name: str, device: str, dtype: str, compile: bool = Fal
).to(torch.device(device))

if compile:
self.model = torch.compile(self.model)
self.model = torch.compile(self.model) # pyright: ignore

self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model.eval()
Expand Down

0 comments on commit 93d3b6f

Please sign in to comment.