Skip to content

Commit

Permalink
Merge branch 'main' into reddit
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni authored Nov 14, 2023
2 parents 1430d68 + 5a010a2 commit a13238c
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,6 @@ target/
# ignoring test output
/tests/work/
/python/dolma/core/warc

# ignore vscode directory
.vscode
7 changes: 6 additions & 1 deletion python/dolma/tokenizer/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,17 @@ def process_single(cls, source_path: str, destination_path: str, queue: QueueTyp
raise RuntimeError("grouped_source_prefixes should be a list of paths")
source_paths = global_source_paths[int(source_path)]

tokenizer_name_or_path = kwargs.pop("tokenizer_name_or_path", None)
if tokenizer_name_or_path is None:
raise RuntimeError("tokenizer_name_or_path not provided")

cpu_count = multiprocessing.cpu_count()

documents_cnt = tokens_cnt = 0
update_interval = 1
mm_cnt = 0

tokenizer = Tokenizer.from_pretrained("allenai/eleuther-ai-gpt-neox-20b-pii-special")
tokenizer = Tokenizer.from_pretrained(tokenizer_name_or_path)
tokenizer_ring = []
for _ in range(min(ring_size, len(source_paths))):
path = source_paths.pop()
Expand Down Expand Up @@ -218,4 +222,5 @@ def tokenize_in_parallel(
ring_size=ring_size,
max_size=max_size,
dtype=dtype,
tokenizer_name_or_path=tokenizer_name_or_path,
)

0 comments on commit a13238c

Please sign in to comment.