From 5a010a2685914b1db7744426abfb4b9ece52da95 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Wed, 8 Nov 2023 10:41:25 -0800 Subject: [PATCH] Fix Hardcoded Tokenizer (#71) * fix for hardcoded tokenizer * ignoring VSCode dir * style, removed files --- .gitignore | 3 +++ python/dolma/tokenizer/executor.py | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4a319441..5edee38b 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,6 @@ target/ # ignoring test output /tests/work/ /python/dolma/core/warc + +# ignore vscode directory +.vscode diff --git a/python/dolma/tokenizer/executor.py b/python/dolma/tokenizer/executor.py index fa360450..67d0c78f 100644 --- a/python/dolma/tokenizer/executor.py +++ b/python/dolma/tokenizer/executor.py @@ -48,13 +48,17 @@ def process_single(cls, source_path: str, destination_path: str, queue: QueueTyp raise RuntimeError("grouped_source_prefixes should be a list of paths") source_paths = global_source_paths[int(source_path)] + tokenizer_name_or_path = kwargs.pop("tokenizer_name_or_path", None) + if tokenizer_name_or_path is None: + raise RuntimeError("tokenizer_name_or_path not provided") + cpu_count = multiprocessing.cpu_count() documents_cnt = tokens_cnt = 0 update_interval = 1 mm_cnt = 0 - tokenizer = Tokenizer.from_pretrained("allenai/eleuther-ai-gpt-neox-20b-pii-special") + tokenizer = Tokenizer.from_pretrained(tokenizer_name_or_path) tokenizer_ring = [] for _ in range(min(ring_size, len(source_paths))): path = source_paths.pop() @@ -218,4 +222,5 @@ def tokenize_in_parallel( ring_size=ring_size, max_size=max_size, dtype=dtype, + tokenizer_name_or_path=tokenizer_name_or_path, )