From a88aee1ffe041c1b34805912595725e76ab92b37 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Sat, 28 Oct 2023 21:31:22 -0700 Subject: [PATCH] Remove unnecessary spawn in tokenizer, fix config with multiple paths (#67) * Removed unnecessary spawn * upping version * fixes tokenizer config bugs * fixed unused import --------- Co-authored-by: Ubuntu --- Cargo.lock | 2 +- Cargo.toml | 2 +- pyproject.toml | 2 +- python/dolma/core/parallel.py | 12 ++++++++++-- python/dolma/tokenizer/executor.py | 15 +++++++++------ 5 files changed, 22 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c94883ac..f2db7745 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -682,7 +682,7 @@ dependencies = [ [[package]] name = "dolma" -version = "0.9.0" +version = "0.9.2" dependencies = [ "ahash", "aws-config", diff --git a/Cargo.toml b/Cargo.toml index 6164669c..6fd7ecd5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dolma" -version = "0.9.1" +version = "0.9.2" edition = "2021" license = "Apache-2.0" diff --git a/pyproject.toml b/pyproject.toml index 1995d749..30248c87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "0.9.1" +version = "0.9.2" description = "Data filters" license = {text = "Apache-2.0"} readme = "README.md" diff --git a/python/dolma/core/parallel.py b/python/dolma/core/parallel.py index b8206c57..e8f659b5 100644 --- a/python/dolma/core/parallel.py +++ b/python/dolma/core/parallel.py @@ -121,8 +121,16 @@ def __init__( "Check that you have subclassed BaseParallelProcessor correctly!" ) - if len(self.src_prefixes) != len(self.dst_prefixes) or len(self.src_prefixes) != len(self.meta_prefixes): - raise ValueError("The number of source, destination and metadata prefixes must be the same.") + if len(self.src_prefixes) != len(self.dst_prefixes): + raise ValueError( + "The number of source and destination prefixes must be the same " + f"(got {len(self.src_prefixes)} and {len(self.dst_prefixes)})" + ) + elif len(self.src_prefixes) != len(self.meta_prefixes): + raise ValueError( + "The number of source and metadata prefixes must be the same." + f"(got {len(self.src_prefixes)} and {len(self.meta_prefixes)})" + ) if len(self.src_prefixes) == 0: raise ValueError("At least one source prefix must be provided.") diff --git a/python/dolma/tokenizer/executor.py b/python/dolma/tokenizer/executor.py index ee322ce3..fa360450 100644 --- a/python/dolma/tokenizer/executor.py +++ b/python/dolma/tokenizer/executor.py @@ -144,8 +144,8 @@ def __call__(self, num_readers: Optional[int] = None, **process_single_kwargs: A # so that they can load the correct source paths source_indices = [str(i) for i in range(len(grouped_source_prefixes))] - # check that only one destination and metadata is provided - if len(self.dst_prefixes) != 1 or len(self.meta_prefixes) != 1: + # check that only one value of destination and metadata is provided + if len(set(self.dst_prefixes)) != 1 or len(set(self.meta_prefixes)) != 1: raise ValueError("Only one destination and metadata should be provided.") # make necessary destination directories @@ -188,8 +188,6 @@ def tokenize_in_parallel( dtype: str = "uint16", debug: bool = False, ): - multiprocessing.set_start_method("spawn") - # variables for the nice debugging and tokenizers os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace" os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -203,8 +201,13 @@ def tokenize_in_parallel( parallel_writer = MemMapParallelWriter( source_prefix=sources, - destination_prefix=destination, - metadata_prefix=metadata_dir, + # the call action will actually get the first destination and + # make relative paths from there. Unfortunately, BaseParallelProcessor + # expects as many destinations as there are sources, so we employ + # this "hack" (that is, repeating destination len(sources) times) + # to get around that. Same thing applies to metadata_dir. + destination_prefix=[destination for _ in sources], + metadata_prefix=[metadata_dir for _ in sources], num_processes=num_writers, seed=seed, debug=debug,