Remove unnecessary spawn in tokenizer, fix config with multiple paths (…

…#67) * Removed unnecessary spawn * upping version * fixes tokenizer config bugs * fixed unused import --------- Co-authored-by: Ubuntu <[email protected]>
allenai · Oct 29, 2023 · a88aee1 · a88aee1
1 parent 2ee1ae2
commit a88aee1
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 11 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dolma"
-version = "0.9.1"
+version = "0.9.2"
 edition = "2021"
 license = "Apache-2.0"
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dolma"
-version = "0.9.1"
+version = "0.9.2"
 description = "Data filters"
 license = {text = "Apache-2.0"}
 readme = "README.md"

diff --git a/python/dolma/core/parallel.py b/python/dolma/core/parallel.py
@@ -121,8 +121,16 @@ def __init__(
                 "Check that you have subclassed BaseParallelProcessor correctly!"
             )
 
-        if len(self.src_prefixes) != len(self.dst_prefixes) or len(self.src_prefixes) != len(self.meta_prefixes):
-            raise ValueError("The number of source, destination and metadata prefixes must be the same.")
+        if len(self.src_prefixes) != len(self.dst_prefixes):
+            raise ValueError(
+                "The number of source and destination prefixes must be the same "
+                f"(got {len(self.src_prefixes)} and {len(self.dst_prefixes)})"
+            )
+        elif len(self.src_prefixes) != len(self.meta_prefixes):
+            raise ValueError(
+                "The number of source and metadata prefixes must be the same."
+                f"(got {len(self.src_prefixes)} and {len(self.meta_prefixes)})"
+            )
 
         if len(self.src_prefixes) == 0:
             raise ValueError("At least one source prefix must be provided.")

diff --git a/python/dolma/tokenizer/executor.py b/python/dolma/tokenizer/executor.py
@@ -144,8 +144,8 @@ def __call__(self, num_readers: Optional[int] = None, **process_single_kwargs: A
         # so that they can load the correct source paths
         source_indices = [str(i) for i in range(len(grouped_source_prefixes))]
 
-        # check that only one destination and metadata is provided
-        if len(self.dst_prefixes) != 1 or len(self.meta_prefixes) != 1:
+        # check that only one value of destination and metadata is provided
+        if len(set(self.dst_prefixes)) != 1 or len(set(self.meta_prefixes)) != 1:
             raise ValueError("Only one destination and metadata should be provided.")
 
         # make necessary destination directories
@@ -188,8 +188,6 @@ def tokenize_in_parallel(
     dtype: str = "uint16",
     debug: bool = False,
 ):
-    multiprocessing.set_start_method("spawn")
-
     # variables for the nice debugging and tokenizers
     os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace"
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -203,8 +201,13 @@ def tokenize_in_parallel(
 
     parallel_writer = MemMapParallelWriter(
         source_prefix=sources,
-        destination_prefix=destination,
-        metadata_prefix=metadata_dir,
+        # the call action will actually get the first destination and
+        # make relative paths from there. Unfortunately, BaseParallelProcessor
+        # expects as many destinations as there are sources, so we employ
+        # this "hack" (that is, repeating destination len(sources) times)
+        # to get around that. Same thing applies to metadata_dir.
+        destination_prefix=[destination for _ in sources],
+        metadata_prefix=[metadata_dir for _ in sources],
         num_processes=num_writers,
         seed=seed,
         debug=debug,