From a88aee1ffe041c1b34805912595725e76ab92b37 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Sat, 28 Oct 2023 21:31:22 -0700
Subject: [PATCH] Remove unnecessary spawn in tokenizer, fix config with
 multiple paths (#67)

* Removed unnecessary spawn

* upping version

* fixes tokenizer config bugs

* fixed unused import

---------

Co-authored-by: Ubuntu <ubuntu@ip-172-31-29-105.ec2.internal>
---
 Cargo.lock                         |  2 +-
 Cargo.toml                         |  2 +-
 pyproject.toml                     |  2 +-
 python/dolma/core/parallel.py      | 12 ++++++++++--
 python/dolma/tokenizer/executor.py | 15 +++++++++------
 5 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c94883ac..f2db7745 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -682,7 +682,7 @@ dependencies = [
 
 [[package]]
 name = "dolma"
-version = "0.9.0"
+version = "0.9.2"
 dependencies = [
  "ahash",
  "aws-config",
diff --git a/Cargo.toml b/Cargo.toml
index 6164669c..6fd7ecd5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dolma"
-version = "0.9.1"
+version = "0.9.2"
 edition = "2021"
 license = "Apache-2.0"
 
diff --git a/pyproject.toml b/pyproject.toml
index 1995d749..30248c87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dolma"
-version = "0.9.1"
+version = "0.9.2"
 description = "Data filters"
 license = {text = "Apache-2.0"}
 readme = "README.md"
diff --git a/python/dolma/core/parallel.py b/python/dolma/core/parallel.py
index b8206c57..e8f659b5 100644
--- a/python/dolma/core/parallel.py
+++ b/python/dolma/core/parallel.py
@@ -121,8 +121,16 @@ def __init__(
                 "Check that you have subclassed BaseParallelProcessor correctly!"
             )
 
-        if len(self.src_prefixes) != len(self.dst_prefixes) or len(self.src_prefixes) != len(self.meta_prefixes):
-            raise ValueError("The number of source, destination and metadata prefixes must be the same.")
+        if len(self.src_prefixes) != len(self.dst_prefixes):
+            raise ValueError(
+                "The number of source and destination prefixes must be the same "
+                f"(got {len(self.src_prefixes)} and {len(self.dst_prefixes)})"
+            )
+        elif len(self.src_prefixes) != len(self.meta_prefixes):
+            raise ValueError(
+                "The number of source and metadata prefixes must be the same."
+                f"(got {len(self.src_prefixes)} and {len(self.meta_prefixes)})"
+            )
 
         if len(self.src_prefixes) == 0:
             raise ValueError("At least one source prefix must be provided.")
diff --git a/python/dolma/tokenizer/executor.py b/python/dolma/tokenizer/executor.py
index ee322ce3..fa360450 100644
--- a/python/dolma/tokenizer/executor.py
+++ b/python/dolma/tokenizer/executor.py
@@ -144,8 +144,8 @@ def __call__(self, num_readers: Optional[int] = None, **process_single_kwargs: A
         # so that they can load the correct source paths
         source_indices = [str(i) for i in range(len(grouped_source_prefixes))]
 
-        # check that only one destination and metadata is provided
-        if len(self.dst_prefixes) != 1 or len(self.meta_prefixes) != 1:
+        # check that only one value of destination and metadata is provided
+        if len(set(self.dst_prefixes)) != 1 or len(set(self.meta_prefixes)) != 1:
             raise ValueError("Only one destination and metadata should be provided.")
 
         # make necessary destination directories
@@ -188,8 +188,6 @@ def tokenize_in_parallel(
     dtype: str = "uint16",
     debug: bool = False,
 ):
-    multiprocessing.set_start_method("spawn")
-
     # variables for the nice debugging and tokenizers
     os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace"
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -203,8 +201,13 @@ def tokenize_in_parallel(
 
     parallel_writer = MemMapParallelWriter(
         source_prefix=sources,
-        destination_prefix=destination,
-        metadata_prefix=metadata_dir,
+        # the call action will actually get the first destination and
+        # make relative paths from there. Unfortunately, BaseParallelProcessor
+        # expects as many destinations as there are sources, so we employ
+        # this "hack" (that is, repeating destination len(sources) times)
+        # to get around that. Same thing applies to metadata_dir.
+        destination_prefix=[destination for _ in sources],
+        metadata_prefix=[metadata_dir for _ in sources],
         num_processes=num_writers,
         seed=seed,
         debug=debug,