Skip to content

Commit

Permalink
Remove unnecessary spawn in tokenizer, fix config with multiple paths (
Browse files Browse the repository at this point in the history
…#67)

* Removed unnecessary spawn

* upping version

* fixes tokenizer config bugs

* fixed unused import

---------

Co-authored-by: Ubuntu <[email protected]>
  • Loading branch information
soldni and Ubuntu authored Oct 29, 2023
1 parent 2ee1ae2 commit a88aee1
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "dolma"
version = "0.9.1"
version = "0.9.2"
edition = "2021"
license = "Apache-2.0"

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dolma"
version = "0.9.1"
version = "0.9.2"
description = "Data filters"
license = {text = "Apache-2.0"}
readme = "README.md"
Expand Down
12 changes: 10 additions & 2 deletions python/dolma/core/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,16 @@ def __init__(
"Check that you have subclassed BaseParallelProcessor correctly!"
)

if len(self.src_prefixes) != len(self.dst_prefixes) or len(self.src_prefixes) != len(self.meta_prefixes):
raise ValueError("The number of source, destination and metadata prefixes must be the same.")
if len(self.src_prefixes) != len(self.dst_prefixes):
raise ValueError(
"The number of source and destination prefixes must be the same "
f"(got {len(self.src_prefixes)} and {len(self.dst_prefixes)})"
)
elif len(self.src_prefixes) != len(self.meta_prefixes):
raise ValueError(
"The number of source and metadata prefixes must be the same."
f"(got {len(self.src_prefixes)} and {len(self.meta_prefixes)})"
)

if len(self.src_prefixes) == 0:
raise ValueError("At least one source prefix must be provided.")
Expand Down
15 changes: 9 additions & 6 deletions python/dolma/tokenizer/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ def __call__(self, num_readers: Optional[int] = None, **process_single_kwargs: A
# so that they can load the correct source paths
source_indices = [str(i) for i in range(len(grouped_source_prefixes))]

# check that only one destination and metadata is provided
if len(self.dst_prefixes) != 1 or len(self.meta_prefixes) != 1:
# check that only one value of destination and metadata is provided
if len(set(self.dst_prefixes)) != 1 or len(set(self.meta_prefixes)) != 1:
raise ValueError("Only one destination and metadata should be provided.")

# make necessary destination directories
Expand Down Expand Up @@ -188,8 +188,6 @@ def tokenize_in_parallel(
dtype: str = "uint16",
debug: bool = False,
):
multiprocessing.set_start_method("spawn")

# variables for the nice debugging and tokenizers
os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
Expand All @@ -203,8 +201,13 @@ def tokenize_in_parallel(

parallel_writer = MemMapParallelWriter(
source_prefix=sources,
destination_prefix=destination,
metadata_prefix=metadata_dir,
# the call action will actually get the first destination and
# make relative paths from there. Unfortunately, BaseParallelProcessor
# expects as many destinations as there are sources, so we employ
# this "hack" (that is, repeating destination len(sources) times)
# to get around that. Same thing applies to metadata_dir.
destination_prefix=[destination for _ in sources],
metadata_prefix=[metadata_dir for _ in sources],
num_processes=num_writers,
seed=seed,
debug=debug,
Expand Down

0 comments on commit a88aee1

Please sign in to comment.