allenai · Whattabatt · Oct 3, 2024 · Sep 27, 2024 · Sep 24, 2024 · Sep 27, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dolma"
-version = "1.0.9"
+version = "1.0.14"
 edition = "2021"
 license = "Apache-2.0"
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dolma"
-version = "1.0.14.post1"
+version = "1.0.5"
 description = "Data filters"
 license = { text = "Apache-2.0" }
 readme = "README.md"

diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py
@@ -1,3 +1,5 @@
+import fnmatch
+import os
 from contextlib import ExitStack
 from dataclasses import dataclass
 from pathlib import Path
@@ -99,6 +101,13 @@ class DedupeConfig:
     partition_index: Optional[int] = field(
         default=0, help="The index of the partition being processed, in the range [0, num_partitions)."
     )
+    file_partition: Optional[bool] = field(
+        default=False, help="Whether or not to partition at the document level (vs at the span level)"
+    )
+    document_dir: Optional[str] = field(
+        default="documents",
+        help="The folder in source paths to replace with 'attributes' to store results, if not 'documents'",
+    )
 
 
 @dataclass
@@ -135,7 +144,6 @@ def run(cls, parsed_config: DeduperConfig):
         logger = get_logger("tagger")
 
         dict_config: Dict[str, Any] = {}
-
         with ExitStack() as stack:
             work_dirs = stack.enter_context(make_workdirs(parsed_config.work_dir))
 
@@ -146,6 +154,8 @@ def run(cls, parsed_config: DeduperConfig):
                 "min_words": parsed_config.dedupe.min_words,
                 "num_partitions": parsed_config.dedupe.num_partitions,
                 "partition_index": parsed_config.dedupe.partition_index,
+                "file_partition": parsed_config.dedupe.file_partition,
+                "document_dir": parsed_config.dedupe.document_dir,
             }
             try_name = parsed_config.dedupe.name if not om.is_missing(parsed_config.dedupe, "name") else None
 
@@ -182,7 +192,17 @@ def run(cls, parsed_config: DeduperConfig):
             # perform some path validation to make sure we don't call the mixer with invalid config
             total_matching_documents = 0
             for document in parsed_config.documents:
-                dict_config.setdefault("documents", []).append(str(document))
+
+                if not any(
+                    fnmatch.fnmatch(dict_config["dedupe"]["document_dir"], part) for part in document.split(os.sep)
+                ):
+                    raise DolmaConfigError(
+                        f"Path ({document}) does not contain expected document directory: '/{dict_config['dedupe']['document_dir']}/'. "
+                    )
+
+                doc = str(document)
+
+                dict_config.setdefault("documents", []).append(doc)
 
                 current_matching_documents = sum(1 for _ in glob_path(document))
                 if current_matching_documents == 0:

diff --git a/src/deduper.rs b/src/deduper.rs
@@ -14,8 +14,9 @@ use crate::s3_util;
 use crate::shard::shard_config::{CompressionConfig, WorkDirConfig};
 use crate::shard::{find_objects_matching_patterns, FileCache};
 use crate::wimbd::tokens::tokenize;
-
+use ahash::RandomState;
 use deduper_config::*;
+use std::hash::{BuildHasher, Hash, Hasher};
 
 pub fn run(config: DeduperConfig) -> Result<u32, u32> {
     let bloom_filter = BloomFilter::initialize(&config.bloom_filter).unwrap();
@@ -33,7 +34,20 @@ pub fn run(config: DeduperConfig) -> Result<u32, u32> {
     let threadpool = ThreadPool::new(config.processes);
     let failed_shard_count = AtomicU32::new(0);
     let failed_shard_count_ref = Arc::new(failed_shard_count);
+    let hash_builder = RandomState::with_seeds(0, 1, 2, 3);
+
     for p in paths {
+        let mut hasher = hash_builder.build_hasher();
+        p.hash(&mut hasher);
+        let hashed_path = hasher.finish();
+
+        if config.dedupe.file_partition.unwrap_or(false)
+            && hashed_path % config.dedupe.num_partitions.unwrap_or(1)
+                != config.dedupe.partition_index.unwrap_or(0)
+        {
+            continue;
+        }
+
         let path = p.clone();
         let work_dirs = config.work_dir.clone();
         let dedupe = config.dedupe.clone();
@@ -123,7 +137,15 @@ fn write_attributes(
 
     let attrs_location = {
         let attr_prefix = format!("/attributes/{}/", attr_key);
-        docs_location.replace("/documents/", &attr_prefix)
+        docs_location.replace(
+            &format!(
+                "/{}/",
+                dedupe_config
+                    .document_dir
+                    .unwrap_or(String::from("documents"))
+            ),
+            &attr_prefix,
+        )
     };
     let local_output = cache.prepare_output(&attrs_location, label_temp)?;
     let mut num_processed = 0;
@@ -546,6 +568,8 @@ pub mod deduper_config {
         pub skip_empty: Option<bool>,
         pub num_partitions: Option<u64>,
         pub partition_index: Option<u64>,
+        pub file_partition: Option<bool>,
+        pub document_dir: Option<String>,
     }
 
     #[derive(Serialize, Deserialize, Clone)]

diff --git a/tests/config/filepath-bad.json b/tests/config/filepath-bad.json
@@ -0,0 +1,28 @@
+{
+  "documents": [
+    "tests/data/provided/deduper/pathnotd0cumentz/000.json.gz"
+  ],
+  "work_dir": {
+    "input": "tests/work/temp/dedupe-para/input",
+    "output": "tests/work/temp/dedupe-para/output"
+  },
+  "dedupe": {
+    "name": "dedupe_paragraph_ngrams",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans",
+      "by_ngram": {
+        "ngram_length": 6,
+        "stride": 3,
+        "overlap_threshold": 0.5
+      }
+    }
+  },
+  "bloom_filter": {
+    "file": "tests/work/para_bloom_filter.bin",
+    "size_in_bytes": 0,
+    "read_only": false,
+    "estimated_doc_count": 1000,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 1
+}
diff --git a/tests/config/filepath-good.json b/tests/config/filepath-good.json
@@ -0,0 +1,29 @@
+{
+  "documents": [
+    "tests/data/provided/deduper/pathnotd0cumentz/000.json.gz"
+  ],
+  "work_dir": {
+    "input": "tests/work/temp/dedupe-para/input",
+    "output": "tests/work/temp/dedupe-para/output"
+  },
+  "dedupe": {
+    "name": "dedupe_paragraph_ngrams",
+    "document_dir": "pathnotd0cumentz",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans",
+      "by_ngram": {
+        "ngram_length": 6,
+        "stride": 3,
+        "overlap_threshold": 0.5
+      }
+    }
+  },
+  "bloom_filter": {
+    "file": "tests/work/para_bloom_filter.bin",
+    "size_in_bytes": 0,
+    "read_only": false,
+    "estimated_doc_count": 1000,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 1
+}
diff --git a/tests/data/provided/deduper/pathnotd0cumentz/000.json.gz b/tests/data/provided/deduper/pathnotd0cumentz/000.json.gz
diff --git a/tests/python/test_deduper.py b/tests/python/test_deduper.py
@@ -10,6 +10,7 @@
 from typing_extensions import TypedDict
 
 from dolma.cli.__main__ import main
+from dolma.core.errors import DolmaConfigError
 from dolma.core.utils import split_words
 
 from .utils import (
@@ -24,6 +25,9 @@
 
 TEST_DIR = Path(__file__).parent.parent
 DEDUPE_BY_URL = TEST_DIR / "config/dedupe-by-url.json"
+DEDUPE_BAD_FILENAME = TEST_DIR / "config/filepath-bad.json"
+DEDUPE_GOOD_FILENAME = TEST_DIR / "config/filepath-good.json"
+
 DEDUPE_PARAGRAPHS = TEST_DIR / "config/dedupe-paragraphs.json"
 DEDUPE_PARAGRAPH_NGRAMS = TEST_DIR / "config/dedupe-paragraph-ngrams.json"
 
@@ -48,13 +52,13 @@ def setUp(self) -> None:
 
             # upload test data
             upload_s3_prefix(
-                s3_prefix=f"{self.remote_test_prefix}", local_prefix="tests/data/provided/deduper/documents/*.gz"
+                s3_prefix=f"{self.remote_test_prefix}", local_prefix="tests/data/provided/deduper/*/*.gz"
             )
 
         # copy provided config files to local temp dir
         shutil.copytree(
-            "tests/data/provided/deduper/documents",
-            f"{self.local_temp_dir}/tests/data/provided/deduper/documents",
+            "tests/data/provided/deduper",
+            f"{self.local_temp_dir}/tests/data/provided/deduper",
             dirs_exist_ok=True,
         )
 
@@ -82,6 +86,33 @@ def test_dedupe_by_url(self):
         )
         return self._compare_dedupe_output(expected, computed)  # pyright: ignore
 
+    def test_dedupe_bad_filepath(self):
+        with open(DEDUPE_BAD_FILENAME, "r") as f:
+            config = json.load(f)
+
+        config["documents"][0] = f'{self.local_temp_dir}/{config["documents"][0]}'
+        config["bloom_filter"]["file"] = f'{self.local_temp_dir}/{config["bloom_filter"]["file"]}'
+
+        with NamedTemporaryFile("w") as f:
+            json.dump(config, f)
+            f.flush()
+
+            with self.assertRaises(DolmaConfigError):
+                main(argv=["-c", f.name, "dedupe"])
+
+    def test_dedupe_good_filepath(self):
+        with open(DEDUPE_GOOD_FILENAME, "r") as f:
+            config = json.load(f)
+
+        config["documents"][0] = f'{self.local_temp_dir}/{config["documents"][0]}'
+        config["bloom_filter"]["file"] = f'{self.local_temp_dir}/{config["bloom_filter"]["file"]}'
+
+        with NamedTemporaryFile("w") as f:
+            json.dump(config, f)
+            f.flush()
+
+            main(argv=["-c", f.name, "dedupe"])
+
     def test_dedupe_paragraphs(self):
         with open(DEDUPE_PARAGRAPHS, "r") as f:
             config = json.load(f)