-
Notifications
You must be signed in to change notification settings - Fork 120
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
'File partition' option and 'document' directory specification #213
Merged
Merged
Changes from 9 commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
30d8afd
init
Whattabatt 0b724b3
Also pin maturin in action (#208)
undfined 74dec8c
reduced comments
Whattabatt 968f65c
add file path test
Whattabatt e06f67a
fix
Whattabatt 7d8edf1
Merge branch 'main' into file-partition
Whattabatt 2d04498
style
Whattabatt ba9d3c9
test configs
Whattabatt d13ab5c
version bump
Whattabatt 35cecfb
...
Whattabatt cdb92b2
bump?
Whattabatt 3ada6e1
bunmp
Whattabatt b746d9c
.
Whattabatt 526474f
.
Whattabatt 865b250
version weirdness
Whattabatt 0e76d2e
Update pyproject.toml
Whattabatt 1ccd790
Update pyproject.toml
Whattabatt 9858712
ci version bump
Whattabatt 259c4d9
formatting
Whattabatt 6c3ba6c
Erroring at the rust level instead of overriding source
Whattabatt 75a2f98
Update deduper.rs
Whattabatt File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[package] | ||
name = "dolma" | ||
version = "1.0.9" | ||
version = "1.0.14" | ||
edition = "2021" | ||
license = "Apache-2.0" | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,8 +14,9 @@ use crate::s3_util; | |
use crate::shard::shard_config::{CompressionConfig, WorkDirConfig}; | ||
use crate::shard::{find_objects_matching_patterns, FileCache}; | ||
use crate::wimbd::tokens::tokenize; | ||
|
||
use ahash::RandomState; | ||
use deduper_config::*; | ||
use std::hash::{BuildHasher, Hash, Hasher}; | ||
|
||
pub fn run(config: DeduperConfig) -> Result<u32, u32> { | ||
let bloom_filter = BloomFilter::initialize(&config.bloom_filter).unwrap(); | ||
|
@@ -33,7 +34,20 @@ pub fn run(config: DeduperConfig) -> Result<u32, u32> { | |
let threadpool = ThreadPool::new(config.processes); | ||
let failed_shard_count = AtomicU32::new(0); | ||
let failed_shard_count_ref = Arc::new(failed_shard_count); | ||
let hash_builder = RandomState::with_seeds(0, 1, 2, 3); | ||
|
||
for p in paths { | ||
let mut hasher = hash_builder.build_hasher(); | ||
p.hash(&mut hasher); | ||
let hashed_path = hasher.finish(); | ||
|
||
if config.dedupe.file_partition.unwrap_or(false) | ||
&& hashed_path % config.dedupe.num_partitions.unwrap_or(1) | ||
!= config.dedupe.partition_index.unwrap_or(0) | ||
{ | ||
continue; | ||
} | ||
|
||
let path = p.clone(); | ||
let work_dirs = config.work_dir.clone(); | ||
let dedupe = config.dedupe.clone(); | ||
|
@@ -123,7 +137,15 @@ fn write_attributes( | |
|
||
let attrs_location = { | ||
let attr_prefix = format!("/attributes/{}/", attr_key); | ||
docs_location.replace("/documents/", &attr_prefix) | ||
docs_location.replace( | ||
&format!( | ||
"/{}/", | ||
dedupe_config | ||
.document_dir | ||
.unwrap_or(String::from("documents")) | ||
), | ||
&attr_prefix, | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a test for this? |
||
}; | ||
let local_output = cache.prepare_output(&attrs_location, label_temp)?; | ||
let mut num_processed = 0; | ||
|
@@ -546,6 +568,8 @@ pub mod deduper_config { | |
pub skip_empty: Option<bool>, | ||
pub num_partitions: Option<u64>, | ||
pub partition_index: Option<u64>, | ||
pub file_partition: Option<bool>, | ||
pub document_dir: Option<String>, | ||
} | ||
|
||
#[derive(Serialize, Deserialize, Clone)] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
{ | ||
"documents": [ | ||
"tests/data/provided/deduper/pathnotd0cumentz/000.json.gz" | ||
], | ||
"work_dir": { | ||
"input": "tests/work/temp/dedupe-para/input", | ||
"output": "tests/work/temp/dedupe-para/output" | ||
}, | ||
"dedupe": { | ||
"name": "dedupe_paragraph_ngrams", | ||
"paragraphs": { | ||
"attribute_name": "bff_duplicate_paragraph_spans", | ||
"by_ngram": { | ||
"ngram_length": 6, | ||
"stride": 3, | ||
"overlap_threshold": 0.5 | ||
} | ||
} | ||
}, | ||
"bloom_filter": { | ||
"file": "tests/work/para_bloom_filter.bin", | ||
"size_in_bytes": 0, | ||
"read_only": false, | ||
"estimated_doc_count": 1000, | ||
"desired_false_positive_rate": 0.001 | ||
}, | ||
"processes": 1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{ | ||
"documents": [ | ||
"tests/data/provided/deduper/pathnotd0cumentz/000.json.gz" | ||
], | ||
"work_dir": { | ||
"input": "tests/work/temp/dedupe-para/input", | ||
"output": "tests/work/temp/dedupe-para/output" | ||
}, | ||
"dedupe": { | ||
"name": "dedupe_paragraph_ngrams", | ||
"document_dir": "pathnotd0cumentz", | ||
"paragraphs": { | ||
"attribute_name": "bff_duplicate_paragraph_spans", | ||
"by_ngram": { | ||
"ngram_length": 6, | ||
"stride": 3, | ||
"overlap_threshold": 0.5 | ||
} | ||
} | ||
}, | ||
"bloom_filter": { | ||
"file": "tests/work/para_bloom_filter.bin", | ||
"size_in_bytes": 0, | ||
"read_only": false, | ||
"estimated_doc_count": 1000, | ||
"desired_false_positive_rate": 0.001 | ||
}, | ||
"processes": 1 | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You want
1.0.15
right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No actually, when I started making dev branches I misread the version. The most recent not dev is 1.0.4