Skip to content

Commit

Permalink
Merge branch 'common-voice:main' into feature/add-tr-rules
Browse files Browse the repository at this point in the history
  • Loading branch information
HarikalarKutusu authored Aug 19, 2023
2 parents f545008 + 75b5949 commit a94eae3
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 15 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ In the beginning, the WikiExtractor prints out how many processes it will use fo
```bash
cd ../cv-sentence-extractor
pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
cargo run --release -- extract -l en -d ../wikiextractor/text/ >> wiki.en.txt
cargo run --release -- -l en -d ../wikiextractor/text/ extract >> wiki.en.txt
```

*Tip: You don't need this last process to finish to start observing the output, wiki.en.txt should get a few thousands sentences in just a few minutes, and you can use that as a way to estimate the quality of the output early on and stop the process if you are not happy.*
Expand Down Expand Up @@ -137,7 +137,7 @@ python WikiExtractor.py --json ../enwikisource-latest-pages-articles.xml
```bash
cd ../cv-sentence-extractor
pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
cargo run --release -- extract-wikisource -l en -d ../wikiextractor/text/ >> wiki.en.txt
cargo run --release -- -l en -d ../wikiextractor/text/ extract-wikisource >> wiki.en.txt
```

*Tip: You don't need this last process to finish to start observing the output, wiki.en.txt should get a few thousands sentences in just a few minutes, and you can use that as a way to estimate the quality of the output early on and stop the process if you are not happy.*
Expand All @@ -148,7 +148,7 @@ If you have one or multiple files with one sentence per line, you can use this e

```bash
pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
cargo run --release -- extract-file -l en -d ../texts/ >> file.en.txt
cargo run --release -- -l en -d ../texts/ extract-file >> file.en.txt
```

## Using language rules
Expand Down Expand Up @@ -257,7 +257,7 @@ After running step 1 and 2 from the `Usage` section above, run:

```bash
cd ../cv-sentence-extractor
cargo run --release -- extract -l en -d ../wikiextractor/text/ --no-check >> wiki.en.all.txt
cargo run --release -- -l en -d ../wikiextractor/text/ --no-check extract >> wiki.en.all.txt
```

Then you can use the cvtools scripts to generate a list of the word frequency:
Expand Down
2 changes: 1 addition & 1 deletion scripts/providers/wiki-source.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ function extract {
python $WIKI_EXTRACTOR_PATH --processes 4 --json $DUMP_FILE

echo "Running extraction"
cargo run --release -- extract -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH >> $EXTRACTED_SENTENCES_PATH
cargo run --release -- -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH extract >> $EXTRACTED_SENTENCES_PATH
}

function cleanup {
Expand Down
6 changes: 3 additions & 3 deletions scripts/providers/wiki.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ function extract {

echo "Running extraction"
if [ $TYPE == "blocklist" ]; then
cargo run --release -- extract -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH --no-check >> $EXTRACTED_SENTENCES_PATH
cargo run --release -- -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH --no-check extract >> $EXTRACTED_SENTENCES_PATH
elif [ -f "$TITLE_FILTER_PATH" ]; then
cargo run --release -- extract -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH --title-filter-list $TITLE_FILTER_PATH >> $EXTRACTED_SENTENCES_PATH
cargo run --release -- -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH extract --title-filter-list $TITLE_FILTER_PATH >> $EXTRACTED_SENTENCES_PATH
else
cargo run --release -- extract -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH >> $EXTRACTED_SENTENCES_PATH
cargo run --release -- -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH extract >> $EXTRACTED_SENTENCES_PATH
fi
}

Expand Down
9 changes: 5 additions & 4 deletions src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ enum Commands {
Extract {
/// path to the file containing titles to filter for
#[arg(short, long)]
title_filter_list: String,
title_filter_list: Option<String>,
},

/// Extract sentences from Wikisource dump extracts using WikiExtractor
Expand All @@ -48,15 +48,16 @@ pub fn start() -> Result<(), String> {
match &args.command {
Commands::Extract { title_filter_list } => {
let wikipedia_loader = Wikipedia::new(language, directory);
extract(wikipedia_loader, no_check, title_filter_list)
let filter_list_value = title_filter_list.clone().unwrap_or(String::from(""));
extract(wikipedia_loader, no_check, filter_list_value)
},
Commands::ExtractWikisource => {
let wikipedia_loader = Wikipedia::new(language, directory);
extract(wikipedia_loader, no_check, "")
extract(wikipedia_loader, no_check, String::from(""))
},
Commands::ExtractFile => {
let file_loader = File::new(language, directory);
extract(file_loader, no_check, "")
extract(file_loader, no_check, String::from(""))
}
}
}
6 changes: 3 additions & 3 deletions src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use std::io::Read;
use std::path::Path;
use std::path::PathBuf;

pub fn extract(loader: impl Loader, no_check: bool, filter_list_path: &str) -> Result<(), String> {
pub fn extract(loader: impl Loader, no_check: bool, filter_list_path: String) -> Result<(), String> {
let config = loader.get_config();
let rules = load_rules(&config.language);
let training_data = get_training_data(&config.language);
Expand Down Expand Up @@ -191,14 +191,14 @@ fn load_file_names(dir_name: &str, prefix: &str) -> Result<Vec<PathBuf>, String>
.collect::<Result<Vec<PathBuf>, String>>()
}

fn read_filtered_titles(filtered_titles_path: &str) -> HashSet<String> {
fn read_filtered_titles(filtered_titles_path: String) -> HashSet<String> {
if filtered_titles_path.is_empty() {
return HashSet::new();
}

eprintln!("Reading titles from {:?}", filtered_titles_path);
let mut titles = HashSet::new();
let titles_path = Path::new(filtered_titles_path);
let titles_path = Path::new(&filtered_titles_path);
let mut content = String::new();
let mut file = File::open(titles_path).map_err(|e| format!("{}", e)).unwrap();
file.read_to_string(&mut content)
Expand Down

0 comments on commit a94eae3

Please sign in to comment.