Skip to content

Commit

Permalink
Added arguments against common pitfalls
Browse files Browse the repository at this point in the history
  • Loading branch information
TevenLeScao committed Feb 20, 2023
1 parent dd3c9ae commit 6a31b32
Showing 1 changed file with 28 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def parseArgs():
parser.add_argument(
"--config_name",
type=str,
default="unshuffled_deduplicated_af",
default=None,
help="Name of the dataset config to pass.",
)
parser.add_argument(
Expand Down Expand Up @@ -92,6 +92,17 @@ def parseArgs():
default="ac_dc/af.arpa.bin",
help="Path to the KenLM model used to compute perplexity scores.",
)
parser.add_argument(
"--max_len_prefilter",
type=int,
default=None,
help="Maximum length of documents to keep. Longer documents might crash the pipeline.",
)
parser.add_argument(
"--remove_meta",
action="store_true",
help="Only keep text column in dataset.",
)
parser.add_argument(
"--num_proc",
type=int,
Expand Down Expand Up @@ -119,6 +130,15 @@ def main():
use_auth_token=True,
)

if args.remove_meta:
dataset = dataset.remove_columns([column for column in dataset.column_names if column != "text"])

print("Filtering too-long documents")
if args.max_len_prefilter is not None:
dataset = dataset.filter(lambda x: len(x["text"]) < args.max_len_prefilter,
num_proc=check_num_proc(args.num_proc))
print("Too-long documents filtered")

dataset_filtering = DatasetFiltering(
dataset=dataset,
lang_dataset_id=args.lang_dataset_id,
Expand All @@ -128,9 +148,16 @@ def main():
num_proc=check_num_proc(args.num_proc),
path_dir_save_dataset=args.path_dir_save_dataset,
)

print("Modifying documents")
dataset_filtering.modifying_documents()
print("Modifying step done")
print("Filtering documents")
dataset_filtering.filtering()
print("Filtering step done")
print("Saving dataset")
dataset_filtering.save_dataset()
print("Dataset saved")


if __name__ == "__main__":
Expand Down

0 comments on commit 6a31b32

Please sign in to comment.