diff --git a/galvasr2/align/export.py b/galvasr2/align/export.py index 71b1bddf..5409ca73 100644 --- a/galvasr2/align/export.py +++ b/galvasr2/align/export.py @@ -93,13 +93,13 @@ def get_sample_size(population_size): margin_of_error = 0.01 fraction_picking = 0.50 z_score = 2.58 # Corresponds to confidence level 99% - numerator = (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / ( - margin_of_error ** 2 + numerator = (z_score**2 * fraction_picking * (1 - fraction_picking)) / ( + margin_of_error**2 ) sample_size = 0 for train_size in range(population_size, 0, -1): - denominator = 1 + (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / ( - margin_of_error ** 2 * train_size + denominator = 1 + (z_score**2 * fraction_picking * (1 - fraction_picking)) / ( + margin_of_error**2 * train_size ) sample_size = int(numerator / denominator) if 2 * sample_size + train_size <= population_size: diff --git a/galvasr2/align/spark/align_cuda_decoder.py b/galvasr2/align/spark/align_cuda_decoder.py index a0d9c61d..baaf7117 100644 --- a/galvasr2/align/spark/align_cuda_decoder.py +++ b/galvasr2/align/spark/align_cuda_decoder.py @@ -147,7 +147,7 @@ def main(argv): mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( "SC_PHYS_PAGES" ) # e.g. 4015976448 - mem_gib = int((mem_bytes / (1024.0 ** 3)) * 0.9) + mem_gib = int((mem_bytes / (1024.0**3)) * 0.9) (jar_path,) = galvasr2.__path__ jars = ",".join(glob.glob(os.path.join(jar_path, "*.jar"))) print("GALVEZ:jars=", jars) diff --git a/galvasr2/align/spark/align_lib_test.py b/galvasr2/align/spark/align_lib_test.py index 147e1b8a..e8a450ca 100644 --- a/galvasr2/align/spark/align_lib_test.py +++ b/galvasr2/align/spark/align_lib_test.py @@ -128,7 +128,7 @@ def setUpClass(cls): mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( "SC_PHYS_PAGES" ) # e.g. 4015976448 - mem_gib = int((mem_bytes / (1024.0 ** 3)) * 0.9) + mem_gib = int((mem_bytes / (1024.0**3)) * 0.9) tar_jar = os.path.join( find_runfiles(), "__main__/galvasr2/spark/tar_spark_datasource.jar" ) diff --git a/scripts/archive.org/download_items.py b/scripts/archive.org/download_items.py index 3ba0e763..a236958e 100644 --- a/scripts/archive.org/download_items.py +++ b/scripts/archive.org/download_items.py @@ -27,7 +27,8 @@ # TODO: Consider adding AND NOT format:ASR to disclude speech recognition-based labels. # ALL_CAPTIONED_DATA=f"{LICENSE_WHITELIST} AND (mediatype:audio OR mediatype:movies) AND (closed_captioning:yes OR format:SubRip OR format:\"Web Video Text Tracks\") AND (NOT access-restricted-item:TRUE)", # NON_CAPTIONED_DATA_WITH_TEXT=f"{LICENSE_WHITELIST} AND (format:DjVuTXT AND format:MP3 AND NOT format:SubRip) AND NOT (subject:'librivox')", - CC_BY_SA_EXPANDED_LICENSES_FILTERED_ACCESS=f'{LICENSE_WHITELIST} AND (mediatype:audio OR mediatype:movies) AND (closed_captioning:yes OR format:SubRip OR format:"Web Video Text Tracks") AND (NOT access-restricted-item:TRUE)', + # CC_BY_SA_EXPANDED_LICENSES_FILTERED_ACCESS=f'{LICENSE_WHITELIST} AND (mediatype:audio OR mediatype:movies) AND (closed_captioning:yes OR format:SubRip OR format:"Web Video Text Tracks") AND (NOT access-restricted-item:TRUE)', + CC_BY_SA_ALL_AUDIO_LABELED_OR_UNLABELED=f"{LICENSE_WHITELIST} AND (mediatype:audio OR mediatype:movies) AND (NOT access-restricted-item:TRUE)", ) @@ -89,10 +90,11 @@ def get_metadata(result: dict): print(query) print(f"Dumping metadata for {key}") save_file = key + ".jsonl.gz" - # download_metadata(query, save_file) + download_metadata(query, save_file) download_data( save_file, - f"gs://the-peoples-speech-west-europe/archive_org/Mar_7_2021/{key}", + # Ryan: Change this output path + f"download_output/{key}", ) # download_data("CAPTIONED_DATA.jsonl.gz", "gs://the-peoples-speech-west-europe/archive_org/Aug_18_2020/CAPTIONED_DATA")