From 7c4bf1edf1017881060b1ba74ea39400d37311db Mon Sep 17 00:00:00 2001 From: akashin Date: Sat, 30 Dec 2017 20:12:06 +0000 Subject: [PATCH] [data] Add checks to avoid downloading the same file twice. --- download_utils.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/download_utils.py b/download_utils.py index 135bb9bd..e9b89e59 100644 --- a/download_utils.py +++ b/download_utils.py @@ -26,16 +26,19 @@ def download_file(url, file_path): print("Removed incomplete download") -def download_from_github(version, fn, target_dir): +def download_from_github(version, fn, target_dir, force=False): url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn) file_path = os.path.join(target_dir, fn) + if os.path.exists(file_path) and not force: + print("File {} is already downloaded.".format(file_path)) + return download_file(url, file_path) -def sequential_downloader(version, fns, target_dir): +def sequential_downloader(version, fns, target_dir, force=False): os.makedirs(target_dir, exist_ok=True) for fn in fns: - download_from_github(version, fn, target_dir) + download_from_github(version, fn, target_dir, force=force) def link_all_files_from_dir(src_dir, dst_dir): @@ -54,7 +57,7 @@ def link_resources(): link_all_files_from_dir("../readonly/dataset/", ".") -def download_week1_resources(): +def download_week1_resources(force=False): sequential_downloader( "week1", [ @@ -63,11 +66,12 @@ def download_week1_resources(): "test.tsv", "text_prepare_tests.tsv", ], - "data" + "data", + force=force ) -def download_week2_resources(): +def download_week2_resources(force=False): sequential_downloader( "week2", [ @@ -75,5 +79,6 @@ def download_week2_resources(): "validation.txt", "test.txt", ], - "data" + "data", + force=force )