You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I'm downloading ADT dataset in poor network environment. The big files sometimes occurs error 404 , and then the downloaded part will be deleted. I've asked gpt to modify the download code like this:
if data_type not in sequence_data:
print(f"Data type {data_type} is not available for sequence: {sequence}")
return is_success, status_code
network_link = sequence_data[data_type][self._DatasetDownloader__KEY_URL]
if not network_link:
print(
f"Url for sequence '{sequence}', data type '{data_type}' is not available"
)
return is_success, status_code
try:
download_filename = sequence_data[data_type][self._DatasetDownloader__KEY_FILENAME]
download_file_path = os.path.join(output_folder, download_filename + ".part")
final_file_path = os.path.join(output_folder, download_filename)
sha1sum = sequence_data[data_type][self._DatasetDownloader__KEY_CHECKSUM]
existing_size = 0
headers = {}
if os.path.exists(download_file_path):
existing_size = os.path.getsize(download_file_path)
# We no longer check the checksum for partial files.
print(f"Resuming download of {download_filename} from byte {existing_size}")
headers['Range'] = f'bytes={existing_size}-'
session = requests.Session()
retries = Retry(
total=5,
backoff_factor=2, # final try is 2*(2**5) = 64 seconds
status_forcelist=[429, 500, 502, 503, 504],
)
session.mount("https://", HTTPAdapter(max_retries=retries))
with session.get(network_link, stream=True, timeout=10, headers=headers) as r:
r.raise_for_status()
total_size_in_bytes = int(r.headers.get("content-length", 0)) + existing_size
_, _, free_disk_in_bytes = shutil.disk_usage("/")
if free_disk_in_bytes < total_size_in_bytes:
raise Exception(
f"Insufficient disk space for sequence '{sequence}' data type '{data_type}'. Required \
{total_size_in_bytes} bytes, available {free_disk_in_bytes} bytes"
)
mode = 'ab' if existing_size > 0 else 'wb'
with open(download_file_path, mode) as f:
progress_bar = tqdm(
desc=f"Downloading {download_filename}",
initial=existing_size,
total=total_size_in_bytes,
unit="iB",
unit_scale=True,
)
for chunk in r.iter_content(chunk_size=CHUCK_SIZE_BYTE):
progress_bar.update(len(chunk))
f.write(chunk)
progress_bar.close()
status_code = r.status_code
calculated_checksum = calculate_file_sha1(download_file_path)
if sha1sum != calculated_checksum:
error = f"different checksum value for sequence '{sequence}' and data type '{data_type}'\n"
error += f"Calculated checksum: {calculated_checksum}, expected checksum: {sha1sum}"
os.remove(download_file_path) # Remove the partially downloaded file.
raise Exception(error)
os.rename(download_file_path, final_file_path) # Rename .part to final filename once complete
if is_zipfile(final_file_path):
# unzip and reorganize
with ZipFile(final_file_path) as zip_ref:
unzipped_top_dir_name = sequence
unzipped_dir = os.path.join(
output_folder, unzipped_top_dir_name
)
if not os.path.exists(unzipped_dir):
os.makedirs(unzipped_dir)
# MOVE MPS FILES
if data_type in MPS_DATA_TYPE_TO_SAVE_PATH.keys():
zip_ref.extractall(output_folder)
for local_path in zip_ref.namelist():
abs_path = os.path.join(output_folder, local_path)
if os.path.isfile(abs_path):
move_filename = os.path.basename(abs_path)
move_path = os.path.join(
Path(abs_path).parent,
MPS_DATA_TYPE_TO_SAVE_PATH[data_type],
)
os.makedirs(move_path, exist_ok=True)
shutil.move(
abs_path, os.path.join(move_path, move_filename)
)
else:
zip_ref.extractall(output_folder)
else:
# No need to copy since we've already renamed it to the final path.
pass
except Exception as e:
print(f"An error occurred: {e}. Status code: {status_code}")
is_success = False
return is_success, status_code
The text was updated successfully, but these errors were encountered:
I'm downloading ADT dataset in poor network environment. The big files sometimes occurs error 404 , and then the downloaded part will be deleted. I've asked gpt to modify the download code like this:
def _DatasetDownloader__download_data_from_url(
self,
sequence: str,
data_type: str,
output_folder: str,
) -> Tuple[bool, int]:
status_code = STATUS_CODE_DEFAULT
is_success = True
sequence_data = self.sequences_data[sequence]
The text was updated successfully, but these errors were encountered: