Please support resume download for large files！ #177

Dancing-Github · 2025-01-16T16:31:45Z

I'm downloading ADT dataset in poor network environment. The big files sometimes occurs error 404 , and then the downloaded part will be deleted. I've asked gpt to modify the download code like this:

def _DatasetDownloader__download_data_from_url(
self,
sequence: str,
data_type: str,
output_folder: str,
) -> Tuple[bool, int]:
status_code = STATUS_CODE_DEFAULT
is_success = True
sequence_data = self.sequences_data[sequence]

    if data_type not in sequence_data:
        print(f"Data type {data_type} is not available for sequence: {sequence}")
        return is_success, status_code

    network_link = sequence_data[data_type][self._DatasetDownloader__KEY_URL]
    if not network_link:
        print(
            f"Url for sequence '{sequence}', data type '{data_type}' is not available"
        )
        return is_success, status_code

    try:
        download_filename = sequence_data[data_type][self._DatasetDownloader__KEY_FILENAME]
        download_file_path = os.path.join(output_folder, download_filename + ".part")
        final_file_path = os.path.join(output_folder, download_filename)
        sha1sum = sequence_data[data_type][self._DatasetDownloader__KEY_CHECKSUM]

        existing_size = 0
        headers = {}

        if os.path.exists(download_file_path):
            existing_size = os.path.getsize(download_file_path)
            # We no longer check the checksum for partial files.
            print(f"Resuming download of {download_filename} from byte {existing_size}")
            headers['Range'] = f'bytes={existing_size}-'
        
        session = requests.Session()
        retries = Retry(
            total=5,
            backoff_factor=2,  # final try is 2*(2**5) = 64 seconds
            status_forcelist=[429, 500, 502, 503, 504],
        )
        session.mount("https://", HTTPAdapter(max_retries=retries))

        with session.get(network_link, stream=True, timeout=10, headers=headers) as r:
            r.raise_for_status()
            total_size_in_bytes = int(r.headers.get("content-length", 0)) + existing_size

            _, _, free_disk_in_bytes = shutil.disk_usage("/")
            if free_disk_in_bytes < total_size_in_bytes:
                raise Exception(
                    f"Insufficient disk space for sequence '{sequence}' data type '{data_type}'. Required \
                    {total_size_in_bytes} bytes, available {free_disk_in_bytes} bytes"
                )

            mode = 'ab' if existing_size > 0 else 'wb'
            with open(download_file_path, mode) as f:
                progress_bar = tqdm(
                    desc=f"Downloading {download_filename}",
                    initial=existing_size,
                    total=total_size_in_bytes,
                    unit="iB",
                    unit_scale=True,
                )
                for chunk in r.iter_content(chunk_size=CHUCK_SIZE_BYTE):
                    progress_bar.update(len(chunk))
                    f.write(chunk)
                progress_bar.close()
            status_code = r.status_code


        calculated_checksum = calculate_file_sha1(download_file_path)
        if sha1sum != calculated_checksum:
            error = f"different checksum value for sequence '{sequence}' and data type '{data_type}'\n"
            error += f"Calculated checksum: {calculated_checksum}, expected checksum: {sha1sum}"
            os.remove(download_file_path)  # Remove the partially downloaded file.
            raise Exception(error)

        os.rename(download_file_path, final_file_path)  # Rename .part to final filename once complete

        if is_zipfile(final_file_path):
            # unzip and reorganize
            with ZipFile(final_file_path) as zip_ref:
                unzipped_top_dir_name = sequence
                unzipped_dir = os.path.join(
                    output_folder, unzipped_top_dir_name
                )
                if not os.path.exists(unzipped_dir):
                    os.makedirs(unzipped_dir)

                # MOVE MPS FILES
                if data_type in MPS_DATA_TYPE_TO_SAVE_PATH.keys():
                    zip_ref.extractall(output_folder)
                    for local_path in zip_ref.namelist():
                        abs_path = os.path.join(output_folder, local_path)
                        if os.path.isfile(abs_path):
                            move_filename = os.path.basename(abs_path)
                            move_path = os.path.join(
                                Path(abs_path).parent,
                                MPS_DATA_TYPE_TO_SAVE_PATH[data_type],
                            )
                            os.makedirs(move_path, exist_ok=True)
                            shutil.move(
                                abs_path, os.path.join(move_path, move_filename)
                            )
                else:
                    zip_ref.extractall(output_folder)
        else:
            # No need to copy since we've already renamed it to the final path.
            pass
    except Exception as e:
        print(f"An error occurred: {e}. Status code: {status_code}")
        is_success = False

    return is_success, status_code

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Please support resume download for large files！ #177

Please support resume download for large files！ #177

Dancing-Github commented Jan 16, 2025 •

edited

Loading

Please support resume download for large files！ #177

Please support resume download for large files！ #177

Comments

Dancing-Github commented Jan 16, 2025 • edited Loading

Dancing-Github commented Jan 16, 2025 •

edited

Loading