Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Please support resume download for large files! #177

Open
Dancing-Github opened this issue Jan 16, 2025 · 0 comments
Open

Please support resume download for large files! #177

Dancing-Github opened this issue Jan 16, 2025 · 0 comments

Comments

@Dancing-Github
Copy link

Dancing-Github commented Jan 16, 2025

I'm downloading ADT dataset in poor network environment. The big files sometimes occurs error 404 , and then the downloaded part will be deleted. I've asked gpt to modify the download code like this:

def _DatasetDownloader__download_data_from_url(
self,
sequence: str,
data_type: str,
output_folder: str,
) -> Tuple[bool, int]:
status_code = STATUS_CODE_DEFAULT
is_success = True
sequence_data = self.sequences_data[sequence]

    if data_type not in sequence_data:
        print(f"Data type {data_type} is not available for sequence: {sequence}")
        return is_success, status_code

    network_link = sequence_data[data_type][self._DatasetDownloader__KEY_URL]
    if not network_link:
        print(
            f"Url for sequence '{sequence}', data type '{data_type}' is not available"
        )
        return is_success, status_code

    try:
        download_filename = sequence_data[data_type][self._DatasetDownloader__KEY_FILENAME]
        download_file_path = os.path.join(output_folder, download_filename + ".part")
        final_file_path = os.path.join(output_folder, download_filename)
        sha1sum = sequence_data[data_type][self._DatasetDownloader__KEY_CHECKSUM]

        existing_size = 0
        headers = {}

        if os.path.exists(download_file_path):
            existing_size = os.path.getsize(download_file_path)
            # We no longer check the checksum for partial files.
            print(f"Resuming download of {download_filename} from byte {existing_size}")
            headers['Range'] = f'bytes={existing_size}-'
        
        session = requests.Session()
        retries = Retry(
            total=5,
            backoff_factor=2,  # final try is 2*(2**5) = 64 seconds
            status_forcelist=[429, 500, 502, 503, 504],
        )
        session.mount("https://", HTTPAdapter(max_retries=retries))

        with session.get(network_link, stream=True, timeout=10, headers=headers) as r:
            r.raise_for_status()
            total_size_in_bytes = int(r.headers.get("content-length", 0)) + existing_size

            _, _, free_disk_in_bytes = shutil.disk_usage("/")
            if free_disk_in_bytes < total_size_in_bytes:
                raise Exception(
                    f"Insufficient disk space for sequence '{sequence}' data type '{data_type}'. Required \
                    {total_size_in_bytes} bytes, available {free_disk_in_bytes} bytes"
                )

            mode = 'ab' if existing_size > 0 else 'wb'
            with open(download_file_path, mode) as f:
                progress_bar = tqdm(
                    desc=f"Downloading {download_filename}",
                    initial=existing_size,
                    total=total_size_in_bytes,
                    unit="iB",
                    unit_scale=True,
                )
                for chunk in r.iter_content(chunk_size=CHUCK_SIZE_BYTE):
                    progress_bar.update(len(chunk))
                    f.write(chunk)
                progress_bar.close()
            status_code = r.status_code


        calculated_checksum = calculate_file_sha1(download_file_path)
        if sha1sum != calculated_checksum:
            error = f"different checksum value for sequence '{sequence}' and data type '{data_type}'\n"
            error += f"Calculated checksum: {calculated_checksum}, expected checksum: {sha1sum}"
            os.remove(download_file_path)  # Remove the partially downloaded file.
            raise Exception(error)

        os.rename(download_file_path, final_file_path)  # Rename .part to final filename once complete

        if is_zipfile(final_file_path):
            # unzip and reorganize
            with ZipFile(final_file_path) as zip_ref:
                unzipped_top_dir_name = sequence
                unzipped_dir = os.path.join(
                    output_folder, unzipped_top_dir_name
                )
                if not os.path.exists(unzipped_dir):
                    os.makedirs(unzipped_dir)

                # MOVE MPS FILES
                if data_type in MPS_DATA_TYPE_TO_SAVE_PATH.keys():
                    zip_ref.extractall(output_folder)
                    for local_path in zip_ref.namelist():
                        abs_path = os.path.join(output_folder, local_path)
                        if os.path.isfile(abs_path):
                            move_filename = os.path.basename(abs_path)
                            move_path = os.path.join(
                                Path(abs_path).parent,
                                MPS_DATA_TYPE_TO_SAVE_PATH[data_type],
                            )
                            os.makedirs(move_path, exist_ok=True)
                            shutil.move(
                                abs_path, os.path.join(move_path, move_filename)
                            )
                else:
                    zip_ref.extractall(output_folder)
        else:
            # No need to copy since we've already renamed it to the final path.
            pass
    except Exception as e:
        print(f"An error occurred: {e}. Status code: {status_code}")
        is_success = False

    return is_success, status_code
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant