From efff647a332f9520e7b7d7565893bd16ab26e041 Mon Sep 17 00:00:00 2001 From: Hsin-Yuan Hsieh <84929237+Jerome-Hsieh@users.noreply.github.com> Date: Sat, 21 Dec 2024 22:18:23 +0800 Subject: [PATCH] enhance download_and_extract (#8216) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #5463 ### Description According to issue, the error messages are not very intuitive. I think maybe we can check if the file name matches the downloaded file’s base name before starting the download. If it doesn’t match, it will notify user. ### Types of changes - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. --------- Signed-off-by: jerome_Hsieh Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- monai/apps/utils.py | 39 +++++++++++++++++++++++++++++- tests/test_download_and_extract.py | 3 ++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index c2e17d3247..95c1450f2a 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -15,6 +15,7 @@ import json import logging import os +import re import shutil import sys import tarfile @@ -30,7 +31,9 @@ from monai.config.type_definitions import PathLike from monai.utils import look_up_option, min_version, optional_import +requests, has_requests = optional_import("requests") gdown, has_gdown = optional_import("gdown", "4.7.3") +BeautifulSoup, has_bs4 = optional_import("bs4", name="BeautifulSoup") if TYPE_CHECKING: from tqdm import tqdm @@ -298,6 +301,29 @@ def extractall( ) +def get_filename_from_url(data_url: str) -> str: + """ + Get the filename from the URL link. + """ + try: + response = requests.head(data_url, allow_redirects=True) + content_disposition = response.headers.get("Content-Disposition") + if content_disposition: + filename = re.findall('filename="?([^";]+)"?', content_disposition) + if filename: + return str(filename[0]) + if "drive.google.com" in data_url: + response = requests.get(data_url) + if "text/html" in response.headers.get("Content-Type", ""): + soup = BeautifulSoup(response.text, "html.parser") + filename_div = soup.find("span", {"class": "uc-name-size"}) + if filename_div: + return str(filename_div.find("a").text) + return _basename(data_url) + except Exception as e: + raise Exception(f"Error processing URL: {e}") from e + + def download_and_extract( url: str, filepath: PathLike = "", @@ -327,7 +353,18 @@ def download_and_extract( be False. progress: whether to display progress bar. """ + url_filename_ext = "".join(Path(get_filename_from_url(url)).suffixes) + filepath_ext = "".join(Path(_basename(filepath)).suffixes) + if filepath not in ["", "."]: + if filepath_ext == "": + new_filepath = Path(filepath).with_suffix(url_filename_ext) + logger.warning( + f"filepath={filepath}, which missing file extension. Auto-appending extension to: {new_filepath}" + ) + filepath = new_filepath + if filepath_ext and filepath_ext != url_filename_ext: + raise ValueError(f"File extension mismatch: expected extension {url_filename_ext}, but get {filepath_ext}") with tempfile.TemporaryDirectory() as tmp_dir: - filename = filepath or Path(tmp_dir, _basename(url)).resolve() + filename = filepath or Path(tmp_dir, get_filename_from_url(url)).resolve() download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) diff --git a/tests/test_download_and_extract.py b/tests/test_download_and_extract.py index 555f7dc250..439a11bbc1 100644 --- a/tests/test_download_and_extract.py +++ b/tests/test_download_and_extract.py @@ -20,9 +20,10 @@ from parameterized import parameterized from monai.apps import download_and_extract, download_url, extractall -from tests.utils import skip_if_downloading_fails, skip_if_quick, testing_data_config +from tests.utils import SkipIfNoModule, skip_if_downloading_fails, skip_if_quick, testing_data_config +@SkipIfNoModule("requests") class TestDownloadAndExtract(unittest.TestCase): @skip_if_quick