diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fc05a08c7c..2e816292c1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,13 +15,13 @@ repos: # python code formatting - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 23.3.0 hooks: - id: black # Ruff version. - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: "v0.0.253" + rev: "v0.0.275" hooks: - id: ruff exclude: "tests" @@ -29,7 +29,7 @@ repos: # python static type checking - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.0.1" + rev: "v1.4.1" hooks: - id: mypy additional_dependencies: [types-PyYAML] @@ -37,7 +37,7 @@ repos: # notebooks. - repo: https://github.com/nbQA-dev/nbQA - rev: 1.6.3 + rev: 1.7.0 hooks: - id: nbqa-black - id: nbqa-ruff @@ -46,12 +46,12 @@ repos: args: ["--ignore=I001"] - repo: https://github.com/pre-commit/mirrors-prettier - rev: v3.0.0-alpha.4 + rev: v3.0.0-alpha.9-for-vscode hooks: - id: prettier - repo: https://github.com/igorshubovych/markdownlint-cli - rev: v0.33.0 + rev: v0.35.0 hooks: - id: markdownlint @@ -62,3 +62,9 @@ repos: name: Lint Dockerfiles description: Runs hadolint to lint Dockerfiles args: ["--ignore", "DL3008"] + - repo: https://github.com/PyCQA/bandit + rev: 1.7.5 + hooks: + - id: bandit + args: ["-c", ".ci/ipas_default.config"] + additional_dependencies: ["bandit"] diff --git a/pyproject.toml b/pyproject.toml index e72fb7d30a..d25e846269 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,6 +108,7 @@ source = [ ".tox/*/site-packages", ] + # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # NBQA CONFIGURATION # [tool.nbqa.addopts] diff --git a/src/anomalib/data/utils/augmenter.py b/src/anomalib/data/utils/augmenter.py index 611bde8374..5ca1a49846 100644 --- a/src/anomalib/data/utils/augmenter.py +++ b/src/anomalib/data/utils/augmenter.py @@ -97,8 +97,8 @@ def generate_perturbation( perlin_scale = 6 min_perlin_scale = 0 - perlin_scalex = 2 ** random.randint(min_perlin_scale, perlin_scale) - perlin_scaley = 2 ** random.randint(min_perlin_scale, perlin_scale) + perlin_scalex = 2 ** random.randint(min_perlin_scale, perlin_scale) # nosec: B311 + perlin_scaley = 2 ** random.randint(min_perlin_scale, perlin_scale) # nosec: B311 perlin_noise = random_2d_perlin((nextpow2(height), nextpow2(width)), (perlin_scalex, perlin_scaley))[ :height, :width diff --git a/src/anomalib/data/utils/download.py b/src/anomalib/data/utils/download.py index f667547764..60be931e57 100644 --- a/src/anomalib/data/utils/download.py +++ b/src/anomalib/data/utils/download.py @@ -9,10 +9,11 @@ import io import logging import os +import re import tarfile from dataclasses import dataclass from pathlib import Path -from tarfile import TarError, TarFile +from tarfile import TarFile, TarInfo from typing import Iterable from urllib.request import urlretrieve from zipfile import ZipFile @@ -205,6 +206,37 @@ def update_to(self, chunk_number: int = 1, max_chunk_size: int = 1, total_size=N self.update(chunk_number * max_chunk_size - self.n) +def is_file_potentially_dangerous(file_name: str) -> bool: + """Check if a file is potentially dangerous. + + Args: + file_name (str): Filename. + + Returns: + bool: True if the member is potentially dangerous, False otherwise. + + """ + # Some example criteria. We could expand this. + unsafe_patterns = ["/etc/", "/root/"] + for pattern in unsafe_patterns: + if re.search(pattern, file_name): + return True + return False + + +def safe_extract(tar_file: TarFile, root: Path, members: list[TarInfo]) -> None: + """Extract safe members from a tar archive. + + Args: + tar_file (TarFile): TarFile object. + root (Path): Root directory where the dataset will be stored. + members (List[TarInfo]): List of safe members to be extracted. + + """ + for member in members: + tar_file.extract(member, root) + + def hash_check(file_path: Path, expected_hash: str) -> None: """Raise assert error if hash does not match the calculated hash of the file. @@ -214,7 +246,7 @@ def hash_check(file_path: Path, expected_hash: str) -> None: """ with file_path.open("rb") as hash_file: assert ( - hashlib.md5(hash_file.read()).hexdigest() == expected_hash + hashlib.new(name="md5", data=hash_file.read(), usedforsecurity=False).hexdigest() == expected_hash ), f"Downloaded file {file_path} does not match the required hash." @@ -227,17 +259,26 @@ def extract(file_name: Path, root: Path) -> None: """ logger.info("Extracting dataset into root folder.") + + # Safely extract zip files if file_name.suffix == ".zip": with ZipFile(file_name, "r") as zip_file: - zip_file.extractall(root) + for file_info in zip_file.infolist(): + if not is_file_potentially_dangerous(file_info.filename): + zip_file.extract(file_info, root) + + # Safely extract tar files. elif file_name.suffix in (".tar", ".gz", ".xz", ".tgz"): with tarfile.open(file_name) as tar_file: - safe_extract(tar_file, root) + members = tar_file.getmembers() + safe_members = [member for member in members if not is_file_potentially_dangerous(member.name)] + safe_extract(tar_file, root, safe_members) + else: raise ValueError(f"Unrecognized file format: {file_name}") logger.info("Cleaning up files.") - (file_name).unlink() + file_name.unlink() def download_and_extract(root: Path, info: DownloadInfo) -> None: @@ -286,19 +327,3 @@ def is_within_directory(directory: Path, target: Path): # TODO: replace with pathlib is_relative_to after switching to Python 3.10 prefix = os.path.commonprefix([abs_directory, abs_target]) return prefix == str(abs_directory) - - -def safe_extract(tar_file: TarFile, path: str | Path = "."): - """Extract a tar file safely by first checking for attempted path traversal. - - Args: - tar_file (TarFile): Tar file to be extracted - path (str | Path): path in which the extracted files will be placed - """ - path = Path(path) - for member in tar_file.getmembers(): - member_path = path / member.name - if not is_within_directory(path, member_path): - raise TarError("Attempted Path Traversal in Tar File") - - tar_file.extractall(path) diff --git a/src/anomalib/models/ai_vad/clip/clip.py b/src/anomalib/models/ai_vad/clip/clip.py index e5065f0b32..b2b85fe484 100644 --- a/src/anomalib/models/ai_vad/clip/clip.py +++ b/src/anomalib/models/ai_vad/clip/clip.py @@ -10,17 +10,20 @@ # SPDX-License-Identifier: Apache-2.0 import hashlib +import logging import os -import urllib import warnings from typing import List, Union +from urllib.parse import urlparse +import requests import torch from PIL import Image from pkg_resources import packaging from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor from tqdm import tqdm +logger = logging.getLogger(__name__) from .model import build_model try: @@ -50,36 +53,48 @@ } +def _verify_checksum(file_path: str, url: str) -> bool: + expected_sha256 = url.split("/")[-2] + sha256_hash = hashlib.sha256() + + with open(file_path, "rb") as file: + for chunk in iter(lambda: file.read(4096), b""): + sha256_hash.update(chunk) + + file_hash = sha256_hash.hexdigest() + + return file_hash == expected_sha256 + + def _download(url: str, root: str): os.makedirs(root, exist_ok=True) - filename = os.path.basename(url) - - expected_sha256 = url.split("/")[-2] + filename = os.path.basename(urlparse(url).path) download_target = os.path.join(root, filename) - if os.path.exists(download_target) and not os.path.isfile(download_target): - raise RuntimeError(f"{download_target} exists and is not a regular file") - - if os.path.isfile(download_target): - if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256: + if os.path.exists(download_target): + if not os.path.isfile(download_target): + raise FileExistsError(f"{download_target} exists and is not a regular file") + if _verify_checksum(download_target, url): return download_target - else: - warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") - - with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: - with tqdm( - total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024 - ) as loop: - while True: - buffer = source.read(8192) - if not buffer: - break - - output.write(buffer) - loop.update(len(buffer)) - - if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: - raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match") + + logger.warning("%s exists, but the checksum does not match; re-downloading the file", download_target) + os.remove(download_target) + + response = requests.get(url, stream=True, timeout=10.0) # Timeout is for bandit security linter + response.raise_for_status() + + total_size = int(response.headers.get("Content-Length", 0)) + + with open(download_target, "wb") as file, tqdm( + total=total_size, ncols=80, unit="iB", unit_scale=True, unit_divisor=1024 + ) as loop: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + file.write(chunk) + loop.update(len(chunk)) + + if not _verify_checksum(download_target, url): + raise RuntimeError("Model has been downloaded but the checksum does not match") return download_target diff --git a/src/anomalib/models/efficient_ad/torch_model.py b/src/anomalib/models/efficient_ad/torch_model.py index acda83f43c..7e9a50ed25 100644 --- a/src/anomalib/models/efficient_ad/torch_model.py +++ b/src/anomalib/models/efficient_ad/torch_model.py @@ -278,8 +278,8 @@ def choose_random_aug_image(self, image: Tensor) -> Tensor: transforms.functional.adjust_saturation, ] # Sample an augmentation coefficient λ from the uniform distribution U(0.8, 1.2) - coefficient = random.uniform(0.8, 1.2) - transform_function = random.choice(transform_functions) + coefficient = random.uniform(0.8, 1.2) # nosec: B311 + transform_function = random.choice(transform_functions) # nosec: B311 return transform_function(image, coefficient) def forward(self, batch: Tensor, batch_imagenet: Tensor = None) -> Tensor | dict: diff --git a/tests/pre_merge/utils/metrics/test_adaptive_threshold.py b/tests/pre_merge/utils/metrics/test_adaptive_threshold.py index 4b78c2a324..799f7b23e6 100644 --- a/tests/pre_merge/utils/metrics/test_adaptive_threshold.py +++ b/tests/pre_merge/utils/metrics/test_adaptive_threshold.py @@ -46,8 +46,8 @@ def test_manual_threshold(): config.metrics.image = ["F1Score"] config.metrics.pixel = ["F1Score"] - image_threshold = random.random() - pixel_threshold = random.random() + image_threshold = random.random() # nosec: B311 + pixel_threshold = random.random() # nosec: B311 config.metrics.threshold.manual_image = image_threshold config.metrics.threshold.manual_pixel = pixel_threshold diff --git a/tools/benchmarking/utils/metrics.py b/tools/benchmarking/utils/metrics.py index 7696fe0c73..b1b7373d65 100644 --- a/tools/benchmarking/utils/metrics.py +++ b/tools/benchmarking/utils/metrics.py @@ -87,7 +87,7 @@ def get_unique_key(str_len: int) -> str: Returns: str: Random string """ - return "".join([random.choice(string.ascii_lowercase) for _ in range(str_len)]) + return "".join([random.choice(string.ascii_lowercase) for _ in range(str_len)]) # nosec: B311 def upload_to_wandb(