🚜 Address bandit issues (openvinotoolkit#1152)

* Fix metadata path * Address bandit issues * Address codacy issues * Changed the bandit configuration file * Address PR comments * bandit fix.
orobix · Jul 20, 2023 · 9323985 · 9323985
1 parent 2083c51
commit 9323985
Show file tree

Hide file tree

Showing 8 changed files with 107 additions and 60 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,29 +15,29 @@ repos:
 
   # python code formatting
   - repo: https://github.com/psf/black
-    rev: 23.1.0
+    rev: 23.3.0
     hooks:
       - id: black
 
   # Ruff version.
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: "v0.0.253"
+    rev: "v0.0.275"
     hooks:
       - id: ruff
         exclude: "tests"
         args: ["--fix"]
 
   # python static type checking
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.0.1"
+    rev: "v1.4.1"
     hooks:
       - id: mypy
         additional_dependencies: [types-PyYAML]
         exclude: "tests"
 
   # notebooks.
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.6.3
+    rev: 1.7.0
     hooks:
       - id: nbqa-black
       - id: nbqa-ruff
@@ -46,12 +46,12 @@ repos:
         args: ["--ignore=I001"]
 
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.0-alpha.4
+    rev: v3.0.0-alpha.9-for-vscode
     hooks:
       - id: prettier
 
   - repo: https://github.com/igorshubovych/markdownlint-cli
-    rev: v0.33.0
+    rev: v0.35.0
     hooks:
       - id: markdownlint
 
@@ -62,3 +62,9 @@ repos:
         name: Lint Dockerfiles
         description: Runs hadolint to lint Dockerfiles
         args: ["--ignore", "DL3008"]
+  - repo: https://github.com/PyCQA/bandit
+    rev: 1.7.5
+    hooks:
+      - id: bandit
+        args: ["-c", ".ci/ipas_default.config"]
+        additional_dependencies: ["bandit"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -108,6 +108,7 @@ source = [
     ".tox/*/site-packages",
 ]
 
+
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 # NBQA CONFIGURATION                                                          #
 [tool.nbqa.addopts]

diff --git a/src/anomalib/data/utils/augmenter.py b/src/anomalib/data/utils/augmenter.py
@@ -97,8 +97,8 @@ def generate_perturbation(
         perlin_scale = 6
         min_perlin_scale = 0
 
-        perlin_scalex = 2 ** random.randint(min_perlin_scale, perlin_scale)
-        perlin_scaley = 2 ** random.randint(min_perlin_scale, perlin_scale)
+        perlin_scalex = 2 ** random.randint(min_perlin_scale, perlin_scale)  # nosec: B311
+        perlin_scaley = 2 ** random.randint(min_perlin_scale, perlin_scale)  # nosec: B311
 
         perlin_noise = random_2d_perlin((nextpow2(height), nextpow2(width)), (perlin_scalex, perlin_scaley))[
             :height, :width

diff --git a/src/anomalib/data/utils/download.py b/src/anomalib/data/utils/download.py
@@ -9,10 +9,11 @@
 import io
 import logging
 import os
+import re
 import tarfile
 from dataclasses import dataclass
 from pathlib import Path
-from tarfile import TarError, TarFile
+from tarfile import TarFile, TarInfo
 from typing import Iterable
 from urllib.request import urlretrieve
 from zipfile import ZipFile
@@ -205,6 +206,37 @@ def update_to(self, chunk_number: int = 1, max_chunk_size: int = 1, total_size=N
         self.update(chunk_number * max_chunk_size - self.n)
 
 
+def is_file_potentially_dangerous(file_name: str) -> bool:
+    """Check if a file is potentially dangerous.
+
+    Args:
+        file_name (str): Filename.
+
+    Returns:
+        bool: True if the member is potentially dangerous, False otherwise.
+
+    """
+    # Some example criteria. We could expand this.
+    unsafe_patterns = ["/etc/", "/root/"]
+    for pattern in unsafe_patterns:
+        if re.search(pattern, file_name):
+            return True
+    return False
+
+
+def safe_extract(tar_file: TarFile, root: Path, members: list[TarInfo]) -> None:
+    """Extract safe members from a tar archive.
+
+    Args:
+        tar_file (TarFile): TarFile object.
+        root (Path): Root directory where the dataset will be stored.
+        members (List[TarInfo]): List of safe members to be extracted.
+
+    """
+    for member in members:
+        tar_file.extract(member, root)
+
+
 def hash_check(file_path: Path, expected_hash: str) -> None:
     """Raise assert error if hash does not match the calculated hash of the file.
 
@@ -214,7 +246,7 @@ def hash_check(file_path: Path, expected_hash: str) -> None:
     """
     with file_path.open("rb") as hash_file:
         assert (
-            hashlib.md5(hash_file.read()).hexdigest() == expected_hash
+            hashlib.new(name="md5", data=hash_file.read(), usedforsecurity=False).hexdigest() == expected_hash
         ), f"Downloaded file {file_path} does not match the required hash."
 
 
@@ -227,17 +259,26 @@ def extract(file_name: Path, root: Path) -> None:
 
     """
     logger.info("Extracting dataset into root folder.")
+
+    # Safely extract zip files
     if file_name.suffix == ".zip":
         with ZipFile(file_name, "r") as zip_file:
-            zip_file.extractall(root)
+            for file_info in zip_file.infolist():
+                if not is_file_potentially_dangerous(file_info.filename):
+                    zip_file.extract(file_info, root)
+
+    # Safely extract tar files.
     elif file_name.suffix in (".tar", ".gz", ".xz", ".tgz"):
         with tarfile.open(file_name) as tar_file:
-            safe_extract(tar_file, root)
+            members = tar_file.getmembers()
+            safe_members = [member for member in members if not is_file_potentially_dangerous(member.name)]
+            safe_extract(tar_file, root, safe_members)
+
     else:
         raise ValueError(f"Unrecognized file format: {file_name}")
 
     logger.info("Cleaning up files.")
-    (file_name).unlink()
+    file_name.unlink()
 
 
 def download_and_extract(root: Path, info: DownloadInfo) -> None:
@@ -286,19 +327,3 @@ def is_within_directory(directory: Path, target: Path):
     # TODO: replace with pathlib is_relative_to after switching to Python 3.10
     prefix = os.path.commonprefix([abs_directory, abs_target])
     return prefix == str(abs_directory)
-
-
-def safe_extract(tar_file: TarFile, path: str | Path = "."):
-    """Extract a tar file safely by first checking for attempted path traversal.
-
-    Args:
-        tar_file (TarFile): Tar file to be extracted
-        path (str | Path): path in which the extracted files will be placed
-    """
-    path = Path(path)
-    for member in tar_file.getmembers():
-        member_path = path / member.name
-        if not is_within_directory(path, member_path):
-            raise TarError("Attempted Path Traversal in Tar File")
-
-    tar_file.extractall(path)
diff --git a/src/anomalib/models/ai_vad/clip/clip.py b/src/anomalib/models/ai_vad/clip/clip.py
@@ -10,17 +10,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import hashlib
+import logging
 import os
-import urllib
 import warnings
 from typing import List, Union
+from urllib.parse import urlparse
 
+import requests
 import torch
 from PIL import Image
 from pkg_resources import packaging
 from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
 from tqdm import tqdm
 
+logger = logging.getLogger(__name__)
 from .model import build_model
 
 try:
@@ -50,36 +53,48 @@
 }
 
 
+def _verify_checksum(file_path: str, url: str) -> bool:
+    expected_sha256 = url.split("/")[-2]
+    sha256_hash = hashlib.sha256()
+
+    with open(file_path, "rb") as file:
+        for chunk in iter(lambda: file.read(4096), b""):
+            sha256_hash.update(chunk)
+
+    file_hash = sha256_hash.hexdigest()
+
+    return file_hash == expected_sha256
+
+
 def _download(url: str, root: str):
     os.makedirs(root, exist_ok=True)
-    filename = os.path.basename(url)
-
-    expected_sha256 = url.split("/")[-2]
+    filename = os.path.basename(urlparse(url).path)
     download_target = os.path.join(root, filename)
 
-    if os.path.exists(download_target) and not os.path.isfile(download_target):
-        raise RuntimeError(f"{download_target} exists and is not a regular file")
-
-    if os.path.isfile(download_target):
-        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+    if os.path.exists(download_target):
+        if not os.path.isfile(download_target):
+            raise FileExistsError(f"{download_target} exists and is not a regular file")
+        if _verify_checksum(download_target, url):
             return download_target
-        else:
-            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
-
-    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
-        with tqdm(
-            total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
-        ) as loop:
-            while True:
-                buffer = source.read(8192)
-                if not buffer:
-                    break
-
-                output.write(buffer)
-                loop.update(len(buffer))
-
-    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
-        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
+
+        logger.warning("%s exists, but the checksum does not match; re-downloading the file", download_target)
+        os.remove(download_target)
+
+    response = requests.get(url, stream=True, timeout=10.0)  # Timeout is for bandit security linter
+    response.raise_for_status()
+
+    total_size = int(response.headers.get("Content-Length", 0))
+
+    with open(download_target, "wb") as file, tqdm(
+        total=total_size, ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
+    ) as loop:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                file.write(chunk)
+                loop.update(len(chunk))
+
+    if not _verify_checksum(download_target, url):
+        raise RuntimeError("Model has been downloaded but the checksum does not match")
 
     return download_target
 

diff --git a/src/anomalib/models/efficient_ad/torch_model.py b/src/anomalib/models/efficient_ad/torch_model.py
@@ -278,8 +278,8 @@ def choose_random_aug_image(self, image: Tensor) -> Tensor:
             transforms.functional.adjust_saturation,
         ]
         # Sample an augmentation coefficient λ from the uniform distribution U(0.8, 1.2)
-        coefficient = random.uniform(0.8, 1.2)
-        transform_function = random.choice(transform_functions)
+        coefficient = random.uniform(0.8, 1.2)  # nosec: B311
+        transform_function = random.choice(transform_functions)  # nosec: B311
         return transform_function(image, coefficient)
 
     def forward(self, batch: Tensor, batch_imagenet: Tensor = None) -> Tensor | dict:

diff --git a/tests/pre_merge/utils/metrics/test_adaptive_threshold.py b/tests/pre_merge/utils/metrics/test_adaptive_threshold.py
@@ -46,8 +46,8 @@ def test_manual_threshold():
     config.metrics.image = ["F1Score"]
     config.metrics.pixel = ["F1Score"]
 
-    image_threshold = random.random()
-    pixel_threshold = random.random()
+    image_threshold = random.random()  # nosec: B311
+    pixel_threshold = random.random()  # nosec: B311
     config.metrics.threshold.manual_image = image_threshold
     config.metrics.threshold.manual_pixel = pixel_threshold
 

diff --git a/tools/benchmarking/utils/metrics.py b/tools/benchmarking/utils/metrics.py
@@ -87,7 +87,7 @@ def get_unique_key(str_len: int) -> str:
     Returns:
         str: Random string
     """
-    return "".join([random.choice(string.ascii_lowercase) for _ in range(str_len)])
+    return "".join([random.choice(string.ascii_lowercase) for _ in range(str_len)])  # nosec: B311
 
 
 def upload_to_wandb(