From e5635d0e3ac5343ae9dc94dd01dcade940f9df2f Mon Sep 17 00:00:00 2001 From: Adam Thornton Date: Tue, 31 Dec 2024 16:48:45 -0700 Subject: [PATCH] WIP --- src/rsp_reaper/config.py | 57 ++++++++++-- src/rsp_reaper/models/image.py | 137 ++++++++++++++++++++++++---- src/rsp_reaper/models/rsptag.py | 126 ++++++++++++++----------- src/rsp_reaper/storage/dockerhub.py | 14 +-- src/rsp_reaper/storage/gar.py | 70 +++++++------- src/rsp_reaper/storage/ghcr.py | 14 +-- src/rsp_reaper/storage/registry.py | 25 +++-- 7 files changed, 303 insertions(+), 140 deletions(-) diff --git a/src/rsp_reaper/config.py b/src/rsp_reaper/config.py index fbca659..25957a3 100644 --- a/src/rsp_reaper/config.py +++ b/src/rsp_reaper/config.py @@ -1,9 +1,7 @@ """Configuration for a reaper for a particular container registry.""" from dataclasses import dataclass - -from .models.registry_category import RegistryCategory - +from pathlib import Path @dataclass class RegistryAuth: @@ -13,14 +11,61 @@ class RegistryAuth: username: str password: str +class LatestSemverKeepers: + minor: int | None + patch: int | None + build: int | None + +class OlderSemverKeepers: + major: int | None + minor: int | None + patch: int | None + build: int | None + +@dataclass +class SemverKeepers: + """Within each of latest_major and older, how many minor versions, + how many patch versions within each of those, and how many builds of each + of those, to keep. Older also has a major version number. For instance, + older.major might be 3, and then when version 5.0 came out, you would + keep some images for the 2.x.y, 3.x.y, and 4.x.y series, but no 1.x images. + """ + latest_major: LatestSemverKeepers @dataclass -class ContainerRegistryConfig: - """Configuration for a particular container registry.""" +class RSPKeepers: + """Aliases are never purged. + """ + release: int | None + weekly: int | None + daily: int | None + release_candidate: int | None + experimental: int | None + unknown: int | None + +@dataclass +class KeepPolicy: + """How many of each image category to keep. `-1` or `None` means + "don't reap that category at all". `0` means "purge them all". + """ + untagged: int | None + semver: SemverKeepers | None + rsp: RSPKeepers | None +@dataclass +class RegistryConfig: namespace: str repository: str registry: str - category: RegistryCategory + category: str + keep: KeepPolicy project: str | None = None auth: RegistryAuth | None = None + dry_run: bool = True + debug: bool = True + input_file: Path | None = None + +@dataclass +class Config: + """Configuration for multiple registries.""" + registries: list[RegistryConfig] diff --git a/src/rsp_reaper/models/image.py b/src/rsp_reaper/models/image.py index b71d346..6a2bb8e 100644 --- a/src/rsp_reaper/models/image.py +++ b/src/rsp_reaper/models/image.py @@ -9,7 +9,7 @@ import semver -from .rsptag import RSPImageTag +from .rsptag import RSPImageTag, RSPImageTagCollection, RSPImageType DATEFMT = "%Y-%m-%dT%H:%M:%S.%f%z" LATEST_TAGS = ("latest", "latest_release", "latest_weekly", "latest_daily") @@ -18,10 +18,11 @@ class ImageVersionClass(Enum): - """Images are versioned with either RSP tags or semver tags.""" + """Tagged images are versioned with either RSP tags or semver tags.""" RSP = "rsp" SEMVER = "semver" + UNTAGGED = "untagged" @total_ordering @@ -40,7 +41,7 @@ class Image: date: datetime.datetime | None = None id: int | None = None rsp_image_tag: RSPImageTag | None = None - semver_tag: semver.Version | None = None + semver_tag: semver.VersionInfo | None = None version_class: ImageVersionClass | None = None def __eq__(self, other: object) -> bool: @@ -65,27 +66,71 @@ def _compare(self, other: object) -> int: int or NotImplemented 0 if equal, -1 if self is less than other, 1 if self is greater than other, `NotImplemented` if they're not comparable. + + Notes + ----- + Because we're using this to sort images, which should not have both + RSP and Semver tags, but can certainly be untagged, we are going to + play pretty fast and loose with NotImplemented. Effectively, untagged + images and images whose tag types we cannot parse into some meaningful + order will get shoved to the bottom of the list. """ if not isinstance(other, Image): return NotImplemented + if self.digest == other.digest: + # Tags are not relevant. It's the same image. return 0 - # If they have the same type of tags, sort on those - if self.rsp_image_tag is not None and other.rsp_image_tag is not None: - return self._compare_rsp_image_tags(other.rsp_image_tag) + # If they have the same type of tags, sort on those, and if the tags + # don't tell us, sort on date. - if self.semver_tag is not None and other.semver_tag is not None: - return self._compare_semver_tags(other.semver_tag) + return self._compare_by_class(other) - # Untagged sorts to the bottom - if self.tags and not other.tags: - return 1 - if other.tags and not self.tags: - return -1 + def _compare_by_class(self, other_image: Self) -> int: + myclass = self.version_class + otherclass = other_image.version_class - # Both untagged? Sort by date - return self._compare_dates(other) + if myclass == otherclass: + if myclass == ImageVersionClass.RSP: + return self._compare_rsptags(other_image) + elif myclass == ImageVersionClass.SEMVER: + return self._compare_semver(other_image) + else: + return self._compare_dates(other_image) + else: + # Untagged sorts to the bottom + if ( + myclass != ImageVersionClass.UNTAGGED + and otherclass == ImageVersionClass.UNTAGGED + ): + return 1 + elif ( + myclass == ImageVersionClass.UNTAGGED + and otherclass != ImageVersionClass.UNTAGGED + ): + return -1 + # Can't compare a Semver and an RSP image + return NotImplemented + + def _compare_rsptags(self, other_image: Self) -> int: + if self.rsp_image_tag is None: + raise ValueError(f"{self} rsp_image_tag cannot be None") + if other_image.rsp_image_tag is None: + raise ValueError(f"{other_image} rsp_image_tag cannot be None") + return self.rsp_image_tag.compare(other_image.rsp_image_tag) + + def _compare_semver(self, other_image: Self) -> int: + if self.semver_tag is None: + raise ValueError(f"{self} semver_tag cannot be None") + if other_image.semver_tag is None: + raise ValueError(f"{other_image} semver_tag cannot be None") + if self.semver_tag == other_image.semver_tag: + return 0 + if self.semver_tag < other_image.semver_tag: + return -1 + else: + return 1 def _compare_rsp_image_tags(self, other_tag: RSPImageTag) -> int: if self.rsp_image_tag is None: @@ -96,7 +141,7 @@ def _compare_rsp_image_tags(self, other_tag: RSPImageTag) -> int: return 1 return 0 - def _compare_semver_tags(self, other_tag: semver.Version) -> int: + def _compare_semver_tags(self, other_tag: semver.VersionInfo) -> int: if self.semver_tag is None: raise ValueError("semver_tag is None") if self.semver_tag < other_tag: @@ -116,12 +161,21 @@ def _compare_dates(self, other: Self) -> int: return -1 if other.date and not self.date: return 1 - # Give up - return NotImplemented + return self._compare_digests(other) + + def _compare_digests(self, other: Self) -> int: + if self.digest == other.digest: + return 0 + if self.digest < other.digest: + return -1 + return 1 def to_dict(self) -> JSONImage: # Differs from asdict, in that set and datetime aren't # JSON-serializable, so we make them a list and a string. + # + # We will just drop the semver/RSP tag fields, and rebuild them + # on load. self_dict = asdict(self) list_tags: list[str] = [] if self.tags: @@ -136,6 +190,43 @@ def to_dict(self) -> JSONImage: def to_json(self) -> str: return json.dumps(self.to_dict()) + def apply_best_tag(self) -> None: + """Choose the best tag (preferring RSP to semver) for an image.""" + collection = RSPImageTagCollection.from_tag_names( + list(self.tags), aliases=set(), cycle=None + ) + self.rsp_image_tag = collection.best_tag() + if self.rsp_image_tag is not None: + self.semver_tag = self.rsp_image_tag.version + if self.rsp_image_tag.image_type == RSPImageType.UNKNOWN: + self.semver_tag = self._semver_from_tags() + if self.semver_tag is None: + self.semver_tag = self._generate_semver() + + def _semver_from_tags(self) -> semver.VersionInfo | None: + raw_tags = list(self.tags) + best_semver: semver.Version | None = None + for tag in raw_tags: + try: + sv = semver.Version.parse(tag) + if best_semver is None or best_semver < sv: + best_semver = sv + except (ValueError, TypeError): + continue + return best_semver + + def _generate_semver(self) -> semver.VersionInfo: + datestr = "unknown-date" + if self.date is not None: + datestr = ( + self.date.isoformat() + .replace(":", "-") + .replace("+", "plus") + .replace(".", "-") + ) + digstr = self.digest.replace(":", "-") + return semver.Version.parse(f"0.0.0-{datestr}+{digstr}") + @classmethod def from_json(cls, inp: JSONImage | str) -> Self: """Much painful assertion that each field is the right type.""" @@ -157,6 +248,14 @@ def from_json(cls, inp: JSONImage | str) -> Self: i_i = inp["id"] if i_i and isinstance(i_i, int): new_id = i_i - return cls( + new_obj = cls( digest=inp["digest"], tags=new_tags, date=new_date, id=new_id ) + new_obj.apply_best_tag() + if not new_tags: + new_obj.version_class = ImageVersionClass.UNTAGGED + elif new_obj.rsp_image_tag is not None: + new_obj.version_class = ImageVersionClass.RSP + else: + new_obj.version_class = ImageVersionClass.SEMVER + return new_obj diff --git a/src/rsp_reaper/models/rsptag.py b/src/rsp_reaper/models/rsptag.py index bbd22d8..3b74834 100644 --- a/src/rsp_reaper/models/rsptag.py +++ b/src/rsp_reaper/models/rsptag.py @@ -201,14 +201,82 @@ def from_str(cls, tag: str) -> Self: ) def __eq__(self, other: object) -> bool: - return self._compare(other) == 0 + return self.compare(other) == 0 def __lt__(self, other: object) -> bool: - order = self._compare(other) + order = self.compare(other) if order is NotImplemented: return NotImplemented return order == -1 + def compare(self, other: object) -> int: + """Compare to image tags for sorting purposes. + + Parameters + ---------- + other + The other object, potentially an image tag. + + Returns + ------- + int or NotImplemented + 0 if equal, -1 if self is less than other, 1 if self is greater + than other, `NotImplemented` if they're not comparable. + + Notes + ----- + Because we want to sort the whole list of images, we're playing kind of + fast and loose with comparability. If the images are different types, + we sort them by the priority order of the tag. + """ + if not isinstance(other, RSPImageTag): + return NotImplemented + if self.image_type != other.image_type: + mypriority = self.tag_category_priority() + otherpriority = other.tag_category_priority() + if mypriority < otherpriority: + return 1 + if not (self.version and other.version): + if self.tag == other.tag: + return 0 + return -1 if self.tag < other.tag else 1 + rank = self.version.compare(other.version) + if rank != 0: + return rank + + # semver ignores the build for sorting purposes, but we don't want to + # since we want newer cycles to sort ahead of older cycles (and newer + # cycle builds to sort above older cycle builds) in otherwise matching + # tags, and the cycle information is stored in the build. + if self.version.build == other.version.build: + return 0 + elif self.version.build: + if not other.version.build: + return 1 + else: + return -1 if self.version.build < other.version.build else 1 + else: + return -1 if other.version.build else 0 + + def tag_category_priority(self) -> int: + """Given a tag, return a number representing a rank; lower is better. + + Returns + ------- + int + Tag priority rank. Lower is better. + """ + priority: dict[RSPImageType, int] = {} + for idx, entry in enumerate(RSPImageType): + if entry == RSPImageType.ALIAS: + continue + priority[entry] = idx + # Alias types are worse than UNKNOWN for this purpose (that is, + # they sort to the top of the spawner display, but they are + # useless for 'best tag' purposes + priority[RSPImageType.ALIAS] = priority[RSPImageType.UNKNOWN] + 1 + return priority[self.image_type] + @classmethod def _from_match( cls, image_type: RSPImageType, match: re.Match, tag: str @@ -355,46 +423,6 @@ def _determine_build( else: return rest if rest else None - def _compare(self, other: object) -> int: - """Compare to image tags for sorting purposes. - - Parameters - ---------- - other - The other object, potentially an image tag. - - Returns - ------- - int or NotImplemented - 0 if equal, -1 if self is less than other, 1 if self is greater - than other, `NotImplemented` if they're not comparable. - """ - if not isinstance(other, RSPImageTag): - return NotImplemented - if self.image_type != other.image_type: - return NotImplemented - if not (self.version and other.version): - if self.tag == other.tag: - return 0 - return -1 if self.tag < other.tag else 1 - rank = self.version.compare(other.version) - if rank != 0: - return rank - - # semver ignores the build for sorting purposes, but we don't want to - # since we want newer cycles to sort ahead of older cycles (and newer - # cycle builds to sort above older cycle builds) in otherwise matching - # tags, and the cycle information is stored in the build. - if self.version.build == other.version.build: - return 0 - elif self.version.build: - if not other.version.build: - return 1 - else: - return -1 if self.version.build < other.version.build else 1 - else: - return -1 if other.version.build else 0 - class RSPImageTagCollection: """Hold and perform operations on a set of `RSPImageTag` objects. @@ -478,20 +506,14 @@ def best_tag(self) -> RSPImageTag | None: Alias tags are excluded from consideration in this case. """ - priority: dict[RSPImageType, int] = {} - for idx, entry in enumerate(RSPImageType): - if entry == RSPImageType.ALIAS: - continue - priority[entry] = idx chosen: RSPImageTag | None = None rank: int | None = None for tag in self._by_tag: rsptag = RSPImageTag.from_str(tag) - prio = priority.get(rsptag.image_type, None) - if prio is not None: - if rank is None or rank > prio: - rank = prio - chosen = rsptag + prio = rsptag.tag_category_priority() + if rank is None or rank > prio: + rank = prio + chosen = rsptag return chosen def subset( diff --git a/src/rsp_reaper/storage/dockerhub.py b/src/rsp_reaper/storage/dockerhub.py index a876871..f041d98 100644 --- a/src/rsp_reaper/storage/dockerhub.py +++ b/src/rsp_reaper/storage/dockerhub.py @@ -7,6 +7,7 @@ import datetime import json from pathlib import Path +from typing import cast import httpx import structlog @@ -111,15 +112,8 @@ def debug_load_images(self, inputfile: Path) -> None: jsons = inp["data"] self._images = {} for digest in jsons: - tags = jsons[digest]["tags"] - date = jsons[digest]["date"] - self._images[digest] = Image( - digest=digest, - tags=set(tags), - date=datetime.datetime.strptime(date, DATEFMT).astimezone( - datetime.UTC - ), - ) + obj = cast(JSONImage, jsons[digest]) + self._images[digest] = Image.from_json(obj) def _find_untagged(self) -> list[Image]: untagged: list[Image] = [] @@ -129,7 +123,7 @@ def _find_untagged(self) -> list[Image]: untagged.append(img) return untagged - def deprecated_delete_untagged(self) -> None: + def delete_untagged(self) -> None: """Delete all untagged images.""" ### This API goes away Nov. 15, 2023. Possibly December 11. # diff --git a/src/rsp_reaper/storage/gar.py b/src/rsp_reaper/storage/gar.py index f51f3fc..93b239a 100644 --- a/src/rsp_reaper/storage/gar.py +++ b/src/rsp_reaper/storage/gar.py @@ -3,6 +3,7 @@ import datetime import json from pathlib import Path +from typing import cast import structlog from google.cloud.artifactregistry_v1 import ( @@ -12,7 +13,8 @@ ) from google.cloud.artifactregistry_v1.types import DockerImage -from ..models.image import DATEFMT, Image, JSONImage +from ..models.image import Image, JSONImage +from ..config import RegistryConfig from ..models.registry_category import RegistryCategory from .registry import ContainerRegistryClient @@ -26,31 +28,36 @@ class GARClient(ContainerRegistryClient): def __init__( self, - location: str, - project_id: str, - repository: str, - image: str, - *, - dry_run: bool = False, - input_file: Path | None = None, + cfg: RegistryConfig ) -> None: - self._location = location - self._project_id = project_id - self._repository = repository - self._image = image - self._registry = f"{location}-docker.pkg.dev" - self._parent = ( - f"projects/{project_id}/locations/{location}" - f"/repositories/{repository}" + if cfg.category != RegistryCategory.GAR.value: + raise ValueError( + "GAR registry client must have value " + f"'{RegistryCategory.GAR.value}', not '{cfg.category}'" + ) + self._category = cfg.category + if not cfg.project: + raise ValueError( + "GAR registry client must have 'project' set" + ) + super()._extract_registry_config(cfg) + gar_loc = "-docker.pkg.dev" + if not self._registry.endswith(gar_loc): + raise ValueError( + f"GAR registry location must end with '{gar_loc}'" + ) + location=self._registry[0:-(len(gar_loc))] + self._project_id = cfg.project + + self._parent:str = ( + f"projects/{self._project_id}/locations/{location}" + f"/repositories/{self._namespace}" ) # "path" is what everything else calls a repository - self._path = f"{project_id}/{repository}/{image}" - self._client = ArtifactRegistryClient() - self._logger = structlog.get_logger() - self._images: dict[str, Image] = {} - self._dry_run = dry_run - if input_file: - self.debug_load_images(input_file) + self._path: str = ( + f"{self._project_id}/{self._namespace}/{self._repository}" + ) + self._client: ArtifactRegistryClient = ArtifactRegistryClient() def scan_repo(self) -> None: images: list[DockerImage] = [] @@ -61,7 +68,7 @@ def scan_repo(self) -> None: count = 0 while True: self._logger.debug( - f"Requesting {self._path}: images " + f"Requesting {self._repository}: images " f"{count*page_size + 1}-{(count+1) * page_size}" ) resp = self._client.list_docker_images(request=request) @@ -119,18 +126,13 @@ def debug_load_images(self, inputfile: Path) -> None: jsons = inp["data"] self._images = {} for digest in jsons: - tags = jsons[digest]["tags"] - date = jsons[digest]["date"] - self._images[digest] = Image( - digest=digest, - tags=set(tags), - date=datetime.datetime.strptime(date, DATEFMT).astimezone( - datetime.UTC - ), - ) + obj = cast(JSONImage, jsons[digest]) + self._images[digest] = Image.from_json(obj) def _image_to_name(self, img: Image) -> str: - return f"{self._parent}/packages/{self._image}/versions/{img.digest}" + return ( + f"{self._parent}/packages/{self._repository}/versions/{img.digest}" + ) def _find_untagged(self) -> list[Image]: return [x for x in self._images.values() if not x.tags] diff --git a/src/rsp_reaper/storage/ghcr.py b/src/rsp_reaper/storage/ghcr.py index 5f266ac..77f4002 100644 --- a/src/rsp_reaper/storage/ghcr.py +++ b/src/rsp_reaper/storage/ghcr.py @@ -3,7 +3,7 @@ import datetime import json from pathlib import Path -from typing import Any +from typing import Any, cast import httpx import structlog @@ -104,17 +104,9 @@ def debug_load_images(self, inputfile: Path) -> None: self._images = {} self._image_by_id = {} for digest in jsons: - tags = jsons[digest]["image"]["tags"] - date = jsons[digest]["image"]["date"] id = jsons[digest]["id"] - img = Image( - digest=digest, - tags=set(tags), - date=datetime.datetime.strptime(date, DATEFMT).astimezone( - datetime.UTC - ), - id=id, - ) + obj = cast(JSONImage, jsons[digest]) + img = Image.from_json(obj) self._images[digest] = img self._image_by_id[id] = img diff --git a/src/rsp_reaper/storage/registry.py b/src/rsp_reaper/storage/registry.py index 2e79064..aaef4c4 100644 --- a/src/rsp_reaper/storage/registry.py +++ b/src/rsp_reaper/storage/registry.py @@ -4,8 +4,10 @@ from pathlib import Path from typing import Any +import structlog import semver +from ..config import RegistryConfig from ..models.image import Image, ImageVersionClass from ..models.rsptag import RSPImageTag, RSPImageTagCollection @@ -13,6 +15,19 @@ class ContainerRegistryClient: """Collection of methods we expect any registry client to provide.""" + def _extract_registry_config(self, cfg: RegistryConfig) -> None: + # Load the generic items from the registry config + self._registry = cfg.registry + self._repository = cfg.repository + self._namespace = cfg.namespace + self._category = cfg.category + self._dry_run = cfg.dry_run + self._debug = cfg.debug + if cfg.input_file: + self.debug_load_images(cfg.input_file) + self._images: dict[str, Image] = {} + self._logger = structlog.get_logger() + @abstractmethod def scan_repo(self) -> None: ... @@ -56,10 +71,7 @@ def _categorize_rsp(self) -> None: t_tag = raw_tags[0] if len(img.tags) > 0 else "unknown" rsp_image_tag = RSPImageTag.from_str(t_tag) img.rsp_image_tag = rsp_image_tag - self._sorted_images = sorted( - unsorted, - key=lambda img: img.rsp_image_tag, # type: ignore [arg-type, return-value] - ) + self._sorted_images = sorted(unsorted) def _categorize_semver(self) -> None: unsorted = [x for x in list(self._images.values()) if x is not None] @@ -76,7 +88,4 @@ def _categorize_semver(self) -> None: ) else: img.semver_tag = semver.Version.parse(f"0.0.0-{img.digest}") - self._sorted_images = sorted( - unsorted, - key=lambda img: img.semver_tag, # type: ignore [arg-type, return-value] - ) + self._sorted_images = sorted(unsorted)