From 5329c91e743414b09e955d3049e964cfac3c8be0 Mon Sep 17 00:00:00 2001 From: Chris Kuehl Date: Wed, 15 May 2024 14:04:06 -0700 Subject: [PATCH 1/4] Support traditional "simple" HTML registries --- pypi_browser/app.py | 22 ++++++- pypi_browser/packaging.py | 54 ++++++++++++++++ pypi_browser/pypi.py | 127 +++++++++++++++++++++++++++----------- 3 files changed, 165 insertions(+), 38 deletions(-) diff --git a/pypi_browser/app.py b/pypi_browser/app.py index ef5259d..e0b1968 100644 --- a/pypi_browser/app.py +++ b/pypi_browser/app.py @@ -74,9 +74,16 @@ async def dispatch( config = starlette.config.Config() +pypi_url = config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org').rstrip('/') +repo: pypi.PythonRepository +if pypi_url.endswith('/simple'): + repo = pypi.SimpleRepository(pypi_url) +else: + repo = pypi.LegacyJsonRepository(pypi_url) + pypi_config = pypi.PyPIConfig( + repo=repo, cache_path=config('PYPI_BROWSER_PACKAGE_CACHE_PATH', default='/tmp'), - pypi_url=config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org'), ) templates = Jinja2Templates( @@ -115,16 +122,25 @@ async def package(request: Request) -> Response: return RedirectResponse(request.url_for('package', package=normalized_package_name)) try: - version_to_files = await pypi.files_for_package(pypi_config, package_name) + version_to_files = await pypi.files_by_version(pypi_config, package_name) except pypi.PackageDoesNotExist: return PlainTextResponse( f'Package {package_name!r} does not exist on PyPI.', status_code=404, ) else: + def _version_sort_key(version: str | None) -> packaging.version.Version: + if version is not None: + try: + return packaging.version.parse(version) + except packaging.version.InvalidVersion: + pass + # Not really correct, but just throw everything we can't parse at the bottom. + return packaging.version.Version('0.0.0') + version_to_files_sorted = sorted( version_to_files.items(), - key=lambda item: packaging.version.parse(item[0]), + key=lambda item: _version_sort_key(item[0]), reverse=True, ) return templates.TemplateResponse( diff --git a/pypi_browser/packaging.py b/pypi_browser/packaging.py index ab10a98..b667c58 100644 --- a/pypi_browser/packaging.py +++ b/pypi_browser/packaging.py @@ -12,10 +12,64 @@ from types import TracebackType +# Copied from distlib/wheel.py +WHEEL_FILENAME_RE = re.compile(r''' +(?P[^-]+) +-(?P\d+[^-]*) +(-(?P\d+[^-]*))? +-(?P\w+\d+(\.\w+\d+)*) +-(?P\w+) +-(?P\w+(\.\w+)*) +\.whl$ +''', re.IGNORECASE | re.VERBOSE) + + def pep426_normalize(package_name: str) -> str: return re.sub(r'[-_.]+', '-', package_name.strip()).lower() +def _remove_extension(name: str) -> str: + if name.endswith(('gz', 'bz2')): + name, _ = name.rsplit('.', 1) + name, _ = name.rsplit('.', 1) + return name + + +def guess_version_from_filename(filename: str) -> str | None: + # Inspired by https://github.com/chriskuehl/dumb-pypi/blob/a71c3cfeba6/dumb_pypi/main.py#L56 + if filename.endswith('.whl'): + # TODO: Switch to packaging.utils.parse_wheel_filename which enforces + # PEP440 versions for wheels. + m = WHEEL_FILENAME_RE.match(filename) + if m is not None: + return m.group('vn') + else: + raise ValueError(f'Invalid package name: {filename}') + else: + # These don't have a well-defined format like wheels do, so they are + # sort of "best effort", with lots of tests to back them up. + # The most important thing is to correctly parse the name. + name = _remove_extension(filename) + version = None + + if '-' in name: + if name.count('-') == 1: + name, version = name.split('-') + else: + parts = name.split('-') + for i in range(len(parts) - 1, 0, -1): + part = parts[i] + if '.' in part and re.search('[0-9]', part): + name, version = '-'.join(parts[0:i]), '-'.join(parts[i:]) + + # Possible with poorly-named files. + if len(name) <= 0: + raise ValueError(f'Invalid package name: {filename}') + + assert version is None or len(version) > 0, version + return version + + class UnsupportedPackageType(Exception): pass diff --git a/pypi_browser/pypi.py b/pypi_browser/pypi.py index 8506518..cccd551 100644 --- a/pypi_browser/pypi.py +++ b/pypi_browser/pypi.py @@ -1,6 +1,9 @@ +import abc import base64 +import collections import contextlib import dataclasses +import html.parser import itertools import os.path import typing @@ -9,38 +12,98 @@ import aiofiles.os import httpx +from pypi_browser import packaging + + +class PythonRepository(abc.ABC): + + @abc.abstractmethod + async def files_for_package(self, package_name: str) -> typing.Dict[str, str]: + """Return mapping from filename to file URL for files in a package.""" + + +class HTMLAnchorParser(html.parser.HTMLParser): + anchors: set[str] + + def __init__(self) -> None: + super().__init__() + self.anchors = set() + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag == 'a': + if href := dict(attrs).get('href'): + self.anchors.add(href) + @dataclasses.dataclass(frozen=True) -class PyPIConfig: - cache_path: str +class SimpleRepository(PythonRepository): + """Old-style "simple" PyPI registry serving HTML files.""" + # TODO: Also handle PEP691 JSON simple repositories. pypi_url: str + async def files_for_package(self, package_name: str) -> typing.Dict[str, str]: + async with httpx.AsyncClient() as client: + resp = await client.get( + f'{self.pypi_url}/{package_name}', + follow_redirects=True, + ) + if resp.status_code == 404: + raise PackageDoesNotExist(package_name) + parser = HTMLAnchorParser() + parser.feed(resp.text) -class PackageDoesNotExist(Exception): - pass + def clean_url(url: str) -> str: + parsed = urllib.parse.urlparse(urllib.parse.urljoin(str(resp.url), url)) + return parsed._replace(fragment='').geturl() + return { + (urllib.parse.urlparse(url).path).split('/')[-1]: clean_url(url) + for url in parser.anchors + } -async def package_metadata( - config: PyPIConfig, - client: httpx.AsyncClient, - package: str, -) -> typing.Dict[typing.Any, typing.Any]: - resp = await client.get(f'{config.pypi_url}/pypi/{package}/json') - if resp.status_code == 404: - raise PackageDoesNotExist(package) - resp.raise_for_status() - return resp.json() +@dataclasses.dataclass(frozen=True) +class LegacyJsonRepository(PythonRepository): + """Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints.""" + pypi_url: str -async def files_for_package(config: PyPIConfig, package: str) -> typing.Dict[str, typing.Set[str]]: - async with httpx.AsyncClient() as client: - metadata = await package_metadata(config, client, package) + async def files_for_package(self, package_name: str) -> typing.Dict[str, str]: + async with httpx.AsyncClient() as client: + resp = await client.get( + f'{self.pypi_url}/pypi/{package_name}/json', + follow_redirects=True, + ) + if resp.status_code == 404: + raise PackageDoesNotExist(package_name) + resp.raise_for_status() + return { + file_['filename']: urllib.parse.urljoin(str(resp.url), file_['url']) + for file_ in itertools.chain.from_iterable(resp.json()['releases'].values()) + } - return { - version: {file_['filename'] for file_ in files} - for version, files in metadata['releases'].items() - if len(files) > 0 - } + +@dataclasses.dataclass(frozen=True) +class PyPIConfig: + repo: PythonRepository + cache_path: str + + +class PackageDoesNotExist(Exception): + pass + + +async def files_by_version(config: PyPIConfig, package: str) -> typing.Dict[str | None, typing.Set[str]]: + ret = collections.defaultdict(set) + for filename in await config.repo.files_for_package(package): + try: + version = packaging.guess_version_from_filename(filename) + except ValueError: + # Possible with some very poorly-formed packages that used to be + # allowed on PyPI. Just skip them when this happens. + pass + else: + ret[version].add(filename) + return ret class CannotFindFileError(Exception): @@ -81,21 +144,15 @@ async def downloaded_file_path(config: PyPIConfig, package: str, filename: str) if await aiofiles.os.path.exists(stored_path): return stored_path - async with httpx.AsyncClient() as client: - metadata = await package_metadata(config, client, package) - - # Parsing versions from non-wheel Python packages isn't perfectly - # reliable, so just search through all releases until we find a - # matching file. - for file_ in itertools.chain.from_iterable(metadata['releases'].values()): - if file_['filename'] == filename: - url = urllib.parse.urljoin(config.pypi_url, file_['url']) - break - else: - raise CannotFindFileError(package, filename) + filename_to_url = await config.repo.files_for_package(package) + try: + url = filename_to_url[filename] + except KeyError: + raise CannotFindFileError(package, filename) - await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True) + await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True) + async with httpx.AsyncClient() as client: async with _atomic_file(stored_path) as f: async with client.stream('GET', url) as resp: resp.raise_for_status() From 4dbf23ed2afba9453bc70a3efb3147c74a37b8d2 Mon Sep 17 00:00:00 2001 From: Chris Kuehl Date: Thu, 16 May 2024 09:00:32 -0700 Subject: [PATCH 2/4] pre-commit autoupdate --- .pre-commit-config.yaml | 5 ++--- pypi_browser/packaging.py | 18 ++++++++++-------- pypi_browser/pypi.py | 10 +++++----- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d820913..7449ceb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,17 +14,16 @@ repos: rev: v3.12.0 hooks: - id: reorder-python-imports - args: ['--py38-plus'] + args: ['--py39-plus'] - repo: https://github.com/asottile/add-trailing-comma rev: v3.1.0 hooks: - id: add-trailing-comma - args: ['--py36-plus'] - repo: https://github.com/asottile/pyupgrade rev: v3.15.2 hooks: - id: pyupgrade - args: ['--py38-plus'] + args: ['--py310-plus'] - repo: https://github.com/hhatto/autopep8 rev: v2.1.0 hooks: diff --git a/pypi_browser/packaging.py b/pypi_browser/packaging.py index b667c58..95e3715 100644 --- a/pypi_browser/packaging.py +++ b/pypi_browser/packaging.py @@ -13,7 +13,8 @@ # Copied from distlib/wheel.py -WHEEL_FILENAME_RE = re.compile(r''' +WHEEL_FILENAME_RE = re.compile( + r''' (?P[^-]+) -(?P\d+[^-]*) (-(?P\d+[^-]*))? @@ -21,7 +22,8 @@ -(?P\w+) -(?P\w+(\.\w+)*) \.whl$ -''', re.IGNORECASE | re.VERBOSE) +''', re.IGNORECASE | re.VERBOSE, +) def pep426_normalize(package_name: str) -> str: @@ -92,7 +94,7 @@ class PackageEntry: size: int -def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]: +def _package_entries_from_zipfile(path: str) -> set[PackageEntry]: with zipfile.ZipFile(path) as zf: return { PackageEntry( @@ -105,7 +107,7 @@ def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]: } -def _package_entries_from_tarball(path: str) -> typing.Set[PackageEntry]: +def _package_entries_from_tarball(path: str) -> set[PackageEntry]: with tarfile.open(path) as tf: return { PackageEntry( @@ -130,9 +132,9 @@ async def __aenter__(self) -> 'AsyncArchiveFile': async def __aexit__( self, - exc_t: typing.Optional[typing.Type[BaseException]], - exc_v: typing.Optional[BaseException], - exc_tb: typing.Optional[TracebackType], + exc_t: type[BaseException] | None, + exc_v: BaseException | None, + exc_tb: TracebackType | None, ) -> None: await asyncio.to_thread(self.file_.close) @@ -171,7 +173,7 @@ def from_path(cls, path: str) -> 'Package': path=path, ) - async def entries(self) -> typing.Set[PackageEntry]: + async def entries(self) -> set[PackageEntry]: if self.package_format is PackageFormat.ZIPFILE: return await asyncio.to_thread(_package_entries_from_zipfile, self.path) elif self.package_format is PackageFormat.TARBALL: diff --git a/pypi_browser/pypi.py b/pypi_browser/pypi.py index cccd551..e1764e0 100644 --- a/pypi_browser/pypi.py +++ b/pypi_browser/pypi.py @@ -18,7 +18,7 @@ class PythonRepository(abc.ABC): @abc.abstractmethod - async def files_for_package(self, package_name: str) -> typing.Dict[str, str]: + async def files_for_package(self, package_name: str) -> dict[str, str]: """Return mapping from filename to file URL for files in a package.""" @@ -41,7 +41,7 @@ class SimpleRepository(PythonRepository): # TODO: Also handle PEP691 JSON simple repositories. pypi_url: str - async def files_for_package(self, package_name: str) -> typing.Dict[str, str]: + async def files_for_package(self, package_name: str) -> dict[str, str]: async with httpx.AsyncClient() as client: resp = await client.get( f'{self.pypi_url}/{package_name}', @@ -67,7 +67,7 @@ class LegacyJsonRepository(PythonRepository): """Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints.""" pypi_url: str - async def files_for_package(self, package_name: str) -> typing.Dict[str, str]: + async def files_for_package(self, package_name: str) -> dict[str, str]: async with httpx.AsyncClient() as client: resp = await client.get( f'{self.pypi_url}/pypi/{package_name}/json', @@ -92,9 +92,9 @@ class PackageDoesNotExist(Exception): pass -async def files_by_version(config: PyPIConfig, package: str) -> typing.Dict[str | None, typing.Set[str]]: +async def files_by_version(config: PyPIConfig, package: str) -> dict[str | None, set[str]]: ret = collections.defaultdict(set) - for filename in await config.repo.files_for_package(package): + for filename in await config.repo.files_for_package(package): try: version = packaging.guess_version_from_filename(filename) except ValueError: From da4bd63017edeff7dd57d07dfb1a89438eee063a Mon Sep 17 00:00:00 2001 From: Chris Kuehl Date: Thu, 16 May 2024 09:09:32 -0700 Subject: [PATCH 3/4] Display "None" versions as "(unparseable version)" --- pypi_browser/templates/package.html | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pypi_browser/templates/package.html b/pypi_browser/templates/package.html index dd2fa41..7263f9e 100644 --- a/pypi_browser/templates/package.html +++ b/pypi_browser/templates/package.html @@ -21,7 +21,15 @@

{{package}}

{% for version, files in version_to_files %}