diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d820913..7449ceb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,17 +14,16 @@ repos: rev: v3.12.0 hooks: - id: reorder-python-imports - args: ['--py38-plus'] + args: ['--py39-plus'] - repo: https://github.com/asottile/add-trailing-comma rev: v3.1.0 hooks: - id: add-trailing-comma - args: ['--py36-plus'] - repo: https://github.com/asottile/pyupgrade rev: v3.15.2 hooks: - id: pyupgrade - args: ['--py38-plus'] + args: ['--py310-plus'] - repo: https://github.com/hhatto/autopep8 rev: v2.1.0 hooks: diff --git a/README.md b/README.md index eba11df..55bb4f1 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,17 @@ You can set these environment variables to configure the server: * `PYPI_BROWSER_PYPI_URL`: URL for the PyPI server to use (defaults to `https://pypi.org`) + + If your registry supports the pypi.org-compatible JSON API (e.g. + `{registry}/pypi/{package}/json`), specify your base registry URL without + appending `/simple` (e.g. `https://my-registry`). + + If your registry only supports the traditional HTML "simple" index, specify + the registry URL with `/simple` at the end (e.g. + `https://my-registry/simple`). + + Note that the [PEP691][pep691] JSON-based "simple" API is not yet supported. + * `PYPI_BROWSER_PACKAGE_CACHE_PATH`: Filesystem path to use for caching downloaded files. This will grow forever (the app does not clean it up) so you may want to use `tmpreaper` or similar to manage its size. @@ -77,3 +88,5 @@ $ make start-dev ``` to run a copy of the application locally with hot reloading enabled. + +[pep691]: https://peps.python.org/pep-0691/ diff --git a/pypi_browser/app.py b/pypi_browser/app.py index ef5259d..e0b1968 100644 --- a/pypi_browser/app.py +++ b/pypi_browser/app.py @@ -74,9 +74,16 @@ async def dispatch( config = starlette.config.Config() +pypi_url = config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org').rstrip('/') +repo: pypi.PythonRepository +if pypi_url.endswith('/simple'): + repo = pypi.SimpleRepository(pypi_url) +else: + repo = pypi.LegacyJsonRepository(pypi_url) + pypi_config = pypi.PyPIConfig( + repo=repo, cache_path=config('PYPI_BROWSER_PACKAGE_CACHE_PATH', default='/tmp'), - pypi_url=config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org'), ) templates = Jinja2Templates( @@ -115,16 +122,25 @@ async def package(request: Request) -> Response: return RedirectResponse(request.url_for('package', package=normalized_package_name)) try: - version_to_files = await pypi.files_for_package(pypi_config, package_name) + version_to_files = await pypi.files_by_version(pypi_config, package_name) except pypi.PackageDoesNotExist: return PlainTextResponse( f'Package {package_name!r} does not exist on PyPI.', status_code=404, ) else: + def _version_sort_key(version: str | None) -> packaging.version.Version: + if version is not None: + try: + return packaging.version.parse(version) + except packaging.version.InvalidVersion: + pass + # Not really correct, but just throw everything we can't parse at the bottom. + return packaging.version.Version('0.0.0') + version_to_files_sorted = sorted( version_to_files.items(), - key=lambda item: packaging.version.parse(item[0]), + key=lambda item: _version_sort_key(item[0]), reverse=True, ) return templates.TemplateResponse( diff --git a/pypi_browser/packaging.py b/pypi_browser/packaging.py index ab10a98..95e3715 100644 --- a/pypi_browser/packaging.py +++ b/pypi_browser/packaging.py @@ -12,10 +12,66 @@ from types import TracebackType +# Copied from distlib/wheel.py +WHEEL_FILENAME_RE = re.compile( + r''' +(?P[^-]+) +-(?P\d+[^-]*) +(-(?P\d+[^-]*))? +-(?P\w+\d+(\.\w+\d+)*) +-(?P\w+) +-(?P\w+(\.\w+)*) +\.whl$ +''', re.IGNORECASE | re.VERBOSE, +) + + def pep426_normalize(package_name: str) -> str: return re.sub(r'[-_.]+', '-', package_name.strip()).lower() +def _remove_extension(name: str) -> str: + if name.endswith(('gz', 'bz2')): + name, _ = name.rsplit('.', 1) + name, _ = name.rsplit('.', 1) + return name + + +def guess_version_from_filename(filename: str) -> str | None: + # Inspired by https://github.com/chriskuehl/dumb-pypi/blob/a71c3cfeba6/dumb_pypi/main.py#L56 + if filename.endswith('.whl'): + # TODO: Switch to packaging.utils.parse_wheel_filename which enforces + # PEP440 versions for wheels. + m = WHEEL_FILENAME_RE.match(filename) + if m is not None: + return m.group('vn') + else: + raise ValueError(f'Invalid package name: {filename}') + else: + # These don't have a well-defined format like wheels do, so they are + # sort of "best effort", with lots of tests to back them up. + # The most important thing is to correctly parse the name. + name = _remove_extension(filename) + version = None + + if '-' in name: + if name.count('-') == 1: + name, version = name.split('-') + else: + parts = name.split('-') + for i in range(len(parts) - 1, 0, -1): + part = parts[i] + if '.' in part and re.search('[0-9]', part): + name, version = '-'.join(parts[0:i]), '-'.join(parts[i:]) + + # Possible with poorly-named files. + if len(name) <= 0: + raise ValueError(f'Invalid package name: {filename}') + + assert version is None or len(version) > 0, version + return version + + class UnsupportedPackageType(Exception): pass @@ -38,7 +94,7 @@ class PackageEntry: size: int -def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]: +def _package_entries_from_zipfile(path: str) -> set[PackageEntry]: with zipfile.ZipFile(path) as zf: return { PackageEntry( @@ -51,7 +107,7 @@ def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]: } -def _package_entries_from_tarball(path: str) -> typing.Set[PackageEntry]: +def _package_entries_from_tarball(path: str) -> set[PackageEntry]: with tarfile.open(path) as tf: return { PackageEntry( @@ -76,9 +132,9 @@ async def __aenter__(self) -> 'AsyncArchiveFile': async def __aexit__( self, - exc_t: typing.Optional[typing.Type[BaseException]], - exc_v: typing.Optional[BaseException], - exc_tb: typing.Optional[TracebackType], + exc_t: type[BaseException] | None, + exc_v: BaseException | None, + exc_tb: TracebackType | None, ) -> None: await asyncio.to_thread(self.file_.close) @@ -117,7 +173,7 @@ def from_path(cls, path: str) -> 'Package': path=path, ) - async def entries(self) -> typing.Set[PackageEntry]: + async def entries(self) -> set[PackageEntry]: if self.package_format is PackageFormat.ZIPFILE: return await asyncio.to_thread(_package_entries_from_zipfile, self.path) elif self.package_format is PackageFormat.TARBALL: diff --git a/pypi_browser/pypi.py b/pypi_browser/pypi.py index 8506518..e1764e0 100644 --- a/pypi_browser/pypi.py +++ b/pypi_browser/pypi.py @@ -1,6 +1,9 @@ +import abc import base64 +import collections import contextlib import dataclasses +import html.parser import itertools import os.path import typing @@ -9,38 +12,98 @@ import aiofiles.os import httpx +from pypi_browser import packaging + + +class PythonRepository(abc.ABC): + + @abc.abstractmethod + async def files_for_package(self, package_name: str) -> dict[str, str]: + """Return mapping from filename to file URL for files in a package.""" + + +class HTMLAnchorParser(html.parser.HTMLParser): + anchors: set[str] + + def __init__(self) -> None: + super().__init__() + self.anchors = set() + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag == 'a': + if href := dict(attrs).get('href'): + self.anchors.add(href) + @dataclasses.dataclass(frozen=True) -class PyPIConfig: - cache_path: str +class SimpleRepository(PythonRepository): + """Old-style "simple" PyPI registry serving HTML files.""" + # TODO: Also handle PEP691 JSON simple repositories. pypi_url: str + async def files_for_package(self, package_name: str) -> dict[str, str]: + async with httpx.AsyncClient() as client: + resp = await client.get( + f'{self.pypi_url}/{package_name}', + follow_redirects=True, + ) + if resp.status_code == 404: + raise PackageDoesNotExist(package_name) + parser = HTMLAnchorParser() + parser.feed(resp.text) -class PackageDoesNotExist(Exception): - pass + def clean_url(url: str) -> str: + parsed = urllib.parse.urlparse(urllib.parse.urljoin(str(resp.url), url)) + return parsed._replace(fragment='').geturl() + return { + (urllib.parse.urlparse(url).path).split('/')[-1]: clean_url(url) + for url in parser.anchors + } -async def package_metadata( - config: PyPIConfig, - client: httpx.AsyncClient, - package: str, -) -> typing.Dict[typing.Any, typing.Any]: - resp = await client.get(f'{config.pypi_url}/pypi/{package}/json') - if resp.status_code == 404: - raise PackageDoesNotExist(package) - resp.raise_for_status() - return resp.json() +@dataclasses.dataclass(frozen=True) +class LegacyJsonRepository(PythonRepository): + """Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints.""" + pypi_url: str -async def files_for_package(config: PyPIConfig, package: str) -> typing.Dict[str, typing.Set[str]]: - async with httpx.AsyncClient() as client: - metadata = await package_metadata(config, client, package) + async def files_for_package(self, package_name: str) -> dict[str, str]: + async with httpx.AsyncClient() as client: + resp = await client.get( + f'{self.pypi_url}/pypi/{package_name}/json', + follow_redirects=True, + ) + if resp.status_code == 404: + raise PackageDoesNotExist(package_name) + resp.raise_for_status() + return { + file_['filename']: urllib.parse.urljoin(str(resp.url), file_['url']) + for file_ in itertools.chain.from_iterable(resp.json()['releases'].values()) + } - return { - version: {file_['filename'] for file_ in files} - for version, files in metadata['releases'].items() - if len(files) > 0 - } + +@dataclasses.dataclass(frozen=True) +class PyPIConfig: + repo: PythonRepository + cache_path: str + + +class PackageDoesNotExist(Exception): + pass + + +async def files_by_version(config: PyPIConfig, package: str) -> dict[str | None, set[str]]: + ret = collections.defaultdict(set) + for filename in await config.repo.files_for_package(package): + try: + version = packaging.guess_version_from_filename(filename) + except ValueError: + # Possible with some very poorly-formed packages that used to be + # allowed on PyPI. Just skip them when this happens. + pass + else: + ret[version].add(filename) + return ret class CannotFindFileError(Exception): @@ -81,21 +144,15 @@ async def downloaded_file_path(config: PyPIConfig, package: str, filename: str) if await aiofiles.os.path.exists(stored_path): return stored_path - async with httpx.AsyncClient() as client: - metadata = await package_metadata(config, client, package) - - # Parsing versions from non-wheel Python packages isn't perfectly - # reliable, so just search through all releases until we find a - # matching file. - for file_ in itertools.chain.from_iterable(metadata['releases'].values()): - if file_['filename'] == filename: - url = urllib.parse.urljoin(config.pypi_url, file_['url']) - break - else: - raise CannotFindFileError(package, filename) + filename_to_url = await config.repo.files_for_package(package) + try: + url = filename_to_url[filename] + except KeyError: + raise CannotFindFileError(package, filename) - await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True) + await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True) + async with httpx.AsyncClient() as client: async with _atomic_file(stored_path) as f: async with client.stream('GET', url) as resp: resp.raise_for_status() diff --git a/pypi_browser/templates/package.html b/pypi_browser/templates/package.html index dd2fa41..7263f9e 100644 --- a/pypi_browser/templates/package.html +++ b/pypi_browser/templates/package.html @@ -21,7 +21,15 @@

{{package}}

{% for version, files in version_to_files %}
-
{{version}}
+
+
+ {% if version is not none %} + {{version}} + {% else %} + (unparseable version) + {% endif %} +
+
{% for file in files|sort %}