From 5329c91e743414b09e955d3049e964cfac3c8be0 Mon Sep 17 00:00:00 2001
From: Chris Kuehl
Date: Wed, 15 May 2024 14:04:06 -0700
Subject: [PATCH 1/4] Support traditional "simple" HTML registries
---
pypi_browser/app.py | 22 ++++++-
pypi_browser/packaging.py | 54 ++++++++++++++++
pypi_browser/pypi.py | 127 +++++++++++++++++++++++++++-----------
3 files changed, 165 insertions(+), 38 deletions(-)
diff --git a/pypi_browser/app.py b/pypi_browser/app.py
index ef5259d..e0b1968 100644
--- a/pypi_browser/app.py
+++ b/pypi_browser/app.py
@@ -74,9 +74,16 @@ async def dispatch(
config = starlette.config.Config()
+pypi_url = config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org').rstrip('/')
+repo: pypi.PythonRepository
+if pypi_url.endswith('/simple'):
+ repo = pypi.SimpleRepository(pypi_url)
+else:
+ repo = pypi.LegacyJsonRepository(pypi_url)
+
pypi_config = pypi.PyPIConfig(
+ repo=repo,
cache_path=config('PYPI_BROWSER_PACKAGE_CACHE_PATH', default='/tmp'),
- pypi_url=config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org'),
)
templates = Jinja2Templates(
@@ -115,16 +122,25 @@ async def package(request: Request) -> Response:
return RedirectResponse(request.url_for('package', package=normalized_package_name))
try:
- version_to_files = await pypi.files_for_package(pypi_config, package_name)
+ version_to_files = await pypi.files_by_version(pypi_config, package_name)
except pypi.PackageDoesNotExist:
return PlainTextResponse(
f'Package {package_name!r} does not exist on PyPI.',
status_code=404,
)
else:
+ def _version_sort_key(version: str | None) -> packaging.version.Version:
+ if version is not None:
+ try:
+ return packaging.version.parse(version)
+ except packaging.version.InvalidVersion:
+ pass
+ # Not really correct, but just throw everything we can't parse at the bottom.
+ return packaging.version.Version('0.0.0')
+
version_to_files_sorted = sorted(
version_to_files.items(),
- key=lambda item: packaging.version.parse(item[0]),
+ key=lambda item: _version_sort_key(item[0]),
reverse=True,
)
return templates.TemplateResponse(
diff --git a/pypi_browser/packaging.py b/pypi_browser/packaging.py
index ab10a98..b667c58 100644
--- a/pypi_browser/packaging.py
+++ b/pypi_browser/packaging.py
@@ -12,10 +12,64 @@
from types import TracebackType
+# Copied from distlib/wheel.py
+WHEEL_FILENAME_RE = re.compile(r'''
+(?P[^-]+)
+-(?P\d+[^-]*)
+(-(?P\d+[^-]*))?
+-(?P\w+\d+(\.\w+\d+)*)
+-(?P\w+)
+-(?P\w+(\.\w+)*)
+\.whl$
+''', re.IGNORECASE | re.VERBOSE)
+
+
def pep426_normalize(package_name: str) -> str:
return re.sub(r'[-_.]+', '-', package_name.strip()).lower()
+def _remove_extension(name: str) -> str:
+ if name.endswith(('gz', 'bz2')):
+ name, _ = name.rsplit('.', 1)
+ name, _ = name.rsplit('.', 1)
+ return name
+
+
+def guess_version_from_filename(filename: str) -> str | None:
+ # Inspired by https://github.com/chriskuehl/dumb-pypi/blob/a71c3cfeba6/dumb_pypi/main.py#L56
+ if filename.endswith('.whl'):
+ # TODO: Switch to packaging.utils.parse_wheel_filename which enforces
+ # PEP440 versions for wheels.
+ m = WHEEL_FILENAME_RE.match(filename)
+ if m is not None:
+ return m.group('vn')
+ else:
+ raise ValueError(f'Invalid package name: {filename}')
+ else:
+ # These don't have a well-defined format like wheels do, so they are
+ # sort of "best effort", with lots of tests to back them up.
+ # The most important thing is to correctly parse the name.
+ name = _remove_extension(filename)
+ version = None
+
+ if '-' in name:
+ if name.count('-') == 1:
+ name, version = name.split('-')
+ else:
+ parts = name.split('-')
+ for i in range(len(parts) - 1, 0, -1):
+ part = parts[i]
+ if '.' in part and re.search('[0-9]', part):
+ name, version = '-'.join(parts[0:i]), '-'.join(parts[i:])
+
+ # Possible with poorly-named files.
+ if len(name) <= 0:
+ raise ValueError(f'Invalid package name: {filename}')
+
+ assert version is None or len(version) > 0, version
+ return version
+
+
class UnsupportedPackageType(Exception):
pass
diff --git a/pypi_browser/pypi.py b/pypi_browser/pypi.py
index 8506518..cccd551 100644
--- a/pypi_browser/pypi.py
+++ b/pypi_browser/pypi.py
@@ -1,6 +1,9 @@
+import abc
import base64
+import collections
import contextlib
import dataclasses
+import html.parser
import itertools
import os.path
import typing
@@ -9,38 +12,98 @@
import aiofiles.os
import httpx
+from pypi_browser import packaging
+
+
+class PythonRepository(abc.ABC):
+
+ @abc.abstractmethod
+ async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
+ """Return mapping from filename to file URL for files in a package."""
+
+
+class HTMLAnchorParser(html.parser.HTMLParser):
+ anchors: set[str]
+
+ def __init__(self) -> None:
+ super().__init__()
+ self.anchors = set()
+
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+ if tag == 'a':
+ if href := dict(attrs).get('href'):
+ self.anchors.add(href)
+
@dataclasses.dataclass(frozen=True)
-class PyPIConfig:
- cache_path: str
+class SimpleRepository(PythonRepository):
+ """Old-style "simple" PyPI registry serving HTML files."""
+ # TODO: Also handle PEP691 JSON simple repositories.
pypi_url: str
+ async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
+ async with httpx.AsyncClient() as client:
+ resp = await client.get(
+ f'{self.pypi_url}/{package_name}',
+ follow_redirects=True,
+ )
+ if resp.status_code == 404:
+ raise PackageDoesNotExist(package_name)
+ parser = HTMLAnchorParser()
+ parser.feed(resp.text)
-class PackageDoesNotExist(Exception):
- pass
+ def clean_url(url: str) -> str:
+ parsed = urllib.parse.urlparse(urllib.parse.urljoin(str(resp.url), url))
+ return parsed._replace(fragment='').geturl()
+ return {
+ (urllib.parse.urlparse(url).path).split('/')[-1]: clean_url(url)
+ for url in parser.anchors
+ }
-async def package_metadata(
- config: PyPIConfig,
- client: httpx.AsyncClient,
- package: str,
-) -> typing.Dict[typing.Any, typing.Any]:
- resp = await client.get(f'{config.pypi_url}/pypi/{package}/json')
- if resp.status_code == 404:
- raise PackageDoesNotExist(package)
- resp.raise_for_status()
- return resp.json()
+@dataclasses.dataclass(frozen=True)
+class LegacyJsonRepository(PythonRepository):
+ """Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints."""
+ pypi_url: str
-async def files_for_package(config: PyPIConfig, package: str) -> typing.Dict[str, typing.Set[str]]:
- async with httpx.AsyncClient() as client:
- metadata = await package_metadata(config, client, package)
+ async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
+ async with httpx.AsyncClient() as client:
+ resp = await client.get(
+ f'{self.pypi_url}/pypi/{package_name}/json',
+ follow_redirects=True,
+ )
+ if resp.status_code == 404:
+ raise PackageDoesNotExist(package_name)
+ resp.raise_for_status()
+ return {
+ file_['filename']: urllib.parse.urljoin(str(resp.url), file_['url'])
+ for file_ in itertools.chain.from_iterable(resp.json()['releases'].values())
+ }
- return {
- version: {file_['filename'] for file_ in files}
- for version, files in metadata['releases'].items()
- if len(files) > 0
- }
+
+@dataclasses.dataclass(frozen=True)
+class PyPIConfig:
+ repo: PythonRepository
+ cache_path: str
+
+
+class PackageDoesNotExist(Exception):
+ pass
+
+
+async def files_by_version(config: PyPIConfig, package: str) -> typing.Dict[str | None, typing.Set[str]]:
+ ret = collections.defaultdict(set)
+ for filename in await config.repo.files_for_package(package):
+ try:
+ version = packaging.guess_version_from_filename(filename)
+ except ValueError:
+ # Possible with some very poorly-formed packages that used to be
+ # allowed on PyPI. Just skip them when this happens.
+ pass
+ else:
+ ret[version].add(filename)
+ return ret
class CannotFindFileError(Exception):
@@ -81,21 +144,15 @@ async def downloaded_file_path(config: PyPIConfig, package: str, filename: str)
if await aiofiles.os.path.exists(stored_path):
return stored_path
- async with httpx.AsyncClient() as client:
- metadata = await package_metadata(config, client, package)
-
- # Parsing versions from non-wheel Python packages isn't perfectly
- # reliable, so just search through all releases until we find a
- # matching file.
- for file_ in itertools.chain.from_iterable(metadata['releases'].values()):
- if file_['filename'] == filename:
- url = urllib.parse.urljoin(config.pypi_url, file_['url'])
- break
- else:
- raise CannotFindFileError(package, filename)
+ filename_to_url = await config.repo.files_for_package(package)
+ try:
+ url = filename_to_url[filename]
+ except KeyError:
+ raise CannotFindFileError(package, filename)
- await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True)
+ await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True)
+ async with httpx.AsyncClient() as client:
async with _atomic_file(stored_path) as f:
async with client.stream('GET', url) as resp:
resp.raise_for_status()
From 4dbf23ed2afba9453bc70a3efb3147c74a37b8d2 Mon Sep 17 00:00:00 2001
From: Chris Kuehl
Date: Thu, 16 May 2024 09:00:32 -0700
Subject: [PATCH 2/4] pre-commit autoupdate
---
.pre-commit-config.yaml | 5 ++---
pypi_browser/packaging.py | 18 ++++++++++--------
pypi_browser/pypi.py | 10 +++++-----
3 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d820913..7449ceb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,17 +14,16 @@ repos:
rev: v3.12.0
hooks:
- id: reorder-python-imports
- args: ['--py38-plus']
+ args: ['--py39-plus']
- repo: https://github.com/asottile/add-trailing-comma
rev: v3.1.0
hooks:
- id: add-trailing-comma
- args: ['--py36-plus']
- repo: https://github.com/asottile/pyupgrade
rev: v3.15.2
hooks:
- id: pyupgrade
- args: ['--py38-plus']
+ args: ['--py310-plus']
- repo: https://github.com/hhatto/autopep8
rev: v2.1.0
hooks:
diff --git a/pypi_browser/packaging.py b/pypi_browser/packaging.py
index b667c58..95e3715 100644
--- a/pypi_browser/packaging.py
+++ b/pypi_browser/packaging.py
@@ -13,7 +13,8 @@
# Copied from distlib/wheel.py
-WHEEL_FILENAME_RE = re.compile(r'''
+WHEEL_FILENAME_RE = re.compile(
+ r'''
(?P[^-]+)
-(?P\d+[^-]*)
(-(?P\d+[^-]*))?
@@ -21,7 +22,8 @@
-(?P\w+)
-(?P\w+(\.\w+)*)
\.whl$
-''', re.IGNORECASE | re.VERBOSE)
+''', re.IGNORECASE | re.VERBOSE,
+)
def pep426_normalize(package_name: str) -> str:
@@ -92,7 +94,7 @@ class PackageEntry:
size: int
-def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]:
+def _package_entries_from_zipfile(path: str) -> set[PackageEntry]:
with zipfile.ZipFile(path) as zf:
return {
PackageEntry(
@@ -105,7 +107,7 @@ def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]:
}
-def _package_entries_from_tarball(path: str) -> typing.Set[PackageEntry]:
+def _package_entries_from_tarball(path: str) -> set[PackageEntry]:
with tarfile.open(path) as tf:
return {
PackageEntry(
@@ -130,9 +132,9 @@ async def __aenter__(self) -> 'AsyncArchiveFile':
async def __aexit__(
self,
- exc_t: typing.Optional[typing.Type[BaseException]],
- exc_v: typing.Optional[BaseException],
- exc_tb: typing.Optional[TracebackType],
+ exc_t: type[BaseException] | None,
+ exc_v: BaseException | None,
+ exc_tb: TracebackType | None,
) -> None:
await asyncio.to_thread(self.file_.close)
@@ -171,7 +173,7 @@ def from_path(cls, path: str) -> 'Package':
path=path,
)
- async def entries(self) -> typing.Set[PackageEntry]:
+ async def entries(self) -> set[PackageEntry]:
if self.package_format is PackageFormat.ZIPFILE:
return await asyncio.to_thread(_package_entries_from_zipfile, self.path)
elif self.package_format is PackageFormat.TARBALL:
diff --git a/pypi_browser/pypi.py b/pypi_browser/pypi.py
index cccd551..e1764e0 100644
--- a/pypi_browser/pypi.py
+++ b/pypi_browser/pypi.py
@@ -18,7 +18,7 @@
class PythonRepository(abc.ABC):
@abc.abstractmethod
- async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
+ async def files_for_package(self, package_name: str) -> dict[str, str]:
"""Return mapping from filename to file URL for files in a package."""
@@ -41,7 +41,7 @@ class SimpleRepository(PythonRepository):
# TODO: Also handle PEP691 JSON simple repositories.
pypi_url: str
- async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
+ async def files_for_package(self, package_name: str) -> dict[str, str]:
async with httpx.AsyncClient() as client:
resp = await client.get(
f'{self.pypi_url}/{package_name}',
@@ -67,7 +67,7 @@ class LegacyJsonRepository(PythonRepository):
"""Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints."""
pypi_url: str
- async def files_for_package(self, package_name: str) -> typing.Dict[str, str]:
+ async def files_for_package(self, package_name: str) -> dict[str, str]:
async with httpx.AsyncClient() as client:
resp = await client.get(
f'{self.pypi_url}/pypi/{package_name}/json',
@@ -92,9 +92,9 @@ class PackageDoesNotExist(Exception):
pass
-async def files_by_version(config: PyPIConfig, package: str) -> typing.Dict[str | None, typing.Set[str]]:
+async def files_by_version(config: PyPIConfig, package: str) -> dict[str | None, set[str]]:
ret = collections.defaultdict(set)
- for filename in await config.repo.files_for_package(package):
+ for filename in await config.repo.files_for_package(package):
try:
version = packaging.guess_version_from_filename(filename)
except ValueError:
From da4bd63017edeff7dd57d07dfb1a89438eee063a Mon Sep 17 00:00:00 2001
From: Chris Kuehl
Date: Thu, 16 May 2024 09:09:32 -0700
Subject: [PATCH 3/4] Display "None" versions as "(unparseable version)"
---
pypi_browser/templates/package.html | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/pypi_browser/templates/package.html b/pypi_browser/templates/package.html
index dd2fa41..7263f9e 100644
--- a/pypi_browser/templates/package.html
+++ b/pypi_browser/templates/package.html
@@ -21,7 +21,15 @@ {{package}}
{% for version, files in version_to_files %}