Skip to content

Commit

Permalink
Merge pull request #10 from chriskuehl/support-simple-html
Browse files Browse the repository at this point in the history
Support traditional "simple" HTML registries
  • Loading branch information
chriskuehl authored May 16, 2024
2 parents 3c1da1b + 2aaea3e commit d3ecc67
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 48 deletions.
5 changes: 2 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,16 @@ repos:
rev: v3.12.0
hooks:
- id: reorder-python-imports
args: ['--py38-plus']
args: ['--py39-plus']
- repo: https://github.com/asottile/add-trailing-comma
rev: v3.1.0
hooks:
- id: add-trailing-comma
args: ['--py36-plus']
- repo: https://github.com/asottile/pyupgrade
rev: v3.15.2
hooks:
- id: pyupgrade
args: ['--py38-plus']
args: ['--py310-plus']
- repo: https://github.com/hhatto/autopep8
rev: v2.1.0
hooks:
Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ You can set these environment variables to configure the server:

* `PYPI_BROWSER_PYPI_URL`: URL for the PyPI server to use (defaults to
`https://pypi.org`)

If your registry supports the pypi.org-compatible JSON API (e.g.
`{registry}/pypi/{package}/json`), specify your base registry URL without
appending `/simple` (e.g. `https://my-registry`).

If your registry only supports the traditional HTML "simple" index, specify
the registry URL with `/simple` at the end (e.g.
`https://my-registry/simple`).

Note that the [PEP691][pep691] JSON-based "simple" API is not yet supported.

* `PYPI_BROWSER_PACKAGE_CACHE_PATH`: Filesystem path to use for caching
downloaded files. This will grow forever (the app does not clean it up) so
you may want to use `tmpreaper` or similar to manage its size.
Expand All @@ -77,3 +88,5 @@ $ make start-dev
```

to run a copy of the application locally with hot reloading enabled.

[pep691]: https://peps.python.org/pep-0691/
22 changes: 19 additions & 3 deletions pypi_browser/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,16 @@ async def dispatch(


config = starlette.config.Config()
pypi_url = config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org').rstrip('/')
repo: pypi.PythonRepository
if pypi_url.endswith('/simple'):
repo = pypi.SimpleRepository(pypi_url)
else:
repo = pypi.LegacyJsonRepository(pypi_url)

pypi_config = pypi.PyPIConfig(
repo=repo,
cache_path=config('PYPI_BROWSER_PACKAGE_CACHE_PATH', default='/tmp'),
pypi_url=config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org'),
)

templates = Jinja2Templates(
Expand Down Expand Up @@ -115,16 +122,25 @@ async def package(request: Request) -> Response:
return RedirectResponse(request.url_for('package', package=normalized_package_name))

try:
version_to_files = await pypi.files_for_package(pypi_config, package_name)
version_to_files = await pypi.files_by_version(pypi_config, package_name)
except pypi.PackageDoesNotExist:
return PlainTextResponse(
f'Package {package_name!r} does not exist on PyPI.',
status_code=404,
)
else:
def _version_sort_key(version: str | None) -> packaging.version.Version:
if version is not None:
try:
return packaging.version.parse(version)
except packaging.version.InvalidVersion:
pass
# Not really correct, but just throw everything we can't parse at the bottom.
return packaging.version.Version('0.0.0')

version_to_files_sorted = sorted(
version_to_files.items(),
key=lambda item: packaging.version.parse(item[0]),
key=lambda item: _version_sort_key(item[0]),
reverse=True,
)
return templates.TemplateResponse(
Expand Down
68 changes: 62 additions & 6 deletions pypi_browser/packaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,66 @@
from types import TracebackType


# Copied from distlib/wheel.py
WHEEL_FILENAME_RE = re.compile(
r'''
(?P<nm>[^-]+)
-(?P<vn>\d+[^-]*)
(-(?P<bn>\d+[^-]*))?
-(?P<py>\w+\d+(\.\w+\d+)*)
-(?P<bi>\w+)
-(?P<ar>\w+(\.\w+)*)
\.whl$
''', re.IGNORECASE | re.VERBOSE,
)


def pep426_normalize(package_name: str) -> str:
return re.sub(r'[-_.]+', '-', package_name.strip()).lower()


def _remove_extension(name: str) -> str:
if name.endswith(('gz', 'bz2')):
name, _ = name.rsplit('.', 1)
name, _ = name.rsplit('.', 1)
return name


def guess_version_from_filename(filename: str) -> str | None:
# Inspired by https://github.com/chriskuehl/dumb-pypi/blob/a71c3cfeba6/dumb_pypi/main.py#L56
if filename.endswith('.whl'):
# TODO: Switch to packaging.utils.parse_wheel_filename which enforces
# PEP440 versions for wheels.
m = WHEEL_FILENAME_RE.match(filename)
if m is not None:
return m.group('vn')
else:
raise ValueError(f'Invalid package name: {filename}')
else:
# These don't have a well-defined format like wheels do, so they are
# sort of "best effort", with lots of tests to back them up.
# The most important thing is to correctly parse the name.
name = _remove_extension(filename)
version = None

if '-' in name:
if name.count('-') == 1:
name, version = name.split('-')
else:
parts = name.split('-')
for i in range(len(parts) - 1, 0, -1):
part = parts[i]
if '.' in part and re.search('[0-9]', part):
name, version = '-'.join(parts[0:i]), '-'.join(parts[i:])

# Possible with poorly-named files.
if len(name) <= 0:
raise ValueError(f'Invalid package name: {filename}')

assert version is None or len(version) > 0, version
return version


class UnsupportedPackageType(Exception):
pass

Expand All @@ -38,7 +94,7 @@ class PackageEntry:
size: int


def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]:
def _package_entries_from_zipfile(path: str) -> set[PackageEntry]:
with zipfile.ZipFile(path) as zf:
return {
PackageEntry(
Expand All @@ -51,7 +107,7 @@ def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]:
}


def _package_entries_from_tarball(path: str) -> typing.Set[PackageEntry]:
def _package_entries_from_tarball(path: str) -> set[PackageEntry]:
with tarfile.open(path) as tf:
return {
PackageEntry(
Expand All @@ -76,9 +132,9 @@ async def __aenter__(self) -> 'AsyncArchiveFile':

async def __aexit__(
self,
exc_t: typing.Optional[typing.Type[BaseException]],
exc_v: typing.Optional[BaseException],
exc_tb: typing.Optional[TracebackType],
exc_t: type[BaseException] | None,
exc_v: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
await asyncio.to_thread(self.file_.close)

Expand Down Expand Up @@ -117,7 +173,7 @@ def from_path(cls, path: str) -> 'Package':
path=path,
)

async def entries(self) -> typing.Set[PackageEntry]:
async def entries(self) -> set[PackageEntry]:
if self.package_format is PackageFormat.ZIPFILE:
return await asyncio.to_thread(_package_entries_from_zipfile, self.path)
elif self.package_format is PackageFormat.TARBALL:
Expand Down
127 changes: 92 additions & 35 deletions pypi_browser/pypi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import abc
import base64
import collections
import contextlib
import dataclasses
import html.parser
import itertools
import os.path
import typing
Expand All @@ -9,38 +12,98 @@
import aiofiles.os
import httpx

from pypi_browser import packaging


class PythonRepository(abc.ABC):

@abc.abstractmethod
async def files_for_package(self, package_name: str) -> dict[str, str]:
"""Return mapping from filename to file URL for files in a package."""


class HTMLAnchorParser(html.parser.HTMLParser):
anchors: set[str]

def __init__(self) -> None:
super().__init__()
self.anchors = set()

def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag == 'a':
if href := dict(attrs).get('href'):
self.anchors.add(href)


@dataclasses.dataclass(frozen=True)
class PyPIConfig:
cache_path: str
class SimpleRepository(PythonRepository):
"""Old-style "simple" PyPI registry serving HTML files."""
# TODO: Also handle PEP691 JSON simple repositories.
pypi_url: str

async def files_for_package(self, package_name: str) -> dict[str, str]:
async with httpx.AsyncClient() as client:
resp = await client.get(
f'{self.pypi_url}/{package_name}',
follow_redirects=True,
)
if resp.status_code == 404:
raise PackageDoesNotExist(package_name)
parser = HTMLAnchorParser()
parser.feed(resp.text)

class PackageDoesNotExist(Exception):
pass
def clean_url(url: str) -> str:
parsed = urllib.parse.urlparse(urllib.parse.urljoin(str(resp.url), url))
return parsed._replace(fragment='').geturl()

return {
(urllib.parse.urlparse(url).path).split('/')[-1]: clean_url(url)
for url in parser.anchors
}

async def package_metadata(
config: PyPIConfig,
client: httpx.AsyncClient,
package: str,
) -> typing.Dict[typing.Any, typing.Any]:
resp = await client.get(f'{config.pypi_url}/pypi/{package}/json')
if resp.status_code == 404:
raise PackageDoesNotExist(package)
resp.raise_for_status()
return resp.json()

@dataclasses.dataclass(frozen=True)
class LegacyJsonRepository(PythonRepository):
"""Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints."""
pypi_url: str

async def files_for_package(config: PyPIConfig, package: str) -> typing.Dict[str, typing.Set[str]]:
async with httpx.AsyncClient() as client:
metadata = await package_metadata(config, client, package)
async def files_for_package(self, package_name: str) -> dict[str, str]:
async with httpx.AsyncClient() as client:
resp = await client.get(
f'{self.pypi_url}/pypi/{package_name}/json',
follow_redirects=True,
)
if resp.status_code == 404:
raise PackageDoesNotExist(package_name)
resp.raise_for_status()
return {
file_['filename']: urllib.parse.urljoin(str(resp.url), file_['url'])
for file_ in itertools.chain.from_iterable(resp.json()['releases'].values())
}

return {
version: {file_['filename'] for file_ in files}
for version, files in metadata['releases'].items()
if len(files) > 0
}

@dataclasses.dataclass(frozen=True)
class PyPIConfig:
repo: PythonRepository
cache_path: str


class PackageDoesNotExist(Exception):
pass


async def files_by_version(config: PyPIConfig, package: str) -> dict[str | None, set[str]]:
ret = collections.defaultdict(set)
for filename in await config.repo.files_for_package(package):
try:
version = packaging.guess_version_from_filename(filename)
except ValueError:
# Possible with some very poorly-formed packages that used to be
# allowed on PyPI. Just skip them when this happens.
pass
else:
ret[version].add(filename)
return ret


class CannotFindFileError(Exception):
Expand Down Expand Up @@ -81,21 +144,15 @@ async def downloaded_file_path(config: PyPIConfig, package: str, filename: str)
if await aiofiles.os.path.exists(stored_path):
return stored_path

async with httpx.AsyncClient() as client:
metadata = await package_metadata(config, client, package)

# Parsing versions from non-wheel Python packages isn't perfectly
# reliable, so just search through all releases until we find a
# matching file.
for file_ in itertools.chain.from_iterable(metadata['releases'].values()):
if file_['filename'] == filename:
url = urllib.parse.urljoin(config.pypi_url, file_['url'])
break
else:
raise CannotFindFileError(package, filename)
filename_to_url = await config.repo.files_for_package(package)
try:
url = filename_to_url[filename]
except KeyError:
raise CannotFindFileError(package, filename)

await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True)
await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True)

async with httpx.AsyncClient() as client:
async with _atomic_file(stored_path) as f:
async with client.stream('GET', url) as resp:
resp.raise_for_status()
Expand Down
10 changes: 9 additions & 1 deletion pypi_browser/templates/package.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,15 @@ <h1 class="font-monospace">{{package}}</h1>
</p>
{% for version, files in version_to_files %}
<div class="card bg-light mb-3">
<div class="card-header"><h5 class="mb-0">{{version}}</h5></div>
<div class="card-header">
<h5 class="mb-0">
{% if version is not none %}
{{version}}
{% else %}
(unparseable version)
{% endif %}
</h5>
</div>
<div class="list-group list-group-flush">
{% for file in files|sort %}
<a class="list-group-item list-group-item-action" href="{{url_for('package_file', package=package, filename=file)}}">
Expand Down

0 comments on commit d3ecc67

Please sign in to comment.