diff --git a/.github/workflows/packages-anaconda-org.yml b/.github/workflows/packages-anaconda-org.yml index c9d483a19d2b3..ae6e936230b03 100644 --- a/.github/workflows/packages-anaconda-org.yml +++ b/.github/workflows/packages-anaconda-org.yml @@ -22,29 +22,28 @@ jobs: export PYTHONPATH="$( pwd )/src" python -m package_downloads.ntp_time )" - git fetch --depth=1 origin data:data - last_commit_message="$( git --no-pager log -1 --pretty=%s data )" - set -x - case "${last_commit_message}" in - *"${time}" ) + + if git ls-remote --exit-code --quiet --tags -- origin "${time}" ; then echo "skip=1" >> "${GITHUB_OUTPUT}" - ;; - * ) + else echo "skip=0" >> "${GITHUB_OUTPUT}" - esac + fi - if: ${{ github.ref != 'refs/heads/main' || steps.check.outputs.skip == '0' }} name: Fetch id: fetch run: | + set -x + ref="$( git show-ref --head --hash HEAD )" pip install \ ntplib \ aiohttp requests urllib3 \ pandas + git fetch --depth=1 origin data:data + git checkout --quiet data + git checkout --quiet "${ref}" -- ./src time="$( export PYTHONPATH="$( pwd )/src" - mkdir tmp - cd tmp python -m package_downloads.stats_from_anaconda_org )" echo "time=${time}" >> "${GITHUB_OUTPUT}" @@ -52,13 +51,11 @@ jobs: - if: ${{ github.ref != 'refs/heads/main' || steps.check.outputs.skip == '0' }} name: Add changes, commit run: | + set -x git config user.name github-actions git config user.email github-actions@github.com - git checkout data - rm -rf package-downloads - mv tmp/package-downloads ./ - rmdir tmp git add -A ./package-downloads + git rm -rf ./src git status --short | grep -o ^. | sort | uniq -c git commit -qm 'Update package download stats, ${{ steps.fetch.outputs.time }}' git tag '${{ steps.fetch.outputs.time }}' diff --git a/package-downloads.md b/package-downloads.md deleted file mode 100644 index 317aec5fa420b..0000000000000 --- a/package-downloads.md +++ /dev/null @@ -1,182 +0,0 @@ ----- -# Directory structure -``` -package-downloads/ - anaconda.org/ - .json - / - .json - / - .json - / - .json - / - .json -``` - ----- -# JSON structure -``` -// package-downloads/anaconda.org/channel.json -{ - "channel": "", - "download_per_package": [ - {"package": "", "total": }, - ... - ] - "downloads_per_date": [ - {"date": "YYYY-mm-dd", "total": }, - ... - ] -] -``` -``` -// package-downloads/anaconda.org/channel/package.json -{ - "channel": "", - "package": "", - "downloads_per_version": [ - {"version": "", "total": }, - ... - ] - "downloads_per_date": [ - {"date": "YYYY-mm-dd", "total": }, - ... - ] -} -``` -``` -// package-downloads/anaconda.org/channel/package/version.json -{ - "channel": "", - "package": "", - "version": "", - "downloads_per_subdir": [ - {"subdir": "", "total": }, - ... - ] - "downloads_per_date": [ - {"date": "YYYY-mm-dd", "total": }, - ... - ] -} -``` -``` -// package-downloads/anaconda.org/channel/package/version/subdir.json -{ - "channel": "", - "package": "", - "version": "", - "subdir": "", - "downloads_per_basename": [ - {"basename": "", "total": }, - ... - ] - "downloads_per_date": [ - {"date": "YYYY-mm-dd", "total": }, - ... - ] -} -``` -``` -// package-downloads/anaconda.org/channel/package/version/subdir/basename.json -{ - "channel": "", - "package": "", - "version": "", - "subdir": "", - "basename": "", - "downloads_per_date": [ - {"date": "YYYY-mm-dd", "total": }, - ... - ] -} -``` - ----- -# Example - -``` -// package-downloads/anaconda.org/bioconda.json -{ -"channel":"bioconda", -"downloads_per_package":[ -// Limited to top N packages to avoid bloating the channel.json -{"package":"example","total":23456} -], -"downloads_per_date":[ -{"date":"2021-01-01","total":21234}, -// For stretches without download count changes, only record start and end points: -{"date":"2021-01-02","total":22345}, -// {"date":"2021-01-03","total":22345}, // Same count as previous and next date => exclude from .json -{"date":"2021-01-04","total":22345}, // Same count as previous but not next date => include as end point -{"date":"2021-01-05","total":23456} -] -} -``` -``` -// package-downloads/anaconda.org/bioconda/example.json -{ -"channel":"bioconda", -"name":"example-package", -"downloads_per_date":[ -{"date":"2021-01-01","total":21234}, -// For stretches without download count changes, only record start and end points: -{"date":"2021-01-02","total":22345}, -// {"date":"2021-01-03","total":22345}, // Same count as previous and next date => exclude from .json -{"date":"2021-01-04","total":22345}, // Same count as previous but not next date => include as end point -{"date":"2021-01-05","total":23456} -] -} -``` -``` -// package-downloads/anaconda.org/bioconda/example/1.0.0.json -{ -"channel":"bioconda", -"name":"example-package", -"version":"1.0.0", -"downloads_per_date":[ -{"date":"2021-01-01","total":21234}, -// For stretches without download count changes, only record start and end points: -{"date":"2021-01-02","total":22345}, -// {"date":"2021-01-03","total":22345}, // Same count as previous and next date => exclude from .json -// {"date":"2021-01-04","total":22345}, // Same count as previous and next date => exclude from .json -{"date":"2021-01-05","total":23456} -] -} -``` -``` -// package-downloads/anaconda.org/bioconda/example/1.0.0/linux-64.json -{ -"channel":"bioconda", -"name":"example-package", -"version":"1.0.0", -"subdir":"linux-64", -"downloads_per_date":[ -{"date":"2021-01-01","total":11234}, -// For stretches without download count changes, only record start and end points: -{"date":"2021-01-02","total":12345}, -// {"date":"2021-01-03","total":12345}, // Same count as previous and next date => exclude from .json -// {"date":"2021-01-04","total":12345}, // Same count as previous and next date => exclude from .json -{"date":"2021-01-05","total":13456} -] -} -``` -``` -// package-downloads/anaconda.org/bioconda/example/1.0.0/linux-64/example-1.0.0-h1234567_1.tar.bz2.json -{ -"channel":"bioconda", -"name":"example-package", -"version":"1.0.0", -"subdir":"linux-64", -"basename":"example-1.0.0-h1234567_1.tar.bz2", -"downloads_per_date":[ -{"date":"2021-01-01","total":1234}, -// For stretches without download count changes, only record start and end points: -{"date":"2021-01-02","total":2345}, -// {"date":"2021-01-03","total":2345}, // Same count as previous and next date => exclude from .json -// {"date":"2021-01-04","total":2345}, // Same count as previous and next date => exclude from .json -{"date":"2021-01-05","total":3456} -] -} -``` diff --git a/src/package_downloads/_vendor/conda/__pycache__/__init__.cpython-311.pyc b/src/package_downloads/_vendor/conda/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 88378b2b7d9ea..0000000000000 Binary files a/src/package_downloads/_vendor/conda/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/src/package_downloads/_vendor/conda/__pycache__/exceptions.cpython-311.pyc b/src/package_downloads/_vendor/conda/__pycache__/exceptions.cpython-311.pyc deleted file mode 100644 index 8cce4bf2806f5..0000000000000 Binary files a/src/package_downloads/_vendor/conda/__pycache__/exceptions.cpython-311.pyc and /dev/null differ diff --git a/src/package_downloads/_vendor/conda/models/__pycache__/__init__.cpython-311.pyc b/src/package_downloads/_vendor/conda/models/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 1336961b8d848..0000000000000 Binary files a/src/package_downloads/_vendor/conda/models/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/src/package_downloads/_vendor/conda/models/__pycache__/version.cpython-311.pyc b/src/package_downloads/_vendor/conda/models/__pycache__/version.cpython-311.pyc deleted file mode 100644 index 93e8fdff0280c..0000000000000 Binary files a/src/package_downloads/_vendor/conda/models/__pycache__/version.cpython-311.pyc and /dev/null differ diff --git a/src/package_downloads/stats_from_anaconda_org.py b/src/package_downloads/stats_from_anaconda_org.py index 8bff8a1d4820b..70763773163e2 100644 --- a/src/package_downloads/stats_from_anaconda_org.py +++ b/src/package_downloads/stats_from_anaconda_org.py @@ -1,12 +1,13 @@ #! /usr/bin/env python from asyncio import run +from collections import defaultdict from functools import partial from itertools import islice from logging import INFO, basicConfig, getLogger from pathlib import Path from time import sleep -from typing import Any, Dict, Iterable, List, Tuple +from typing import Any, Dict, Iterable, List import re from aiohttp import ClientSession @@ -64,40 +65,35 @@ async def fetch_package_download_counts( continue downloads.append( { - "top": TOP_DIR, - "channel": channel, "package": package, "version": package_file_info["version"], "subdir": package_file_info["attrs"]["subdir"], - "build": package_file_info["attrs"]["build"], - "extension": PACKAGE_EXTENSION_RE.search(package_file_info["basename"])[0], + # "build": package_file_info["attrs"]["build"], + # "extension": PACKAGE_EXTENSION_RE.search(package_file_info["basename"])[0], "total": max(0, package_file_info["ndownloads"]), } ) - df = pd.DataFrame( + return pd.DataFrame( sorted( downloads, key=lambda e: ( - e["top"], - e["channel"], e["package"], VersionOrder(e["version"]), # VersionOrder can be ambiguous (e.g., "1.1" == "1.01"), so compare by str, too. e["version"], e["subdir"], - e["build"], - e["extension"], + # e["build"], + # e["extension"], ), ) ) - return df.set_index(df.loc[:, :"package"].columns.tolist()) async def get_batch_package_download_counts( date: str, channel_name: str, package_names: List[str] ) -> Iterable[pd.DataFrame]: - retries_per_chunk = 5 - retry_delay = 15 + retries_per_chunk = 2 + retry_delay = 60 retry = 0 while True: try: @@ -118,42 +114,66 @@ async def get_batch_package_download_counts( sleep(retry_delay) -async def save_counts(counts: Tuple[Tuple[str, ...], pd.DataFrame]) -> None: - index, totals = counts - path = Path(BASE_DIR).joinpath(*index[:-1], index[-1] + ".tsv") - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(totals.to_csv(sep="\t", lineterminator="\n", index=False)) - - -async def save_channel_stats( +async def get_channel_stats( date: str, channel_name: str, package_names: List[str] ) -> pd.DataFrame: - fetch_count = 0 - totals_list: List[pd.DataFrame] = [] chunk_size = 500 + fetch_count = 0 + stats_list: List[pd.DataFrame] = [] for chunk_package_names in chunked_lists(package_names, chunk_size): - chunk_totals = pd.concat( + stats_list.extend( await get_batch_package_download_counts(date, channel_name, chunk_package_names) ) - fetch_count += len(chunk_package_names) - log("save_counts: %s: %d of %d", channel_name, fetch_count, len(package_names)) + log("get_channel_stats: %s: %d of %d", channel_name, fetch_count, len(package_names)) + return pd.concat(stats_list) - grouped = chunk_totals.groupby(chunk_totals.index.names) - await gather_map(save_counts, grouped) - totals_list.append(grouped.sum("total")) - totals = pd.concat(totals_list) - while True: - names = totals.index.droplevel(-1).names - totals.reset_index(inplace=True) - totals.set_index(names, inplace=True) - grouped = totals.groupby(totals.index.names) - if len(totals.index.names) > 1: - await gather_map(save_counts, grouped) - totals = grouped.sum("total") - continue - return totals +def read_tsv(path: Path, **kwargs: Any) -> pd.DataFrame: + return pd.read_csv(path, sep="\t", dtype=defaultdict(lambda: str, total=int)) + + +def write_tsv(path: Path, data_frame: pd.DataFrame) -> None: + data_frame.to_csv(path, sep="\t", lineterminator="\n", index=True) + + +async def save_packages_stats(channel_dir: Path, totals: pd.DataFrame) -> None: + log("save_packages_stats: %s", channel_dir.name) + packages_totals = totals.groupby("package", sort=True) + write_tsv(channel_dir / "packages.tsv", packages_totals.sum("total")) + + versions_dir = channel_dir / "versions" + versions_dir.mkdir(parents=True, exist_ok=True) + for package, package_totals in packages_totals: + version_totals = package_totals.groupby("version", sort=False) + write_tsv(versions_dir / f"{package}.tsv", version_totals.sum("total")) + + +async def save_historic_channel_stats( + date: str, channel_dir: Path, totals: pd.DataFrame +) -> None: + channel_totals = pd.DataFrame([{"date": date, "total": totals["total"].sum()}]) + channel_tsv = channel_dir / "channel.tsv" + if channel_tsv.exists(): + channel_totals = pd.concat([read_tsv(channel_tsv), channel_totals]) + channel_totals.set_index("date", inplace=True) + write_tsv(channel_tsv, channel_totals) + + +async def save_channel_stats(date: str, channel_name: str, package_names: List[str]) -> None: + totals = await get_channel_stats(date, channel_name, package_names) + + log("save_channel_stats: %s: entries %d", channel_name, len(totals)) + + channel_dir = Path(BASE_DIR) / TOP_DIR / channel_name + channel_dir.mkdir(parents=True, exist_ok=True) + + await save_historic_channel_stats(date, channel_dir, totals) + + subdirs_totals = totals.groupby("subdir", sort=True) + write_tsv(channel_dir / "subdirs.tsv", subdirs_totals.sum("total")) + + await save_packages_stats(channel_dir, totals) async def main() -> str: @@ -164,14 +184,8 @@ async def main() -> str: for channel_name, channel_url in channels.items() } date = session.date - totals = pd.DataFrame() for channel_name, package_names in channel_package_names.items(): - channel_totals = await save_channel_stats(date, channel_name, package_names) - totals = pd.concat((totals, channel_totals)) - totals.insert(0, "date", date) - for index, entry in totals.groupby(totals.index.names[0]): - path = Path(BASE_DIR).joinpath(index + ".tsv") - path.write_text(entry.to_csv(sep="\t", lineterminator="\n", index=False)) + await save_channel_stats(date, channel_name, package_names) return date