Skip to content

Commit

Permalink
public
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed Sep 27, 2023
1 parent 860b5d9 commit 4ae4d21
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__
.venv/

index-v2.json
sourceCodes.txt
118 changes: 118 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import asyncio
import logging
import requests
from pathlib import Path
import json
import tqdm
import argparse

from swh import git_swh

json_url = 'https://f-droid.org/repo/index-v2.json'
json_cache = Path('index-v2.json')
success_repos = Path('success_repos.txt')


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--refresh', action='store_true', help='Redownload index-v2.json')
parser.add_argument('--swh-token', type=str)
parser.add_argument('--list-only', action='store_true', help='Only list repos to sourceCodes.txt, do not submit to SWH')
return parser.parse_args()

async def main():
args = parse_args()
if not args.refresh and json_cache.exists():
with json_cache.open('rb') as f:
data = json.load(f)
else:
r = requests.get(json_url, stream=True)
r.raise_for_status()
data = b''
for chunk in tqdm.tqdm(r.iter_content(chunk_size=1024 * 1024), unit='chunk', unit_scale=True):
data += chunk
with json_cache.open('wb') as f:
f.write(data)
data: dict = json.loads(data)
print(len(data)//1024//1024, 'MiB')
packages = data.get('packages', {})

# now = time.time()

new_packages = {}
for package_name in packages:
package = packages[package_name]
# if package['metadata']['added']/1000 > now - 60*60*24*30:
# # print(package_name)
new_packages.update({package_name: package})

print('Sorting new packages...1')
# newest first
# package["metadata"]["added"]
new_packages_names = list(new_packages)
new_packages_added = [new_packages[package_name]['metadata']['added'] for package_name in new_packages_names]
new_packages_added, new_packages_names = zip(*sorted(zip(new_packages_added, new_packages_names), reverse=True))
print('Sorting new packages...2')
sorted_new_packages = {}
for package_name in new_packages_names:
# print(new_packages[package_name]['metadata']['added'], package_name)
sorted_new_packages.update({package_name: new_packages[package_name]})
print(len(sorted_new_packages))


sourceCodes = set()
for package in sorted_new_packages.values():
# rprint(package)
added = package['metadata']['added']
sourceCode = package['metadata'].get("sourceCode")
if sourceCode: # some are None
print(added, sourceCode)
sourceCodes.add(sourceCode)
# time.sleep(0.1)

with open("sourceCodes.txt", "w") as f:
f.write("\n".join(sourceCodes)+"\n")

print("sourceCodes.txt written")

if args.list_only:
return

success_repos_text = success_repos.read_text() if success_repos.exists() else ''


cors_list = []
cors_codes = []
cors_workers = 10

logging.info('Starting...')

# very bad implementation, but it's just a simple script :)

for sourceCode in sourceCodes:
if sourceCode in success_repos_text:
logging.info('Skipping %s', sourceCode)
continue
cors_codes.append(sourceCode)

assert args.swh_token, 'Please provide --swh-token'

cor = git_swh(sourceCode, swh_token=args.swh_token)
logging.info('Starting %s', sourceCode)
await asyncio.sleep(0.5)
cors_list.append(cor)
if len(cors_list) >= cors_workers:
await asyncio.gather(*cors_list)

success_repos_text = success_repos.read_text() if success_repos.exists() else ''

with success_repos.open('a') as f:
for cors_code in cors_codes:
if cors_code not in success_repos_text:
f.write(cors_code + '\n')
cors_list = []
cors_codes = []

if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
asyncio.run(main())
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests
tqdm
httpx
108 changes: 108 additions & 0 deletions swh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import asyncio
import logging
import time
import traceback
from typing import Optional
from urllib.parse import urljoin

import httpx


async def validate_git_url(client: httpx.AsyncClient, url: Optional[str]):
if not isinstance(url, str):
raise ValueError('Invalid URL')
if not url.startswith('https://') and not url.startswith('http://'):
return False

if not url.endswith('/'):
url += '/'

params = {
'service': 'git-upload-pack',
}
headers = {
'User-Agent': 'code/0.1.0',
'Git-Protocol': 'version=2',
}
refs_path = 'info/refs'
refs_url = urljoin(url, refs_path)
logging.info('GET %s', refs_url)
r = None
for _ in range(5):
try:
r = await client.get(refs_url, params=params, headers=headers, follow_redirects=True)
break
except Exception:
traceback.print_exc()
await asyncio.sleep(3)
if r is None:
return False
if r.headers.get('Content-Type') != 'application/x-git-upload-pack-advertisement':
# raise ValueError(f'Invalid Content-Type: {r.headers.get("Content-Type")}')
return False

return True

async def post_git_url(client: httpx.AsyncClient, url: str, swh_token: str):
# POST https://archive.softwareheritage.org/api/1/origin/save/git/url/https://github.com/${GITHUB_REPOSITORY}/
if not url.endswith('/'):
url += '/'
headers = {
'Authorization': f'Bearer {swh_token}',
}
e = 0
while True:
try:
r = await client.post(f'https://archive.softwareheritage.org/api/1/origin/save/git/url/{url}', headers=headers, follow_redirects=True)
except Exception:
e += 1
if e > 10:
return
await asyncio.sleep(3)
logging.info('X-RateLimit-Remaining: %s', r.headers.get('X-RateLimit-Remaining'))
if r.status_code == 429:
waiting_to = int(r.headers.get("x-ratelimit-reset", time.time())) - time.time() + 10
logging.warning(f'Hitting rate limit. (sleep {waiting_to}s)')
await asyncio.sleep(waiting_to)
continue
break
if r.status_code != 200:
if r.status_code == 429:
logging.warning(f'Hitting rate limit: {r.headers}')
raise ValueError(f'429 Too Many Requests: {r.text}')
raise ValueError(f'Invalid status code: {r.status_code}')
if r.headers.get('Content-Type') != 'application/json':
raise ValueError(f'Invalid Content-Type: {r.headers.get("Content-Type")}')
r_json = r.json()
save_task_status = r_json['save_task_status']
save_request_status = r_json['save_request_status']
request_url = r_json['request_url']
return
while True:
await asyncio.sleep(10)
r = await client.get(request_url, headers=headers, follow_redirects=True)
logging.info('X-RateLimit-Remaining: %s', r.headers.get('X-RateLimit-Remaining'))
if r.status_code != 200:
raise ValueError(f'Invalid status code: {r.status_code}')
if r.headers.get('Content-Type') != 'application/json':
raise ValueError(f'Invalid Content-Type: {r.headers.get("Content-Type")}')
r_json = r.json()
save_request_status = r_json['save_request_status']
save_task_status = r_json['save_task_status']
if save_task_status in ['succeeded', 'failed']:
logging.info('save_task_status: %s %s', save_task_status, save_request_status)
break
logging.info('save_task_status: %s %s', save_task_status, save_request_status)

async def git_swh(git_url: str, swh_token: str):
async with httpx.AsyncClient() as client:
is_valid_repo = await validate_git_url(client=client, url=git_url)
if not is_valid_repo:
logging.warning('Invalid git repository')
return 'Invalid git repository'

try:
await post_git_url(client=client, url=git_url, swh_token=swh_token)
except Exception:
traceback.print_exc()
return

0 comments on commit 4ae4d21

Please sign in to comment.