-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
234 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
__pycache__ | ||
.venv/ | ||
|
||
index-v2.json | ||
sourceCodes.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import asyncio | ||
import logging | ||
import requests | ||
from pathlib import Path | ||
import json | ||
import tqdm | ||
import argparse | ||
|
||
from swh import git_swh | ||
|
||
json_url = 'https://f-droid.org/repo/index-v2.json' | ||
json_cache = Path('index-v2.json') | ||
success_repos = Path('success_repos.txt') | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--refresh', action='store_true', help='Redownload index-v2.json') | ||
parser.add_argument('--swh-token', type=str) | ||
parser.add_argument('--list-only', action='store_true', help='Only list repos to sourceCodes.txt, do not submit to SWH') | ||
return parser.parse_args() | ||
|
||
async def main(): | ||
args = parse_args() | ||
if not args.refresh and json_cache.exists(): | ||
with json_cache.open('rb') as f: | ||
data = json.load(f) | ||
else: | ||
r = requests.get(json_url, stream=True) | ||
r.raise_for_status() | ||
data = b'' | ||
for chunk in tqdm.tqdm(r.iter_content(chunk_size=1024 * 1024), unit='chunk', unit_scale=True): | ||
data += chunk | ||
with json_cache.open('wb') as f: | ||
f.write(data) | ||
data: dict = json.loads(data) | ||
print(len(data)//1024//1024, 'MiB') | ||
packages = data.get('packages', {}) | ||
|
||
# now = time.time() | ||
|
||
new_packages = {} | ||
for package_name in packages: | ||
package = packages[package_name] | ||
# if package['metadata']['added']/1000 > now - 60*60*24*30: | ||
# # print(package_name) | ||
new_packages.update({package_name: package}) | ||
|
||
print('Sorting new packages...1') | ||
# newest first | ||
# package["metadata"]["added"] | ||
new_packages_names = list(new_packages) | ||
new_packages_added = [new_packages[package_name]['metadata']['added'] for package_name in new_packages_names] | ||
new_packages_added, new_packages_names = zip(*sorted(zip(new_packages_added, new_packages_names), reverse=True)) | ||
print('Sorting new packages...2') | ||
sorted_new_packages = {} | ||
for package_name in new_packages_names: | ||
# print(new_packages[package_name]['metadata']['added'], package_name) | ||
sorted_new_packages.update({package_name: new_packages[package_name]}) | ||
print(len(sorted_new_packages)) | ||
|
||
|
||
sourceCodes = set() | ||
for package in sorted_new_packages.values(): | ||
# rprint(package) | ||
added = package['metadata']['added'] | ||
sourceCode = package['metadata'].get("sourceCode") | ||
if sourceCode: # some are None | ||
print(added, sourceCode) | ||
sourceCodes.add(sourceCode) | ||
# time.sleep(0.1) | ||
|
||
with open("sourceCodes.txt", "w") as f: | ||
f.write("\n".join(sourceCodes)+"\n") | ||
|
||
print("sourceCodes.txt written") | ||
|
||
if args.list_only: | ||
return | ||
|
||
success_repos_text = success_repos.read_text() if success_repos.exists() else '' | ||
|
||
|
||
cors_list = [] | ||
cors_codes = [] | ||
cors_workers = 10 | ||
|
||
logging.info('Starting...') | ||
|
||
# very bad implementation, but it's just a simple script :) | ||
|
||
for sourceCode in sourceCodes: | ||
if sourceCode in success_repos_text: | ||
logging.info('Skipping %s', sourceCode) | ||
continue | ||
cors_codes.append(sourceCode) | ||
|
||
assert args.swh_token, 'Please provide --swh-token' | ||
|
||
cor = git_swh(sourceCode, swh_token=args.swh_token) | ||
logging.info('Starting %s', sourceCode) | ||
await asyncio.sleep(0.5) | ||
cors_list.append(cor) | ||
if len(cors_list) >= cors_workers: | ||
await asyncio.gather(*cors_list) | ||
|
||
success_repos_text = success_repos.read_text() if success_repos.exists() else '' | ||
|
||
with success_repos.open('a') as f: | ||
for cors_code in cors_codes: | ||
if cors_code not in success_repos_text: | ||
f.write(cors_code + '\n') | ||
cors_list = [] | ||
cors_codes = [] | ||
|
||
if __name__ == '__main__': | ||
logging.basicConfig(level=logging.INFO) | ||
asyncio.run(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
requests | ||
tqdm | ||
httpx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import asyncio | ||
import logging | ||
import time | ||
import traceback | ||
from typing import Optional | ||
from urllib.parse import urljoin | ||
|
||
import httpx | ||
|
||
|
||
async def validate_git_url(client: httpx.AsyncClient, url: Optional[str]): | ||
if not isinstance(url, str): | ||
raise ValueError('Invalid URL') | ||
if not url.startswith('https://') and not url.startswith('http://'): | ||
return False | ||
|
||
if not url.endswith('/'): | ||
url += '/' | ||
|
||
params = { | ||
'service': 'git-upload-pack', | ||
} | ||
headers = { | ||
'User-Agent': 'code/0.1.0', | ||
'Git-Protocol': 'version=2', | ||
} | ||
refs_path = 'info/refs' | ||
refs_url = urljoin(url, refs_path) | ||
logging.info('GET %s', refs_url) | ||
r = None | ||
for _ in range(5): | ||
try: | ||
r = await client.get(refs_url, params=params, headers=headers, follow_redirects=True) | ||
break | ||
except Exception: | ||
traceback.print_exc() | ||
await asyncio.sleep(3) | ||
if r is None: | ||
return False | ||
if r.headers.get('Content-Type') != 'application/x-git-upload-pack-advertisement': | ||
# raise ValueError(f'Invalid Content-Type: {r.headers.get("Content-Type")}') | ||
return False | ||
|
||
return True | ||
|
||
async def post_git_url(client: httpx.AsyncClient, url: str, swh_token: str): | ||
# POST https://archive.softwareheritage.org/api/1/origin/save/git/url/https://github.com/${GITHUB_REPOSITORY}/ | ||
if not url.endswith('/'): | ||
url += '/' | ||
headers = { | ||
'Authorization': f'Bearer {swh_token}', | ||
} | ||
e = 0 | ||
while True: | ||
try: | ||
r = await client.post(f'https://archive.softwareheritage.org/api/1/origin/save/git/url/{url}', headers=headers, follow_redirects=True) | ||
except Exception: | ||
e += 1 | ||
if e > 10: | ||
return | ||
await asyncio.sleep(3) | ||
logging.info('X-RateLimit-Remaining: %s', r.headers.get('X-RateLimit-Remaining')) | ||
if r.status_code == 429: | ||
waiting_to = int(r.headers.get("x-ratelimit-reset", time.time())) - time.time() + 10 | ||
logging.warning(f'Hitting rate limit. (sleep {waiting_to}s)') | ||
await asyncio.sleep(waiting_to) | ||
continue | ||
break | ||
if r.status_code != 200: | ||
if r.status_code == 429: | ||
logging.warning(f'Hitting rate limit: {r.headers}') | ||
raise ValueError(f'429 Too Many Requests: {r.text}') | ||
raise ValueError(f'Invalid status code: {r.status_code}') | ||
if r.headers.get('Content-Type') != 'application/json': | ||
raise ValueError(f'Invalid Content-Type: {r.headers.get("Content-Type")}') | ||
r_json = r.json() | ||
save_task_status = r_json['save_task_status'] | ||
save_request_status = r_json['save_request_status'] | ||
request_url = r_json['request_url'] | ||
return | ||
while True: | ||
await asyncio.sleep(10) | ||
r = await client.get(request_url, headers=headers, follow_redirects=True) | ||
logging.info('X-RateLimit-Remaining: %s', r.headers.get('X-RateLimit-Remaining')) | ||
if r.status_code != 200: | ||
raise ValueError(f'Invalid status code: {r.status_code}') | ||
if r.headers.get('Content-Type') != 'application/json': | ||
raise ValueError(f'Invalid Content-Type: {r.headers.get("Content-Type")}') | ||
r_json = r.json() | ||
save_request_status = r_json['save_request_status'] | ||
save_task_status = r_json['save_task_status'] | ||
if save_task_status in ['succeeded', 'failed']: | ||
logging.info('save_task_status: %s %s', save_task_status, save_request_status) | ||
break | ||
logging.info('save_task_status: %s %s', save_task_status, save_request_status) | ||
|
||
async def git_swh(git_url: str, swh_token: str): | ||
async with httpx.AsyncClient() as client: | ||
is_valid_repo = await validate_git_url(client=client, url=git_url) | ||
if not is_valid_repo: | ||
logging.warning('Invalid git repository') | ||
return 'Invalid git repository' | ||
|
||
try: | ||
await post_git_url(client=client, url=git_url, swh_token=swh_token) | ||
except Exception: | ||
traceback.print_exc() | ||
return |