From 02e8aa71fa6637c88609764db02645f54ef5f75d Mon Sep 17 00:00:00 2001 From: Rabenherz112 Date: Sat, 4 May 2024 14:51:31 +0200 Subject: [PATCH 1/5] Rewrite of add_github_metadata function - Created new function add_gh_metadata - Use github graphql api to get all github metadata - Get all metadata already via old function - Get latest release with tag and date - Get commit history with commit count (only the for the current month) - Created new function gh_metadata_cleanup - Clean up old commit history wich is older then 12 months This code is not tested yet, tbd. --- hecat/processors/github_metadata.py | 152 ++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/hecat/processors/github_metadata.py b/hecat/processors/github_metadata.py index a13d152..b269463 100644 --- a/hecat/processors/github_metadata.py +++ b/hecat/processors/github_metadata.py @@ -32,7 +32,9 @@ import sys import logging +import requests import re +import json import os import time from datetime import datetime @@ -49,6 +51,12 @@ class DummyGhMetadata(dict): def __init__(self): self.stargazers_count = 0 self.archived = False + self.current_release = { + "tag": None, + "published_at": None + } + self.last_commit_date = None + self.commit_history = {} def get_gh_metadata(step, github_url, g, errors): """get github project metadata from Github API""" @@ -114,3 +122,147 @@ def add_github_metadata(step): logging.error("There were errors during processing") print('\n'.join(errors)) sys.exit(1) + +def add_gh_metadata(step): + """gather github project data and add it to source YAML files""" + GITHUB_TOKEN = os.environ['GITHUB_TOKEN'] + errors = [] + github_projects = [] + # Load software data + software_list = load_yaml_data(step['module_options']['source_directory'] + '/software') + logging.info('updating software data from Github API') + # Check if the source code URL is a GitHub repository and add it to the queue to be processed + for software in software_list: + if 'source_code_url' in software: + if re.search(r'^https://github.com/[\w\.\-]+/[\w\.\-]+/?$', software['source_code_url']): + # Check if we only want to update missing metadata or all metadata + if 'gh_metadata_only_missing' in step['module_options'].keys() and step['module_options']['gh_metadata_only_missing']: + if ('stargazers_count' not in software) or ('updated_at' not in software) or ('archived' not in software) or ('current_release' not in software) or ('last_commit_date' not in software) or ('commit_history' not in software): + github_projects.append(software) + else: + logging.debug('all metadata already present, skipping %s', software['source_code_url']) + # If key is not present, update all metadata + else: + github_projects.append(software) + # TODO: Why do we need to check the website_url? We can exspect that the source_code_url is always present and the website_url is optional and even if changed it would not point to a github repository + elif 'website_url' in software: + if re.search(r'^https://github.com/[\w\.\-]+/[\w\.\-]+/?$' , software['website_url']): + # Check if we only want to update missing metadata or all metadata + if 'gh_metadata_only_missing' in step['module_options'].keys() and step['module_options']['gh_metadata_only_missing']: + if ('stargazers_count' not in software) or ('updated_at' not in software) or ('archived' not in software) or ('current_release' not in software) or ('last_commit_date' not in software) or ('commit_history' not in software): + github_projects.append(software) + else: + logging.debug('all metadata already present, skipping %s', software['website_url']) + # If key is not present, update all metadata + else: + github_projects.append(software) + # Get the metadata for the GitHub repositories + GITHUB_GRAPHQL_API = "https://api.github.com/graphql" + headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"} + + # Get the URLs of the queued repositories + github_urls = [software['source_code_url'] for software in github_projects] + repos = [re.sub('https://github.com/', '', url) for url in github_urls] + projectindex = 0 + + # Split the list of repositories into batches of 100 + n = 100 + batches = [repos[i * n:(i + 1) * n] for i in range((len(repos) + n - 1) // n )] + + + for batch in batches: + repos_query = " ".join([f"repo:{repo}" for repo in batch]) + + # Get the current year and month + now = datetime.now() + year_month = now.strftime("%Y-%m") + + query = f""" + {{ + search( + type: REPOSITORY + query: "{repos_query}" + first: 100 + ) {{ + repos: edges {{ + repo: node {{ + ... on Repository {{ + name + stargazerCount + archived + releases(last: 1) {{ + edges {{ + node {{ + tagName + publishedAt + }} + }} + }} + defaultBranchRef {{ + target {{ + ... on Commit {{ + committedDate + history(since: "{year_month}-01T00:00:00", until: "{year_month}-31T23:59:59") {{ + totalCount + }} + }} + }} + }} + }} + }} + }} + }} + }} + """ + try: + response = requests.post(GITHUB_GRAPHQL_API, json={"query": query}, headers=headers) + data = response.json() + except Exception as e: + errors.append(str(e)) + + for edge in data["data"]["search"]["repos"]: + repo = edge["repo"] + software = github_projects[projectindex] + software["stargazer_count"] = repo["stargazerCount"] + software["archived"] = repo["archived"] + if repo["releases"]["edges"]: + software["current_release"] = { + "tag": repo["releases"]["edges"][0]["node"]["tagName"], + "published_at": repo["releases"]["edges"][0]["node"]["publishedAt"] + } + else: + software["current_release"] = { + "tag": None, + "published_at": None + } + software["last_commit_date"] = repo["defaultBranchRef"]["target"]["committedDate"] + if year_month in software["commit_history"]: + software["commit_history"][year_month] = repo["defaultBranchRef"]["target"]["history"]["totalCount"] + else: + software["commit_history"].update({ + year_month: repo["defaultBranchRef"]["target"]["history"]["totalCount"] + }) + projectindex += 1 + write_software_yaml(step, software) + + if errors: + logging.error("There were errors during processing") + print('\n'.join(errors)) + sys.exit(1) + +def gh_metadata_cleanup(step): + """remove github metadata from source YAML files""" + software_list = load_yaml_data(step['module_options']['source_directory'] + '/software') + logging.info('cleaning up old github metadata from software data') + # Get the current year and month + now = datetime.now() + year_month = now.strftime("%Y-%m") + # Check if commit_history exists and remove any entries that are older the 12 months + for software in software_list: + if 'commit_history' in software: + for key in list(software['commit_history'].keys()): + if key < year_month: + del software['commit_history'][key] + logging.debug('removing commit history %s for %s', key, software['name']) + write_software_yaml(step, software) + \ No newline at end of file From 54067b4adcc45557de6ff3ad5834435396bdfdf3 Mon Sep 17 00:00:00 2001 From: Rabenherz112 Date: Sat, 4 May 2024 15:55:34 +0200 Subject: [PATCH 2/5] Bug fixes, removed unused code - Removed old `get_gh_metadata` function and renamed new function to the same name - Set GitHub graphql API batch amount to 60 to avoid API errors - Fixed issue that `isArchived` field did not exist in the response - Added simple error handling for the case that the github metadata could not be fetched - Fixed duplicated values for `stargazers_count` and `updated_at` - Fixed date syntax for `current_release/published_at` and `commit_history` --- hecat/processors/github_metadata.py | 80 ++++++----------------------- 1 file changed, 15 insertions(+), 65 deletions(-) diff --git a/hecat/processors/github_metadata.py b/hecat/processors/github_metadata.py index b269463..e302a0a 100644 --- a/hecat/processors/github_metadata.py +++ b/hecat/processors/github_metadata.py @@ -58,23 +58,6 @@ def __init__(self): self.last_commit_date = None self.commit_history = {} -def get_gh_metadata(step, github_url, g, errors): - """get github project metadata from Github API""" - if 'sleep_time' in step['module_options']: - time.sleep(step['module_options']['sleep_time']) - project = re.sub('https://github.com/', '', github_url) - project = re.sub('/$', '', project) - try: - gh_metadata = g.get_repo(project) - latest_commit_date = gh_metadata.get_commits()[0].commit.committer.date - except github.GithubException as github_error: - error_msg = '{} : {}'.format(github_url, github_error) - logging.error(error_msg) - errors.append(error_msg) - gh_metadata = DummyGhMetadata() - latest_commit_date = datetime.strptime('1970-01-01', '%Y-%m-%d') - return gh_metadata, latest_commit_date - def write_software_yaml(step, software): """write software data to yaml file""" dest_file = '{}/{}'.format( @@ -85,45 +68,6 @@ def write_software_yaml(step, software): yaml.dump(software, yaml_file) def add_github_metadata(step): - """gather github project data and add it to source YAML files""" - GITHUB_TOKEN = os.environ['GITHUB_TOKEN'] - g = github.Github(GITHUB_TOKEN) - errors = [] - software_list = load_yaml_data(step['module_options']['source_directory'] + '/software') - logging.info('updating software data from Github API') - for software in software_list: - github_url = '' - if 'source_code_url' in software: - if re.search(r'^https://github.com/[\w\.\-]+/[\w\.\-]+/?$', software['source_code_url']): - github_url = software['source_code_url'] - elif 'website_url' in software: - if re.search(r'^https://github.com/[\w\.\-]+/[\w\.\-]+/?$', software['website_url']): - github_url = software['website_url'] - if github_url: - logging.debug('%s is a github project URL', github_url) - if 'gh_metadata_only_missing' in step['module_options'].keys() and step['module_options']['gh_metadata_only_missing']: - if ('stargazers_count' not in software) or ('updated_at' not in software) or ('archived' not in software): - logging.info('Missing metadata for %s, gathering it from Github API', software['name']) - gh_metadata, latest_commit_date = get_gh_metadata(step, github_url, g, errors) - software['stargazers_count'] = int(gh_metadata.stargazers_count) - software['updated_at'] = datetime.strftime(latest_commit_date, "%Y-%m-%d") - software['archived'] = bool(gh_metadata.archived) - write_software_yaml(step, software) - else: - logging.debug('all metadata already present, skipping %s', github_url) - else: - logging.info('Gathering metadata for %s from Github API', github_url) - gh_metadata, latest_commit_date = get_gh_metadata(step, github_url, g, errors) - software['stargazers_count'] = gh_metadata.stargazers_count - software['updated_at'] = datetime.strftime(latest_commit_date, "%Y-%m-%d") - software['archived'] = gh_metadata.archived - write_software_yaml(step, software) - if errors: - logging.error("There were errors during processing") - print('\n'.join(errors)) - sys.exit(1) - -def add_gh_metadata(step): """gather github project data and add it to source YAML files""" GITHUB_TOKEN = os.environ['GITHUB_TOKEN'] errors = [] @@ -165,8 +109,9 @@ def add_gh_metadata(step): repos = [re.sub('https://github.com/', '', url) for url in github_urls] projectindex = 0 - # Split the list of repositories into batches of 100 - n = 100 + # Split the list of repositories into batches of 60 + # TODO: While more should be supported, I don't get it to work with 75 or more, as the API returns an error + n = 60 batches = [repos[i * n:(i + 1) * n] for i in range((len(repos) + n - 1) // n )] @@ -182,14 +127,14 @@ def add_gh_metadata(step): search( type: REPOSITORY query: "{repos_query}" - first: 100 + first: 60 ) {{ repos: edges {{ repo: node {{ ... on Repository {{ name stargazerCount - archived + isArchived releases(last: 1) {{ edges {{ node {{ @@ -217,25 +162,31 @@ def add_gh_metadata(step): try: response = requests.post(GITHUB_GRAPHQL_API, json={"query": query}, headers=headers) data = response.json() + if 'errors' in data: + for error in data['errors']: + errors.append(error['message']) + sys.exit(1) except Exception as e: errors.append(str(e)) for edge in data["data"]["search"]["repos"]: repo = edge["repo"] software = github_projects[projectindex] - software["stargazer_count"] = repo["stargazerCount"] - software["archived"] = repo["archived"] + software["stargazers_count"] = repo["stargazerCount"] + software["archived"] = repo["isArchived"] if repo["releases"]["edges"]: software["current_release"] = { "tag": repo["releases"]["edges"][0]["node"]["tagName"], - "published_at": repo["releases"]["edges"][0]["node"]["publishedAt"] + "published_at": datetime.strptime(repo["releases"]["edges"][0]["node"]["publishedAt"], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d') } else: software["current_release"] = { "tag": None, "published_at": None } - software["last_commit_date"] = repo["defaultBranchRef"]["target"]["committedDate"] + software["updated_at"] = datetime.strptime(repo["defaultBranchRef"]["target"]["committedDate"], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d') + if 'commit_history' not in software: + software['commit_history'] = {} if year_month in software["commit_history"]: software["commit_history"][year_month] = repo["defaultBranchRef"]["target"]["history"]["totalCount"] else: @@ -265,4 +216,3 @@ def gh_metadata_cleanup(step): del software['commit_history'][key] logging.debug('removing commit history %s for %s', key, software['name']) write_software_yaml(step, software) - \ No newline at end of file From d46ed73bf59d2cd8a1d410e8f8bf4fae4fa75878 Mon Sep 17 00:00:00 2001 From: Rabenherz112 Date: Sat, 4 May 2024 16:18:26 +0200 Subject: [PATCH 3/5] More fixed - Re-implement sleep time for GitHub API to avaoid rate limit - Fix gh_metadata_cleanup task --- hecat/processors/github_metadata.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hecat/processors/github_metadata.py b/hecat/processors/github_metadata.py index e302a0a..49a9f7d 100644 --- a/hecat/processors/github_metadata.py +++ b/hecat/processors/github_metadata.py @@ -195,6 +195,10 @@ def add_github_metadata(step): }) projectindex += 1 write_software_yaml(step, software) + + # Sleep for the specified amount of time before the next request + if 'sleep_time' in step['module_options']: + time.sleep(step['module_options']['sleep_time']) if errors: logging.error("There were errors during processing") @@ -207,12 +211,12 @@ def gh_metadata_cleanup(step): logging.info('cleaning up old github metadata from software data') # Get the current year and month now = datetime.now() - year_month = now.strftime("%Y-%m") + year_month_12_months_ago = (now.replace(year = now.year - 1)).strftime("%Y-%m") # Check if commit_history exists and remove any entries that are older the 12 months for software in software_list: if 'commit_history' in software: for key in list(software['commit_history'].keys()): - if key < year_month: + if key < year_month_12_months_ago: del software['commit_history'][key] logging.debug('removing commit history %s for %s', key, software['name']) write_software_yaml(step, software) From f3f64c6eee9a0f4d1339712e5a368623e223d9f2 Mon Sep 17 00:00:00 2001 From: Rabenherz112 Date: Sat, 4 May 2024 17:07:45 +0200 Subject: [PATCH 4/5] Fixed tag bug to always show the latest version --- hecat/processors/github_metadata.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/hecat/processors/github_metadata.py b/hecat/processors/github_metadata.py index 49a9f7d..f6d2b4d 100644 --- a/hecat/processors/github_metadata.py +++ b/hecat/processors/github_metadata.py @@ -135,7 +135,7 @@ def add_github_metadata(step): name stargazerCount isArchived - releases(last: 1) {{ + releases(first: 1) {{ edges {{ node {{ tagName @@ -174,16 +174,11 @@ def add_github_metadata(step): software = github_projects[projectindex] software["stargazers_count"] = repo["stargazerCount"] software["archived"] = repo["isArchived"] - if repo["releases"]["edges"]: + if repo["releases"]["edges"] and len(repo["releases"]["edges"]) > 0: software["current_release"] = { "tag": repo["releases"]["edges"][0]["node"]["tagName"], "published_at": datetime.strptime(repo["releases"]["edges"][0]["node"]["publishedAt"], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d') } - else: - software["current_release"] = { - "tag": None, - "published_at": None - } software["updated_at"] = datetime.strptime(repo["defaultBranchRef"]["target"]["committedDate"], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d') if 'commit_history' not in software: software['commit_history'] = {} From 08c64ede3b3899d00757806af31559df265da8ce Mon Sep 17 00:00:00 2001 From: Rabenherz112 Date: Tue, 7 May 2024 21:07:24 +0200 Subject: [PATCH 5/5] Fixed wrong affiliation in the metadata to files These changes have been tested by running a full metadata processing on the awesome-selfhosted-data repository and checking the metadata files for the correct affiliation. Bug Fixes: - Metadata is now not being assigned via a index but instead by matching the `url` field in the return of GraphQL query to the `source_code_url` Logging: - Added more information about the status of the metadata processing (as this can now take a while to process) - Added more debug information for Ratelimit Information from GitHub API Defaults: - Added a default wait-time between API requests to GitHub to avoid hitting the rate limit (default is now 60 seconds, can be configured in the `hecat.yml` file) - Added a default batch-size for the metadata processing (default is now 30, can be configured in the `hecat.yml` file) Others: - Added new function `extract_repo_name` to extract the repo name from the `source_code_url` - Added try-catch block to catch exceptions when the writing metadata to a file - Updated documentation to reflect new batch_size configuration option and new API restrictions from GitHub Co-authored-by: Le Duc Lischetzke --- hecat/processors/github_metadata.py | 128 +++++++++++++++++++--------- 1 file changed, 87 insertions(+), 41 deletions(-) diff --git a/hecat/processors/github_metadata.py b/hecat/processors/github_metadata.py index f6d2b4d..9c91a90 100644 --- a/hecat/processors/github_metadata.py +++ b/hecat/processors/github_metadata.py @@ -8,7 +8,8 @@ module_options: source_directory: tests/awesome-selfhosted-data # directory containing YAML data and software subdirectory gh_metadata_only_missing: False # (default False) only gather metadata for software entries in which one of stargazers_count,updated_at, archived is missing - sleep_time: 3.7 # (default 0) sleep for this amount of time before each request to Github API + sleep_time: 3.7 # (default 60) sleep for this amount of time before each request to Github API + batch_size: 10 # (default 30) number of repositories to include in each batch request to Github API source_directory: path to directory where data files reside. Directory structure: ├── software @@ -26,8 +27,8 @@ env: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} -When using GITHUB_TOKEN, the API rate limit is 1,000 requests per hour per repository [[1]](https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#rate-limits-for-requests-from-github-actions) -Not that each call to get_gh_metadata() results in 2 API requests (on for the repo/stargazers count, one for the latest commit date) +When using GITHUB_TOKEN, the API rate limit is 1,000 requests per hour per repository [[1]](https://docs.github.com/en/graphql/overview/rate-limits-and-node-limits-for-the-graphql-api#primary-rate-limit) +The call to get_gh_metadata() performs an API request for every batch with `batch_size` repositories (which returns all metadata in one request). """ import sys @@ -39,7 +40,6 @@ import time from datetime import datetime import ruamel.yaml -import github from ..utils import load_yaml_data, to_kebab_case yaml = ruamel.yaml.YAML(typ='rt') @@ -67,6 +67,10 @@ def write_software_yaml(step, software): with open(dest_file, 'w+', encoding="utf-8") as yaml_file: yaml.dump(software, yaml_file) +def extract_repo_name(url): + re_result = re.search(r'^https?:\/\/github\.com\/[^\/]+\/([^\/]+)\/?$', url) + return re_result.group(1) if re_result else None + def add_github_metadata(step): """gather github project data and add it to source YAML files""" GITHUB_TOKEN = os.environ['GITHUB_TOKEN'] @@ -107,71 +111,106 @@ def add_github_metadata(step): # Get the URLs of the queued repositories github_urls = [software['source_code_url'] for software in github_projects] repos = [re.sub('https://github.com/', '', url) for url in github_urls] - projectindex = 0 - - # Split the list of repositories into batches of 60 - # TODO: While more should be supported, I don't get it to work with 75 or more, as the API returns an error - n = 60 - batches = [repos[i * n:(i + 1) * n] for i in range((len(repos) + n - 1) // n )] + # TODO: While more should be supported, I don't get it to work with 50 or more, as the API returns an error + # This limit was tested with a personal access token, batch_size = 30, timeout = 60 (new default if not provided in a config); Worked fine for the full repo (enable debug messages to see usage of API Rate limit stats) + # Split repo list into batches of batch_size + if 'batch_size' in step['module_options']: + batch_size = step['module_options']['batch_size'] + else: + # Default batch_size of repos if not specified in config file + batch_size = 30 + batches = [repos[i * batch_size:(i + 1) * batch_size] for i in range((len(repos) + batch_size - 1) // batch_size )] + counter = 0 for batch in batches: + counter += 1 + logging.info(f"Processing batch {counter}/{len(batches)}") + repos_query = " ".join([f"repo:{repo}" for repo in batch]) # Get the current year and month now = datetime.now() year_month = now.strftime("%Y-%m") + # TODO: More accurate lookup would be for last month, but would mean we lag 1 month behind + #dateMonth = (now.month - 2) % 12 + 1 + #dateYear = now.year - 1 if dateMonth == 12 else now.year + #year_last_month = datetime.date(dateYear, dateMonth, 1).strftime("%Y-%m") query = f""" - {{ - search( - type: REPOSITORY - query: "{repos_query}" - first: 60 - ) {{ - repos: edges {{ - repo: node {{ - ... on Repository {{ - name - stargazerCount - isArchived - releases(first: 1) {{ - edges {{ - node {{ +{{ + search( + type: REPOSITORY + query: "{repos_query}" + first: {batch_size} + ) {{ + repos: edges {{ + repo: node {{ + ... on Repository {{ + url + stargazerCount + isArchived + releases(first: 1) {{ + edges {{ + node {{ tagName publishedAt - }} }} - }} - defaultBranchRef {{ - target {{ - ... on Commit {{ + }} + }} + defaultBranchRef {{ + target {{ + ... on Commit {{ committedDate history(since: "{year_month}-01T00:00:00", until: "{year_month}-31T23:59:59") {{ - totalCount + totalCount }} - }} }} - }} }} - }} }} - }} }} + }} + }} + }} +}} """ + res_header = None try: response = requests.post(GITHUB_GRAPHQL_API, json={"query": query}, headers=headers) + res_header = response.headers + # Check status code + if response.status_code != 200: + # print body + errors.append(f'Response code of POST request (GraphQL): {response.status_code}') data = response.json() if 'errors' in data: for error in data['errors']: errors.append(error['message']) - sys.exit(1) + sys.exit(4) except Exception as e: errors.append(str(e)) - + + # casefold header names + if res_header: + res_header = {k.casefold(): v for k, v in res_header.items()} + rl_arr = [] + rl_arr.append(res_header['x-ratelimit-limit']) if 'x-ratelimit-limit' in res_header else rl_arr.append('-1') + rl_arr.append(res_header['x-ratelimit-remaining']) if 'x-ratelimit-remaining' in res_header else rl_arr.append('-1') + rl_arr.append(res_header['x-ratelimit-used']) if 'x-ratelimit-used' in res_header else rl_arr.append('-1') + rl_arr.append(res_header['x-ratelimit-reset']) if 'x-ratelimit-reset' in res_header else rl_arr.append('-1') + logging.debug(f"Rate limit (Limit/Remain/Used/Reset): {'/'.join(rl_arr)}") + for edge in data["data"]["search"]["repos"]: repo = edge["repo"] - software = github_projects[projectindex] + software = None + for project in github_projects: + if extract_repo_name(repo["url"]).casefold() == extract_repo_name(project['source_code_url']).casefold(): + software = project + break + if not software: + logging.error('could not find software entry for %s', repo["url"]) + continue + software["stargazers_count"] = repo["stargazerCount"] software["archived"] = repo["isArchived"] if repo["releases"]["edges"] and len(repo["releases"]["edges"]) > 0: @@ -188,17 +227,24 @@ def add_github_metadata(step): software["commit_history"].update({ year_month: repo["defaultBranchRef"]["target"]["history"]["totalCount"] }) - projectindex += 1 - write_software_yaml(step, software) + try: + write_software_yaml(step, software) + except Exception as e: + errors.append(str(e)) + logging.error('could not write software entry for %s', repo["url"]) + continue # Sleep for the specified amount of time before the next request if 'sleep_time' in step['module_options']: time.sleep(step['module_options']['sleep_time']) + else: + # Default time between Github GraphQL API requests if not specified in config file + time.sleep(60) if errors: logging.error("There were errors during processing") print('\n'.join(errors)) - sys.exit(1) + sys.exit(2) def gh_metadata_cleanup(step): """remove github metadata from source YAML files"""