From d89bf1cd5f4d0869feaf4334212d80bdc2bd5695 Mon Sep 17 00:00:00 2001 From: Kara Moraw Date: Thu, 4 Jul 2024 16:08:47 +0100 Subject: [PATCH] amend docs for GitHub crawling scripts --- src/github/README.md | 13 +++++++++++++ src/github/crawl_contents.py | 11 ++++++----- src/github/crawl_contributions.py | 11 ++++++----- src/github/crawl_engagement.py | 11 ++++++----- src/github/crawl_issues.py | 11 ++++++----- src/github/crawl_metadata.py | 15 ++++++++------- 6 files changed, 45 insertions(+), 27 deletions(-) create mode 100644 src/github/README.md diff --git a/src/github/README.md b/src/github/README.md new file mode 100644 index 0000000..204510a --- /dev/null +++ b/src/github/README.md @@ -0,0 +1,13 @@ +# Mining GitHub repositories + +All scripts in this directory expect an argument `-f` pointing to a CSV file containing a column of GitHub repository IDs, i.e. `user_name/repo_name`, and an argument `-n` indicating the name of that column. +Run the scripts with `--help` for more detail. +Additionally, all scripts use utilities provided in [`utils.py`](./utils.py), e.g. to instantiate the GitHub object and catch rate limit errors. + +Each script produces one or more CSV files with data mined for all repositories. +They will thus run for a long time, and likely run into API rate limits, too. +These are caught by the scripts which then wait until the rate limit has reset (hourly). +You should use a valid GitHub API token as described in the root README. +You will only be able to reach repositories readable with your token, which will include any public repository and any repositories your user account at GitHub has access to. + +Information on the collected data and resulting schemas is listed in the wiki associated with this repository. \ No newline at end of file diff --git a/src/github/crawl_contents.py b/src/github/crawl_contents.py index 8d4ec87..82ba947 100644 --- a/src/github/crawl_contents.py +++ b/src/github/crawl_contents.py @@ -116,7 +116,7 @@ def query_contents(row: pd.Series, id_key: str, g: Github): return row def crawl_repos(df, name, target_folder, verbose): - """For each repository, retrieve contributions, contents, readme info, stars, + """For each repository, retrieve contents and readme info. Args: df (pd.DataFrame): dataset containing GitHub repository identifiers @@ -147,9 +147,9 @@ def crawl_repos(df, name, target_folder, verbose): end = time.time() print(f"Done - {end-start:.2f} seconds.") -def main(path, name, verbose): +def main(path, name, datadir, verbose): df = pd.read_csv(path) - target_folder = '../data' + target_folder = datadir crawl_repos(df, name, target_folder, verbose) if __name__ == "__main__": @@ -157,10 +157,11 @@ def main(path, name, verbose): resource.setrlimit(resource.RLIMIT_AS, (2000000000, hard)) parser = argparse.ArgumentParser( prog="crawl", - description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository." + description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository." ) parser.add_argument("-f", "--file", required=True, type=str, help="CSV file") parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID") + parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to") parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output") args = parser.parse_args() - main(args.file, args.name, args.verbose) + main(args.file, args.name, args.datadir, args.verbose) diff --git a/src/github/crawl_contributions.py b/src/github/crawl_contributions.py index 63b1d02..9e9b2c9 100644 --- a/src/github/crawl_contributions.py +++ b/src/github/crawl_contributions.py @@ -50,7 +50,7 @@ def query_contributions(row: pd.Series, id_key: str, g: Github): return row def crawl_repos(df, name, target_folder, verbose): - """For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV. + """For each repository, retrieve contributions and store as CSV. Args: df (pd.DataFrame): dataset containing GitHub repository identifiers @@ -72,18 +72,19 @@ def crawl_repos(df, name, target_folder, verbose): end = time.time() print(f"Done - {end-start:.2f} seconds.") -def main(path, name, verbose): +def main(path, name, datadir, verbose): df = pd.read_csv(path) - target_folder = '../data' + target_folder = datadir crawl_repos(df, name, target_folder, verbose) if __name__ == "__main__": parser = argparse.ArgumentParser( prog="crawl", - description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository." + description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository." ) parser.add_argument("-f", "--file", required=True, type=str, help="CSV file") parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID") + parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to") parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output") args = parser.parse_args() - main(args.file, args.name, args.verbose) \ No newline at end of file + main(args.file, args.name, args.datadir, args.verbose) \ No newline at end of file diff --git a/src/github/crawl_engagement.py b/src/github/crawl_engagement.py index 6cbfb3d..952ac6d 100644 --- a/src/github/crawl_engagement.py +++ b/src/github/crawl_engagement.py @@ -88,7 +88,7 @@ def query_forks(row: pd.Series, id_key: str, g: Github): return row def crawl_repos(df, name, target_folder, verbose): - """For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV. + """For each repository, retrieve stars and forks. Stored as separate CSV. Args: df (pd.DataFrame): dataset containing GitHub repository identifiers @@ -118,18 +118,19 @@ def crawl_repos(df, name, target_folder, verbose): end = time.time() print(f"Done - {end-start:.2f} seconds.") -def main(path, name, verbose): +def main(path, name, datadir, verbose): df = pd.read_csv(path) - target_folder = '../data' + target_folder = datadir crawl_repos(df, name, target_folder, verbose) if __name__ == "__main__": parser = argparse.ArgumentParser( prog="crawl", - description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository." + description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository." ) parser.add_argument("-f", "--file", required=True, type=str, help="CSV file") parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID") + parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to") parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output") args = parser.parse_args() - main(args.file, args.name, args.verbose) \ No newline at end of file + main(args.file, args.name, args.datadir, args.verbose) \ No newline at end of file diff --git a/src/github/crawl_issues.py b/src/github/crawl_issues.py index 9e328c5..04f1d94 100644 --- a/src/github/crawl_issues.py +++ b/src/github/crawl_issues.py @@ -57,7 +57,7 @@ def query_issues(row: pd.Series, id_key: str, g: Github): return row def crawl_repos(df, name, target_folder, verbose): - """For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV. + """For each repository, retrieve issues and store as CSV. Args: df (pd.DataFrame): dataset containing GitHub repository identifiers @@ -79,18 +79,19 @@ def crawl_repos(df, name, target_folder, verbose): end = time.time() print(f"Done - {end-start:.2f} seconds.") -def main(path, name, verbose): +def main(path, name, datadir, verbose): df = pd.read_csv(path) - target_folder = '../data' + target_folder = datadir crawl_repos(df, name, target_folder, verbose) if __name__ == "__main__": parser = argparse.ArgumentParser( prog="crawl", - description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository." + description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository." ) parser.add_argument("-f", "--file", required=True, type=str, help="CSV file") parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID") + parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to") parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output") args = parser.parse_args() - main(args.file, args.name, args.verbose) + main(args.file, args.name, args.datadir, args.verbose) diff --git a/src/github/crawl_metadata.py b/src/github/crawl_metadata.py index 089e0a2..78a62f6 100644 --- a/src/github/crawl_metadata.py +++ b/src/github/crawl_metadata.py @@ -9,7 +9,7 @@ @wrap_query def query_metadata(row: pd.Series, id_key: str, g: Github): - """Gets stargazers of a repository. + """Gets archival status, creation date, wiki existance and page existance for a repository. Args: row (pd.Series): contains column with repository ID @@ -20,7 +20,7 @@ def query_metadata(row: pd.Series, id_key: str, g: Github): pd.Series: added columns ['archived', 'created_at', 'has_wiki', 'has_pages'] """ data = {k: [] for k in ['archived', 'created_at', 'has_wiki', 'has_pages']} - repo = safe_load_repo(g, row[id_key], "query_stars") + repo = safe_load_repo(g, row[id_key], "query_metadata") if repo is None: return None for tries in range(2): @@ -40,7 +40,7 @@ def query_metadata(row: pd.Series, id_key: str, g: Github): return row def crawl_repos(df, name, target_folder, verbose): - """For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV. + """For each repository, retrieve metadata and store as CSV. Args: df (pd.DataFrame): dataset containing GitHub repository identifiers @@ -62,18 +62,19 @@ def crawl_repos(df, name, target_folder, verbose): end = time.time() print(f"Done - {end-start:.2f} seconds.") -def main(path, name, verbose): +def main(path, name, datadir, verbose): df = pd.read_csv(path) - target_folder = '../data' + target_folder = datadir crawl_repos(df, name, target_folder, verbose) if __name__ == "__main__": parser = argparse.ArgumentParser( prog="crawl", - description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository." + description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository." ) parser.add_argument("-f", "--file", required=True, type=str, help="CSV file") parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID") + parser.add_argument("--datadir", default="../../data/raw/github/", help="directory to write GitHub data to") parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output") args = parser.parse_args() - main(args.file, args.name, args.verbose) \ No newline at end of file + main(args.file, args.name, args.datadir, args.verbose) \ No newline at end of file