amend docs for GitHub crawling scripts

softwaresaved · Jul 4, 2024 · d89bf1c · d89bf1c
1 parent 0e82d99
commit d89bf1c
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 27 deletions.
diff --git a/src/github/README.md b/src/github/README.md
@@ -0,0 +1,13 @@
+# Mining GitHub repositories
+
+All scripts in this directory expect an argument `-f` pointing to a CSV file containing a column of GitHub repository IDs, i.e. `user_name/repo_name`, and an argument `-n` indicating the name of that column.
+Run the scripts with `--help` for more detail.
+Additionally, all scripts use utilities provided in [`utils.py`](./utils.py), e.g. to instantiate the GitHub object and catch rate limit errors.
+
+Each script produces one or more CSV files with data mined for all repositories.
+They will thus run for a long time, and likely run into API rate limits, too.
+These are caught by the scripts which then wait until the rate limit has reset (hourly).
+You should use a valid GitHub API token as described in the root README.
+You will only be able to reach repositories readable with your token, which will include any public repository and any repositories your user account at GitHub has access to.
+
+Information on the collected data and resulting schemas is listed in the wiki associated with this repository.
diff --git a/src/github/crawl_contents.py b/src/github/crawl_contents.py
@@ -116,7 +116,7 @@ def query_contents(row: pd.Series, id_key: str, g: Github):
     return row
 
 def crawl_repos(df, name, target_folder, verbose):
-    """For each repository, retrieve contributions, contents, readme info, stars,
+    """For each repository, retrieve contents and readme info.
 
     Args:
         df (pd.DataFrame): dataset containing GitHub repository identifiers
@@ -147,20 +147,21 @@ def crawl_repos(df, name, target_folder, verbose):
         end = time.time()
         print(f"Done - {end-start:.2f} seconds.")
 
-def main(path, name, verbose):
+def main(path, name, datadir, verbose):
     df = pd.read_csv(path)
-    target_folder = '../data'
+    target_folder = datadir
     crawl_repos(df, name, target_folder, verbose)
 
 if __name__ == "__main__":
     soft, hard = resource.getrlimit(resource.RLIMIT_AS)
     resource.setrlimit(resource.RLIMIT_AS, (2000000000, hard))
     parser = argparse.ArgumentParser(
         prog="crawl",
-        description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
+        description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
     )
     parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
     parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
+    parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
     parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
     args = parser.parse_args()
-    main(args.file, args.name, args.verbose)
+    main(args.file, args.name, args.datadir, args.verbose)
diff --git a/src/github/crawl_contributions.py b/src/github/crawl_contributions.py
@@ -50,7 +50,7 @@ def query_contributions(row: pd.Series, id_key: str, g: Github):
     return row
 
 def crawl_repos(df, name, target_folder, verbose):
-    """For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV.
+    """For each repository, retrieve contributions and store as CSV.
 
     Args:
         df (pd.DataFrame): dataset containing GitHub repository identifiers
@@ -72,18 +72,19 @@ def crawl_repos(df, name, target_folder, verbose):
         end = time.time()
         print(f"Done - {end-start:.2f} seconds.")
 
-def main(path, name, verbose):
+def main(path, name, datadir, verbose):
     df = pd.read_csv(path)
-    target_folder = '../data'
+    target_folder = datadir
     crawl_repos(df, name, target_folder, verbose)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="crawl",
-        description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
+        description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
     )
     parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
     parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
+    parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
     parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
     args = parser.parse_args()
-    main(args.file, args.name, args.verbose)
+    main(args.file, args.name, args.datadir, args.verbose)
diff --git a/src/github/crawl_engagement.py b/src/github/crawl_engagement.py
@@ -88,7 +88,7 @@ def query_forks(row: pd.Series, id_key: str, g: Github):
     return row
 
 def crawl_repos(df, name, target_folder, verbose):
-    """For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV.
+    """For each repository, retrieve stars and forks. Stored as separate CSV.
 
     Args:
         df (pd.DataFrame): dataset containing GitHub repository identifiers
@@ -118,18 +118,19 @@ def crawl_repos(df, name, target_folder, verbose):
         end = time.time()
         print(f"Done - {end-start:.2f} seconds.")
 
-def main(path, name, verbose):
+def main(path, name, datadir, verbose):
     df = pd.read_csv(path)
-    target_folder = '../data'
+    target_folder = datadir
     crawl_repos(df, name, target_folder, verbose)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="crawl",
-        description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
+        description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
     )
     parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
     parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
+    parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
     parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
     args = parser.parse_args()
-    main(args.file, args.name, args.verbose)
+    main(args.file, args.name, args.datadir, args.verbose)
diff --git a/src/github/crawl_issues.py b/src/github/crawl_issues.py
@@ -57,7 +57,7 @@ def query_issues(row: pd.Series, id_key: str, g: Github):
     return row
 
 def crawl_repos(df, name, target_folder, verbose):
-    """For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV.
+    """For each repository, retrieve issues and store as CSV.
 
     Args:
         df (pd.DataFrame): dataset containing GitHub repository identifiers
@@ -79,18 +79,19 @@ def crawl_repos(df, name, target_folder, verbose):
         end = time.time()
         print(f"Done - {end-start:.2f} seconds.")
 
-def main(path, name, verbose):
+def main(path, name, datadir, verbose):
     df = pd.read_csv(path)
-    target_folder = '../data'
+    target_folder = datadir
     crawl_repos(df, name, target_folder, verbose)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="crawl",
-        description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
+        description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
     )
     parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
     parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
+    parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
     parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
     args = parser.parse_args()
-    main(args.file, args.name, args.verbose)
+    main(args.file, args.name, args.datadir, args.verbose)
diff --git a/src/github/crawl_metadata.py b/src/github/crawl_metadata.py
@@ -9,7 +9,7 @@
 
 @wrap_query
 def query_metadata(row: pd.Series, id_key: str, g: Github):
-    """Gets stargazers of a repository.
+    """Gets archival status, creation date, wiki existance and page existance for a repository.
 
     Args:
         row (pd.Series): contains column with repository ID
@@ -20,7 +20,7 @@ def query_metadata(row: pd.Series, id_key: str, g: Github):
         pd.Series: added columns ['archived', 'created_at', 'has_wiki', 'has_pages']
     """
     data = {k: [] for k in  ['archived', 'created_at', 'has_wiki', 'has_pages']}
-    repo = safe_load_repo(g, row[id_key], "query_stars")
+    repo = safe_load_repo(g, row[id_key], "query_metadata")
     if repo is None:
         return None
     for tries in range(2):
@@ -40,7 +40,7 @@ def query_metadata(row: pd.Series, id_key: str, g: Github):
     return row
 
 def crawl_repos(df, name, target_folder, verbose):
-    """For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV.
+    """For each repository, retrieve metadata and store as CSV.
 
     Args:
         df (pd.DataFrame): dataset containing GitHub repository identifiers
@@ -62,18 +62,19 @@ def crawl_repos(df, name, target_folder, verbose):
         end = time.time()
         print(f"Done - {end-start:.2f} seconds.")
 
-def main(path, name, verbose):
+def main(path, name, datadir, verbose):
     df = pd.read_csv(path)
-    target_folder = '../data'
+    target_folder = datadir
     crawl_repos(df, name, target_folder, verbose)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="crawl",
-        description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
+        description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
     )
     parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
     parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
+    parser.add_argument("--datadir", default="../../data/raw/github/", help="directory to write GitHub data to")
     parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
     args = parser.parse_args()
-    main(args.file, args.name, args.verbose)
+    main(args.file, args.name, args.datadir, args.verbose)