Skip to content

Commit

Permalink
amend docs for GitHub crawling scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
karacolada committed Jul 4, 2024
1 parent 0e82d99 commit d89bf1c
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 27 deletions.
13 changes: 13 additions & 0 deletions src/github/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Mining GitHub repositories

All scripts in this directory expect an argument `-f` pointing to a CSV file containing a column of GitHub repository IDs, i.e. `user_name/repo_name`, and an argument `-n` indicating the name of that column.
Run the scripts with `--help` for more detail.
Additionally, all scripts use utilities provided in [`utils.py`](./utils.py), e.g. to instantiate the GitHub object and catch rate limit errors.

Each script produces one or more CSV files with data mined for all repositories.
They will thus run for a long time, and likely run into API rate limits, too.
These are caught by the scripts which then wait until the rate limit has reset (hourly).
You should use a valid GitHub API token as described in the root README.
You will only be able to reach repositories readable with your token, which will include any public repository and any repositories your user account at GitHub has access to.

Information on the collected data and resulting schemas is listed in the wiki associated with this repository.
11 changes: 6 additions & 5 deletions src/github/crawl_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def query_contents(row: pd.Series, id_key: str, g: Github):
return row

def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve contributions, contents, readme info, stars,
"""For each repository, retrieve contents and readme info.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
Expand Down Expand Up @@ -147,20 +147,21 @@ def crawl_repos(df, name, target_folder, verbose):
end = time.time()
print(f"Done - {end-start:.2f} seconds.")

def main(path, name, verbose):
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = '../data'
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)

if __name__ == "__main__":
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS, (2000000000, hard))
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.verbose)
main(args.file, args.name, args.datadir, args.verbose)
11 changes: 6 additions & 5 deletions src/github/crawl_contributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def query_contributions(row: pd.Series, id_key: str, g: Github):
return row

def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV.
"""For each repository, retrieve contributions and store as CSV.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
Expand All @@ -72,18 +72,19 @@ def crawl_repos(df, name, target_folder, verbose):
end = time.time()
print(f"Done - {end-start:.2f} seconds.")

def main(path, name, verbose):
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = '../data'
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.verbose)
main(args.file, args.name, args.datadir, args.verbose)
11 changes: 6 additions & 5 deletions src/github/crawl_engagement.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def query_forks(row: pd.Series, id_key: str, g: Github):
return row

def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV.
"""For each repository, retrieve stars and forks. Stored as separate CSV.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
Expand Down Expand Up @@ -118,18 +118,19 @@ def crawl_repos(df, name, target_folder, verbose):
end = time.time()
print(f"Done - {end-start:.2f} seconds.")

def main(path, name, verbose):
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = '../data'
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.verbose)
main(args.file, args.name, args.datadir, args.verbose)
11 changes: 6 additions & 5 deletions src/github/crawl_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def query_issues(row: pd.Series, id_key: str, g: Github):
return row

def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV.
"""For each repository, retrieve issues and store as CSV.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
Expand All @@ -79,18 +79,19 @@ def crawl_repos(df, name, target_folder, verbose):
end = time.time()
print(f"Done - {end-start:.2f} seconds.")

def main(path, name, verbose):
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = '../data'
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.verbose)
main(args.file, args.name, args.datadir, args.verbose)
15 changes: 8 additions & 7 deletions src/github/crawl_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

@wrap_query
def query_metadata(row: pd.Series, id_key: str, g: Github):
"""Gets stargazers of a repository.
"""Gets archival status, creation date, wiki existance and page existance for a repository.
Args:
row (pd.Series): contains column with repository ID
Expand All @@ -20,7 +20,7 @@ def query_metadata(row: pd.Series, id_key: str, g: Github):
pd.Series: added columns ['archived', 'created_at', 'has_wiki', 'has_pages']
"""
data = {k: [] for k in ['archived', 'created_at', 'has_wiki', 'has_pages']}
repo = safe_load_repo(g, row[id_key], "query_stars")
repo = safe_load_repo(g, row[id_key], "query_metadata")
if repo is None:
return None
for tries in range(2):
Expand All @@ -40,7 +40,7 @@ def query_metadata(row: pd.Series, id_key: str, g: Github):
return row

def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve contributions, contents, readme info, stars, forks and issues. All stored as CSV.
"""For each repository, retrieve metadata and store as CSV.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
Expand All @@ -62,18 +62,19 @@ def crawl_repos(df, name, target_folder, verbose):
end = time.time()
print(f"Done - {end-start:.2f} seconds.")

def main(path, name, verbose):
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = '../data'
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with columns user_name and repo_name, gather data from the corresponding GitHub repository."
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/github/", help="directory to write GitHub data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.verbose)
main(args.file, args.name, args.datadir, args.verbose)

0 comments on commit d89bf1c

Please sign in to comment.