-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_eprints_links.py
102 lines (92 loc) · 4.37 KB
/
clean_eprints_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
import argparse
import re
import os
import Levenshtein
import configparser
from github import Github, GithubException
from unidecode import unidecode
def get_access_token():
"""Read Github API access token from config file.
Returns:
str: Access Token
"""
config = configparser.ConfigParser()
config.read('../../config.cfg')
return config['ACCESS']['token']
def clean_by_pattern(row, domain):
"""Parsing results often contain other text at the end of the link.
The real link is often separated from the following text using a dot or number (from the next footnote),
so we match against domain/username/reponame and assume that the reponame is not allowed to contain dots or number.
This is not true, but usually works well. Early cut-offs can also be corrected in the clean_by_user function.
Args:
row (pd.Series): row with a column url
Returns:
list<str>: List of cleaned links.
"""
pattern = rf"{domain}/[A-Za-z0-9-]+/[A-Za-z_\-]+"
cleaned = re.findall(pattern, unidecode(row['domain_url']))
return cleaned
def clean_by_user(row, column, verbose):
"""Clean links by looking up user-repo mapping via Github API.
Args:
row (pd.Series): row to process
column (str): column name
verbose (bool): whether to produce additional output
Returns:
str: repository identifier (ie. 'user/repo'), may be empty
"""
g = Github(get_access_token())
_, username, repo_name = unidecode(row[column]).split("/")
try:
repo_id = f"{username}/{repo_name}"
r = g.get_repo(repo_id)
except GithubException:
try:
user = g.get_user(username)
except GithubException: # user not found
return None
bestmatch = ""
maxratio = 0.
for r in user.get_repos():
ratio = Levenshtein.ratio(r.name, repo_name)
if ratio > 0.7 and ratio > maxratio:
bestmatch = r.name
maxratio = ratio
if bestmatch != "":
repo_id = f"{username}/{bestmatch}"
if verbose:
print(f"Matched user {username}'s repo {bestmatch} with extracted link {row[column]}.")
else: # no match found
repo_id = None
return repo_id
def main(repo, date, domain, datadir, verbose):
repo_urls_path = os.path.join(datadir, f"repo_urls/extracted_urls_{repo}_{date}_{domain}.csv")
df = pd.read_csv(repo_urls_path)
# clean by advanced URL pattern
df["pattern_cleaned_url"] = df.apply(clean_by_pattern, args=(domain,), axis=1)
df = df.explode("pattern_cleaned_url", ignore_index=True) # expand DataFrame for when multiple links are found
df.drop_duplicates(subset=['title', 'author_for_reference', 'pattern_cleaned_url'], inplace=True)
df.dropna(axis=0, subset=['pattern_cleaned_url'], inplace=True)
# check whether the repository exists using the GitHub API
if domain == "github.com":
df["github_user_cleaned_url"] = df.apply(clean_by_user, args=("pattern_cleaned_url", verbose), axis=1)
#df.drop_duplicates(subset=['title', 'author_for_reference', 'github_user_cleaned_url'], inplace=True)
df.dropna(axis=0, subset=['github_user_cleaned_url'], inplace=True)
cleaned_urls_path = os.path.join(datadir, f"cleaned_repo_urls/cleaned_urls_{repo}_{date}_{domain}.csv")
# store as CSV
df.to_csv(cleaned_urls_path, index=False)
if verbose:
print(f"Saved cleaned URLs in {cleaned_urls_path}.csv")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="clean_eprints_links",
description="Clean the repository links retrieved from ePrints. Only works for Github Links for now."
)
parser.add_argument("--repo", required=True, type=str, help="name of ePrints repository (i.e. domain)")
parser.add_argument("--date", required=True, type=str, help="date range for filtering ePrints, e.g. 2021-2022")
parser.add_argument("--domain", required=True, type=str, help="domain to match against (only one can be provided for now, e.g. github.com)")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.repo, args.date, args.domain, args.datadir, args.verbose)