-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_pdfs.py
74 lines (69 loc) · 3.5 KB
/
parse_pdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import requests
import re
import os
import argparse
import resource
import pandas as pd
from io import BytesIO
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
def get_domain_urls(row, domain, verbose):
"""Downloads file and yields matches of URLs of the domain found in it if it's a PDF.
Args:
row (pd.Series): contains columns for PDF url
domain (str): domain to scan for, e.g. github.com
Returns:
pd.Series: added columns ['page_no', 'domain_url']
"""
pattern = rf"(?P<url>https?://(www\.)?{re.escape(domain)}[^\s]+)"
matches = {k: [] for k in ['page_no', 'domain_url']}
pdf = requests.get(row['pdf_url'], stream=True)
if pdf.status_code == 200 and "pdf" in pdf.headers['content-type'] and int(pdf.headers['content-length']) < 5e7: # ignore files larger than 50 MB to avoid OOM error
if verbose:
print(f"Parsing {row['pdf_url']} of size {int(pdf.headers['content-length'])}")
try:
page_layouts = extract_pages(BytesIO(pdf.content))
for page_no, page_layout in enumerate(page_layouts):
for element in page_layout:
if isinstance(element, LTTextContainer):
text = element.get_text()
for match in re.finditer(pattern, text):
matches['page_no'].append(page_no)
matches['domain_url'].append(match.group("url"))
except Exception:
pass
elif pdf.status_code == 200 and "pdf" in pdf.headers['content-type']:
if verbose:
print(f"Ignoring {row['pdf_url']} of size {int(pdf.headers['content-length'])}")
for k, v in matches.items():
row[k] = v
return row
def main(repo, date, domain, datadir, verbose):
path = os.path.join(datadir, f"publication_urls/extracted_pdf_urls_{repo}_{date}.csv")
df = pd.read_csv(path)
# download file and search for URLs that contain the domain
d = df.apply(get_domain_urls, axis=1, args=(domain, verbose))
print(d.head())
if verbose:
print(f"Extracted URLs of domain {domain} from respository {repo}.")
# reformatting, store as CSV
d = d.dropna().explode(['page_no', 'domain_url'])
d.dropna(axis=0, how='all', subset=['domain_url'], inplace=True)
links_path = os.path.join(datadir, f"repo_urls/extracted_urls_{repo}_{date}_{domain}.csv")
d.to_csv(links_path, index=False)
if verbose:
print(f"Saved extracted URLs in {links_path}.csv")
if __name__ == "__main__":
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS, (2000000000, hard))
parser = argparse.ArgumentParser(
prog="parse_pdfs",
description="Scan the downloadable publications for links of a specific domain name, e.g. github.com."
)
parser.add_argument("--repo", required=True, type=str, help="name of ePrints repository (i.e. domain)")
parser.add_argument("--date", required=True, type=str, help="date range for filtering ePrints, e.g. 2021-2022")
parser.add_argument("--domain", required=True, type=str, help="domain to match against (only one can be provided for now, e.g. github.com)")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.repo, args.date, args.domain, args.datadir, args.verbose)