-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_eprints.py
151 lines (138 loc) · 6.39 KB
/
parse_eprints.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import requests
import os
from lxml import etree
import argparse
import pandas as pd
def get_paper_list(repo, date, path):
"""Sends request to repository for papers in the specified date range. Writes output to XML file.
Args:
repo (str): domain name of ePrints repository
date (str): date range to consider
path (str): path to write XML data to
"""
request = f"https://{repo}/cgi/search/archive/advanced?screen=Search&" \
"output=XML&" \
"_action_export_redir=Export&" \
"dataset=archive&" \
"_action_search=Search&" \
"documents_merge=ALL&" \
"documents=&" \
"eprintid=&" \
"title_merge=ALL&" \
"title=&" \
"contributors_name_merge=ALL&" \
"contributors_name=&" \
"abstract_merge=ALL&" \
"abstract=&" \
f"date={date}&" \
"keywords_merge=ALL&" \
"keywords=&" \
"divisions_merge=ANY&" \
"pres_type=paper&" \
"refereed=EITHER&" \
"publication%2Fseries_name_merge=ALL&" \
"publication%2Fseries_name=&" \
"documents.date_embargo=&" \
"lastmod=&" \
"pure_uuid=&" \
"contributors_id=&" \
"satisfyall=ALL&" \
"order=contributors_name%2F-date%2Ftitle"
response = requests.get(request)
with open(path, "wb") as f:
f.write(response.content)
def get_specific_fields_content(element, field_name):
"""Returns content of XML fields of a specific name of an element.
Args:
element (lxml.etree._Element): XML element to analyse
field_name (str): name of field to look for
Returns:
list<str>: list of contents found in children of element of given name
"""
contents = []
for child in list(element):
if field_name == etree.QName(child.tag).localname:
contents.append(child.text)
return contents
def get_specific_fields_elements(element, field_name):
"""Returns XML subelements of the given element with the given name.
Args:
element (lxml.etree._Element): XML element to analyse
field_name (str): name of field to look for
Returns:
list<lxml.etree._Element>: list of children found of the given name
"""
elements = []
for child in list(element):
if field_name == etree.QName(child.tag).localname:
elements.append(child)
return elements
def parse_pdf_urls(path):
"""Extracts download URLs of PDFs from XML file.
Args:
path (str): path to XML file
Yields:
dict: contains title, download URL for PDF, name of one of the authors
"""
with open(path, "rb") as f:
tree = etree.parse(f)
root = tree.getroot()
children = list(root)
for c in children:
urls = []
title = get_specific_fields_content(c, "title")[0]
date = get_specific_fields_content(c, "date")[0]
creators = get_specific_fields_elements(c, "creators")
try:
author_for_reference = get_specific_fields_elements(get_specific_fields_elements(creators[0], "item")[0], "name")[0]
author_name_for_reference = f"{get_specific_fields_content(author_for_reference, 'given')[0]} {get_specific_fields_content(author_for_reference, 'family')[0]}"
except IndexError:
print(f"No athor found for {title}.")
author_name_for_reference = ""
documents_holders = get_specific_fields_elements(c, "documents")
for documents_list in documents_holders:
documents = get_specific_fields_elements(documents_list, "document")
for document in documents:
files_holders = get_specific_fields_elements(document, "files")
for files_list in files_holders:
files = get_specific_fields_elements(files_list, "file")
for file in files:
urls += get_specific_fields_content(file, "url")
if len(urls) > 0: # NOTE: can sometimes include jpegs, docx etc.
n = len(urls)
yield {"title": [title for _ in range(n)], "date": [date for _ in range(n)], "url": urls, "author_for_reference": [author_name_for_reference for _ in range(n)]}
def main(repo, date, local, datadir, verbose):
path = os.path.join(datadir, f"exports/export_{repo}_{date}.xml")
if not local: # download XML data using ePrints search engine
get_paper_list(repo, date, path)
if verbose:
print("Downloaded XML list of publications.")
# look for URLs to downloadable files for each publication
pdf_dict = {'title': [], 'date': [], 'author_for_reference': [], 'pdf_url': []}
for temp_dict in parse_pdf_urls(path):
pdf_dict['title'] += temp_dict['title']
pdf_dict['date'] += temp_dict['date']
pdf_dict['author_for_reference'] += temp_dict['author_for_reference']
pdf_dict['pdf_url'] += temp_dict['url']
if verbose:
print(f"Extracted PDF download URLs from respository {repo}.")
# instantiate DataFrame, preliminary cleaning, store as CSV
df = pd.DataFrame(pdf_dict)
df.drop_duplicates(subset=['pdf_url'], inplace=True)
df.dropna(inplace=True)
extracted_path = os.path.join(datadir, f"publication_urls/extracted_pdf_urls_{repo}_{date}.csv")
df.to_csv(extracted_path, index=False)
if verbose:
print(f"Saved extracted URLs in {extracted_path}.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="parse_eprints",
description="Query ePrints repository for publications, extract PDF download URLs.."
)
parser.add_argument("--repo", required=True, type=str, help="name of ePrints repository (i.e. domain)")
parser.add_argument("--date", required=True, type=str, help="date range for filtering ePrints, e.g. 2021-2022")
parser.add_argument("--local", action="store_true", help="use local ePrints XML output instead of downloading from web")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.repo, args.date, args.local, args.datadir, args.verbose)