-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_pubs_xml.py
118 lines (96 loc) · 3.35 KB
/
fetch_pubs_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Fetch documents from WOS webservices expanded and write to XML.
$ python fetch_pubs_xml.py --help
"""
import argparse
import sys
import os
from string import Template
import time
import xml.etree.ElementTree as ET
import json
from lib import wose
from log_setup import get_logger
logger = get_logger()
def ln(uri):
return uri.toPython().split('/')[-1]
def make_out_dir(path):
rp = os.path.realpath(path)
if os.path.exists(rp) is not True:
os.mkdir(rp)
return rp
QUERY = Template("""
<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/"
xmlns:woksearch="http://woksearch.v3.wokmws.thomsonreuters.com"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<soapenv:Header/>
<soapenv:Body>
<woksearch:search>
<queryParameters>
<databaseId>WOS</databaseId>
<userQuery>$query</userQuery>
TSPAN
<queryLanguage>en</queryLanguage>
</queryParameters>
<retrieveParameters>
<firstRecord>$first</firstRecord>
<count>$count</count>
</retrieveParameters>
</woksearch:search>
</soapenv:Body>
</soapenv:Envelope>
""")
TSPAN = Template("""
<timeSpan>
<begin>$start</begin>
<end>$end</end>
</timeSpan>
""")
def prep_qstring(query, first=1, count=25, start=None, end=None):
q = QUERY.substitute(query=query, first=first, count=count)
if start is None:
return q.replace("TSPAN", "")
else:
if end is None:
end = date.today().isoformat()
tspan = TSPAN.substitute(start=start, end=end)
return q.replace("TSPAN", tspan)
def get_path(ut, base_path="/tmp/wose2/"):
ut = ut.lstrip("WOS:")
num = ut.lstrip("0")[:2]
path = os.path.join(base_path, num)
if not os.path.exists(path):
os.makedirs(path)
fn = os.path.join(path, ut + ".xml")
return fn
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Fetch Web of Science Documents')
parser.add_argument('--session', '-s', default=None, help="WOS session id")
parser.add_argument('--start', default=None, required=True, help="Date start. E.g 2012-01-01")
parser.add_argument('--end', default=None, required=True, help="Date end. E.g 2012-02-01")
parser.add_argument('--query', '-q', required=True)
parser.add_argument('--out', '-o', default="wos")
args = parser.parse_args(sys.argv[1:])
start_stop = []
logger.info("Query: {}".format(args.query))
#query = "OG=(Technical University of Denmark)"
q = prep_qstring(args.query, count=100, start=args.start, end=args.end)
logger.info("Fetching publications from WoS")
logger.info("WOS query: {}".format(q))
user = os.environ['WOS_USER']
password = os.environ['WOS_PASSWORD']
# Authenticate if no session ID is passed in.
sid = args.session
if sid is None:
wos = wose.Session(user=user, password=password)
sid = wos.authenticate()
logger.info("Session ID: {}.".format(sid))
qid, num, records = wose.raw_query(q, sid, get_all=True)
logger.info("{} records found.".format(len(records)))
# Make output dir
outd = make_out_dir(args.out)
for rec in records:
ut = rec.find('./UID').text
path = get_path(ut, base_path=outd)
with open(path, 'w') as outfile:
outfile.write(ET.tostring(rec))