forked from wpoa/open-access-media-importer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
oa-pmc-ids
executable file
·52 lines (46 loc) · 1.99 KB
/
oa-pmc-ids
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from argparse import ArgumentParser
from datetime import datetime
from requests import get
from requests.exceptions import Timeout
from sys import argv, stderr, stdout
from xml.etree.cElementTree import dump, fromstring
parser = ArgumentParser(
description='List PMC IDs for articles in the PubMed Central Open Access subset.',
epilog='Caveat: All dates are given in local time in Bethesda, Maryland: either EST (-05:00) or EDT (-04:00), depending on the time of year.'
)
parser.add_argument('--from', help='Only list articles updated on or after the specified date (YYYY-MM-DD).', type=str)
parser.add_argument('--until', help='Only list articles updated before the specified date (YYYY-MM-DD).', type=str)
parser.add_argument('--verbose', help='Output a dot to stderr for each successful HTTP request. Output a dash to stderr for each unsuccessful HTTP request.', dest='verbose', action='store_true')
args = parser.parse_args()
verbose = args.verbose
def parse_date(text):
return datetime.strptime(text, '%Y-%m-%d').date()
def get_records(url):
while url:
while True:
try:
request = get(url, timeout=3)
break
except Timeout:
if verbose: print('-')
assert(request.ok)
assert(request.text != u'')
if verbose: print('.')
tree = fromstring(request.text)
for record in tree.iterfind('*//record'):
yield record
resumption_link = tree.find('*//resumption/link')
if resumption_link != None:
url = resumption_link.attrib['href']
else:
url = None
date_from = parse_date(args.__getattribute__('from')) # reserved word
date_until = parse_date(args.until)
url = 'http://www.pubmedcentral.nih.gov/utils/oa/oa.fcgi?from=%s&until=%s' \
% (date_from.isoformat(), date_until.isoformat())
records = get_records(url)
for record in records:
stdout.write(record.attrib['id'])
stdout.write(' ')