-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexternal_sources.py
140 lines (122 loc) · 6.24 KB
/
external_sources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import requests
from typing import Iterator, Iterable
from urllib.error import HTTPError
import logging
import re
from itertools import tee, filterfalse
from datetime import datetime as dt
from orcid import ORCiDFuzzyDate, ORCiDContributor
logger = logging.getLogger(__name__)
logging.basicConfig(encoding='utf-8', level=logging.DEBUG)
def partition(predicate, iterable):
"""Partition entries into false entries and true entries.
If *predicate* is slow, consider wrapping it with functools.lru_cache().
"""
# partition(is_odd, range(10)) → 0 2 4 6 8 and 1 3 5 7 9
t1, t2 = tee(iterable)
return list(filterfalse(predicate, t1)), list(filter(predicate, t2))
class OpenAlexClient:
'''
Class for querying OpenAlex for authors and works.
'''
OPEN_ALEX_URL = 'https://api.openalex.org'
def __init__(self, ror: str, email_address: str):
'''
:param ror: the ROR for the institution with which the authors are associated.
:param email_address: for use with the OpenAlex API to gain access to the "polite" pool
'''
self.polite_param = {'mailto': email_address}
self.ror = ror
def get_author_ids(self, author_name: str) -> dict:
'''Given an author's name (and the ROR, as defined above), retrieve the author's record from OpenAlex. Uses the last_known_institutions paramater as a filter.'''
params = {'filter': f'display_name.search:{author_name},last_known_institutions.ror:{self.ror}'}
params.update(self.polite_param)
try:
response = requests.get(f'{OpenAlexClient.OPEN_ALEX_URL}/authors', params=params)
response.raise_for_status()
return response.json()
except HTTPError:
logger.error(response.text)
return None
def get_works(self, author_id: str, titles: Iterable[str], years: Iterable[str]) -> Iterator[list]:
'''
Retieves OpenAlex metadata, given an OpenAlex author ID and a list of titles and years of publication.
:param years: should have None or empty string where no year supplied
'''
session = requests.Session()
session.params = self.polite_param
for title, year in zip(titles, years):
# Need to remove colons and commas from titles for querying
logger.debug(f'Getting info for {title}')
title_param = re.sub(r'[:,]', ' ', title)
if year:
params = {f'filter': f'display_name.search:{title_param},author.id:{author_id},publication_year:{int(year)}'}
else:
params = {f'filter': f'display_name.search:{title_param},author.id:{author_id}'}
try:
response = session.get(f'{OpenAlexClient.OPEN_ALEX_URL}/works', params=params)
response.raise_for_status()
yield response.json()
except HTTPError:
logger.error(response.text)
yield None
class OpenAlexMapping:
# https://api.openalex.org/works?group_by=type
OPENALEX_TYPE_MAPPING = { 'article': 'journal-article',
'book-chapter': 'book-chapter',
'book': 'book',
'dataset': 'data-set',
'dissertation': 'disseration',
'preprint': 'preprint',
'reference-entry': 'encyclopaedia-entry',
'review': 'book-review',
'report': 'report',
'other': 'other',
'peer-review': 'review',
'standard': 'standards-and-policy',
'editorial': 'other',
'erratum': 'other',
'letter': 'other',
'supplementary-materials': 'other' }
def resolve_duplicates(self, items: list[dict]) -> dict:
'''
In the case of multiple matches returned by the OpenAlex API, returns one based on the following logic:
- prefer published type to preprint
- choose the (non-preprint) item with the highest score
'''
# Separate out preprints
preprints, other = partition(lambda x: x['type'] == 'preprint', items)
if other:
# return the first entry: OpenAlex results are returned in descending order of relevance
return other[0]
return preprints[0]
def extract_authors(self, authorships: list[dict]) -> Iterator[dict[str, str]]:
'''Extracts ORCiD-relevant information from the authorships object of an OpenAlex work.'''
for authorship in authorships:
yield { 'credit_name': authorship['author']['display_name'],
'contributor_orcid': authorship['author'].get('orcid')
}
def to_orcid_work(self, works: dict[str, list]) -> dict[str, str]:
'''
Extract work identifiers and other metadata from a set of OpenAlex API works, converting fields to those used by ORCiD
'''
if works and (works['meta']['count'] > 0):
if len(works['results']) > 2:
work = self.resolve_duplicates(works['results'])
else:
work = works['results'][0]
# If it's missing a DOI, don't use it
# OpenAlex uses the DOI as the canonical ID for works, so the field should be populated if a DOI is available
if not work.get('doi'):
return
source = work.get('primary_location', {}).get('source')
if source:
source = source.get('display_name', None)
orcid_work = { 'title': work['title'],
'_type': self.OPENALEX_TYPE_MAPPING[work['type']],
'journal_title': source,
'publication_date': ORCiDFuzzyDate.create_from_date(work['publication_date']),
'doi': work['doi'],
'url': work['primary_location'].get('landing_page_url') }
orcid_work['contributors'] = ORCiDContributor.add_contributors([contributor for contributor in self.extract_authors(work['authorships'])])
return orcid_work