Skip to content

Commit

Permalink
#24 #3 better DOI resolution framework
Browse files Browse the repository at this point in the history
  • Loading branch information
mrwunderbar666 committed Jul 25, 2023
1 parent f638366 commit 2feb1a5
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 3 deletions.
202 changes: 202 additions & 0 deletions flaskinventory/external/doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import requests
from typing import Union
from dateutil import parser as dateparser
from thefuzz import fuzz


def resolve(doi: str) -> Union[dict, bool]:
# clean input string
doi = doi.replace("https://doi.org/", "")
doi = doi.replace("http://doi.org/", "")
doi = doi.replace("doi.org/", "")


api = 'https://api.crossref.org/works/'

r = requests.get(api + doi)

if r.status_code != 200:
return False

publication = r.json()

if publication['status'] != 'ok':
return False

publication = publication['message']

result = {'doi': doi}

result['venue'] = publication.get('container-title')
if isinstance(result['venue'], list):
result['venue'] = result['venue'][0]
result['title'] = publication.get('title')

if isinstance(result['title'], list):
result['title'] = result['title'][0]

result['paper_kind'] = publication.get('type')

if publication.get('created'):
result['date_published'] = dateparser.parse(publication['created']['date-time'])

if publication.get('link'):
result['url'] = publication['link'][0]['URL']

if publication.get('author'):
authors = []
for i, author in enumerate(publication['author']):
author_name = f"{author.get('family', '')}, {author.get('given')}"
# authors.append(Scalar(author_name, facets={'sequence': i}))

result['authors'] = authors


return result



def openalex_getauthorname(author_id: str) -> dict:
api = "https://api.openalex.org/people/"
r = requests.get(api + author_id, params={'mailto': "[email protected]"})
j = r.json()
result = {'openalex': author_id}
if 'display_name' in j:
result['name'] = j['display_name']
if 'orcid' in j:
result['orcid'] = j['orcid']
if 'last_known_institution' in j:
try:
# prevent circular imports
from flaskinventory.flaskdgraph.dgraph_types import Scalar
institution = Scalar(j['last_known_institution']['display_name'],
facets={'openalex': j['last_known_institution']['id']})
result['last_known_institution'] = institution
except:
pass
return result

def openalex_getauthor_ids(query: str) -> list:
api = "https://api.openalex.org/authors"

query = query.replace(",", " ")
query = query.replace(";", " ")

params = {"search": query,
'mailto': "[email protected]"}

r = requests.get(api, params=params)

r.raise_for_status()

return r.json()['results']


def doi_org(doi: str) -> dict:
api = "https://doi.org/"

headers = {'Accept': "application/vnd.citationstyles.csl+json"}

r = requests.get(api + doi, headers=headers)

r.raise_for_status()

j = r.json()

result = {'doi': j['DOI'],
'url': j['URL'],
'title': j.get('title'),
'description': j.get('abstract'),
}

try:
result['date_published'] = j['issued']['date-parts'][0][0]
except:
pass

result['_authors_fallback'] = []
result['_authors_fallback|sequence'] = {}

for i, author in enumerate(j['author']):
try:
name = author['family'] + ', ' + author['given']
result['_authors_fallback'].append(name)
result['_authors_fallback|sequence'][str(i)] = i
except KeyError:
continue

return result

def datacite(doi: str) -> dict:
api = "https://api.datacite.org/dois/"

r = requests.get(api + doi, params={'affiliation': 'true'})

r.raise_for_status()

j = r.json()['data']['attributes']

result = {}

for d in j['dates']:
if d['dateType'] == 'Issued':
result['date_published'] = d['date']

for desc in j['descriptions']:
if desc['descriptionType'] == 'Abstract':
result['description'] = desc['description']

result['doi'] = j['doi']
result['date_published'] = dateparser.parse(j['published'])

for t in j['titles']:
if t['lang'] == 'en':
result['title'] = t['title']

result['url'] = j['url']

for author in j['creators']:
# print(author)
pass

return result

def orcid_search(name: str=None, family: str= None, given: str=None, affiliation: str = None) -> dict:
""" Fallback to orcid as last ressort"""
pass

if __name__ == '__main__':

doi = '10.25522/manifesto.mpdssa.2020b'

# result = datacite(doi)

# print(result)

affiliation = "WZB Berlin Social Science Center"

author = "Zehnter, Lisa"

candidates = openalex_getauthor_ids(author)

print(candidates)

for c in candidates:
c['x_concepts'] = None
score = 0
if not 'relevance_score' in c:
c['relevance_score'] = 1
try:
score = fuzz.partial_token_sort_ratio(c['last_known_institution']['display_name'], affiliation)
c['institution_match'] = score
c['overall_score'] = c['relevance_score'] * score
if c['orcid']:
c['overall_score'] = c['overall_score'] * 2
except:
c['overall_score'] = 0

from pprint import pprint

pprint(candidates)

# print(sorted(candidates, key=lambda x: x['overall_score'], reverse=True)[0])
9 changes: 6 additions & 3 deletions tools/wp2_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,22 +175,25 @@
df.languages = df.languages.str.replace(k, v)

query_string = '''query language($language: string) {
q(func: match(name, $language, 2)) @filter(type(Language)) { uid _unique_name name }
q(func: type(Language)) @filter(match(name, $language, 2) OR eq(alternate_names, $language)) { uid _unique_name name }
}'''

languages = list(set(df.languages.apply(lambda x: [y.strip() for y in x.split(';')]).explode().unique().tolist()))
languages.remove('')

language_uid_mapping = {}

failed = []

for language_name in languages:
language = client.txn(read_only=True).query(
query_string, variables={'$language': language_name})
query_string, variables={'$language': language_name.strip()})
try:
j = json.loads(language.json)
language_uid_mapping[language_name] = j['q'][0]['uid']
language_uid_mapping[language_name.strip()] = j['q'][0]['uid']
except:
print('Could not find language:', language_name)
failed.append(language_name)

""" Channels """

Expand Down

0 comments on commit 2feb1a5

Please sign in to comment.