Skip to content

Commit

Permalink
Consistent DOI ID keys
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jun 25, 2024
1 parent 1d8d33c commit c8af82d
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 7 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,7 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# rialto-airflow
data/
.DS_Store

3 changes: 2 additions & 1 deletion rialto_airflow/harvest/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def dois_from_orcid(orcid):
logging.warning("Truncated results for ORCID %s", orcid)
for pub in result["publications"]:
if pub.get("doi"):
yield pub["doi"]
doi_id = pub["doi"].replace("https://doi.org/", "")
yield doi_id


def doi_orcids_pickle(authors_csv, pickle_file, limit=None) -> None:
Expand Down
18 changes: 16 additions & 2 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,31 @@ def dois_from_orcid(orcid: str, limit=None):
time.sleep(1)

logging.info(f"looking up dois for orcid {orcid}")

# get the first (and hopefully only) openalex id for the orcid
authors = Authors().filter(orcid=orcid).get()
if len(authors) == 0:
return []
elif len(authors) > 1:
logging.warn(f"found more than one openalex author id for {orcid}")
author_id = authors[0]["id"]

# get all the works for the openalex author id
work_count = 0
for page in Works().filter(author={"id": author_id}).paginate(per_page=200):
for pub in page:
if pub.get("doi"):
work_count += 1
if limit is not None and work_count > limit:
return
yield pub.get("doi").replace("https://doi.org/", "")


def works_from_author_id(author_id, limit=None):
"""
Pass in the OpenAlex Author ID and get back an iterator of works.
"""
work_count = 0
for page in Works().filter(author={"id": author_id}).paginate(per_page=200):
for pub in page:
if pub.get("doi"):
Expand Down
9 changes: 5 additions & 4 deletions rialto_airflow/harvest/sul_pub.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import requests


sul_pub_fields = [
SUL_PUB_FIELDS = [
"authorship",
"title",
"abstract",
Expand Down Expand Up @@ -35,7 +35,7 @@

def sul_pub_csv(csv_file, host, key, since=None, limit=None):
with open(csv_file, "w") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields)
writer = csv.DictWriter(csvfile, fieldnames=SUL_PUB_FIELDS)
writer.writeheader()
for row in harvest(host, key, since, limit):
writer.writerow(row)
Expand Down Expand Up @@ -73,7 +73,7 @@ def harvest(host, key, since, limit):
more = False
break

pub = {key: record[key] for key in record if key in sul_pub_fields}
pub = {key: record[key] for key in record if key in SUL_PUB_FIELDS}
pub["doi"] = extract_doi(record)

yield pub
Expand All @@ -82,5 +82,6 @@ def harvest(host, key, since, limit):
def extract_doi(record):
for id in record.get("identifier"):
if id["type"] == "doi":
return id["id"]
doi_id = id["id"].replace("https://doi.org/", "")
return doi_id
return None
1 change: 1 addition & 0 deletions test/harvest/test_dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def test_doi_orcids_dict(tmpdir):

assert len(doi_orcids) > 0
assert doi_orcids["10.1109/lra.2018.2890209"] == ["0000-0002-0770-2940"]
assert "https://doi.org/" not in list(doi_orcids.keys())[0], "doi is an ID"


def test_publications_from_dois():
Expand Down
1 change: 1 addition & 0 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_doi_orcids_pickle(tmp_path):
assert len(mapping) > 0

doi = list(mapping.keys())[0]
assert "https://doi.org/" not in doi, "doi is an ID"
assert "/" in doi

orcids = mapping[doi]
Expand Down

0 comments on commit c8af82d

Please sign in to comment.