Skip to content

Commit

Permalink
Consistent DOI ID keys
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jun 25, 2024
1 parent 485e4bb commit 41034b0
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 6 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,7 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# rialto-airflow
data/
.DS_Store

3 changes: 2 additions & 1 deletion rialto_airflow/harvest/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def dois_from_orcid(orcid):
logging.warning("Truncated results for ORCID %s", orcid)
for pub in result["publications"]:
if pub.get("doi"):
yield pub["doi"]
doi_id = pub["doi"].replace("https://doi.org/", "")
yield doi_id


def doi_orcids_pickle(authors_csv, pickle_file, limit=None) -> None:
Expand Down
3 changes: 2 additions & 1 deletion rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def dois_from_orcid(orcid: str):
# not all publications have DOIs
doi = pub.get("doi")
if doi:
yield doi
doi_id = doi.replace("https://doi.org/", "")
yield doi_id


def works_from_author_id(author_id, limit=None):
Expand Down
9 changes: 5 additions & 4 deletions rialto_airflow/harvest/sul_pub.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import requests


sul_pub_fields = [
SUL_PUB_FIELDS = [
"authorship",
"title",
"abstract",
Expand Down Expand Up @@ -35,7 +35,7 @@

def sul_pub_csv(csv_file, host, key, since=None, limit=None):
with open(csv_file, "w") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields)
writer = csv.DictWriter(csvfile, fieldnames=SUL_PUB_FIELDS)
writer.writeheader()
for row in harvest(host, key, since, limit):
writer.writerow(row)
Expand Down Expand Up @@ -73,7 +73,7 @@ def harvest(host, key, since, limit):
more = False
break

pub = {key: record[key] for key in record if key in sul_pub_fields}
pub = {key: record[key] for key in record if key in SUL_PUB_FIELDS}
pub["doi"] = extract_doi(record)

yield pub
Expand All @@ -82,5 +82,6 @@ def harvest(host, key, since, limit):
def extract_doi(record):
for id in record.get("identifier"):
if id["type"] == "doi":
return id["id"]
doi_id = id["id"].replace("https://doi.org/", "")
return doi_id
return None
1 change: 1 addition & 0 deletions test/harvest/test_dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def test_doi_orcids_dict(tmpdir):

assert len(doi_orcids) > 0
assert doi_orcids["10.1109/lra.2018.2890209"] == ["0000-0002-0770-2940"]
assert "https://doi.org/" not in doi_orcids.keys()[0], "doi is an ID"


def test_publications_from_dois():
Expand Down
1 change: 1 addition & 0 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_doi_orcids_pickle(tmp_path):
assert len(mapping) > 0

doi = list(mapping.keys())[0]
assert "https://doi.org/" not in doi, "doi is an ID"
assert "/" in doi

orcids = mapping[doi]
Expand Down

0 comments on commit 41034b0

Please sign in to comment.