From c8af82d31c0f1b8d5ef5a0e2a8776a7bed60eb86 Mon Sep 17 00:00:00 2001 From: Laura Wrubel Date: Tue, 25 Jun 2024 08:59:38 -0400 Subject: [PATCH] Consistent DOI ID keys --- .gitignore | 3 +++ rialto_airflow/harvest/dimensions.py | 3 ++- rialto_airflow/harvest/openalex.py | 18 ++++++++++++++++-- rialto_airflow/harvest/sul_pub.py | 9 +++++---- test/harvest/test_dimensions.py | 1 + test/harvest/test_openalex.py | 1 + 6 files changed, 28 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 09da277..96a430d 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,7 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# rialto-airflow data/ +.DS_Store + diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py index d308b4b..fe69bab 100644 --- a/rialto_airflow/harvest/dimensions.py +++ b/rialto_airflow/harvest/dimensions.py @@ -36,7 +36,8 @@ def dois_from_orcid(orcid): logging.warning("Truncated results for ORCID %s", orcid) for pub in result["publications"]: if pub.get("doi"): - yield pub["doi"] + doi_id = pub["doi"].replace("https://doi.org/", "") + yield doi_id def doi_orcids_pickle(authors_csv, pickle_file, limit=None) -> None: diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index d7d493b..2ed59b9 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -46,7 +46,7 @@ def dois_from_orcid(orcid: str, limit=None): time.sleep(1) logging.info(f"looking up dois for orcid {orcid}") - + # get the first (and hopefully only) openalex id for the orcid authors = Authors().filter(orcid=orcid).get() if len(authors) == 0: @@ -54,9 +54,23 @@ def dois_from_orcid(orcid: str, limit=None): elif len(authors) > 1: logging.warn(f"found more than one openalex author id for {orcid}") author_id = authors[0]["id"] - + # get all the works for the openalex author id work_count = 0 + for page in Works().filter(author={"id": author_id}).paginate(per_page=200): + for pub in page: + if pub.get("doi"): + work_count += 1 + if limit is not None and work_count > limit: + return + yield pub.get("doi").replace("https://doi.org/", "") + + +def works_from_author_id(author_id, limit=None): + """ + Pass in the OpenAlex Author ID and get back an iterator of works. + """ + work_count = 0 for page in Works().filter(author={"id": author_id}).paginate(per_page=200): for pub in page: if pub.get("doi"): diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py index 6d0bd2d..02c3e41 100644 --- a/rialto_airflow/harvest/sul_pub.py +++ b/rialto_airflow/harvest/sul_pub.py @@ -4,7 +4,7 @@ import requests -sul_pub_fields = [ +SUL_PUB_FIELDS = [ "authorship", "title", "abstract", @@ -35,7 +35,7 @@ def sul_pub_csv(csv_file, host, key, since=None, limit=None): with open(csv_file, "w") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields) + writer = csv.DictWriter(csvfile, fieldnames=SUL_PUB_FIELDS) writer.writeheader() for row in harvest(host, key, since, limit): writer.writerow(row) @@ -73,7 +73,7 @@ def harvest(host, key, since, limit): more = False break - pub = {key: record[key] for key in record if key in sul_pub_fields} + pub = {key: record[key] for key in record if key in SUL_PUB_FIELDS} pub["doi"] = extract_doi(record) yield pub @@ -82,5 +82,6 @@ def harvest(host, key, since, limit): def extract_doi(record): for id in record.get("identifier"): if id["type"] == "doi": - return id["id"] + doi_id = id["id"].replace("https://doi.org/", "") + return doi_id return None diff --git a/test/harvest/test_dimensions.py b/test/harvest/test_dimensions.py index f7007bd..9aff083 100644 --- a/test/harvest/test_dimensions.py +++ b/test/harvest/test_dimensions.py @@ -22,6 +22,7 @@ def test_doi_orcids_dict(tmpdir): assert len(doi_orcids) > 0 assert doi_orcids["10.1109/lra.2018.2890209"] == ["0000-0002-0770-2940"] + assert "https://doi.org/" not in list(doi_orcids.keys())[0], "doi is an ID" def test_publications_from_dois(): diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index 1fa0171..c355c81 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -30,6 +30,7 @@ def test_doi_orcids_pickle(tmp_path): assert len(mapping) > 0 doi = list(mapping.keys())[0] + assert "https://doi.org/" not in doi, "doi is an ID" assert "/" in doi orcids = mapping[doi]