From f899f236101bb010ff2abe764a7298a98e95a90a Mon Sep 17 00:00:00 2001 From: Laura Wrubel Date: Mon, 8 Jul 2024 10:38:20 -0400 Subject: [PATCH 1/3] Retry DOI lookups when invalid DOI in batch --- rialto_airflow/harvest/openalex.py | 20 ++++++++++++++++---- test/data/openalex-dois.csv | 2 +- test/harvest/test_openalex.py | 4 +++- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 8cb78f3..ca24d6f 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -6,7 +6,7 @@ from urllib.parse import quote from more_itertools import batched -from pyalex import Authors, Works, config +from pyalex import Authors, Works, config, api from rialto_airflow.utils import invert_dict @@ -89,9 +89,21 @@ def publications_from_dois(dois: list): time.sleep(1) doi_list = quote("|".join([doi for doi in doi_batch])) - for page in Works().filter(doi=doi_list).paginate(per_page=200): - for pub in page: - yield normalize_publication(pub) + try: + for page in Works().filter(doi=doi_list).paginate(per_page=200): + for pub in page: + yield normalize_publication(pub) + except api.QueryError: + # try dois individually + for doi in doi_batch: + try: + pubs = Works().filter(doi=doi).get() + if len(pubs) > 1: + logging.warn(f"Found multiple publications for DOI {doi}") + yield normalize_publication(pubs[0]) + except api.QueryError as e: + logging.error(f"OpenAlex QueryError for {doi}: {e}") + continue def normalize_publication(pub) -> dict: diff --git a/test/data/openalex-dois.csv b/test/data/openalex-dois.csv index a56178f..65dc6c9 100644 --- a/test/data/openalex-dois.csv +++ b/test/data/openalex-dois.csv @@ -1,6 +1,6 @@ doi 10.1002/adma.202103646 -10.1001/jamacardio.2021.6059 +"10.1001/jamacardio,2021.6059" 10.3389/fimmu.2022.832501 10.1161/strokeaha.122.040540 10.1001/jamainternmed.2023.2561 diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index fdd054c..7127942 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -48,7 +48,9 @@ def test_publications_from_dois(): # look up the publication metadata for them pubs = list(openalex.publications_from_dois(dois)) - assert len(pubs) == 231, "should paginate (page size=200)" + assert ( + len(pubs) == 230 + ), "should paginate (page size=200) and have skipped invalid DOI" assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique" assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for." assert len(pubs[0].keys()) == 51, "first publication has 51 columns" From 05ab9e255eba7735e3ae7af1c8ce24023465bbac Mon Sep 17 00:00:00 2001 From: Laura Wrubel Date: Mon, 8 Jul 2024 11:26:25 -0400 Subject: [PATCH 2/3] Add test of QueryError --- test/data/openalex-dois.csv | 2 +- test/harvest/test_openalex.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/test/data/openalex-dois.csv b/test/data/openalex-dois.csv index 65dc6c9..a56178f 100644 --- a/test/data/openalex-dois.csv +++ b/test/data/openalex-dois.csv @@ -1,6 +1,6 @@ doi 10.1002/adma.202103646 -"10.1001/jamacardio,2021.6059" +10.1001/jamacardio.2021.6059 10.3389/fimmu.2022.832501 10.1161/strokeaha.122.040540 10.1001/jamainternmed.2023.2561 diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index 7127942..08598d2 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -48,15 +48,22 @@ def test_publications_from_dois(): # look up the publication metadata for them pubs = list(openalex.publications_from_dois(dois)) - assert ( - len(pubs) == 230 - ), "should paginate (page size=200) and have skipped invalid DOI" + assert len(pubs) == 231, "should paginate (page size=200)" assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique" - assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for." assert len(pubs[0].keys()) == 51, "first publication has 51 columns" assert len(pubs[1].keys()) == 51, "second publication has 51 columns" +def test_publications_from_invalid_dois(caplog): + # Error may change if OpenAlex API or pyalex changes + invalid_dois = ["doi-with-comma,a", "10.1145/3442188.3445922"] + assert len(list(openalex.publications_from_dois(invalid_dois))) == 1 + assert ( + "OpenAlex QueryError for doi-with-comma,a: Invalid query parameter" + in caplog.text + ), "logs error message" + + def test_publications_csv(tmp_path): pubs_csv = tmp_path / "openalex-pubs.csv" openalex.publications_csv( From bb285248115e2df00a9049757a2f732f3c54dc09 Mon Sep 17 00:00:00 2001 From: Laura Wrubel Date: Mon, 8 Jul 2024 11:35:23 -0400 Subject: [PATCH 3/3] Restore assertion --- test/harvest/test_openalex.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index 08598d2..71b4c01 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -50,6 +50,7 @@ def test_publications_from_dois(): pubs = list(openalex.publications_from_dois(dois)) assert len(pubs) == 231, "should paginate (page size=200)" assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique" + assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for." assert len(pubs[0].keys()) == 51, "first publication has 51 columns" assert len(pubs[1].keys()) == 51, "second publication has 51 columns"