From c8af82d31c0f1b8d5ef5a0e2a8776a7bed60eb86 Mon Sep 17 00:00:00 2001
From: Laura Wrubel <lwrubel@stanford.edu>
Date: Tue, 25 Jun 2024 08:59:38 -0400
Subject: [PATCH] Consistent DOI ID keys

---
 .gitignore                           |  3 +++
 rialto_airflow/harvest/dimensions.py |  3 ++-
 rialto_airflow/harvest/openalex.py   | 18 ++++++++++++++++--
 rialto_airflow/harvest/sul_pub.py    |  9 +++++----
 test/harvest/test_dimensions.py      |  1 +
 test/harvest/test_openalex.py        |  1 +
 6 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 09da277..96a430d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,7 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
+# rialto-airflow
 data/
+.DS_Store
+
diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py
index d308b4b..fe69bab 100644
--- a/rialto_airflow/harvest/dimensions.py
+++ b/rialto_airflow/harvest/dimensions.py
@@ -36,7 +36,8 @@ def dois_from_orcid(orcid):
         logging.warning("Truncated results for ORCID %s", orcid)
     for pub in result["publications"]:
         if pub.get("doi"):
-            yield pub["doi"]
+            doi_id = pub["doi"].replace("https://doi.org/", "")
+            yield doi_id
 
 
 def doi_orcids_pickle(authors_csv, pickle_file, limit=None) -> None:
diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py
index d7d493b..2ed59b9 100644
--- a/rialto_airflow/harvest/openalex.py
+++ b/rialto_airflow/harvest/openalex.py
@@ -46,7 +46,7 @@ def dois_from_orcid(orcid: str, limit=None):
     time.sleep(1)
 
     logging.info(f"looking up dois for orcid {orcid}")
-
+    
     # get the first (and hopefully only) openalex id for the orcid
     authors = Authors().filter(orcid=orcid).get()
     if len(authors) == 0:
@@ -54,9 +54,23 @@ def dois_from_orcid(orcid: str, limit=None):
     elif len(authors) > 1:
         logging.warn(f"found more than one openalex author id for {orcid}")
     author_id = authors[0]["id"]
-
+    
     # get all the works for the openalex author id
     work_count = 0
+    for page in Works().filter(author={"id": author_id}).paginate(per_page=200):
+        for pub in page:
+            if pub.get("doi"):
+                work_count += 1
+                if limit is not None and work_count > limit:
+                    return
+                yield pub.get("doi").replace("https://doi.org/", "")
+
+
+def works_from_author_id(author_id, limit=None):
+    """
+    Pass in the OpenAlex Author ID and get back an iterator of works.
+    """
+    work_count = 0
     for page in Works().filter(author={"id": author_id}).paginate(per_page=200):
         for pub in page:
             if pub.get("doi"):
diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py
index 6d0bd2d..02c3e41 100644
--- a/rialto_airflow/harvest/sul_pub.py
+++ b/rialto_airflow/harvest/sul_pub.py
@@ -4,7 +4,7 @@
 import requests
 
 
-sul_pub_fields = [
+SUL_PUB_FIELDS = [
     "authorship",
     "title",
     "abstract",
@@ -35,7 +35,7 @@
 
 def sul_pub_csv(csv_file, host, key, since=None, limit=None):
     with open(csv_file, "w") as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields)
+        writer = csv.DictWriter(csvfile, fieldnames=SUL_PUB_FIELDS)
         writer.writeheader()
         for row in harvest(host, key, since, limit):
             writer.writerow(row)
@@ -73,7 +73,7 @@ def harvest(host, key, since, limit):
                 more = False
                 break
 
-            pub = {key: record[key] for key in record if key in sul_pub_fields}
+            pub = {key: record[key] for key in record if key in SUL_PUB_FIELDS}
             pub["doi"] = extract_doi(record)
 
             yield pub
@@ -82,5 +82,6 @@ def harvest(host, key, since, limit):
 def extract_doi(record):
     for id in record.get("identifier"):
         if id["type"] == "doi":
-            return id["id"]
+            doi_id = id["id"].replace("https://doi.org/", "")
+            return doi_id
     return None
diff --git a/test/harvest/test_dimensions.py b/test/harvest/test_dimensions.py
index f7007bd..9aff083 100644
--- a/test/harvest/test_dimensions.py
+++ b/test/harvest/test_dimensions.py
@@ -22,6 +22,7 @@ def test_doi_orcids_dict(tmpdir):
 
     assert len(doi_orcids) > 0
     assert doi_orcids["10.1109/lra.2018.2890209"] == ["0000-0002-0770-2940"]
+    assert "https://doi.org/" not in list(doi_orcids.keys())[0], "doi is an ID"
 
 
 def test_publications_from_dois():
diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py
index 1fa0171..c355c81 100644
--- a/test/harvest/test_openalex.py
+++ b/test/harvest/test_openalex.py
@@ -30,6 +30,7 @@ def test_doi_orcids_pickle(tmp_path):
     assert len(mapping) > 0
 
     doi = list(mapping.keys())[0]
+    assert "https://doi.org/" not in doi, "doi is an ID"
     assert "/" in doi
 
     orcids = mapping[doi]