Skip to content

Commit

Permalink
Merge pull request #99 from sul-dlss-labs/data-fields
Browse files Browse the repository at this point in the history
Add new authors.csv, adjust published columns and tests
  • Loading branch information
edsu authored Sep 26, 2024
2 parents 6d294e8 + fd68077 commit a0cfdb1
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 34 deletions.
6 changes: 1 addition & 5 deletions rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,7 @@ def dimensions_pubs_df(dimensions_pubs):
"document_type",
"funders",
"funding_section",
"linkout",
"open_access",
"publisher",
"research_orgs",
"researchers",
"title",
"type",
"year",
Expand All @@ -86,7 +82,7 @@ def openalex_pubs_df(openalex_pubs):
"publication_year",
"title",
"type",
"best_oa_location",
"open_access",
),
)
df = df.rename(lambda column_name: "openalex_" + column_name)
Expand Down
15 changes: 9 additions & 6 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def doi_orcids_pickle(authors_csv, pickle_file, limit=None):

def dois_from_orcid(orcid: str, limit=None):
"""
Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person.
Pass in the ORCID ID and get back a list of DOIs for publications authored by that person.
"""

# TODO: I think we can maybe have this function take a list of orcids and
Expand All @@ -57,16 +57,18 @@ def dois_from_orcid(orcid: str, limit=None):
author_id = authors[0]["id"]

# get all the works for the openalex author id
work_count = 0
dois = set()
for page in (
Works().filter(author={"id": author_id}).select(["doi"]).paginate(per_page=200)
):
for pub in page:
if pub.get("doi"):
work_count += 1
if limit is not None and work_count > limit:
return
yield pub.get("doi").replace("https://doi.org/", "")
doi = pub.get("doi").replace("https://doi.org/", "")
dois.add(doi)
if limit is not None and len(dois) == limit:
return list(dois)

return list(dois)


def publications_csv(dois: list, csv_file: str) -> None:
Expand Down Expand Up @@ -147,6 +149,7 @@ def normalize_publication(pub) -> dict:
"id",
"ids",
"indexed_in",
"institution_assertions",
"institutions_distinct_count",
"is_authors_truncated",
"is_paratext",
Expand Down
13 changes: 0 additions & 13 deletions rialto_airflow/harvest/sul_pub.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,16 @@
SUL_PUB_FIELDS = [
"authorship",
"title",
"abstract",
"author",
"year",
"type",
"mesh_headings",
"publisher",
"journal",
"provenance",
"doi",
"issn",
"sulpubid",
"sw_id",
"pmid",
"identifier",
"last_updated",
"pages",
"date",
"country",
"booktitle",
"edition",
"series",
"chapter",
"editor",
]


Expand Down
10 changes: 5 additions & 5 deletions test/harvest/test_merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def openalex_pubs_csv(tmp_path):
"title",
"type",
"doi",
"best_oa_location",
"open_access",
]
writer.writerow(header)
writer.writerow(
Expand All @@ -97,7 +97,7 @@ def openalex_pubs_csv(tmp_path):
"A Publication",
"article",
"https://doi.org/10.0000/cccc",
'{is_oa: true, landing_page_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1398957", pdf_url: null, source: { id: "https://openalex.org/S2764455111", display_name: "PubMed Central", issn_l: null, issn: null, host_organization: "https://openalex.org/I1299303238", type: "repository" }, license: null, version: "publishedVersion"}',
"green",
]
)
writer.writerow(
Expand All @@ -110,7 +110,7 @@ def openalex_pubs_csv(tmp_path):
"A Research Article",
"article",
"https://doi.org/10.0000/1234",
"",
"bronze",
]
)
return fixture_file
Expand Down Expand Up @@ -165,7 +165,7 @@ def test_openalex_pubs_df(openalex_pubs_csv):
df = lazy_df.collect()
assert df.shape[0] == 2
assert "bogus" not in df.columns, "Unneeded columns have been dropped"
assert "openalex_best_oa_location" in df.columns
assert "openalex_open_access" in df.columns
assert df["openalex_doi"].to_list() == ["10.0000/cccc", "10.0000/1234"]


Expand Down Expand Up @@ -193,7 +193,7 @@ def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv):
assert output.is_file(), "output file has been created"
df = pl.read_parquet(output)
assert df.shape[0] == 5
assert df.shape[1] == 25
assert df.shape[1] == 21
assert set(df["doi"].to_list()) == set(
["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"]
)
11 changes: 6 additions & 5 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_dois_from_orcid_paging():
# for Shanhui Fan who has a lot of publications (> 1300)
dois = list(openalex.dois_from_orcid("0000-0002-0081-9732", limit=300))
assert len(dois) == 300, "paging is limiting to 200 works"
assert len(set(dois)) == 300, "the dois are unique"
assert len(set(dois)) == len(dois), "the dois are unique"


def test_doi_orcids_pickle(tmp_path):
Expand Down Expand Up @@ -48,11 +48,12 @@ def test_publications_from_dois():

# look up the publication metadata for them
pubs = list(openalex.publications_from_dois(dois))
assert len(pubs) == 231, "should paginate (page size=200)"
assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique"

# >= is used because sometimes there can be multiple works for a DOI!
assert len(pubs) >= 231, "should paginate (page size=200)"
assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
assert len(pubs[0].keys()) == 52, "first publication has 52 columns"
assert len(pubs[1].keys()) == 52, "second publication has 52 columns"
assert len(pubs[0].keys()) == 53, "first publication has 53 columns"
assert len(pubs[1].keys()) == 53, "second publication has 53 columns"


def test_publications_from_invalid_dois(caplog):
Expand Down

0 comments on commit a0cfdb1

Please sign in to comment.