Skip to content

Commit

Permalink
Use low_memory for merge_pubs
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jul 18, 2024
1 parent 8f70f65 commit bd21116
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def dimensions_pubs_df(dimensions_pubs):
df = pl.scan_csv(
dimensions_pubs,
schema_overrides={"volume": pl.String, "pmid": pl.String, "year": pl.String},
low_memory=True,
)
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
Expand All @@ -74,7 +75,9 @@ def openalex_pubs_df(openalex_pubs):
"""
Create an openalex pubs LazyFrame and rename columns
"""
df = pl.scan_csv(openalex_pubs, schema_overrides={"publication_year": pl.String})
df = pl.scan_csv(
openalex_pubs, schema_overrides={"publication_year": pl.String}, low_memory=True
)
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
Expand All @@ -89,7 +92,11 @@ def sulpub_df(sul_pub):
"""
Create a sulpub LazyFrame and rename columns
"""
df = pl.scan_csv(sul_pub, schema_overrides={"year": pl.String, "pmid": pl.String})
df = pl.scan_csv(
sul_pub,
schema_overrides={"year": pl.String, "pmid": pl.String},
low_memory=True,
)
df = df.drop_nulls("doi")
df = df.with_columns(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String)
Expand Down

0 comments on commit bd21116

Please sign in to comment.