Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

small update in the cache ontologies and update in all of them #174

Merged
merged 4 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdrf_pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.29"
__version__ = "0.0.30"
Binary file modified sdrf_pipelines/ols/bto.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/chebi.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/cl.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/clo.parquet
Binary file not shown.
Binary file removed sdrf_pipelines/ols/efo-base.parquet
Binary file not shown.
Binary file added sdrf_pipelines/ols/efo.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/mondo.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/ncbitaxon.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/ncit.parquet
Binary file not shown.
35 changes: 32 additions & 3 deletions sdrf_pipelines/ols/ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,24 @@ def build_ontology_index(ontology_file: str, output_file: str = None, ontology_n
terms = [term for term in terms if "label" in term]
df = pd.DataFrame(terms)

df.to_parquet(output_file, compression="gzip")
# Convert to lowercase as needed
df["accession"] = df["accession"].str.lower()
df["label"] = df["label"].str.lower()
df["ontology"] = df["ontology"].str.lower()

# Enforce data types (schema)
df["accession"] = df["accession"].astype("string") # Ensuring a string type
df["label"] = df["label"].astype("string") # Ensuring a string type
df["ontology"] = df["ontology"].astype("string") # Ensuring a string type

# Remove terms with no label or accession and print a warning
df = df.dropna(subset=["label", "accession"])
if df.empty:
logger.warning("No terms found in %s", ontology_file)
raise ValueError(f"No terms found in {ontology_file}")
logger.info("Terms found in %s: %s", ontology_file, len(df))

df.to_parquet(output_file, compression="gzip", index=False)
logger.info("Index has finished, output file: %s", output_file)

def besthit(self, name, **kwargs):
Expand Down Expand Up @@ -411,13 +428,25 @@ def cache_search(self, term: str, ontology: str, full_search: bool = False) -> l
return []

if ontology is not None:
# Query for case-insensitive search and ensure all fields are cast to string
duckdb_conn = duckdb.execute(
"""SELECT * FROM read_parquet(?) WHERE lower(label) = lower(?) AND lower(ontology) = lower(?)""",
"""SELECT CAST(accession AS VARCHAR) AS accession,
CAST(label AS VARCHAR) AS label,
CAST(ontology AS VARCHAR) AS ontology
FROM read_parquet(?)
WHERE lower(CAST(label AS VARCHAR)) = lower(?)
AND lower(CAST(ontology AS VARCHAR)) = lower(?)""",
(self.parquet_files, term, ontology),
)
else:
# Query for case-insensitive search without ontology
duckdb_conn = duckdb.execute(
"""SELECT * FROM read_parquet(?) WHERE lower(label) = lower(?)""", (self.parquet_files, term)
"""SELECT CAST(accession AS VARCHAR) AS accession,
CAST(label AS VARCHAR) AS label,
CAST(ontology AS VARCHAR) AS ontology
FROM read_parquet(?)
WHERE lower(CAST(label AS VARCHAR)) = lower(?)""",
(self.parquet_files, term),
)
df = duckdb_conn.fetchdf()

Expand Down
Binary file modified sdrf_pipelines/ols/pato.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/pride.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/psi-ms.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/uberon.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/unimod.parquet
Binary file not shown.
Loading