diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py index a3024cf..ca5c44e 100644 --- a/sdrf_pipelines/__init__.py +++ b/sdrf_pipelines/__init__.py @@ -1 +1 @@ -__version__ = "0.0.29" +__version__ = "0.0.30" diff --git a/sdrf_pipelines/ols/bto.parquet b/sdrf_pipelines/ols/bto.parquet index c9ab5c9..7f4a83f 100644 Binary files a/sdrf_pipelines/ols/bto.parquet and b/sdrf_pipelines/ols/bto.parquet differ diff --git a/sdrf_pipelines/ols/chebi.parquet b/sdrf_pipelines/ols/chebi.parquet index b65f279..c6a806a 100644 Binary files a/sdrf_pipelines/ols/chebi.parquet and b/sdrf_pipelines/ols/chebi.parquet differ diff --git a/sdrf_pipelines/ols/cl.parquet b/sdrf_pipelines/ols/cl.parquet index 8df93e8..520deb8 100644 Binary files a/sdrf_pipelines/ols/cl.parquet and b/sdrf_pipelines/ols/cl.parquet differ diff --git a/sdrf_pipelines/ols/clo.parquet b/sdrf_pipelines/ols/clo.parquet index 0db3334..dba023b 100644 Binary files a/sdrf_pipelines/ols/clo.parquet and b/sdrf_pipelines/ols/clo.parquet differ diff --git a/sdrf_pipelines/ols/efo-base.parquet b/sdrf_pipelines/ols/efo-base.parquet deleted file mode 100644 index 74135b5..0000000 Binary files a/sdrf_pipelines/ols/efo-base.parquet and /dev/null differ diff --git a/sdrf_pipelines/ols/efo.parquet b/sdrf_pipelines/ols/efo.parquet new file mode 100644 index 0000000..c1811c9 Binary files /dev/null and b/sdrf_pipelines/ols/efo.parquet differ diff --git a/sdrf_pipelines/ols/mondo.parquet b/sdrf_pipelines/ols/mondo.parquet index 54f890d..04626bb 100644 Binary files a/sdrf_pipelines/ols/mondo.parquet and b/sdrf_pipelines/ols/mondo.parquet differ diff --git a/sdrf_pipelines/ols/ncbitaxon.parquet b/sdrf_pipelines/ols/ncbitaxon.parquet index 5e2f982..5c0bf7c 100644 Binary files a/sdrf_pipelines/ols/ncbitaxon.parquet and b/sdrf_pipelines/ols/ncbitaxon.parquet differ diff --git a/sdrf_pipelines/ols/ncit.parquet b/sdrf_pipelines/ols/ncit.parquet index eb0c25f..c5b13f3 100644 Binary files a/sdrf_pipelines/ols/ncit.parquet and b/sdrf_pipelines/ols/ncit.parquet differ diff --git a/sdrf_pipelines/ols/ols.py b/sdrf_pipelines/ols/ols.py index ebfc4c6..45111bc 100644 --- a/sdrf_pipelines/ols/ols.py +++ b/sdrf_pipelines/ols/ols.py @@ -236,7 +236,24 @@ def build_ontology_index(ontology_file: str, output_file: str = None, ontology_n terms = [term for term in terms if "label" in term] df = pd.DataFrame(terms) - df.to_parquet(output_file, compression="gzip") + # Convert to lowercase as needed + df["accession"] = df["accession"].str.lower() + df["label"] = df["label"].str.lower() + df["ontology"] = df["ontology"].str.lower() + + # Enforce data types (schema) + df["accession"] = df["accession"].astype("string") # Ensuring a string type + df["label"] = df["label"].astype("string") # Ensuring a string type + df["ontology"] = df["ontology"].astype("string") # Ensuring a string type + + # Remove terms with no label or accession and print a warning + df = df.dropna(subset=["label", "accession"]) + if df.empty: + logger.warning("No terms found in %s", ontology_file) + raise ValueError(f"No terms found in {ontology_file}") + logger.info("Terms found in %s: %s", ontology_file, len(df)) + + df.to_parquet(output_file, compression="gzip", index=False) logger.info("Index has finished, output file: %s", output_file) def besthit(self, name, **kwargs): @@ -411,13 +428,25 @@ def cache_search(self, term: str, ontology: str, full_search: bool = False) -> l return [] if ontology is not None: + # Query for case-insensitive search and ensure all fields are cast to string duckdb_conn = duckdb.execute( - """SELECT * FROM read_parquet(?) WHERE lower(label) = lower(?) AND lower(ontology) = lower(?)""", + """SELECT CAST(accession AS VARCHAR) AS accession, + CAST(label AS VARCHAR) AS label, + CAST(ontology AS VARCHAR) AS ontology + FROM read_parquet(?) + WHERE lower(CAST(label AS VARCHAR)) = lower(?) + AND lower(CAST(ontology AS VARCHAR)) = lower(?)""", (self.parquet_files, term, ontology), ) else: + # Query for case-insensitive search without ontology duckdb_conn = duckdb.execute( - """SELECT * FROM read_parquet(?) WHERE lower(label) = lower(?)""", (self.parquet_files, term) + """SELECT CAST(accession AS VARCHAR) AS accession, + CAST(label AS VARCHAR) AS label, + CAST(ontology AS VARCHAR) AS ontology + FROM read_parquet(?) + WHERE lower(CAST(label AS VARCHAR)) = lower(?)""", + (self.parquet_files, term), ) df = duckdb_conn.fetchdf() diff --git a/sdrf_pipelines/ols/pato.parquet b/sdrf_pipelines/ols/pato.parquet index 58eeb40..313b3cf 100644 Binary files a/sdrf_pipelines/ols/pato.parquet and b/sdrf_pipelines/ols/pato.parquet differ diff --git a/sdrf_pipelines/ols/pride.parquet b/sdrf_pipelines/ols/pride.parquet index 77305f0..379ce71 100644 Binary files a/sdrf_pipelines/ols/pride.parquet and b/sdrf_pipelines/ols/pride.parquet differ diff --git a/sdrf_pipelines/ols/psi-ms.parquet b/sdrf_pipelines/ols/psi-ms.parquet index 32d0742..b6bd7ef 100644 Binary files a/sdrf_pipelines/ols/psi-ms.parquet and b/sdrf_pipelines/ols/psi-ms.parquet differ diff --git a/sdrf_pipelines/ols/uberon.parquet b/sdrf_pipelines/ols/uberon.parquet index e54f395..d412876 100644 Binary files a/sdrf_pipelines/ols/uberon.parquet and b/sdrf_pipelines/ols/uberon.parquet differ diff --git a/sdrf_pipelines/ols/unimod.parquet b/sdrf_pipelines/ols/unimod.parquet index ed8c630..9ee890a 100644 Binary files a/sdrf_pipelines/ols/unimod.parquet and b/sdrf_pipelines/ols/unimod.parquet differ