From 23e307fb0db4d32696c628a8c5639f4e8cbb7940 Mon Sep 17 00:00:00 2001 From: "Gwendolyn O. Gusak" <81755070+GwennyGit@users.noreply.github.com> Date: Wed, 16 Aug 2023 14:55:05 +0100 Subject: [PATCH] Merge hot fix for polish into main (#96) * Changed PyPI version badge For the next release the PyPI version badge stems now from 'shields.io' and not from 'badge.fury.io'. * Changed colour for refineGEMs version badge * Adjusted handling of BioCyc identifiers in polish_annotations #95 #58 * Added requirement for importlib_resources=5.13.0 to Pipfile * Added code to cope with missing sub-database prefixes for BioCyc identifiers #95 * Changed NaN identifier handling #95 * Fixed issue III: None prefix identifier pairs in invalid_curies.tsv #95 * Adjusted files with version for release 1.2.2 --- Pipfile | 1 + Pipfile.lock | 2 +- README.md | 4 +- docs/source/conf.py | 2 +- refinegems/polish.py | 133 ++++++++++++++++++++++++++++++++----------- setup.py | 2 +- 6 files changed, 107 insertions(+), 37 deletions(-) diff --git a/Pipfile b/Pipfile index 45ff4bc8..a1059a14 100644 --- a/Pipfile +++ b/Pipfile @@ -8,6 +8,7 @@ cobra = "==0.22.0" biopython = "==1.79" bioregistry = "==0.10.1" bioservices = "==1.7.11" +importlib_resources = "==5.13.0" memote = "==0.13.0" pandas = "==1.2.4" numpy = "==1.20.3" diff --git a/Pipfile.lock b/Pipfile.lock index 63506f4a..11c84474 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1694,7 +1694,7 @@ }, "refinegems": { "path": ".", - "version": "==1.2.1" + "version": "==1.2.2" }, "requests": { "hashes": [ diff --git a/README.md b/README.md index ada426f2..57b87a01 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -![GitHub Pipenv locked dependency version](https://img.shields.io/github/pipenv/locked/dependency-version/draeger-lab/refinegems/refinegems) +![GitHub Pipenv locked dependency version](https://img.shields.io/github/pipenv/locked/dependency-version/draeger-lab/refinegems/refinegems?label=refineGEMs&color=B4A069) ![GitHub Pipenv locked Python version](https://img.shields.io/github/pipenv/locked/python-version/draeger-lab/refinegems) [![Documentation Status](https://readthedocs.org/projects/refinegems/badge/?version=latest)](https://refinegems.readthedocs.io/en/latest/?badge=latest) ![GitHub last commit (branch)](https://img.shields.io/github/last-commit/draeger-lab/refinegems/main) ![Repo Size](https://img.shields.io/github/repo-size/draeger-lab/refinegems) -[![PyPI version](https://badge.fury.io/py/refineGEMs.svg)](https://badge.fury.io/py/refineGEMs) +![PyPI version](https://img.shields.io/pypi/v/refinegems?label=PyPI%20package&color=neongreen) ![PyPI - Format](https://img.shields.io/pypi/format/refinegems) [![PyPI downloads](https://img.shields.io/pypi/dm/refinegems.svg)](https://pypistats.org/packages/refinegems) [![DOI](https://zenodo.org/badge/359867657.svg)](https://zenodo.org/badge/latestdoi/359867657) diff --git a/docs/source/conf.py b/docs/source/conf.py index 5d2645f3..ba5fd774 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,7 +23,7 @@ author = 'Famke Bäuerle and Gwendolyn O. Gusak' # The full version, including alpha/beta/rc tags -release = '1.2.1' +release = '1.2.2' # -- General configuration --------------------------------------------------- diff --git a/refinegems/polish.py b/refinegems/polish.py index 98d5d673..d58941f9 100644 --- a/refinegems/polish.py +++ b/refinegems/polish.py @@ -19,6 +19,10 @@ from datetime import date __author__ = "Famke Baeuerle and Gwendolyn O. Gusak" + + +#------------------------------------------------ Constant variables --------------------------------------------------# +BIOCYC_TIER1_DATABASES_PREFIXES = ['META', 'ECO', 'ECOLI', 'HUMAN'] #----------- Functions to add URIs from the entity IDs to the annotation field for metabolites & reactions ------------# @@ -547,7 +551,7 @@ def cv_ncbiprotein(gene_list, email, protein_fasta: str, lab_strain: bool=False) logging.warning(f'The following {len(genes_missing_annotation)} genes have no annotation, name & label (locus tag): {genes_missing_annotation}') -#------------------- Functions to change the CURIE pattern/CVTerm qualifier & qualifier type --------------------------# +#------------------- Functions to change the CURIE pattern/CVTerm qualifier & qualifier type --------------------------# def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[str]], list[str]]: """| Gets a list of URIs | & maps the database prefixes to their respective identifier sets @@ -575,14 +579,50 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st curie = manager.parse_curie(extracted_curie) # Contains valid db prefix to identifiers pairs curie = list(curie) # Turn tuple into list to allow item assignment - if not curie[0]: # Need to do own parsing if prefix is not valid + if curie[0]: # Prefix is valid but to have same result for same databases need to do a bit of own parsing + if re.fullmatch('^biocyc$', curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible + # Always add META if BioCyc sub-datbase prefixes are missing + curie = curie if curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else [curie[0], f'META:{curie[1]}'] + + if 'META' in curie[1]: + if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary + prefix, identifier = normalize_parsed_curie(*curie) + + if not curie_dict or (prefix not in curie_dict): + curie_dict[prefix] = SortedSet() + curie_dict[prefix].add(identifier) + else: + invalid_curies.append(f'{curie[0]}:{curie[1]}') + # Add the MetaCyc identifier additionally + curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier + if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE): + curie[0] = 'metacyc.reaction' + else: + curie[0] = 'metacyc.compound' + elif 'metacyc.' in curie[0]: + if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary + prefix, identifier = normalize_parsed_curie(*curie) + + if not curie_dict or (prefix not in curie_dict): + curie_dict[prefix] = SortedSet() + curie_dict[prefix].add(identifier) + else: + invalid_curies.append(f'{curie[0]}:{curie[1]}') + + # Add the BioCyc identifier additionally + curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier + elif re.fullmatch('^brenda$', curie[0], re.IGNORECASE): # Brenda & EC code is the same + curie[0] = 'eccode' + + elif not curie[0]: # Need to do own parsing if prefix is not valid # Get CURIEs irrespective of pattern if '/' in extracted_curie: extracted_curie = extracted_curie.split('/') # Check for NaN identifiers if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE): + # Only return strings where the database prefix is 'NaN' but a possible identifier could be contained if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) and not re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE): invalid_curies.append(f'{extracted_curie[0]}:{extracted_curie[1]}') continue @@ -595,34 +635,45 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st curie = (wrong_prefix[0], f'{wrong_prefix[1]}/{"/".join(extracted_curie[1:len(extracted_curie)])}') elif re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE): # Brenda & EC code is the same curie = ('eccode', extracted_curie[1]) - elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE) or ('metacyc.' in extracted_curie[0]): # Check for bio- & metacyc + elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible + # Always add META if BioCyc sub-datbase prefixes are missing + extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}' curie = ['biocyc', extracted_curie[1]] - if is_valid_identifier(*curie): # Get all valid identifiers + if 'META' in curie[1]: + if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary + prefix, identifier = normalize_parsed_curie(*curie) + + if not curie_dict or (prefix not in curie_dict): + curie_dict[prefix] = SortedSet() + curie_dict[prefix].add(identifier) + else: + invalid_curies.append(f'{curie[0]}:{curie[1]}') + + # Add additionallly the MetaCyc identifier + curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier + if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE): + curie[0] = 'metacyc.reaction' + else: + curie[0] = 'metacyc.compound' + elif 'metacyc.' in extracted_curie[0]: + curie = extracted_curie + if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary prefix, identifier = normalize_parsed_curie(*curie) - + if not curie_dict or (prefix not in curie_dict): curie_dict[prefix] = SortedSet() curie_dict[prefix].add(identifier) - else: - invalid_curies.append(f'{prefix}:{identifier}') - - if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE): - curie[0] = 'metacyc.reaction' - else: - curie[0] = 'metacyc.compound' + invalid_curies.append(f'{curie[0]}:{curie[1]}') + # Add BioCyc identfier additionally + curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier elif re.fullmatch('^chebi$', extracted_curie[0], re.IGNORECASE): new_curie = extracted_curie[1].split(':') - curie = (new_curie[0].lower(), new_curie[1]) - - # Checks for old pattern of SBO term URIs ('MIRIAM/sbo/SBO:identifier') - elif re.search('^sbo:', extracted_curie[1], re.IGNORECASE): - prefix = extracted_curie[0] - identifier = extracted_curie[1].split(':')[1] - + elif re.search('^sbo:', extracted_curie[1], re.IGNORECASE): # Checks for old pattern of SBO term URIs ('MIRIAM/sbo/SBO:identifier') + curie = [extracted_curie[0], extracted_curie[1].split(':')[1]] else: if re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^ec-code$', extracted_curie[0], re.IGNORECASE): # Brenda equals EC code, EC code in URI = ec-code curie[0] = 'eccode' @@ -635,27 +686,45 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st extracted_curie = extracted_curie.split(':') # Check for NaN identifiers - if re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE): + if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE): + # Only return strings where the database prefix is 'NaN' but a possible identifier could be contained + if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) and not re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE): + invalid_curies.append(f'{extracted_curie[0]}:{extracted_curie[1]}') continue + elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible + # Always add META if BioCyc sub-datbase prefixes are missing + extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}' + curie = ['biocyc', extracted_curie[1]] - if re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE) or ('metacyc.' in extracted_curie[0]): # Check for bio- & metacyc - curie = ['biocyc', extracted_curie[-1]] - - if is_valid_identifier(*curie): # Get all valid identifiers + if 'META' in curie[1]: + if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary + prefix, identifier = normalize_parsed_curie(*curie) + + if not curie_dict or (prefix not in curie_dict): + curie_dict[prefix] = SortedSet() + curie_dict[prefix].add(identifier) + else: + invalid_curies.append(f'{curie[0]}:{curie[1]}') + + # Add MetaCyc identifier additionally + curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier + if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE): + curie[0] = 'metacyc.reaction' + else: + curie[0] = 'metacyc.compound' + elif 'metacyc.' in extracted_curie[0]: + curie = extracted_curie + if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary prefix, identifier = normalize_parsed_curie(*curie) if not curie_dict or (prefix not in curie_dict): curie_dict[prefix] = SortedSet() curie_dict[prefix].add(identifier) - else: - invalid_curies.append(f'{prefix}:{identifier}') - - if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE): - curie[0] = 'metacyc.reaction' - else: - curie[0] = 'metacyc.compound' + invalid_curies.append(f'{curie[0]}:{curie[1]}') + # Add BioCyc identifier additionally + curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier else: if re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^ec-code$', extracted_curie[0], re.IGNORECASE): # Brenda equals EC code, EC code in URI = ec-code curie[0] = 'eccode' @@ -906,7 +975,7 @@ def polish_annotations(model: libModel, bioregistry: bool, new_pattern: bool, fi f'These invalid CURIEs are saved to {curies_filename}') invalid_curies_df = parse_dict_to_dataframe(all_entity2invalid_curies) invalid_curies_df.columns = ['entity', 'invalid_curie'] - invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(':', expand = True) + invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(':', n=1, expand = True) # Required for identifiers that aso contain a ':' invalid_curies_df = invalid_curies_df.drop('invalid_curie', axis=1) invalid_curies_df.to_csv(curies_filename, sep='\t') diff --git a/setup.py b/setup.py index efb16e3a..83b20559 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ readme = readme_file.read() setup(name='refineGEMs', - version='1.2.1', + version='1.2.2', description='refineGEMs: a python package intended to help with the curation of genome-scale metabolic models (GEMS)', long_description=readme, long_description_content_type='text/markdown',