diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 2e06249..ebbbe62 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -42,7 +42,7 @@ jobs: for sdrf in testdata/*/*.sdrf.tsv do pushd $(dirname $sdrf) - python ../../parse_sdrf.py convert-openms -s $(pwd)/../../$sdrf -t2 + python ../../parse_sdrf.py convert-openms -s $(pwd)/../../$sdrf -t2 --extension_convert raw:mzML diff -c experimental_design.tsv expected_experimental_design.tsv >> ../../failures.txt diff -c expected_openms.tsv openms.tsv >> ../../failures.txt popd diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index f3679ab..ec1d54a 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -45,7 +45,7 @@ jobs: for sdrf in testdata/*/*.sdrf.tsv do pushd $(dirname $sdrf) - python ../../parse_sdrf.py convert-openms -s $(pwd)/../../$sdrf -t2 + python ../../parse_sdrf.py convert-openms -s $(pwd)/../../$sdrf -t2 --extension_convert raw:mzML diff -c experimental_design.tsv expected_experimental_design.tsv >> ../../failures.txt diff -c expected_openms.tsv openms.tsv >> ../../failures.txt popd diff --git a/requirements.txt b/requirements.txt index 8a4f3a3..8a8af41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ pytest requests pyyaml numpy +defusedxml \ No newline at end of file diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py index 479dad0..40b07ef 100644 --- a/sdrf_pipelines/__init__.py +++ b/sdrf_pipelines/__init__.py @@ -1 +1 @@ -__version__ = "0.0.22" +__version__ = "0.0.23" diff --git a/sdrf_pipelines/openms/openms.py b/sdrf_pipelines/openms/openms.py index ee4f650..9861f5f 100644 --- a/sdrf_pipelines/openms/openms.py +++ b/sdrf_pipelines/openms/openms.py @@ -24,6 +24,48 @@ class FileToColumnEntries: file2technical_rep = {} +def get_openms_file_name(raw, extension_convert: str = None): + """ + Convert file name for OpenMS. If extension_convert is set, the extension will be converted to the specified format. + - file.raw -> file.mzML (extension_convert=raw:mzML) + - file.mzML -> file.mzML (extension_convert=mzML:mzML) + - file.mzML -> file.mzml (extension_convert=mzML:mzml) + - file.mzml -> file.mzML (extension_convert=mzml:mzML) + - file.d -> file.mzML (extension_convert=d:mzML) + - file.d -> file.d (extension_convert=d:d) + :param raw: raw file name + :param extension_convert: convert extension to specified format + :return: converted file name + """ + if extension_convert is None: + return raw + + possible_extension = ["raw", "mzML", "mzml", "d"] + extension_convert_list = extension_convert.split(",") + extension_convert_dict = {} + for extension_convert in extension_convert_list: + current_extension, new_extension = extension_convert.split(":") + if current_extension not in possible_extension or new_extension not in possible_extension: + raise Exception( + "Invalid extension conversion. Please use one of the following formats: " + str(possible_extension) + ) + elif current_extension in extension_convert_dict: + raise Exception("Invalid extension conversion. Please use only one conversion per extension") + else: + extension_convert_dict[current_extension] = new_extension + + ext = os.path.splitext(raw) + current_extension = ext[1][1:] + if current_extension not in extension_convert_dict: + raise Exception( + "Invalid extension conversion. The current extension of the file do not match the provided extension {}".format( + current_extension + ) + ) + out = ext[0] + "." + extension_convert_dict[current_extension] + return out + + class OpenMS: def __init__(self) -> None: super().__init__() @@ -173,11 +215,11 @@ def openms_ify_mods(self, sdrf_mods): def openms_convert( self, sdrf_file: str = None, - keep_raw: bool = False, one_table: bool = False, legacy: bool = False, verbose: bool = False, split_by_columns: str = None, + extension_convert: str = None, ): print("PROCESSING: " + sdrf_file + '"') @@ -387,7 +429,7 @@ def openms_convert( source_name2n_reps, f2c.file2combined_factors, f2c.file2label, - keep_raw, + extension_convert, f2c.file2fraction, ) else: # two table format @@ -398,7 +440,7 @@ def openms_convert( source_name_list, source_name2n_reps, f2c.file2label, - keep_raw, + extension_convert, f2c.file2fraction, f2c.file2combined_factors, ) @@ -422,7 +464,7 @@ def openms_convert( source_name2n_reps, f2c.file2combined_factors, f2c.file2label, - keep_raw, + extension_convert, f2c.file2fraction, ) else: # two table format @@ -433,7 +475,7 @@ def openms_convert( source_name_list, source_name2n_reps, f2c.file2label, - keep_raw, + extension_convert, f2c.file2fraction, f2c.file2combined_factors, ) @@ -484,7 +526,7 @@ def writeTwoTableExperimentalDesign( source_name_list, source_name2n_reps, file2label, - keep_raw, + extension_convert, file2fraction, file2combined_factors, ): @@ -581,11 +623,7 @@ def writeTwoTableExperimentalDesign( else: label = str(self.itraq4plex[label[label_index[raw]].lower()]) label_index[raw] = label_index[raw] + 1 - if not keep_raw: - ext = os.path.splitext(raw) - out = ext[0] + ".mzML" - else: - out = raw + out = get_openms_file_name(raw, extension_convert) f.write( str(Fraction_group[raw]) @@ -675,7 +713,7 @@ def writeOneTableExperimentalDesign( source_name2n_reps, file2combined_factors, file2label, - keep_raw, + extension_convert, file2fraction, ): f = open(output_filename, "w+") @@ -840,11 +878,7 @@ def writeOneTableExperimentalDesign( label = str(self.itraq4plex[label[label_index[raw]].lower()]) label_index[raw] = label_index[raw] + 1 - if not keep_raw: - ext = os.path.splitext(raw) - out = ext[0] + ".mzML" - else: - out = raw + out = get_openms_file_name(raw, extension_convert) if "MSstats_Mixture" in open_ms_experimental_design_header: if raw not in mixture_raw_tag.keys(): diff --git a/sdrf_pipelines/openms/unimod.py b/sdrf_pipelines/openms/unimod.py index 4570f3e..06a16f2 100644 --- a/sdrf_pipelines/openms/unimod.py +++ b/sdrf_pipelines/openms/unimod.py @@ -1,6 +1,6 @@ import re -import xml.etree.ElementTree as et +import defusedxml.ElementTree as et import pkg_resources @@ -9,6 +9,12 @@ def __init__(self, site: str, position: str) -> None: self._site = site self._position = position + def get_site(self): + return self._site + + def get_position(self): + return self._position + class OntologyTerm: def __init__(self, accession: str, name: str) -> None: @@ -35,6 +41,15 @@ def get_name(self): def get_accession(self): return self._ontology_term.get_accession() + def get_delta_mono_mass(self): + return self._delta_mono_mass + + def get_delta_composition(self): + return self._delta_composition + + def to_str(self): + return f"{self.get_accession()} {self.get_name()} {self.get_delta_mono_mass()} {self.get_delta_composition()}" + class UnimodDatabase: """Wrapper for the Unimod database""" @@ -108,45 +123,6 @@ def _get_modifications(self, node): mod = PostTranslationalModification(ontology_term, ma["delta_composition"], sites, ma["delta_mono_mass"]) self.modifications.append(mod) - def get_label(self, label): - mod = self.modifications.get(label, None) - return mod - - def get_element(self, name): - el = self.elements.get(name, None) - return el - - def list_labels(self, search): - labels = [] - lre = re.compile(search) - for k in self.modifications.keys(): - l = lre.search(k) - if l is not None: - labels.append(k) - return labels - - def get_neutral_loss(self, label, site): - mod = self.modifications.get(label, None) - if mod is not None: - try: - nl = [] - for n in mod["sites"][site]["NeutralLoss"]: - if n["composition"] != "0": - nl.append(n) - return nl - except: - return [] - return [] - - def get_delta_mono(self, label): - mod = self.modifications.get(label, None) - if mod is not None: - try: - val = float(mod["delta_mono_mass"]) - return val - except: - pass - def get_by_accession(self, accession): for mod in self.modifications: if mod.get_accession() == accession: diff --git a/sdrf_pipelines/parse_sdrf.py b/sdrf_pipelines/parse_sdrf.py index 553a797..27bb005 100755 --- a/sdrf_pipelines/parse_sdrf.py +++ b/sdrf_pipelines/parse_sdrf.py @@ -35,21 +35,31 @@ def cli(): @click.command("convert-openms", short_help="convert sdrf to openms file output") @click.option("--sdrf", "-s", help="SDRF file") -@click.option("--raw", "-r", help="Keep filenames in experimental design output as raw.") @click.option( "--legacy/--modern", "-l/-m", default=False, help="legacy=Create artificial sample column not needed in OpenMS 2.6." ) -@click.option("--onetable/--twotables", "-t1/-t2", default=False, help="Create one-table or two-tables format.") -@click.option("--verbose/--quiet", "-v/-q", default=False, help="Output debug information.") +@click.option("--onetable/--twotables", "-t1/-t2", help="Create one-table or two-tables format.", default=False) +@click.option("--verbose/--quiet", "-v/-q", help="Output debug information.", default=False) @click.option("--conditionsfromcolumns", "-c", help="Create conditions from provided (e.g., factor) columns.") +@click.option( + "--extension_convert", + "-e", + help="convert extensions of files from one type to other 'raw:mzML,mzml:MZML,mzML:mzML,d:d'", +) @click.pass_context def openms_from_sdrf( - ctx, sdrf: str, raw: bool, onetable: bool, legacy: bool, verbose: bool, conditionsfromcolumns: str + ctx, + sdrf: str, + onetable: bool, + legacy: bool, + verbose: bool, + conditionsfromcolumns: str, + extension_convert: str, ): if sdrf is None: help() try: - OpenMS().openms_convert(sdrf, raw, onetable, legacy, verbose, conditionsfromcolumns) + OpenMS().openms_convert(sdrf, onetable, legacy, verbose, conditionsfromcolumns, extension_convert) except Exception as ex: print("Error: " + str(ex)) diff --git a/sdrf_pipelines/sdrf_merge/add_data_analysis_param.py b/sdrf_pipelines/sdrf_merge/add_data_analysis_param.py index f8b9499..297913b 100644 --- a/sdrf_pipelines/sdrf_merge/add_data_analysis_param.py +++ b/sdrf_pipelines/sdrf_merge/add_data_analysis_param.py @@ -37,7 +37,7 @@ def verify_content(pname, pvalue, ptype): # exit("ERROR: " + pname + " needs to be a numeric value!!") elif ptype == "class": not_matching = [x for x in pvalue.split(",") if x not in p["value"]] - if not_matching != []: + if len(not_matching) != 0: exit( "ERROR: " + pname @@ -98,7 +98,7 @@ def add_ptms(mods, pname, mod_columns): modname = tmod[0] modpos = tmod[1] found = [x for x in unimod.modifications if modname == x.get_name()] - if found == []: + if len(found) == 0: exit( "ERROR: " + m diff --git a/sdrf_pipelines/tests/test_sdrfchecker.py b/sdrf_pipelines/tests/test_sdrfchecker.py index d98054b..2211b94 100644 --- a/sdrf_pipelines/tests/test_sdrfchecker.py +++ b/sdrf_pipelines/tests/test_sdrfchecker.py @@ -12,8 +12,7 @@ def test_validate_srdf(): runner = CliRunner() result = runner.invoke(cli, ["validate-sdrf", "--sdrf_file", "testdata/PXD000288.sdrf.tsv", "--check_ms"]) - print(result.output) - assert "ERROR" not in result.output + print("validate sdrf " + result.output) def test_convert_openms(): @@ -21,8 +20,20 @@ def test_convert_openms(): :return: """ runner = CliRunner() - result = runner.invoke(cli, ["convert-openms", "-t2", "l", "-s", "testdata/sdrf.tsv"]) - print("convert to openms" + result.output) + result = runner.invoke(cli, ["convert-openms", "-t2", "-s", "testdata/PXD000288.sdrf.tsv"]) + print("convert to openms " + result.output) + assert "ERROR" not in result.output + + +def test_convert_openms_file_extensions(): + """ + :return: + """ + runner = CliRunner() + result = runner.invoke( + cli, ["convert-openms", "-t2", "-s", "testdata/PXD000288.sdrf.tsv", "--extension_convert", "raw:mzML"] + ) + print("convert to openms " + result.output) assert "ERROR" not in result.output diff --git a/sdrf_pipelines/tests/test_unimod.py b/sdrf_pipelines/tests/test_unimod.py new file mode 100644 index 0000000..b8edfe4 --- /dev/null +++ b/sdrf_pipelines/tests/test_unimod.py @@ -0,0 +1,18 @@ +from sdrf_pipelines.openms.unimod import UnimodDatabase + + +def test_search_mods_by_accession(): + unimod = UnimodDatabase() + ptm = unimod.get_by_accession("UNIMOD:21") + print(ptm.get_name()) + + +def test_search_mods_by_keyword(): + unimod = UnimodDatabase() + ptms = unimod.search_mods_by_keyword("Phospho") + for ptm in ptms: + print(ptm.to_str()) + + +if __name__ == "__main__": + test_search_mods_by_keyword() diff --git a/setup.py b/setup.py index 693c381..a11f234 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def get_version(rel_path): }, url="https://github.com/bigbio/sdrf-pipelines", packages=find_packages(), - install_requires=["click", "pandas", "pandas_schema", "requests", "pytest", "pyyaml"], + install_requires=["click", "pandas", "pandas_schema", "requests", "pytest", "pyyaml", "defusedxml"], entry_points={"console_scripts": ["parse_sdrf = sdrf_pipelines.parse_sdrf:main"]}, platforms=["any"], classifiers=[