diff --git a/sdrf_pipelines/parse_sdrf.py b/sdrf_pipelines/parse_sdrf.py index d10e36c..4b519e4 100755 --- a/sdrf_pipelines/parse_sdrf.py +++ b/sdrf_pipelines/parse_sdrf.py @@ -142,8 +142,9 @@ def maxquant_from_sdrf( is_flag=True, ) @click.option("--skip_factor_validation", help="Disable the validation of factor values in SDRF", is_flag=True) +@click.option("--skip_experimental_design_validation", help="Disable the validation of experimental design", is_flag=True) @click.pass_context -def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool, skip_factor_validation: bool): +def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool, skip_factor_validation: bool, skip_experimental_design_validation: bool): """ Command to validate the SDRF file. The validation is based on the template provided by the user. User can select the template to be used for validation. If no template is provided, the default template will be used. @@ -154,6 +155,7 @@ def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool, @param template: template to be used for validation @param skip_ms_validation: flag to skip the validation of mass spectrometry fields @param skip_factor_validation: flag to skip the validation of factor values + @param skip_experimental_design_validation: flag to skip the validation of experimental design """ if sdrf_file is None: @@ -173,6 +175,9 @@ def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool, if not skip_factor_validation: errors = errors + df.validate_factor_values() + if not skip_experimental_design_validation: + errors = errors + df.validate_experimental_design() + for error in errors: print(error) diff --git a/sdrf_pipelines/sdrf/sdrf.py b/sdrf_pipelines/sdrf/sdrf.py index 87b8e3e..204e1b4 100644 --- a/sdrf_pipelines/sdrf/sdrf.py +++ b/sdrf_pipelines/sdrf/sdrf.py @@ -32,7 +32,7 @@ def _constructor(self): def get_sdrf_columns(self): """ - This method return the name of the columns of the SDRF. + This method returns the name of the columns of the SDRF. :return: """ return self.columns @@ -118,3 +118,47 @@ def validate_factor_values(self) -> List[LogicError]: errors.append(LogicError(error_message, error_type=logging.ERROR)) return errors + + def validate_experimental_design(self) -> List[LogicError]: + """ + Validate that the experimental design is correct. This method checks that the experimental design is correct, + including the following: + - A raw file can only have one associated assay name. If a raw file has more than one assay name, an error is + raised. + :return: A list of LogicError objects if the experimental design is incorrect, otherwise an empty list. + """ + + errors = [] + + # Check that combination of values assay name and characteristics[data file] is unique in self + errors = self.check_inconsistencies_assay_file(errors) + + return errors + + def check_inconsistencies_assay_file(self, errors: List[LogicError]) -> List[LogicError]: + """ + Check that combination of values assay name and comment[data file] is unique in self + :return: A list of LogicError objects if the combination of values assay name and characteristics[data file] is + not unique, otherwise an empty list. + """ + + # Group by col1 and check if each group has only one unique col2 value + col1_inconsistencies = self.groupby('assay name')['comment[data file]'].nunique() + col1_inconsistent_groups = col1_inconsistencies[col1_inconsistencies > 1] + if len(col1_inconsistent_groups) > 0: + cell_index = col1_inconsistent_groups.index.tolist() + error_message = f"Multiple assays with the same raw files: {cell_index}, the combination assay name and comment[data file] should be unique" + errors.append(LogicError(error_message, error_type=logging.ERROR)) + + # Group by col2 and check if each group has only one unique col1 value + col2_inconsistencies = self.groupby('comment[data file]')['assay name'].nunique() + col2_inconsistent_groups = col2_inconsistencies[col2_inconsistencies > 1] + if len(col2_inconsistent_groups) > 0: + cell_index = col2_inconsistent_groups.index.tolist() + error_message = f"Multiple raw files with the same assay: {cell_index}, the combination assay name and comment[data file] should be unique" + errors.append(LogicError(error_message, error_type=logging.ERROR)) + + return errors + + + diff --git a/tests/data/erroneous/example.sdrf.tsv b/tests/data/erroneous/example.sdrf.tsv index 6552acc..2d18aa7 100644 --- a/tests/data/erroneous/example.sdrf.tsv +++ b/tests/data/erroneous/example.sdrf.tsv @@ -1,6 +1,6 @@ source name characteristics[organism] characteristics[organism part] characteristics[cell type] characteristics[developmental stage] characteristics[disease] characteristics[enrichment process] characteristics[biological replicate] characteristics[compound] characteristics[cell line] assay name technology type comment[data file] comment[technical replicate] comment[fraction identifier] comment[label] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[modification parameters] comment[modification parameters] comment[instrument] comment[associated file uri] factor value[compound] factor value[concentration of] E1S1 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 1 proteomic profiling by mass spectrometry 1342_01.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_01.RAW none not applicable -E1S2 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 2 proteomic profiling by mass spectrometry 1342_02.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_02.RAW none not applicable +E1S2 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 1 proteomic profiling by mass spectrometry 1342_02.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_02.RAW none not applicable E1S3 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 3 proteomic profiling by mass spectrometry 1342_03.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_03.RAW none not applicable E1S4 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 4 proteomic profiling by mass spectrometry 1342_04.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_04.RAW none not applicable E1S5 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 mercury dichloride CLO:0009575 run 5 proteomic profiling by mass spectrometry 1342_05.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_05.RAW mercury dichloride 250 uM