diff --git a/CHANGELOG.md b/CHANGELOG.md index f50c5cbb..d4083f36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +# Version 0.9.0 + +* Updates to PointFinder database handling + * Adds the ability to handle promoters (regions with both promoter nucleotide information and non-promoter codon information) + * Adds handling of insertions and deletions in nucleotide and codon sequence + * Updates list of supported PointFinder species to `salmonella`, `campylobacter`, `enterococcus_faecalis`, `enterococcus_faecium`, `escherichia_coli`, `helicobacter_pylori`. +* Switch name `e.coli` to `escherichia_coli` in PointFinder gene-drug key to match organism name in PointFinder database. + # Version 0.8.0 * Fixed issue when using older version of pandas (#136) (0.8.0.dev0). diff --git a/README.md b/README.md index a134733c..0d466d37 100644 --- a/README.md +++ b/README.md @@ -241,12 +241,12 @@ staramr db restore-default By default, the ResFinder/PointFinder/PlasmidFinder genes listed in [genes_to_exclude.tsv][] will be excluded from the final results. To pass a custom list of genes the option `--exclude-genes-file` can be used, where the file specified will contains a list of the sequence ids (one per line) from the ResFinder/PointFinder/PlasmidFinder databases. For example: ``` -#gene_id +gene_id aac(6')-Iaa_1_NC_003197 ColpVC_1__JX133088 ``` -Please make sure to include `#gene_id` in the first line. The default exclusion list can also be disabled with `--no-exclude-genes`. +Please make sure to include `gene_id` in the first line. The default exclusion list can also be disabled with `--no-exclude-genes`. # Output diff --git a/staramr/__init__.py b/staramr/__init__.py index 32a90a3b..b3f7e62a 100644 --- a/staramr/__init__.py +++ b/staramr/__init__.py @@ -1 +1 @@ -__version__ = '0.8.0' +__version__ = '0.9.0.dev0' diff --git a/staramr/blast/pointfinder/PointfinderBlastDatabase.py b/staramr/blast/pointfinder/PointfinderBlastDatabase.py index 58219a69..8b33d030 100644 --- a/staramr/blast/pointfinder/PointfinderBlastDatabase.py +++ b/staramr/blast/pointfinder/PointfinderBlastDatabase.py @@ -71,6 +71,28 @@ def get_resistance_nucleotides(self, gene, nucleotide_mutations): """ return self._pointfinder_info.get_resistance_nucleotides(gene, nucleotide_mutations) + def get_resistance_promoter(self, gene, nucleotide_mutations): + """ + Gets a list of resistance nucleotides and codons located within the promoter, derived from the list + of nucleotide mutations. + :param gene: The name of the gene. + :param nucleotide_mutations: The positions of the nucleotide mutations. + :return: The resistance positions (both nucleotide and codon positions). + """ + + # Get the mutations in the nucleotide (non-gene) part: + # Filter the list for negative coordinate positions. + nucleotide_part = list(filter(lambda x: (x._nucleotide_position_amr_gene < 0), nucleotide_mutations)) + resistance_nucleotides = self._pointfinder_info.get_resistance_nucleotides(gene, nucleotide_part) + + # Get the mutations in the coding part: + # Filter the list for non-negative coordinate positions. + codon_part = list(filter(lambda x: (x._nucleotide_position_amr_gene >= 0), nucleotide_mutations)) + resistance_codons = self._pointfinder_info.get_resistance_codons(gene, codon_part) + + # Combine and return the results: + return resistance_nucleotides + resistance_codons + def get_organism(self): """ Gets the particular organism of this database. @@ -87,7 +109,8 @@ def get_available_organisms(cls): A Class Method to get a list of organisms that are currently supported by staramr. :return: The list of organisms currently supported by staramr. """ - return ['salmonella', 'campylobacter'] + return ['salmonella', 'campylobacter', 'enterococcus_faecalis', 'enterococcus_faecium', + 'escherichia_coli', 'helicobacter_pylori'] @classmethod def get_organisms(cls, database_dir): diff --git a/staramr/blast/pointfinder/PointfinderDatabaseInfo.py b/staramr/blast/pointfinder/PointfinderDatabaseInfo.py index 08de3560..bca32238 100644 --- a/staramr/blast/pointfinder/PointfinderDatabaseInfo.py +++ b/staramr/blast/pointfinder/PointfinderDatabaseInfo.py @@ -2,6 +2,9 @@ from os import path import pandas as pd +import Bio.Seq +from staramr.blast.results.pointfinder.codon.CodonMutationPosition import CodonMutationPosition +from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition from staramr.exceptions.GenotypePhenotypeMatchException import GenotypePhenotypeMatchException @@ -33,7 +36,15 @@ def from_file(cls, file): :param file: The file containing drug resistance mutations. :return: A new PointfinderDatabaseInfo. """ - pointfinder_info = pd.read_csv(file, sep='\t', index_col=False) + + with open(file) as f: + line = f.readline() + + line = line.lstrip("#") + column_names = line.split() + + pointfinder_info = pd.read_csv(file, sep='\t', index_col=False, comment='#', header=None, names=column_names) + return cls(pointfinder_info, file) @classmethod @@ -44,6 +55,18 @@ def from_pandas_table(cls, database_info_dataframe): :return: A new PointfinderDatabaseInfo. """ return cls(database_info_dataframe) + + @staticmethod + def to_codons(regex_match): + # Sometimes, the regex will match a string with a comma and return multiple matches. + # Ex: TTCATGGAC,TTC + # We need to make sure we handle these commas and multiple matches. + entries = regex_match.string.split(",") + + for i in range(0, len(entries)): + entries[i] = Bio.Seq.translate(entries[i], table='Standard').upper() + + return ",".join(entries) def _resistance_table_hacks(self, table): """ @@ -55,15 +78,72 @@ def _resistance_table_hacks(self, table): if self._file and 'salmonella' in str(self._file) and path.exists( path.join(path.dirname(self._file), '16S_rrsD.fsa')): logger.debug("Replacing [16S] with [16S_rrsD] for pointfinder organism [salmonella]") - table[['#Gene_ID']] = table[['#Gene_ID']].replace('16S', '16S_rrsD') + table[['Gene_ID']] = table[['Gene_ID']].replace('16S', '16S_rrsD') + + # We need to fix table entries where codon deletions are listed, + # but no reference nucleotides are listed in the corresponding entry. + # This will mean that this specific entry (for some reason) + # is using 0-based nucleotide coordinates and nucleotides for the Res_codon + # instead of 1-based codon coordinates and codons for the Res_codon. + # However, we need make an exception for entries where "promoter", "16S", or "23S" + # are found in the 'Gene_ID'. + + table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del') + & (table['Gene_ID'].str.contains('promoter') == False) + & (table['Gene_ID'].str.contains('16S') == False) + & (table['Gene_ID'].str.contains('23S') == False), "Codon_pos"] += 3 + # We increment by 3 because the table is using base-0 coordinates + # and we have base-1 coordindates for everything codon-related. + # However, what's listed is nucleotide coordinates for the codon + # mutation, so we actually need to bump it by 3 nucleotides. + + # Furthermore, we need to convert 'Res_codon' entries related to the above problem + # (codon deletions) from nucleotides into codons. We choose to convert nucleotides into + # codons instead of the other way around so that can more accurately compare observed mutations + # to the table entries. + table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del') + & (table['Gene_ID'].str.contains('promoter') == False) + & (table['Gene_ID'].str.contains('16S') == False) + & (table['Gene_ID'].str.contains('23S') == False), "Res_codon"] = table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del') + & (table['Gene_ID'].str.contains('promoter') == False) + & (table['Gene_ID'].str.contains('16S') == False) + & (table['Gene_ID'].str.contains('23S') == False), "Res_codon"].str.replace('[A-Z,]+', self.to_codons, regex=True) def _get_resistance_codon_match(self, gene, codon_mutation): table = self._pointfinder_info - matches = table[(table['#Gene_ID'] == gene) - & (table['Codon_pos'] == codon_mutation.get_mutation_position()) - & (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation()) - & (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))] + # We need to handle codon deletions as a special case: + if(type(codon_mutation) is CodonMutationPosition + and codon_mutation.get_input_genome_amino_acid() == 'del'): + matches = table[(table['Gene_ID'] == gene) + # Codon deletions are listed using nucleotide coordinates + & (table['Codon_pos'] == codon_mutation.get_mutation_position() * 3) + # Such coords are denoted in nucleotide coordinates in the reference table for some reason, + # so we need to convert to nucleotide coordinates before making the comparison. + & (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation()) + & (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))] + + # We need to handle codon insertions as a special case: + # Pointfinder mis-reports the position of codon insertions. For example: + # ref: ACG --- ACG + # query: ACG GGG ACG + # ref_pos: 1 2 + # Pointfinder is incorrectly reporting the insertion as 2_3insG instead of the correct 1_2insG, + # which is to say, the insertion happens between reference codon coordinates 1 and 2. + # We need to shift by 1 in our interpretation. + elif type(codon_mutation) is CodonInsertionPosition: + matches = table[(table['Gene_ID'] == gene) + # Codon inerstions need to be shifted by 1: + & (table['Codon_pos'] == codon_mutation.get_mutation_position() + 1) + & (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation()) + & (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))] + + # Normal cases: + else: + matches = table[(table['Gene_ID'] == gene) + & (table['Codon_pos'] == codon_mutation.get_mutation_position()) + & (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation()) + & (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))] if len(matches.index) > 1: # If more then one match, try to match Res_codon exactly to subselect @@ -72,10 +152,7 @@ def _get_resistance_codon_match(self, gene, codon_mutation): if len(matches_subset.index) >= 1: matches = matches_subset - if len(matches.index) > 1: - raise GenotypePhenotypeMatchException("Error, multiple matches for gene=" + str(gene) + ", codon_mutation=" + str(codon_mutation)) - else: - return matches + return matches def _get_resistance_nucleotide_match(self, gene, nucleotide_mutations): return self._get_resistance_codon_match(gene, nucleotide_mutations) diff --git a/staramr/blast/results/pointfinder/BlastResultsParserPointfinder.py b/staramr/blast/results/pointfinder/BlastResultsParserPointfinder.py index ee8b34f2..d7a1fec6 100644 --- a/staramr/blast/results/pointfinder/BlastResultsParserPointfinder.py +++ b/staramr/blast/results/pointfinder/BlastResultsParserPointfinder.py @@ -4,6 +4,7 @@ from staramr.blast.results.BlastResultsParser import BlastResultsParser from staramr.blast.results.pointfinder.PointfinderHitHSP import PointfinderHitHSP from staramr.blast.results.pointfinder.nucleotide.PointfinderHitHSPRNA import PointfinderHitHSPRNA +from staramr.blast.results.pointfinder.nucleotide.PointfinderHitHSPPromoter import PointfinderHitHSPPromoter logger = logging.getLogger('BlastResultsParserPointfinder') @@ -25,6 +26,7 @@ class BlastResultsParserPointfinder(BlastResultsParser): Contig Start End + Pointfinder Position '''.strip().split('\n')] SORT_COLUMNS = ['Isolate ID', 'Gene'] @@ -47,11 +49,13 @@ def _create_hit(self, file, database_name, blast_record): logger.debug("database_name=%s", database_name) if (database_name == '16S_rrsD') or (database_name == '23S'): return PointfinderHitHSPRNA(file, blast_record) + elif ('promoter' in database_name): + return PointfinderHitHSPPromoter(file, blast_record, database_name) else: return PointfinderHitHSP(file, blast_record) def _get_result(self, hit, db_mutation): - return [hit.get_genome_id(), + result = [hit.get_genome_id(), hit.get_amr_gene_id() + " (" + db_mutation.get_mutation_string_short() + ")", db_mutation.get_type(), db_mutation.get_mutation_position(), @@ -61,9 +65,12 @@ def _get_result(self, hit, db_mutation): str(hit.get_hsp_length()) + "/" + str(hit.get_amr_gene_length()), hit.get_genome_contig_id(), hit.get_genome_contig_start(), - hit.get_genome_contig_end() + hit.get_genome_contig_end(), + db_mutation.get_pointfinder_mutation_string() ] + return result + def _get_result_rows(self, hit, database_name): database_mutations = hit.get_mutations() @@ -75,6 +82,8 @@ def _get_result_rows(self, hit, database_name): if (database_name == '16S_rrsD') or (database_name == '23S'): database_resistance_mutations = self._blast_database.get_resistance_nucleotides(gene, database_mutations) + elif ('promoter' in database_name): + database_resistance_mutations = self._blast_database.get_resistance_promoter(gene, database_mutations) else: database_resistance_mutations = self._blast_database.get_resistance_codons(gene, database_mutations) logger.debug("database_resistance_mutations=%s", database_resistance_mutations) diff --git a/staramr/blast/results/pointfinder/BlastResultsParserPointfinderResistance.py b/staramr/blast/results/pointfinder/BlastResultsParserPointfinderResistance.py index ec862f57..80c7f810 100644 --- a/staramr/blast/results/pointfinder/BlastResultsParserPointfinderResistance.py +++ b/staramr/blast/results/pointfinder/BlastResultsParserPointfinderResistance.py @@ -1,6 +1,7 @@ import logging from staramr.blast.results.pointfinder.BlastResultsParserPointfinder import BlastResultsParserPointfinder +from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition """ Class used to parse out BLAST results for PointFinder, including phenotyhpes/resistances. @@ -23,6 +24,7 @@ class BlastResultsParserPointfinderResistance(BlastResultsParserPointfinder): Contig Start End + Pointfinder Position '''.strip().split('\n')] def __init__(self, file_blast_map, arg_drug_table, blast_database, pid_threshold, plength_threshold, @@ -43,14 +45,21 @@ def __init__(self, file_blast_map, arg_drug_table, blast_database, pid_threshold self._arg_drug_table = arg_drug_table def _get_result(self, hit, db_mutation): + + # We need to correct for Pointfinder codon insertions being off by 1. + if type(db_mutation) is CodonInsertionPosition: + mutation_position = db_mutation.get_mutation_position() + 1 + else: + mutation_position = db_mutation.get_mutation_position() + drug = self._arg_drug_table.get_drug(self._blast_database.get_organism(), hit.get_amr_gene_id(), - db_mutation.get_mutation_position()) + mutation_position) gene_name = hit.get_amr_gene_id() + " (" + db_mutation.get_mutation_string_short() + ")" if drug is None: drug = 'unknown[' + gene_name + ']' - return [hit.get_genome_id(), + result = [hit.get_genome_id(), gene_name, drug, db_mutation.get_type(), @@ -61,5 +70,8 @@ def _get_result(self, hit, db_mutation): str(hit.get_hsp_length()) + "/" + str(hit.get_amr_gene_length()), hit.get_genome_contig_id(), hit.get_genome_contig_start(), - hit.get_genome_contig_end() + hit.get_genome_contig_end(), + db_mutation.get_pointfinder_mutation_string() ] + + return result diff --git a/staramr/blast/results/pointfinder/MutationPosition.py b/staramr/blast/results/pointfinder/MutationPosition.py index dd2dac97..d1d11a10 100644 --- a/staramr/blast/results/pointfinder/MutationPosition.py +++ b/staramr/blast/results/pointfinder/MutationPosition.py @@ -25,6 +25,11 @@ def get_nucleotide_position(self): """ return self._nucleotide_position_amr_gene + def get_pointfinder_mutation_string(self): + # This function exists in order to show mutations in Pointfinder database coordinates, + # which are off-by-one in the case of codon insertions. + return self.get_mutation_string_short() + def get_mutation_string_short(self): return self.get_database_amr_gene_mutation() + str( self.get_mutation_position()) + self.get_input_genome_mutation() diff --git a/staramr/blast/results/pointfinder/PointfinderHitHSP.py b/staramr/blast/results/pointfinder/PointfinderHitHSP.py index 7d4a783d..ad1aca45 100644 --- a/staramr/blast/results/pointfinder/PointfinderHitHSP.py +++ b/staramr/blast/results/pointfinder/PointfinderHitHSP.py @@ -2,6 +2,9 @@ from staramr.blast.results.AMRHitHSP import AMRHitHSP from staramr.blast.results.pointfinder.codon.CodonMutationPosition import CodonMutationPosition +from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition + +import pandas as pd logger = logging.getLogger('PointfinderHitHSP') @@ -27,22 +30,35 @@ def get_amr_gene_name(self): """ return self._blast_record['qseqid'] - def _get_match_positions(self): - amr_seq = self.get_amr_gene_seq() - genome_seq = self.get_genome_contig_hsp_seq() - - return [i for i, (x, y) in enumerate(zip(amr_seq, genome_seq)) if x != y] - def _get_mutation_positions(self, start): + mutation_positions = [] mutation_positions_filtered = [] codon_starts = [] amr_seq = self.get_amr_gene_seq() genome_seq = self.get_genome_contig_hsp_seq() - # Only return mutation position objects with unique codon start positions - mutation_positions = [CodonMutationPosition(i, amr_seq, genome_seq, start) for i in self._get_match_positions()] + amr_pos = 0 + + for i in range(len(amr_seq)): + # Insertion: "-" in the reference: + if amr_seq[i] == "-": + # left side + offset = i - amr_pos # accounting for string index and reference index possibly being different + mutation = CodonInsertionPosition(i, amr_seq, genome_seq, start, offset=offset) + mutation_positions.append(mutation) + # Mismatch or Deletion: + elif (amr_seq[i] != genome_seq[i]): + offset = i - amr_pos + mutation = CodonMutationPosition(i, amr_seq, genome_seq, start, offset=offset) + mutation_positions.append(mutation) + amr_pos += 1 + # Match: + else: + amr_pos += 1 + + # Only return mutation position objects with unique codon start positions for m in mutation_positions: if m._codon_start not in codon_starts: codon_starts.append(m._codon_start) diff --git a/staramr/blast/results/pointfinder/codon/CodonInsertionPosition.py b/staramr/blast/results/pointfinder/codon/CodonInsertionPosition.py new file mode 100644 index 00000000..6622b503 --- /dev/null +++ b/staramr/blast/results/pointfinder/codon/CodonInsertionPosition.py @@ -0,0 +1,40 @@ +import math + +from staramr.blast.results.pointfinder.codon.CodonMutationPosition import CodonMutationPosition + +""" +A class defining a codon-based insertion mutation for PointFinder. + +This class is needed to handle the special case of codon insertions needing to report +the codon to the LEFT of the insertion, not the sequence position of the insertion itself. +""" + + +class CodonInsertionPosition(CodonMutationPosition): + + def __init__(self, match_position, database_amr_gene_string, input_genome_blast_string, database_amr_gene_start, offset=0): + """ + Creates a new CodonMutationPosition. + :param match_position: The particular position (0-based index) of the BLAST match string for this mutation. + :param database_amr_gene_string: The database amr gene string from BLAST. + :param input_genome_blast_string: The genome BLAST string from the input genome. + :param database_amr_gene_start: The start coordinates of the BLAST amr gene hit. + :param offset: The amount to offset the mutation by (important for promoter mutations). + """ + super().__init__(match_position, database_amr_gene_string, input_genome_blast_string, database_amr_gene_start, offset) + + # We need to correctly set the codon position to be 1 to the left (-1) of the observed insertion codon. + self._codon_start = math.ceil(self._nucleotide_position_amr_gene / 3) - 1 + + def __repr__(self): + return ( + 'CodonInsertionPosition(_database_amr_gene_start={_database_amr_gene_start}, _nucleotide_position_amr_gene={_nucleotide_position_amr_gene}, ' + '_codon_start={_codon_start}, _database_amr_gene_codon={_database_amr_gene_codon}, _input_genome_codon={_input_genome_codon})').format( + **self.__dict__) + + def get_pointfinder_mutation_string(self): + # Since the position codon insertions in the Pointfinder database is off by one, we need + # to adjust the position in the report to show what Pointfinder shows. + return self.get_database_amr_gene_mutation() + str( + self.get_mutation_position() + 1) + self.get_input_genome_mutation() + diff --git a/staramr/blast/results/pointfinder/codon/CodonMutationPosition.py b/staramr/blast/results/pointfinder/codon/CodonMutationPosition.py index d1f15430..0bbc4f11 100644 --- a/staramr/blast/results/pointfinder/codon/CodonMutationPosition.py +++ b/staramr/blast/results/pointfinder/codon/CodonMutationPosition.py @@ -11,15 +11,16 @@ class CodonMutationPosition(MutationPosition): - def __init__(self, match_position, database_amr_gene_string, input_genome_blast_string, database_amr_gene_start): + def __init__(self, match_position, database_amr_gene_string, input_genome_blast_string, database_amr_gene_start, offset=0): """ Creates a new CodonMutationPosition. :param match_position: The particular position (0-based index) of the BLAST match string for this mutation. :param database_amr_gene_string: The database amr gene string from BLAST. :param input_genome_blast_string: The genome BLAST string from the input genome. :param database_amr_gene_start: The start coordinates of the BLAST amr gene hit. + :param offset: The amount to offset the mutation by (important for promoter mutations). """ - super().__init__(match_position, database_amr_gene_start) + super().__init__(match_position - offset, database_amr_gene_start) self._codon_start = math.ceil(self._nucleotide_position_amr_gene / 3) frame_shift = (self._nucleotide_position_amr_gene - 1) % 3 @@ -80,17 +81,27 @@ def get_mutation_string(self): + ' -> ' + self.get_input_genome_amino_acid() + ')' def get_database_amr_gene_mutation(self): - if '-' in self.get_database_amr_gene_codon(): - return self.get_database_amr_gene_amino_acid() + # If there's an insertion ('-'), then return 'ins' instead of '-': + if '-' in self._database_amr_gene_codon: + return 'ins' + # If there's a deletion in the INPUT sequence, then we need to return 'del' + # instead of the actual reference sequence + # (which would normally be some sequence) + if '-' in self._input_genome_codon: + return 'del' + # If neither an insertion or deletion, return the actual gene sequence: else: - return self.get_database_amr_gene_amino_acid().upper() + return Bio.Seq.translate(self.get_database_amr_gene_codon(), table='Standard').upper() def get_input_genome_mutation(self): - # Keep 'ins' or 'del' lowercase - if '-' in self.get_input_genome_codon(): - return self.get_input_genome_amino_acid() + # If there's a deletion in the input genome, the database format actually + # requires us to return the reference sequence that was deleted, + # not the '-' in the sequence that you would expect: + if '-' in self._input_genome_codon: + return Bio.Seq.translate(self.get_database_amr_gene_codon(), table='Standard').upper() + # If not a deletion, return the actual input genome sequence: else: - return self.get_input_genome_amino_acid().upper() + return Bio.Seq.translate(self.get_input_genome_codon(), table='Standard').upper() def get_type(self): return 'codon' diff --git a/staramr/blast/results/pointfinder/nucleotide/NucleotideMutationPosition.py b/staramr/blast/results/pointfinder/nucleotide/NucleotideMutationPosition.py index 2a5ee6f2..765b1372 100644 --- a/staramr/blast/results/pointfinder/nucleotide/NucleotideMutationPosition.py +++ b/staramr/blast/results/pointfinder/nucleotide/NucleotideMutationPosition.py @@ -7,15 +7,16 @@ class NucleotideMutationPosition(MutationPosition): - def __init__(self, match_position, database_amr_gene_string, input_genome_string, database_amr_gene_start): + def __init__(self, match_position, database_amr_gene_string, input_genome_string, database_amr_gene_start, offset=0): """ Creates a new NucleotideMutationPosition. :param match_position: The particular position (0-based index) of the BLAST match string for this mutation. :param database_amr_gene_string: The amr gene string. :param input_genome_string: The input genome string. :param database_amr_gene_start: The start coordinates of the BLAST database hit. + :param offset: The amount to offset the mutation by (particularly for negative-coordinate promoter mutations). """ - super().__init__(match_position, database_amr_gene_start) + super().__init__(match_position - offset, database_amr_gene_start) self._database_amr_gene_mutation = database_amr_gene_string[match_position].upper() self._input_genome_mutation = input_genome_string[match_position].upper() @@ -27,10 +28,27 @@ def get_mutation_position(self): return self.get_nucleotide_position() def get_database_amr_gene_mutation(self): - return self._database_amr_gene_mutation + # If there's an insertion ('-'), then return 'ins' instead of '-': + if '-' in self._database_amr_gene_mutation: + return 'ins' + # If there's a deletion in the INPUT sequence, then we need to return 'del' + # instead of the actual AMR reference sequence + # (which would normally be some sequence) + if '-' in self._input_genome_mutation: + return 'del' + # If neither an insertion or deletion, return the actual AMR gene sequence: + else: + return self._database_amr_gene_mutation def get_input_genome_mutation(self): - return self._input_genome_mutation + # If there's a deletion in the input genome, the database format actually + # requires us to return the reference sequence that was deleted, + # not the '-' in the sequence that you would expect: + if '-' in self._input_genome_mutation: + return self._database_amr_gene_mutation + # If not a deletion, return the actual input genome sequence: + else: + return self._input_genome_mutation def get_mutation_string(self): return self.get_database_amr_gene_mutation() + ' -> ' + self.get_input_genome_mutation() diff --git a/staramr/blast/results/pointfinder/nucleotide/PointfinderHitHSPPromoter.py b/staramr/blast/results/pointfinder/nucleotide/PointfinderHitHSPPromoter.py new file mode 100644 index 00000000..4d74d3cb --- /dev/null +++ b/staramr/blast/results/pointfinder/nucleotide/PointfinderHitHSPPromoter.py @@ -0,0 +1,120 @@ +from staramr.blast.results.pointfinder.PointfinderHitHSP import PointfinderHitHSP +from staramr.blast.results.pointfinder.codon.CodonMutationPosition import CodonMutationPosition +from staramr.blast.results.pointfinder.nucleotide.NucleotideMutationPosition import NucleotideMutationPosition + + +class PointfinderHitHSPPromoter(PointfinderHitHSP): + """ + This class represents a Pointfinder BLAST hit and is responsible for parsing the related BLAST hit. + """ + + def __init__(self, file, blast_record, database_name): + """ + Creates a new PointfinderHitHSPPromoter from a promoter-related BLAST hit. + :param file: The input file. + :param blast_record: The Bio.Blast.Record this hit came from. + :param database_name: The name of the BLAST database. This name must be named in the same way the + Pointfinder promoters are named (ex: "embA_promoter_size_115bp"). + """ + + self._parse_database_name(database_name) + super().__init__(file, blast_record) + + def _get_mutation_positions(self, start): + nucleotide_mutations = [] + codon_mutations = [] + + amr_seq = self.get_amr_gene_seq() + genome_seq = self.get_genome_contig_hsp_seq() + + amr_pos = 0 + + # Get all the nucleotide match positions up to the offset: + for i in range(self.offset): + + # Insertion: "-" in the reference: + if amr_seq[i] == "-": + # left side + offset = i - amr_pos + self.offset + 1 # accounting for string index and reference index possibly being different + mutation = NucleotideMutationPosition(i, amr_seq, genome_seq, start, offset=offset + 1) + nucleotide_mutations.append(mutation) + # Mismatch or Deletion: + elif (amr_seq[i] != genome_seq[i]): + offset = i - amr_pos + self.offset + mutation = NucleotideMutationPosition(i, amr_seq, genome_seq, start, offset=offset + 1) + nucleotide_mutations.append(mutation) + amr_pos += 1 + # Match: + else: + amr_pos += 1 + + # Merge adjacent nucleotide insertions: + nucleotide_mutations_merged = [] + while(len(nucleotide_mutations) > 0): + current = nucleotide_mutations.pop(0) + + # If the merged list has something in it + # and the current mutation is an insertion + # and the last item in the merged list is an insertion + # and the last item in the merged list has the same position: + if(len(nucleotide_mutations_merged) > 0 + and current.get_database_amr_gene_mutation() == 'ins' + and nucleotide_mutations_merged[-1].get_database_amr_gene_mutation() == 'ins' + and nucleotide_mutations_merged[-1].get_mutation_position() == current.get_mutation_position()): + # Add the nucleotide to the existing: + nucleotide_mutations_merged[-1]._database_amr_gene_mutation += current._database_amr_gene_mutation + nucleotide_mutations_merged[-1]._input_genome_mutation += current._input_genome_mutation + + else: + nucleotide_mutations_merged.append(current) + + # Get all the codon match positions after the offset: + for i in range(self.offset, len(amr_seq)): + + # Insertion: "-" in the reference: + if amr_seq[i] == "-": + # left side + offset = i - amr_pos + self.offset # accounting for string index and reference index possibly being different + mutation = CodonMutationPosition(i - 1, amr_seq, genome_seq, start, offset=offset) + codon_mutations.append(mutation) + # Mismatch or Deletion: + elif (amr_seq[i] != genome_seq[i]): + offset = i - amr_pos + self.offset + mutation = CodonMutationPosition(i, amr_seq, genome_seq, start, offset=offset) + codon_mutations.append(mutation) + amr_pos += 1 + # Match: + else: + amr_pos += 1 + + codon_mutations_filtered = [] + codon_starts = [] + + # Only return codon mutation position objects with unique codon start positions + for m in codon_mutations: + if m._codon_start not in codon_starts: + codon_starts.append(m._codon_start) + codon_mutations_filtered.append(m) + + # Combine lists and return all positions: + combined_mutations = nucleotide_mutations_merged + codon_mutations_filtered + + return combined_mutations + + def _parse_database_name(self, database_name): + """ + Parses the name of the database in order to obtain the promoter offset. + The database name is expected to have the following format: + + [GENENAME]_promoter_size_[SIZE]bp + + example: + + embA_promoter_size_115bp + """ + + tokens = database_name.split("_") # split the name into tokens + size_string = tokens[len(tokens) - 1] # get the last token + size = int(size_string.replace('bp', '')) # remove the 'bp' and convert to an int + + self.offset = size diff --git a/staramr/blast/results/pointfinder/nucleotide/PointfinderHitHSPRNA.py b/staramr/blast/results/pointfinder/nucleotide/PointfinderHitHSPRNA.py index e4248a76..5538b7f2 100644 --- a/staramr/blast/results/pointfinder/nucleotide/PointfinderHitHSPRNA.py +++ b/staramr/blast/results/pointfinder/nucleotide/PointfinderHitHSPRNA.py @@ -13,9 +13,29 @@ def __init__(self, file, blast_record): super().__init__(file, blast_record) def _get_mutation_positions(self, start): + mutation_positions = [] + amr_seq = self.get_amr_gene_seq() genome_seq = self.get_genome_contig_hsp_seq() - # @formatter:off - return [NucleotideMutationPosition(i, amr_seq, genome_seq, start) for i in self._get_match_positions()] - # @formatter:on + amr_pos = 0 + + for i in range(len(amr_seq)): + + # Insertion: "-" in the reference: + if amr_seq[i] == "-": + # left side + offset = i - amr_pos # accounting for string index and reference index possibly being different + mutation = NucleotideMutationPosition(i - 1, amr_seq, genome_seq, start, offset=offset) + mutation_positions.append(mutation) + # Mismatch or Deletion: + elif (amr_seq[i] != genome_seq[i]): + offset = i - amr_pos + mutation = NucleotideMutationPosition(i, amr_seq, genome_seq, start, offset=offset) + mutation_positions.append(mutation) + amr_pos += 1 + # Match: + else: + amr_pos += 1 + + return mutation_positions diff --git a/staramr/databases/exclude/ExcludeGenesList.py b/staramr/databases/exclude/ExcludeGenesList.py index ec398dda..a1c79fbe 100644 --- a/staramr/databases/exclude/ExcludeGenesList.py +++ b/staramr/databases/exclude/ExcludeGenesList.py @@ -18,7 +18,7 @@ def tolist(self): Converts the exclude genes data to a list. :return: A list with genes to exclude. """ - return self._data['#gene_id'].tolist() + return self._data['gene_id'].tolist() @classmethod def get_default_exclude_file(cls): diff --git a/staramr/databases/exclude/data/genes_to_exclude.tsv b/staramr/databases/exclude/data/genes_to_exclude.tsv index f40b090f..47ab30a0 100644 --- a/staramr/databases/exclude/data/genes_to_exclude.tsv +++ b/staramr/databases/exclude/data/genes_to_exclude.tsv @@ -1,2 +1,2 @@ -#gene_id +gene_id aac(6')-Iaa_1_NC_003197 diff --git a/staramr/databases/resistance/data/ARG_drug_key_pointfinder.tsv b/staramr/databases/resistance/data/ARG_drug_key_pointfinder.tsv index 787d7584..117783eb 100644 --- a/staramr/databases/resistance/data/ARG_drug_key_pointfinder.tsv +++ b/staramr/databases/resistance/data/ARG_drug_key_pointfinder.tsv @@ -49,71 +49,71 @@ salmonella 16S_rrsD 1065 spectinomycin salmonella 16S_rrsD 1192 spectinomycin salmonella parC 57 None salmonella acrB 171 azithromycin -e.coli gyrA 51 ciprofloxacin I/R,nalidixic acid -e.coli gyrA 67 ciprofloxacin I/R,nalidixic acid -e.coli gyrA 81 ciprofloxacin I/R,nalidixic acid -e.coli gyrA 82 ciprofloxacin I/R,nalidixic acid -e.coli gyrA 83 ciprofloxacin I/R,nalidixic acid -e.coli gyrA 84 ciprofloxacin I/R,nalidixic acid -e.coli gyrA 87 ciprofloxacin I/R,nalidixic acid -e.coli gyrA 106 ciprofloxacin I/R,nalidixic acid -e.coli gyrA 196 ciprofloxacin I/R,nalidixic acid -e.coli gyrB 136 aminocoumarin -e.coli gyrB 426 ciprofloxacin I/R,nalidixic acid -e.coli gyrB 447 ciprofloxacin I/R,nalidixic acid -e.coli parC 56 ciprofloxacin I/R,nalidixic acid -e.coli parC 57 None -e.coli parC 60 ciprofloxacin I/R,nalidixic acid -e.coli parC 78 ciprofloxacin I/R,nalidixic acid -e.coli parC 80 ciprofloxacin I/R,nalidixic acid -e.coli parC 84 ciprofloxacin I/R,nalidixic acid -e.coli parC 108 ciprofloxacin I/R,nalidixic acid -e.coli parE 416 ciprofloxacin I/R,nalidixic acid -e.coli parE 423 ciprofloxacin I/R,nalidixic acid -e.coli parE 439 ciprofloxacin I/R,nalidixic acid -e.coli parE 444 ciprofloxacin I/R,nalidixic acid -e.coli parE 458 ciprofloxacin I/R,nalidixic acid -e.coli parE 460 ciprofloxacin I/R,nalidixic acid -e.coli parE 464 ciprofloxacin I/R,nalidixic acid -e.coli parE 470 ciprofloxacin I/R,nalidixic acid -e.coli parE 475 ciprofloxacin I/R,nalidixic acid -e.coli parE 476 ciprofloxacin I/R,nalidixic acid -e.coli parE 529 ciprofloxacin I/R,nalidixic acid -e.coli pmrA 39 colistin -e.coli pmrA 81 colistin -e.coli pmrB 161 colistin -e.coli folP 64 sulfisoxazole -e.coli rpoB 146 rifamycin -e.coli rpoB 513 rifamycin -e.coli rpoB 526 rifamycin -e.coli rpoB 529 rifamycin -e.coli rpoB 531 rifamycin -e.coli rpoB 533 rifamycin -e.coli rpoB 563 rifamycin -e.coli rpoB 564 rifamycin -e.coli rpoB 687 rifamycin -e.coli 23S 2059 erythromycin,azithromycin -e.coli 16S_rrsB 523 streptomycin -e.coli 16S_rrsB 527 streptomycin -e.coli 16S_rrsB 528 streptomycin -e.coli 16S_rrsB 964 tetracycline -e.coli 16S_rrsB 1053 tetracycline -e.coli 16S_rrsB 1054 tetracycline -e.coli 16S_rrsB 1055 tetracycline -e.coli 16S_rrsB 1058 tetracycline -e.coli 16S_rrsB 1064 spectinomycin -e.coli 16S_rrsB 1066 spectinomycin -e.coli 16S_rrsB 1068 spectinomycin -e.coli 16S_rrsB 1406 gentamicin,kanamycin,tobramycin -e.coli 16S_rrsB 1408 gentamicin,kanamycin,neomycin,paromomycin -e.coli 16S_rrsC 794 kasugamicin -e.coli 16S_rrsC 926 kasugamicin -e.coli 16S_rrsC 1519 kasugamicin -e.coli 16S_rrsH 1192 spectinomycin -e.coli ampC_promoter_size_53bp -42 ampicillin,amoxicillin/clavulanic acid,cefoxitin -e.coli ampC_promoter_size_53bp -32 ampicillin,amoxicillin/clavulanic acid,cefoxitin -e.coli ampC_promoter_size_53bp -13 ampicillin,amoxicillin/clavulanic acid,cefoxitin -e.coli ampC_promoter_size_53bp -12 ampicillin,amoxicillin/clavulanic acid,cefoxitin +escherichia_coli gyrA 51 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrA 67 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrA 81 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrA 82 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrA 83 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrA 84 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrA 87 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrA 106 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrA 196 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrB 136 aminocoumarin +escherichia_coli gyrB 426 ciprofloxacin I/R,nalidixic acid +escherichia_coli gyrB 447 ciprofloxacin I/R,nalidixic acid +escherichia_coli parC 56 ciprofloxacin I/R,nalidixic acid +escherichia_coli parC 57 None +escherichia_coli parC 60 ciprofloxacin I/R,nalidixic acid +escherichia_coli parC 78 ciprofloxacin I/R,nalidixic acid +escherichia_coli parC 80 ciprofloxacin I/R,nalidixic acid +escherichia_coli parC 84 ciprofloxacin I/R,nalidixic acid +escherichia_coli parC 108 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 416 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 423 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 439 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 444 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 458 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 460 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 464 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 470 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 475 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 476 ciprofloxacin I/R,nalidixic acid +escherichia_coli parE 529 ciprofloxacin I/R,nalidixic acid +escherichia_coli pmrA 39 colistin +escherichia_coli pmrA 81 colistin +escherichia_coli pmrB 161 colistin +escherichia_coli folP 64 sulfisoxazole +escherichia_coli rpoB 146 rifamycin +escherichia_coli rpoB 513 rifamycin +escherichia_coli rpoB 526 rifamycin +escherichia_coli rpoB 529 rifamycin +escherichia_coli rpoB 531 rifamycin +escherichia_coli rpoB 533 rifamycin +escherichia_coli rpoB 563 rifamycin +escherichia_coli rpoB 564 rifamycin +escherichia_coli rpoB 687 rifamycin +escherichia_coli 23S 2059 erythromycin,azithromycin +escherichia_coli 16S_rrsB 523 streptomycin +escherichia_coli 16S_rrsB 527 streptomycin +escherichia_coli 16S_rrsB 528 streptomycin +escherichia_coli 16S_rrsB 964 tetracycline +escherichia_coli 16S_rrsB 1053 tetracycline +escherichia_coli 16S_rrsB 1054 tetracycline +escherichia_coli 16S_rrsB 1055 tetracycline +escherichia_coli 16S_rrsB 1058 tetracycline +escherichia_coli 16S_rrsB 1064 spectinomycin +escherichia_coli 16S_rrsB 1066 spectinomycin +escherichia_coli 16S_rrsB 1068 spectinomycin +escherichia_coli 16S_rrsB 1406 gentamicin,kanamycin,tobramycin +escherichia_coli 16S_rrsB 1408 gentamicin,kanamycin,neomycin,paromomycin +escherichia_coli 16S_rrsC 794 kasugamicin +escherichia_coli 16S_rrsC 926 kasugamicin +escherichia_coli 16S_rrsC 1519 kasugamicin +escherichia_coli 16S_rrsH 1192 spectinomycin +escherichia_coli ampC_promoter_size_53bp -42 ampicillin,amoxicillin/clavulanic acid,cefoxitin +escherichia_coli ampC_promoter_size_53bp -32 ampicillin,amoxicillin/clavulanic acid,cefoxitin +escherichia_coli ampC_promoter_size_53bp -13 ampicillin,amoxicillin/clavulanic acid,cefoxitin +escherichia_coli ampC_promoter_size_53bp -12 ampicillin,amoxicillin/clavulanic acid,cefoxitin campylobacter gyrA 70 ciprofloxacin,nalidixic acid campylobacter gyrA 85 ciprofloxacin,nalidixic acid campylobacter gyrA 86 ciprofloxacin,nalidixic acid diff --git a/staramr/databases/resistance/data/info.ini b/staramr/databases/resistance/data/info.ini index 08641f86..897d70b0 100644 --- a/staramr/databases/resistance/data/info.ini +++ b/staramr/databases/resistance/data/info.ini @@ -1,3 +1,3 @@ [Versions] -pointfinder_gene_drug_version = 072621 +pointfinder_gene_drug_version = 072621.1 resfinder_gene_drug_version = 072621 diff --git a/staramr/tests/integration/data/ahpC_promoter_size_180bp-F10I.fsa b/staramr/tests/integration/data/ahpC_promoter_size_180bp-F10I.fsa new file mode 100644 index 00000000..42af93c5 --- /dev/null +++ b/staramr/tests/integration/data/ahpC_promoter_size_180bp-F10I.fsa @@ -0,0 +1,14 @@ +>ahpC_promoter_size_180bp +GAACCACTGCTTTGCCGCCACCGCGGCGAACGCGCGAAGCCCGGCCACGGCCGGCTAGCACCTCTTGGCG +GCGATGCCGATAAATATGGTGTGATATATCACCTTTGCCTGACAGCGACTTCACGGCACGATGGAATGTC +GCAACCAAATGCATTGTCCGCTTTGATGATGAGGAGAGTC +ATGCCACTGCTAACCATTGGCGATCAAaTCCCCGCCTACCAGCTCACCGCTCTCATCGGCGGTGACCTGT +CCAAGGTCGACGCCAAGCAGCCCGGCGACTACTTCACCACTATCACCAGTGACGAACACCCAGGCAAGTG +GCGGGTGGTGTTCTTTTGGCCGAAAGACTTCACGTTCGTGTGCCCTACCGAGATCGCGGCGTTCAGCAAG +CTCAATGACGAGTTCGAGGACCGCGACGCCCAGATCCTGGGGGTTTCGATTGACAGCGAATTCGCGCATT +TCCAGTGGCGTGCACAGCACAACGACCTCAAAACGTTACCCTTCCCGATGCTCTCCGACATCAAGCGCGA +ACTCAGCCAAGCCGCAGGTGTCCTCAACGCCGACGGTGTGGCCGACCGCGTGACCTTTATCGTCGACCCC +AACAACGAGATCCAGTTCGTCTCGGCCACCGCCGGTTCGGTGGGACGCAACGTCGATGAGGTACTGCGAG +TGCTCGACGCCCTCCAGTCCGACGAGCTGTGCGCATGCAACTGGCGCAAGGGCGACCCGACGCTAGACGC +TGGCGAACTCCTCAAGGCTTCGGCCTAA + diff --git a/staramr/tests/integration/data/ampC_promoter_size_53bp-Cn42T.fsa b/staramr/tests/integration/data/ampC_promoter_size_53bp-Cn42T.fsa new file mode 100644 index 00000000..6e83bdd9 --- /dev/null +++ b/staramr/tests/integration/data/ampC_promoter_size_53bp-Cn42T.fsa @@ -0,0 +1,4 @@ +>ampC_promoter_size_53bp +TGGCTGCTATCtTGACAGTTGTCACGCTGATTGGTGTCGTTACAATCTAACGC +ATCGCCAATGTAAATCCGGCCCGCCTATGGCGGGCCGTTTTGTATGGAAACCAGACCCTATGTTCAAAACGACGCTCTGCACCTTATTAATT + diff --git a/staramr/tests/integration/data/ampC_promoter_size_53bp-n13insG.fsa b/staramr/tests/integration/data/ampC_promoter_size_53bp-n13insG.fsa new file mode 100644 index 00000000..71dcbdcd --- /dev/null +++ b/staramr/tests/integration/data/ampC_promoter_size_53bp-n13insG.fsa @@ -0,0 +1,4 @@ +>ampC_promoter_size_53bp +TGGCTGCTATCCTGACAGTTGTCACGCTGATTGGTGTCGTTgACAATCTAACGC +ATCGCCAATGTAAATCCGGCCCGCCTATGGCGGGCCGTTTTGTATGGAAACCAGACCCTATGTTCAAAACGACGCTCTGCACCTTATTAATT + diff --git a/staramr/tests/integration/data/ampC_promoter_size_53bp-n16insGT.fsa b/staramr/tests/integration/data/ampC_promoter_size_53bp-n16insGT.fsa new file mode 100644 index 00000000..2a9910fa --- /dev/null +++ b/staramr/tests/integration/data/ampC_promoter_size_53bp-n16insGT.fsa @@ -0,0 +1,3 @@ +>ampC_promoter_size_53bp +TGGCTGCTATCCTGACAGTTGTCACGCTGATTGGTGTCgtGTTACAATCTAACGC +ATCGCCAATGTAAATCCGGCCCGCCTATGGCGGGCCGTTTTGTATGGAAACCAGACCCTATGTTCAAAACGACGCTCTGCACCTTATTAATT diff --git a/staramr/tests/integration/data/mtrR_promoter_size_66bp-n57del.fsa b/staramr/tests/integration/data/mtrR_promoter_size_66bp-n57del.fsa new file mode 100644 index 00000000..46ba8a15 --- /dev/null +++ b/staramr/tests/integration/data/mtrR_promoter_size_66bp-n57del.fsa @@ -0,0 +1,13 @@ +>mtrR_promoter_size_66bp +TGCACGGATAAAAGTCTTTTTTATAATCCGCCCTCGTCAAACCGACCCGAAACGAAAACGCCATT +ATGAGAAAAACCAAAACCGAAGCCTTGAAAACCAAAGAACACCTGATGCTTGCCGCCTTGGAAACCTTTT +ACCGCAAAGGGATTGCCCGCACCTCGCTCAACGAAATCGCCCAAGCCGCCGGCGTAACGCGCGGCGCGCT +CTATTGGCATTTCAAAAATAAGGAAGACTTGTTTGACGCGTTGTTCCAACGTATCTGCGACGACATCGAA +AACTGCATCGCGCAAGATGCCGCAGATGCCGAAGGAGGTTCTTGGACGGTATTCCGCCACACGCTGCTGC +ACTTTTTCGAGCGGCTGCAAAGCAACGACATCCACTACAAATTCCACAACATCCTGTTTTTAAAGTGCGA +ACATACGGAACAAAACGCCGCCGTTATCGCCATTGCCCGCAAGCATCAGGCAATCTGGCGCGAGAAAATT +ACCGCCGTTTTGACCGAAGCGGTGGAAAATCAGGATTTGGCTGACGATTTGGACAAGGAAACGGCGGTCA +TCTTCATCAAATCGACGTTGGACGGGCTGATTTGGCGTTGGTTCTCTTCCGGCGAAAGTTTCGATTTGGG +CAAAACCGCCCCGCGCATCATCGGGATAATGATGGACAACTTGGAAAACCATCCCTGCCTGCGCCGGAAA +TAA + diff --git a/staramr/tests/integration/data/pbp5-ins465S.fsa b/staramr/tests/integration/data/pbp5-ins465S.fsa new file mode 100644 index 00000000..e65d78b6 --- /dev/null +++ b/staramr/tests/integration/data/pbp5-ins465S.fsa @@ -0,0 +1,36 @@ +>pbp5 +ATGAAAAGAAGTGACAAGCACGGCAAAAATCGAACAGGCGCTTATATTGCCGGCGCAGTG +ATTTTAATAGTAACTGCAAGTGGCGGTTATTTTTACTACCGGCACTACCAAGAAAGCCAA +GCAGTAGAAGCTGGAGAAAAGACGGTTGAGCAATTTGTCCAAGCTTTAAACAAAGGAGAT +TATAACAAAGCTGCAGGAATGGCATCGAAAAAGGCAGCAAATAAAAGTGCATTATCTGAA +AAAGAGATCTTAGAAAAATACCAAAATATATACGGTGCTGCCGATGTCAAAGGACTTGAG +ATATCAAATCTAAAAGTAGATAAAAAAGATGATTCTACTTATAGCTTTTCATATAAAGCA +AAGATGAATACCTCATTAGGTGAATTGAAAGATCTTTCTTATAAAGGAACATTAGACAGA +AATGATGGAAAAACCACGATCAACTGGCAGCCTAACTTGGTTTTTCCAGAAATGGAAGGA +AATGACAAAGTAAGTCTGACCACGCAAGAAGCAACAAGAGGGAACATTTTAGATCGAAAT +GGGGAACCATTAGCAACAACCGGCAAACTAAAACAATTAGGAGTCGTTCCAAGCAAACTT +GGGGATGGGGACGAAAAAACAGCCAATATCAAAGCCATTGCTTCTGCATTCGACTTAACA +GAAGATGCTATCAATCAGGCTATTTCACAAAGCTGGGTACAACCCGATTACTTCGTCCCA +TTGAAAATCATTGATGGAGCAACGCCAGAACTTCCAGCTGGAGCTACCATCCAAGAAGTA +GACGGCAGATATTATCCTTTGGGTGAAGCAGCTGCTCAACTGATTGGTTACGTGGGAGAT +ATCACAGCAGAAGATATTGATAAAAATCCAGAATTAAGCAGTAATGGAAAAATCGGACGA +TCTGGTTTGGAAATGGCTTTTGATAAGGATCTTCGTGGGACTACAGGTGGAAAATTAAGC +ATCACAGATACAGACGGTGTCGAGAAAAAGGTTCTGATCGAGCATGAAGTCCAAAACGGA +AAAGATATCAAATTGACAATCGATGCAAAGGCACAAAAAACAGCTTTCGACAGTCTAGGA +GGAAAAGCTGGATCGACTGTTGCGACAACGCCAAAAACCGGTGATCTTCTTGCGCTTGCT +AGCTCTCCAAGCTATGATCCAAACAAAATGACAAACGGGATCTCACAAGAAGACTACAAA +GCTTATGAAGAAAATCCTGAACAACCATTCATCAGCCGATTTGCGACAGGTTATGCTCCT +GGCTCTACGTTTAAAATGATCACAGCAGCAATCGGTCTCGACAACGGCACTATCGATCCA +AATGAAGTGTTGACGATCAACGGGCTTAAATGGCAAAAAGATAGTTCTTGGGGATCGTAT +CAAGTAACTCGTGTTagcAGTGATGTGTCACAAGTAGACTTAAAAACTGCTTTGATTTATTCC +GATAATATATATATGGCACAAGAAACGTTGAAAATGGGGGAGAAAAATTTCCGTGCAGGT +TTGGATAAATTCATTTTTGGTGAAGACCTTGATTTGCCAATCAGTATGAATCCAGCACAA +ATTTCTAATGAAGAGAGCTTTAATTCAGATATCTTGTTAGCTGATACTGGATATGGACAA +GGCGAACTTCTAATTAATCCTATCCAGCAAGCAGCAATGTATTCTGTTTTTGCCAACAAT +GGCACACTTGTCTATCCTAAATTGATTGCAGATAAAGAGACAAAAGATAAGAAGAATGTC +ATCGGCGAAACAGCAGTACAAACGATCGTGCCAGATCTGAGAGAAGTTGTGCAAGATGTA +AATGGTACAGCACATTCTCTTTCTGCTTTAGGGATTCCATTGGCAGCGAAAACTGGTACA +GCGGAAATCAAAGAAAAACAGGATGAAAAAGGGAAAGAGAACAGTTTCTTGTTTGCTTTC +AACCCTGATGACCAAGGATATATGATGGTGAGCATGTTGGAAAATAAAGAAGATGATGAT +TCAGCAACTAAACGAGCACCCGAACTATTACAATACCTCAACCAAAATTATCAATAA + diff --git a/staramr/tests/integration/data/rpsE-del78gtt.fsa b/staramr/tests/integration/data/rpsE-del78gtt.fsa new file mode 100644 index 00000000..4ef7d3a0 --- /dev/null +++ b/staramr/tests/integration/data/rpsE-del78gtt.fsa @@ -0,0 +1,10 @@ +>rpsE +ATGGCAAAACATGAAATTGAAGAACGCGGTGACGGTCTGATTGAAAAGATGGTCGCAGTTAACCGTGTAA +CCAAAGTAAAAGGTGGTCGCATTATGGCTTTCTCTGCGCTAACTGTTGTTGGTGATGGAGATGGTCG +TATTGGTATGGGTAAAGGTAAGTCAAAAGAAGTGCCTGTTGCAGTCCAAAAGGCGATGGATCAAGCACGA +CGCTCTATGATTAAGGTACCATTAAAAAATGGTACGATCCATCATGAGGTTATTGGTCGACATGGTGCTA +CTAAAGTATTTATGCAGCCTGCTAAAGAGGGTAGTGGCGTAAAAGCCGGTGGACCTATGCGTTTGGTTTT +TGATGCTATGGGCATTCATAATATTTCCGCCAAAGTGCACGGATCTACTAACCCATATAATATCGTACGT +GCAACATTAGATGGTTTGTCTAAGTTGCATACTCCTGCTGATATCGCAGCCAAACGTGGCTTGACAGTGG +AAGACATTTTGGGAGTTAACCATGGCTGA + diff --git a/staramr/tests/integration/databases/test_BlastDatabaseRepositories.py b/staramr/tests/integration/databases/test_BlastDatabaseRepositories.py index 1e09e199..2e509258 100644 --- a/staramr/tests/integration/databases/test_BlastDatabaseRepositories.py +++ b/staramr/tests/integration/databases/test_BlastDatabaseRepositories.py @@ -92,7 +92,7 @@ def testInfo(self): 'Pointfinder commits invalid') self.assertEqual(database_info['pointfinder_organisms_all'], 'campylobacter, e.coli, gonorrhoeae, salmonella, tuberculosis', 'Pointfinder organisms are invalid') - self.assertEqual(database_info['pointfinder_organisms_valid'], 'campylobacter, salmonella', + self.assertEqual(database_info['pointfinder_organisms_valid'], 'campylobacter, enterococcus_faecalis, enterococcus_faecium, escherichia_coli, helicobacter_pylori, salmonella', 'Pointfinder organisms are invalid') self.assertEqual(database_info['plasmidfinder_db_commit'], self.PLASMIDFINDER_VALID_COMMIT, 'Plasmidfinder commits invalid') diff --git a/staramr/tests/integration/detection/test_AMRDetection.py b/staramr/tests/integration/detection/test_AMRDetection.py index 4cc8c76e..db6ad6d6 100644 --- a/staramr/tests/integration/detection/test_AMRDetection.py +++ b/staramr/tests/integration/detection/test_AMRDetection.py @@ -1073,5 +1073,280 @@ def testPointfinderEFaeciumIns466Success(self): self.assertEqual(expected_records['pbp5'].seq.upper(), records['pbp5'].seq.upper(), "records don't match") """ + def testPointfinderEscherichiaColiCn42TSuccess(self): + # C-42T (negative coordinate) + # This tests the ability to find a single point mutation in the nucleotide part of a promoter. + # Verified CGE identifies the ampC-promoter (g.-42C>T) mutation. + pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'escherichia_coli') + blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, + self.blast_out.name) + amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, + self.pointfinder_drug_table, pointfinder_database, + output_dir=self.outdir.name) + + file = path.join(self.test_data_dir, "ampC_promoter_size_53bp-Cn42T.fsa") + files = [file] + amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0) + + pointfinder_results = amr_detection.get_pointfinder_results() + self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') + + result = pointfinder_results[pointfinder_results['Gene'] == 'ampC_promoter_size_53bp (C-42T)'] + self.assertEqual(len(result.index), 1, 'Wrong number of results detected') + self.assertEqual(result.index[0], 'ampC_promoter_size_53bp-Cn42T', msg='Wrong file') + self.assertEqual(result['Type'].iloc[0], 'nucleotide', msg='Wrong type') + self.assertEqual(result['Position'].iloc[0], -42, msg='Wrong nucleotide position') + self.assertEqual(result['Mutation'].iloc[0], 'C -> T', msg='Wrong mutation') + self.assertAlmostEqual(result['%Identity'].iloc[0], 99.31, places=2, msg='Wrong pid') + self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') + self.assertEqual(result['HSP Length/Total Length'].iloc[0], '145/145', msg='Wrong lengths') + self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ampicillin, amoxicillin/clavulanic acid, cefoxitin', + 'Wrong phenotype') + + hit_file = path.join(self.outdir.name, 'pointfinder_ampC_promoter_size_53bp-Cn42T.fsa') + records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) + + self.assertEqual(len(records), 1, 'Wrong number of hit records') + + expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) + self.assertEqual(expected_records['ampC_promoter_size_53bp'].seq.upper(), records['ampC_promoter_size_53bp'].seq.upper(), "records don't match") + + def testPointfinderMycobacteriumTuberculosisF10ISuccess(self): + # F10I + # This tests the ability to find a single mutation in the codon part of a promoter. + # Verified CGE finds the ahpC-promoter (p.F10I) mutation. + pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'mycobacterium_tuberculosis') + blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, + self.blast_out.name) + amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, + self.pointfinder_drug_table, pointfinder_database, + output_dir=self.outdir.name) + + file = path.join(self.test_data_dir, "ahpC_promoter_size_180bp-F10I.fsa") + files = [file] + amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0) + + pointfinder_results = amr_detection.get_pointfinder_results() + self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') + + result = pointfinder_results[pointfinder_results['Gene'] == 'ahpC_promoter_size_180bp (F10I)'] + self.assertEqual(len(result.index), 1, 'Wrong number of results detected') + self.assertEqual(result.index[0], 'ahpC_promoter_size_180bp-F10I', msg='Wrong file') + self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type') + self.assertEqual(result['Position'].iloc[0], 10, msg='Wrong codon position') + self.assertEqual(result['Mutation'].iloc[0], 'TTC -> ATC (F -> I)', msg='Wrong mutation') + self.assertAlmostEqual(result['%Identity'].iloc[0], 99.87, places=2, msg='Wrong pid') + self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') + self.assertEqual(result['HSP Length/Total Length'].iloc[0], '768/768', msg='Wrong lengths') + self.assertEqual(result['Predicted Phenotype'].iloc[0], 'unknown[ahpC_promoter_size_180bp (F10I)]', + 'Wrong phenotype') + + hit_file = path.join(self.outdir.name, 'pointfinder_ahpC_promoter_size_180bp-F10I.fsa') + records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) + + self.assertEqual(len(records), 1, 'Wrong number of hit records') + + expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) + self.assertEqual(expected_records['ahpC_promoter_size_180bp'].seq.upper(), records['ahpC_promoter_size_180bp'].seq.upper(), "records don't match") + + def testPointfinderEscherichiaColin13insGSuccess(self): + # Insert a G at -13 in the promoter. + # (ampC_promoter_size_53bp / ampC promoter / -13 / - / ins / G) + # This tests the ability to find a single insertion in the nucleotide part of a promoter. + # Verified CGE identifies the ampC-promoter (g.-13_-12insG) mutation. + pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'escherichia_coli') + blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, + self.blast_out.name) + amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, + self.pointfinder_drug_table, pointfinder_database, + output_dir=self.outdir.name) + + file = path.join(self.test_data_dir, "ampC_promoter_size_53bp-n13insG.fsa") + files = [file] + amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0) + + pointfinder_results = amr_detection.get_pointfinder_results() + self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') + + result = pointfinder_results[pointfinder_results['Gene'] == 'ampC_promoter_size_53bp (ins-13G)'] + self.assertEqual(len(result.index), 1, 'Wrong number of results detected') + self.assertEqual(result.index[0], 'ampC_promoter_size_53bp-n13insG', msg='Wrong file') + self.assertEqual(result['Type'].iloc[0], 'nucleotide', msg='Wrong type') + self.assertEqual(result['Position'].iloc[0], -13, msg='Wrong nucleotide position') + self.assertEqual(result['Mutation'].iloc[0], 'ins -> G', msg='Wrong mutation') + self.assertAlmostEqual(result['%Identity'].iloc[0], 99.31, places=2, msg='Wrong pid') + self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.69, places=2, msg='Wrong overlap') # Blast reports 100.69 + self.assertEqual(result['HSP Length/Total Length'].iloc[0], '146/145', msg='Wrong lengths') + self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ampicillin, amoxicillin/clavulanic acid, cefoxitin', + 'Wrong phenotype') + + hit_file = path.join(self.outdir.name, 'pointfinder_ampC_promoter_size_53bp-n13insG.fsa') + records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) + + self.assertEqual(len(records), 1, 'Wrong number of hit records') + + expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) + self.assertEqual(expected_records['ampC_promoter_size_53bp'].seq.upper(), records['ampC_promoter_size_53bp'].seq.upper(), "records don't match") + + def testPointfinderEscherichiaColin16insGTSuccess(self): + # Insert a GT at -16 in the promoter. + # (ampC_promoter_size_53bp / ampC promoter / -16 / - / ins / GT) + # This tests the ability to find a double insertion in the nucleotide part of a promoter. + # Verified CGE identifies the ampC-promoter (g.-16_-15insGT) mutation. + pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'escherichia_coli') + blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, + self.blast_out.name) + amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, + self.pointfinder_drug_table, pointfinder_database, + output_dir=self.outdir.name) + + file = path.join(self.test_data_dir, "ampC_promoter_size_53bp-n16insGT.fsa") + files = [file] + amr_detection.run_amr_detection(files, 90, 90, 90, 90,0,0,0,0,0) + + pointfinder_results = amr_detection.get_pointfinder_results() + self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') + + result = pointfinder_results[pointfinder_results['Gene'] == 'ampC_promoter_size_53bp (ins-16GT)'] + self.assertEqual(len(result.index), 1, 'Wrong number of results detected') + self.assertEqual(result.index[0], 'ampC_promoter_size_53bp-n16insGT', msg='Wrong file') + self.assertEqual(result['Type'].iloc[0], 'nucleotide', msg='Wrong type') + self.assertEqual(result['Position'].iloc[0], -16, msg='Wrong nucleotide position') + self.assertEqual(result['Mutation'].iloc[0], 'ins -> GT', msg='Wrong mutation') + self.assertAlmostEqual(result['%Identity'].iloc[0], 98.639, places=2, msg='Wrong pid') + self.assertAlmostEqual(result['%Overlap'].iloc[0], 101.38, places=2, msg='Wrong overlap') + self.assertEqual(result['HSP Length/Total Length'].iloc[0], '147/145', msg='Wrong lengths') + self.assertEqual(result['Predicted Phenotype'].iloc[0], 'unknown[ampC_promoter_size_53bp (ins-16GT)]', + 'Wrong phenotype') + + hit_file = path.join(self.outdir.name, 'pointfinder_ampC_promoter_size_53bp-n16insGT.fsa') + records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) + + self.assertEqual(len(records), 1, 'Wrong number of hit records') + + expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) + self.assertEqual(expected_records['ampC_promoter_size_53bp'].seq.upper(), records['ampC_promoter_size_53bp'].seq.upper(), "records don't match") + + def testPointfinderNeisseriaGonorrhoeaen57delSuccess(self): + # Delete the A nucleotide at -57 in the promoter. + # (mtrR_promoter_size_66bp / mtrR promoter / -57 / del / A) + # This tests the ability to find a single deletion in the nucleotide part of a promoter. + # Verified CGE identifies the mtrR-promoter (g.-57_-57del) mutation. + pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'neisseria_gonorrhoeae') + blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, + self.blast_out.name) + amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, + self.pointfinder_drug_table, pointfinder_database, + output_dir=self.outdir.name) + + file = path.join(self.test_data_dir, "mtrR_promoter_size_66bp-n57del.fsa") + files = [file] + amr_detection.run_amr_detection(files, 90, 90, 90, 90,0,0,0,0,0) + + pointfinder_results = amr_detection.get_pointfinder_results() + self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') + + result = pointfinder_results[pointfinder_results['Gene'] == 'mtrR_promoter_size_66bp (del-57A)'] + self.assertEqual(len(result.index), 1, 'Wrong number of results detected') + self.assertEqual(result.index[0], 'mtrR_promoter_size_66bp-n57del', msg='Wrong file') + self.assertEqual(result['Type'].iloc[0], 'nucleotide', msg='Wrong type') + self.assertEqual(result['Position'].iloc[0], -57, msg='Wrong nucleotide position') + self.assertEqual(result['Mutation'].iloc[0], 'del -> A', msg='Wrong mutation') + self.assertAlmostEqual(result['%Identity'].iloc[0], 99.86, places=2, msg='Wrong pid') + self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.0, places=2, msg='Wrong overlap') + self.assertEqual(result['HSP Length/Total Length'].iloc[0], '699/699', msg='Wrong lengths') + self.assertEqual(result['Predicted Phenotype'].iloc[0], 'unknown[mtrR_promoter_size_66bp (del-57A)]', + 'Wrong phenotype') + + hit_file = path.join(self.outdir.name, 'pointfinder_mtrR_promoter_size_66bp-n57del.fsa') + records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) + + self.assertEqual(len(records), 1, 'Wrong number of hit records') + + expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) + self.assertEqual(expected_records['mtrR_promoter_size_66bp'].seq.upper(), records['mtrR_promoter_size_66bp'].seq.upper().replace('-', ''), "records don't match") + + def testPointfinderNeisseriaGonorrhoeae78delSuccess(self): + # Delete the GTT/V nucleotides/codon at pos 78 (nucleotide coords) / pos 27 (codon coords) + # Reminder that the Pointfinder database uses 0-base for nucleotides and 1-base for codons. + # DB entry: + # rpsE / rpsE / 78 / - / del / GTT / Spectinomycin / 23183436 + # This tests the ability to find a single codon deletion. + # Verified CGE identifies the rpsE codon deletion (rpsE:p.V27_None79del): + pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'neisseria_gonorrhoeae') + blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, + self.blast_out.name) + amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, + self.pointfinder_drug_table, pointfinder_database, + output_dir=self.outdir.name) + + file = path.join(self.test_data_dir, "rpsE-del78gtt.fsa") + files = [file] + amr_detection.run_amr_detection(files, 90, 90, 90, 90,0,0,0,0,0) + + pointfinder_results = amr_detection.get_pointfinder_results() + self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') + + result = pointfinder_results[pointfinder_results['Gene'] == 'rpsE (del27V)'] + self.assertEqual(len(result.index), 1, 'Wrong number of results detected') + self.assertEqual(result.index[0], 'rpsE-del78gtt', msg='Wrong file') + self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type') + self.assertEqual(result['Position'].iloc[0], 27, msg='Wrong codon position') + self.assertEqual(result['Mutation'].iloc[0], 'GTT -> --- (V -> del)', msg='Wrong mutation') + self.assertAlmostEqual(result['%Identity'].iloc[0], 99.42, places=2, msg='Wrong pid') + self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.0, places=2, msg='Wrong overlap') + self.assertEqual(result['HSP Length/Total Length'].iloc[0], '519/519', msg='Wrong lengths') + self.assertEqual(result['Predicted Phenotype'].iloc[0], 'unknown[rpsE (del27V)]', + 'Wrong phenotype') + + hit_file = path.join(self.outdir.name, 'pointfinder_rpsE-del78gtt.fsa') + records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) + + self.assertEqual(len(records), 1, 'Wrong number of hit records') + + expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) + self.assertEqual(expected_records['rpsE'].seq.upper(), records['rpsE'].seq.upper().replace('-', ''), "records don't match") + + def testPointfinderEnterococcusFaecium466insSSuccess(self): + # Insert a AGC (S) at position 466 (codon coords). + # DB entry: + # pbp5 / pbp5 / 466 / - / ins / S,D / Ampicillin / 25182648,25182648 + # This tests the ability to find a single codon insertion. + # Verified CGE identifies the codon insertion pbp5 (ins466_None467insS). + pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'enterococcus_faecium') + blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, + self.blast_out.name) + amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, + self.pointfinder_drug_table, pointfinder_database, + output_dir=self.outdir.name) + + file = path.join(self.test_data_dir, "pbp5-ins465S.fsa") + files = [file] + amr_detection.run_amr_detection(files, 90, 90, 90, 90,0,0,0,0,0) + + pointfinder_results = amr_detection.get_pointfinder_results() + self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') + + result = pointfinder_results[pointfinder_results['Gene'] == 'pbp5 (ins465S)'] + self.assertEqual(len(result.index), 1, 'Wrong number of results detected') + self.assertEqual(result.index[0], 'pbp5-ins465S', msg='Wrong file') + self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type') + self.assertEqual(result['Position'].iloc[0], 465, msg='Wrong codon position') + self.assertEqual(result['Mutation'].iloc[0], '--- -> AGC (ins -> S)', msg='Wrong mutation') + self.assertAlmostEqual(result['%Identity'].iloc[0], 99.85, places=2, msg='Wrong pid') + self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.15, places=2, msg='Wrong overlap') + self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2040/2037', msg='Wrong lengths') + self.assertEqual(result['Predicted Phenotype'].iloc[0], 'unknown[pbp5 (ins465S)]', + 'Wrong phenotype') + + hit_file = path.join(self.outdir.name, 'pointfinder_pbp5-ins465S.fsa') + records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) + + self.assertEqual(len(records), 1, 'Wrong number of hit records') + + expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) + self.assertEqual(expected_records['pbp5'].seq.upper(), records['pbp5'].seq.upper().replace('-', ''), "records don't match") + + if __name__ == '__main__': unittest.main() diff --git a/staramr/tests/unit/blast/pointfinder/test_PointfinderDatabaseInfo.py b/staramr/tests/unit/blast/pointfinder/test_PointfinderDatabaseInfo.py index 2f62e92f..d215bcc8 100644 --- a/staramr/tests/unit/blast/pointfinder/test_PointfinderDatabaseInfo.py +++ b/staramr/tests/unit/blast/pointfinder/test_PointfinderDatabaseInfo.py @@ -14,7 +14,7 @@ def setUp(self): ['gyrA', 'gyrA', 1, 2, 'GAT', 'D', 'N,H', 'Quinolones', 15848289], ], columns=( - '#Gene_ID', 'Gene_name', 'No of mutations needed', 'Codon_pos', 'Ref_nuc', 'Ref_codon', 'Res_codon', + 'Gene_ID', 'Gene_name', 'No of mutations needed', 'Codon_pos', 'Ref_nuc', 'Ref_codon', 'Res_codon', 'Resistance', 'PMID')) self.database = PointfinderDatabaseInfo.from_pandas_table(pandas_pointfinder_table) diff --git a/staramr/tests/unit/blast/results/pointfinder/codon/test_CodonMutationPosition.py b/staramr/tests/unit/blast/results/pointfinder/codon/test_CodonMutationPosition.py index b699b617..4ceb643f 100644 --- a/staramr/tests/unit/blast/results/pointfinder/codon/test_CodonMutationPosition.py +++ b/staramr/tests/unit/blast/results/pointfinder/codon/test_CodonMutationPosition.py @@ -181,9 +181,9 @@ def testMutationPositionGapStart(self): self.assertEqual(mutation.get_mutation_position(), 1, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), '-TC', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I1del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') + self.assertEqual(mutation.get_mutation_string_short(), 'del1I', 'Incorrect string') def testMutationPositionGapMiddle(self): mutation_position = 1 @@ -201,9 +201,9 @@ def testMutationPositionGapMiddle(self): self.assertEqual(mutation.get_mutation_position(), 1, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), 'A-C', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I1del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') + self.assertEqual(mutation.get_mutation_string_short(), 'del1I', 'Incorrect string') def testMutationPositionGapEnd(self): mutation_position = 2 @@ -221,9 +221,9 @@ def testMutationPositionGapEnd(self): self.assertEqual(mutation.get_mutation_position(), 1, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), 'AT-', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I1del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') + self.assertEqual(mutation.get_mutation_string_short(), 'del1I', 'Incorrect string') def testMutationPositionGapMiddleEnd(self): mutation_position = 2 @@ -241,9 +241,9 @@ def testMutationPositionGapMiddleEnd(self): self.assertEqual(mutation.get_mutation_position(), 1, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), 'AT-', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I1del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') + self.assertEqual(mutation.get_mutation_string_short(), 'del1I', 'Incorrect string') def testMutationPositionGapStartMiddleEnd(self): mutation_position = 3 @@ -262,9 +262,9 @@ def testMutationPositionGapStartMiddleEnd(self): self.assertEqual(mutation.get_mutation_position(), 2, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), '---', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I2del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') + self.assertEqual(mutation.get_mutation_string_short(), 'del2I', 'Incorrect string') def testMutationPositionGapPreviousCodon(self): mutation_position = 3 @@ -282,9 +282,9 @@ def testMutationPositionGapPreviousCodon(self): self.assertEqual(mutation.get_mutation_position(), 2, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), '---', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I2del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') + self.assertEqual(mutation.get_mutation_string_short(), 'del2I', 'Incorrect string') def testMutationPositionGapLargerPreviousCodon(self): mutation_position = 3 @@ -302,9 +302,9 @@ def testMutationPositionGapLargerPreviousCodon(self): self.assertEqual(mutation.get_mutation_position(), 2, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), '---', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I2del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') + self.assertEqual(mutation.get_mutation_string_short(), 'del2I', 'Incorrect string') def testMutationPositionGapBefore(self): mutation_position = 3 @@ -322,9 +322,9 @@ def testMutationPositionGapBefore(self): self.assertEqual(mutation.get_mutation_position(), 2, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), 'A--', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I2del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') # reversed + self.assertEqual(mutation.get_mutation_string_short(), 'del2I', 'Incorrect string') def testMutationPositionGapBeforeAfter(self): mutation_position = 3 @@ -342,9 +342,9 @@ def testMutationPositionGapBeforeAfter(self): self.assertEqual(mutation.get_mutation_position(), 2, 'Incorrect mutation start') self.assertEqual(mutation.get_database_amr_gene_codon(), 'ATC', 'Incorrect database codon') self.assertEqual(mutation.get_input_genome_codon(), 'A--', 'Incorrect query codon') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'I', 'Incorrect database amino acid') - self.assertEqual(mutation.get_input_genome_mutation(), 'del', 'Incorrect query amino acid') - self.assertEqual(mutation.get_mutation_string_short(), 'I2del', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database amino acid') # actually reversed, see description in function + self.assertEqual(mutation.get_input_genome_mutation(), 'I', 'Incorrect query amino acid') + self.assertEqual(mutation.get_mutation_string_short(), 'del2I', 'Incorrect string') def testMutationPositionGapReferenceStart(self): mutation_position = 0 diff --git a/staramr/tests/unit/blast/results/pointfinder/nucleotide/test_NucleotideMutationPosition.py b/staramr/tests/unit/blast/results/pointfinder/nucleotide/test_NucleotideMutationPosition.py index 54cf5728..defc4360 100644 --- a/staramr/tests/unit/blast/results/pointfinder/nucleotide/test_NucleotideMutationPosition.py +++ b/staramr/tests/unit/blast/results/pointfinder/nucleotide/test_NucleotideMutationPosition.py @@ -65,9 +65,9 @@ def testMutationPositionGapStart(self): amr_gene_start) self.assertEqual(mutation.get_mutation_position(), 1, 'Incorrect nucleotide position') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'A', 'Incorrect database mutation') - self.assertEqual(mutation.get_input_genome_mutation(), '-', 'Incorrect query mutation') - self.assertEqual(mutation.get_mutation_string_short(), 'A1-', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database mutation') + self.assertEqual(mutation.get_input_genome_mutation(), 'A', 'Incorrect query mutation') + self.assertEqual(mutation.get_mutation_string_short(), 'del1A', 'Incorrect string') def testMutationPositionGapEnd(self): mutation_position = 3 @@ -81,9 +81,9 @@ def testMutationPositionGapEnd(self): amr_gene_start) self.assertEqual(mutation.get_mutation_position(), 4, 'Incorrect nucleotide position') - self.assertEqual(mutation.get_database_amr_gene_mutation(), 'G', 'Incorrect database mutation') - self.assertEqual(mutation.get_input_genome_mutation(), '-', 'Incorrect query mutation') - self.assertEqual(mutation.get_mutation_string_short(), 'G4-', 'Incorrect string') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'del', 'Incorrect database mutation') + self.assertEqual(mutation.get_input_genome_mutation(), 'G', 'Incorrect query mutation') + self.assertEqual(mutation.get_mutation_string_short(), 'del4G', 'Incorrect string') def testMutationPositionGapReference(self): mutation_position = 0 @@ -97,6 +97,6 @@ def testMutationPositionGapReference(self): amr_gene_start) self.assertEqual(mutation.get_mutation_position(), 1, 'Incorrect nucleotide position') - self.assertEqual(mutation.get_database_amr_gene_mutation(), '-', 'Incorrect database mutation') + self.assertEqual(mutation.get_database_amr_gene_mutation(), 'ins', 'Incorrect database mutation') self.assertEqual(mutation.get_input_genome_mutation(), 'A', 'Incorrect query mutation') - self.assertEqual(mutation.get_mutation_string_short(), '-1A', 'Incorrect string') + self.assertEqual(mutation.get_mutation_string_short(), 'ins1A', 'Incorrect string') diff --git a/staramr/tests/unit/blast/results/pointfinder/test_PointfinderHitHSP.py b/staramr/tests/unit/blast/results/pointfinder/test_PointfinderHitHSP.py index 6c95406f..0831be34 100644 --- a/staramr/tests/unit/blast/results/pointfinder/test_PointfinderHitHSP.py +++ b/staramr/tests/unit/blast/results/pointfinder/test_PointfinderHitHSP.py @@ -22,3 +22,42 @@ def testBuildPointfinderHitHSPInvalidQueryCoords(self): blast_record = pd.Series({'sstart': 1, 'send': 10, 'qstart': 10, 'qend': 1, 'sstrand': 'plus'}) self.assertRaises(InvalidPositionException, PointfinderHitHSP, None, blast_record) + + def testGetMutations_MultipleIndels(self): + # This tests correctly handling the following sequence of mutations: + # insertion, insertion, deletion, deletion + + blast_record = pd.Series({ + "qseqid":"pmrA", + "sseqid":"pmrA", + "pident":98.222, + "length":675, + "qstart":1, + "qend":669, + "sstart":1, + "send":669, + "slen":669, + "qlen":669, + "sstrand":"plus", + "sseq":"ATGAAAATTCTGATTGTTGAAGACGATCCCACGCTGTTATTGCAGGGACTGATTCTGGCGGCGCAAACCGAAGGCTACGCGTGCGATGGCGTGACAACCGCGCGGATGGCGGAACAAAGCCTTGAGGCAGGTCATTACAGCCTGGTGGTACTGGATTTAGGGTTACCCGACGAAGATGGACTGCATTTTCTCGCCCGTATCCGGCAGAAAAAATATACCCTGCCGGTACTGATCCTCACCAAAGCTCGCGATACGCTGACCGACAAAATCGCCGGGCTGGATGTCGGTGCCGACGACTATCTGGTGAAGCCTTTTGCGCTGGAAGAGTTACATGCCCGTATCCGCGCCCTGCTACGACGCCATAATAATCAGGGCGAAAGTGAGCTGATTGTTGGCAATCTGACGCTGAACATGGGTCGCCGTCAGGTATGGATGGGCGGTGAAGAGTTGATT---ACGCCCAAAGAATATGCTCTGCTGTCACGGTTAATGCTCAAAGCAGGCAGTCCGGTGCATCGGGAAATTCTCTACAACGACATCTATAACTGGGACAATGAACCCTCGACCAACACCCTGGAAGTGCATATCCACAAT---CGCGACAAAGTGGGCAAAGCCCGTATCCGCACCGTGCGCGGCTTTGGCTATATGCTGGTCGCGAATGAGGAAAACTAA", + "qseq":"ATGAAAATTCTGATTGTTGAAGACGAT---ACGCTGTTATTGCAGGGACTGATTCTGGCGGCGCAAACCGAAGGCTACGCGTGCGATGGCGTGACAACCGCGCGGATGGCGGAACAAAGCCTTGAGGCAGGTCATTACAGCCTGGTGGTACTGGATTTAGGGTTACCCGACGAAGATGGACTGCATTTTCTCGCCCGTATCCGGCAGAAAAAATATACCCTGCCGGTACTGATCCTCACC---GCTCGCGATACGCTGACCGACAAAATCGCCGGGCTGGATGTCGGTGCCGACGACTATCTGGTGAAGCCTTTTGCGCTGGAAGAGTTACATGCCCGTATCCGCGCCCTGCTACGACGCCATAATAATCAGGGCGAAAGTGAGCTGATTGTTGGCAATCTGACGCTGAACATGGGTCGCCGTCAGGTATGGATGGGCGGTGAAGAGTTGATTCTGACGCCCAAAGAATATGCTCTGCTGTCACGGTTAATGCTCAAAGCAGGCAGTCCGGTGCATCGGGAAATTCTCTACAACGACATCTATAACTGGGACAATGAACCCTCGACCAACACCCTGGAAGTGCATATCCACAATCTGCGCGACAAAGTGGGCAAAGCCCGTATCCGCACCGTGCGCGGCTTTGGCTATATGCTGGTCGCGAATGAGGAAAACTAA", + "plength":100.89686098654708}) + + hit = PointfinderHitHSP("staramr/tests/unit/data/pmrA-multi-indel.fsa", blast_record) + mutations = hit.get_mutations() + + self.assertEqual(mutations[0]._codon_start, 9, msg='Wrong codon position.') + self.assertEqual(mutations[0]._database_amr_gene_codon, "---", msg='Wrong AMR codon.') + self.assertEqual(mutations[0]._input_genome_codon, "CCC", msg='Wrong input codon.') + + self.assertEqual(mutations[1]._codon_start, 79, msg='Wrong codon position.') + self.assertEqual(mutations[1]._database_amr_gene_codon, "---", msg='Wrong AMR codon.') + self.assertEqual(mutations[1]._input_genome_codon, "AAA", msg='Wrong input codon.') + + self.assertEqual(mutations[2]._codon_start, 150, msg='Wrong codon position.') + self.assertEqual(mutations[2]._database_amr_gene_codon, "CTG", msg='Wrong AMR codon.') + self.assertEqual(mutations[2]._input_genome_codon, "---", msg='Wrong input codon.') + + self.assertEqual(mutations[3]._codon_start, 197, msg='Wrong codon position.') + self.assertEqual(mutations[3]._database_amr_gene_codon, "CTG", msg='Wrong AMR codon.') + self.assertEqual(mutations[3]._input_genome_codon, "---", msg='Wrong input codon.')