Merge development

phac-nml · Oct 18, 2022 · bdb5c3d · bdb5c3d
2 parents f0b29eb + 4b2d549
commit bdb5c3d
Show file tree

Hide file tree

Showing 31 changed files with 905 additions and 148 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+# Version 0.9.0
+
+* Updates to PointFinder database handling
+    * Adds the ability to handle promoters (regions with both promoter nucleotide information and non-promoter codon information)
+    * Adds handling of insertions and deletions in nucleotide and codon sequence
+    * Updates list of supported PointFinder species to `salmonella`, `campylobacter`, `enterococcus_faecalis`, `enterococcus_faecium`, `escherichia_coli`, `helicobacter_pylori`.
+* Switch name `e.coli` to `escherichia_coli` in PointFinder gene-drug key to match organism name in PointFinder database.
+
 # Version 0.8.0
 
 * Fixed issue when using older version of pandas (#136) (0.8.0.dev0).

diff --git a/README.md b/README.md
@@ -241,12 +241,12 @@ staramr db restore-default
 By default, the ResFinder/PointFinder/PlasmidFinder genes listed in [genes_to_exclude.tsv][] will be excluded from the final results. To pass a custom list of genes the option `--exclude-genes-file` can be used, where the file specified will contains a list of the sequence ids (one per line) from the ResFinder/PointFinder/PlasmidFinder databases. For example:
 
 ```
-#gene_id
+gene_id
 aac(6')-Iaa_1_NC_003197
 ColpVC_1__JX133088
 ```
 
-Please make sure to include `#gene_id` in the first line. The default exclusion list can also be disabled with `--no-exclude-genes`.
+Please make sure to include `gene_id` in the first line. The default exclusion list can also be disabled with `--no-exclude-genes`.
 
 # Output
 

diff --git a/staramr/__init__.py b/staramr/__init__.py
@@ -1 +1 @@
-__version__ = '0.8.0'
+__version__ = '0.9.0.dev0'
diff --git a/staramr/blast/pointfinder/PointfinderBlastDatabase.py b/staramr/blast/pointfinder/PointfinderBlastDatabase.py
@@ -71,6 +71,28 @@ def get_resistance_nucleotides(self, gene, nucleotide_mutations):
         """
         return self._pointfinder_info.get_resistance_nucleotides(gene, nucleotide_mutations)
 
+    def get_resistance_promoter(self, gene, nucleotide_mutations):
+        """
+        Gets a list of resistance nucleotides and codons located within the promoter, derived from the list
+        of nucleotide mutations.
+        :param gene: The name of the gene.
+        :param nucleotide_mutations: The positions of the nucleotide mutations.
+        :return: The resistance positions (both nucleotide and codon positions).
+        """
+
+        # Get the mutations in the nucleotide (non-gene) part:
+        # Filter the list for negative coordinate positions.
+        nucleotide_part = list(filter(lambda x: (x._nucleotide_position_amr_gene < 0), nucleotide_mutations))
+        resistance_nucleotides = self._pointfinder_info.get_resistance_nucleotides(gene, nucleotide_part)
+
+        # Get the mutations in the coding part:
+        # Filter the list for non-negative coordinate positions.
+        codon_part = list(filter(lambda x: (x._nucleotide_position_amr_gene >= 0), nucleotide_mutations))
+        resistance_codons = self._pointfinder_info.get_resistance_codons(gene, codon_part)
+
+        # Combine and return the results:
+        return resistance_nucleotides + resistance_codons
+
     def get_organism(self):
         """
         Gets the particular organism of this database.
@@ -87,7 +109,8 @@ def get_available_organisms(cls):
         A Class Method to get a list of organisms that are currently supported by staramr.
         :return: The list of organisms currently supported by staramr.
         """
-        return ['salmonella', 'campylobacter']
+        return ['salmonella', 'campylobacter', 'enterococcus_faecalis', 'enterococcus_faecium',
+                'escherichia_coli', 'helicobacter_pylori']
 
     @classmethod
     def get_organisms(cls, database_dir):

diff --git a/staramr/blast/pointfinder/PointfinderDatabaseInfo.py b/staramr/blast/pointfinder/PointfinderDatabaseInfo.py
@@ -2,6 +2,9 @@
 from os import path
 
 import pandas as pd
+import Bio.Seq
+from staramr.blast.results.pointfinder.codon.CodonMutationPosition import CodonMutationPosition
+from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition
 
 from staramr.exceptions.GenotypePhenotypeMatchException import GenotypePhenotypeMatchException
 
@@ -33,7 +36,15 @@ def from_file(cls, file):
         :param file: The file containing drug resistance mutations.
         :return: A new PointfinderDatabaseInfo.
         """
-        pointfinder_info = pd.read_csv(file, sep='\t', index_col=False)
+
+        with open(file) as f:
+            line = f.readline()
+
+        line = line.lstrip("#")
+        column_names = line.split()
+
+        pointfinder_info = pd.read_csv(file, sep='\t', index_col=False, comment='#', header=None, names=column_names)
+
         return cls(pointfinder_info, file)
 
     @classmethod
@@ -44,6 +55,18 @@ def from_pandas_table(cls, database_info_dataframe):
         :return: A new PointfinderDatabaseInfo.
         """
         return cls(database_info_dataframe)
+
+    @staticmethod
+    def to_codons(regex_match):
+        # Sometimes, the regex will match a string with a comma and return multiple matches.
+        # Ex: TTCATGGAC,TTC
+        # We need to make sure we handle these commas and multiple matches.
+        entries = regex_match.string.split(",")
+
+        for i in range(0, len(entries)):
+            entries[i] = Bio.Seq.translate(entries[i], table='Standard').upper()
+
+        return ",".join(entries)
 
     def _resistance_table_hacks(self, table):
         """
@@ -55,15 +78,72 @@ def _resistance_table_hacks(self, table):
         if self._file and 'salmonella' in str(self._file) and path.exists(
                 path.join(path.dirname(self._file), '16S_rrsD.fsa')):
             logger.debug("Replacing [16S] with [16S_rrsD] for pointfinder organism [salmonella]")
-            table[['#Gene_ID']] = table[['#Gene_ID']].replace('16S', '16S_rrsD')
+            table[['Gene_ID']] = table[['Gene_ID']].replace('16S', '16S_rrsD')
+
+        # We need to fix table entries where codon deletions are listed,
+        # but no reference nucleotides are listed in the corresponding entry.
+        # This will mean that this specific entry (for some reason)
+        # is using 0-based nucleotide coordinates and nucleotides for the Res_codon
+        # instead of 1-based codon coordinates and codons for the Res_codon.
+        # However, we need make an exception for entries where "promoter", "16S", or "23S"
+        # are found in the 'Gene_ID'.
+
+        table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del')
+        & (table['Gene_ID'].str.contains('promoter') == False)
+        & (table['Gene_ID'].str.contains('16S') == False)
+        & (table['Gene_ID'].str.contains('23S') == False), "Codon_pos"] += 3
+        # We increment by 3 because the table is using base-0 coordinates
+        # and we have base-1 coordindates for everything codon-related.
+        # However, what's listed is nucleotide coordinates for the codon
+        # mutation, so we actually need to bump it by 3 nucleotides.
+
+        # Furthermore, we need to convert 'Res_codon' entries related to the above problem
+        # (codon deletions) from nucleotides into codons. We choose to convert nucleotides into
+        # codons instead of the other way around so that can more accurately compare observed mutations
+        # to the table entries.
+        table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del')
+        & (table['Gene_ID'].str.contains('promoter') == False)
+        & (table['Gene_ID'].str.contains('16S') == False)
+        & (table['Gene_ID'].str.contains('23S') == False), "Res_codon"] = table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del')
+        & (table['Gene_ID'].str.contains('promoter') == False)
+        & (table['Gene_ID'].str.contains('16S') == False)
+        & (table['Gene_ID'].str.contains('23S') == False), "Res_codon"].str.replace('[A-Z,]+', self.to_codons, regex=True)
 
     def _get_resistance_codon_match(self, gene, codon_mutation):
         table = self._pointfinder_info
 
-        matches = table[(table['#Gene_ID'] == gene)
-                        & (table['Codon_pos'] == codon_mutation.get_mutation_position())
-                        & (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation())
-                        & (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))]
+        # We need to handle codon deletions as a special case:
+        if(type(codon_mutation) is CodonMutationPosition
+        and codon_mutation.get_input_genome_amino_acid() == 'del'):
+            matches = table[(table['Gene_ID'] == gene)
+                    # Codon deletions are listed using nucleotide coordinates
+                    & (table['Codon_pos'] == codon_mutation.get_mutation_position() * 3)
+                        # Such coords are denoted in nucleotide coordinates in the reference table for some reason,
+                        # so we need to convert to nucleotide coordinates before making the comparison.
+                    & (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation())
+                    & (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))]
+
+        # We need to handle codon insertions as a special case:
+        # Pointfinder mis-reports the position of codon insertions. For example:
+        # ref:     ACG --- ACG
+        # query:   ACG GGG ACG
+        # ref_pos: 1       2
+        # Pointfinder is incorrectly reporting the insertion as 2_3insG instead of the correct 1_2insG,
+        # which is to say, the insertion happens between reference codon coordinates 1 and 2.
+        # We need to shift by 1 in our interpretation.
+        elif type(codon_mutation) is CodonInsertionPosition:
+            matches = table[(table['Gene_ID'] == gene)
+                    # Codon inerstions need to be shifted by 1:
+                    & (table['Codon_pos'] == codon_mutation.get_mutation_position() + 1)
+                    & (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation())
+                    & (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))]
+
+        # Normal cases:
+        else:
+            matches = table[(table['Gene_ID'] == gene)
+                            & (table['Codon_pos'] == codon_mutation.get_mutation_position())
+                            & (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation())
+                            & (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))]
 
         if len(matches.index) > 1:
             # If more then one match, try to match Res_codon exactly to subselect
@@ -72,10 +152,7 @@ def _get_resistance_codon_match(self, gene, codon_mutation):
             if len(matches_subset.index) >= 1:
                 matches = matches_subset
 
-        if len(matches.index) > 1:
-                raise GenotypePhenotypeMatchException("Error, multiple matches for gene=" + str(gene) + ", codon_mutation=" + str(codon_mutation))
-        else:
-            return matches
+        return matches
 
     def _get_resistance_nucleotide_match(self, gene, nucleotide_mutations):
         return self._get_resistance_codon_match(gene, nucleotide_mutations)

diff --git a/staramr/blast/results/pointfinder/BlastResultsParserPointfinder.py b/staramr/blast/results/pointfinder/BlastResultsParserPointfinder.py
@@ -4,6 +4,7 @@
 from staramr.blast.results.BlastResultsParser import BlastResultsParser
 from staramr.blast.results.pointfinder.PointfinderHitHSP import PointfinderHitHSP
 from staramr.blast.results.pointfinder.nucleotide.PointfinderHitHSPRNA import PointfinderHitHSPRNA
+from staramr.blast.results.pointfinder.nucleotide.PointfinderHitHSPPromoter import PointfinderHitHSPPromoter
 
 logger = logging.getLogger('BlastResultsParserPointfinder')
 
@@ -25,6 +26,7 @@ class BlastResultsParserPointfinder(BlastResultsParser):
     Contig
     Start
     End
+    Pointfinder Position
     '''.strip().split('\n')]
     SORT_COLUMNS = ['Isolate ID', 'Gene']
 
@@ -47,11 +49,13 @@ def _create_hit(self, file, database_name, blast_record):
         logger.debug("database_name=%s", database_name)
         if (database_name == '16S_rrsD') or (database_name == '23S'):
             return PointfinderHitHSPRNA(file, blast_record)
+        elif ('promoter' in database_name):
+            return PointfinderHitHSPPromoter(file, blast_record, database_name)            
         else:
             return PointfinderHitHSP(file, blast_record)
 
     def _get_result(self, hit, db_mutation):
-        return [hit.get_genome_id(),
+        result = [hit.get_genome_id(),
                 hit.get_amr_gene_id() + " (" + db_mutation.get_mutation_string_short() + ")",
                 db_mutation.get_type(),
                 db_mutation.get_mutation_position(),
@@ -61,9 +65,12 @@ def _get_result(self, hit, db_mutation):
                 str(hit.get_hsp_length()) + "/" + str(hit.get_amr_gene_length()),
                 hit.get_genome_contig_id(),
                 hit.get_genome_contig_start(),
-                hit.get_genome_contig_end()
+                hit.get_genome_contig_end(),
+                db_mutation.get_pointfinder_mutation_string()
                 ]
 
+        return result
+
     def _get_result_rows(self, hit, database_name):
         database_mutations = hit.get_mutations()
 
@@ -75,6 +82,8 @@ def _get_result_rows(self, hit, database_name):
 
         if (database_name == '16S_rrsD') or (database_name == '23S'):
             database_resistance_mutations = self._blast_database.get_resistance_nucleotides(gene, database_mutations)
+        elif ('promoter' in database_name):
+            database_resistance_mutations = self._blast_database.get_resistance_promoter(gene, database_mutations)
         else:
             database_resistance_mutations = self._blast_database.get_resistance_codons(gene, database_mutations)
         logger.debug("database_resistance_mutations=%s", database_resistance_mutations)

diff --git a/staramr/blast/results/pointfinder/BlastResultsParserPointfinderResistance.py b/staramr/blast/results/pointfinder/BlastResultsParserPointfinderResistance.py
@@ -1,6 +1,7 @@
 import logging
 
 from staramr.blast.results.pointfinder.BlastResultsParserPointfinder import BlastResultsParserPointfinder
+from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition
 
 """
 Class used to parse out BLAST results for PointFinder, including phenotyhpes/resistances.
@@ -23,6 +24,7 @@ class BlastResultsParserPointfinderResistance(BlastResultsParserPointfinder):
     Contig
     Start
     End
+    Pointfinder Position
     '''.strip().split('\n')]
 
     def __init__(self, file_blast_map, arg_drug_table, blast_database, pid_threshold, plength_threshold,
@@ -43,14 +45,21 @@ def __init__(self, file_blast_map, arg_drug_table, blast_database, pid_threshold
         self._arg_drug_table = arg_drug_table
 
     def _get_result(self, hit, db_mutation):
+
+        # We need to correct for Pointfinder codon insertions being off by 1.
+        if type(db_mutation) is CodonInsertionPosition:
+            mutation_position = db_mutation.get_mutation_position() + 1
+        else:
+            mutation_position = db_mutation.get_mutation_position()
+
         drug = self._arg_drug_table.get_drug(self._blast_database.get_organism(), hit.get_amr_gene_id(),
-                                             db_mutation.get_mutation_position())
+                                             mutation_position)
         gene_name = hit.get_amr_gene_id() + " (" + db_mutation.get_mutation_string_short() + ")"
 
         if drug is None:
             drug = 'unknown[' + gene_name + ']'
 
-        return [hit.get_genome_id(),
+        result = [hit.get_genome_id(),
                 gene_name,
                 drug,
                 db_mutation.get_type(),
@@ -61,5 +70,8 @@ def _get_result(self, hit, db_mutation):
                 str(hit.get_hsp_length()) + "/" + str(hit.get_amr_gene_length()),
                 hit.get_genome_contig_id(),
                 hit.get_genome_contig_start(),
-                hit.get_genome_contig_end()
+                hit.get_genome_contig_end(),
+                db_mutation.get_pointfinder_mutation_string()
                 ]
+
+        return result
diff --git a/staramr/blast/results/pointfinder/MutationPosition.py b/staramr/blast/results/pointfinder/MutationPosition.py
@@ -25,6 +25,11 @@ def get_nucleotide_position(self):
         """
         return self._nucleotide_position_amr_gene
 
+    def get_pointfinder_mutation_string(self):
+        # This function exists in order to show mutations in Pointfinder database coordinates,
+        # which are off-by-one in the case of codon insertions.
+        return self.get_mutation_string_short()
+
     def get_mutation_string_short(self):
         return self.get_database_amr_gene_mutation() + str(
             self.get_mutation_position()) + self.get_input_genome_mutation()

diff --git a/staramr/blast/results/pointfinder/PointfinderHitHSP.py b/staramr/blast/results/pointfinder/PointfinderHitHSP.py
@@ -2,6 +2,9 @@
 
 from staramr.blast.results.AMRHitHSP import AMRHitHSP
 from staramr.blast.results.pointfinder.codon.CodonMutationPosition import CodonMutationPosition
+from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition
+
+import pandas as pd
 
 logger = logging.getLogger('PointfinderHitHSP')
 
@@ -27,22 +30,35 @@ def get_amr_gene_name(self):
         """
         return self._blast_record['qseqid']
 
-    def _get_match_positions(self):
-        amr_seq = self.get_amr_gene_seq()
-        genome_seq = self.get_genome_contig_hsp_seq()
-
-        return [i for i, (x, y) in enumerate(zip(amr_seq, genome_seq)) if x != y]
-
     def _get_mutation_positions(self, start):
+        mutation_positions = []
         mutation_positions_filtered = []
         codon_starts = []
 
         amr_seq = self.get_amr_gene_seq()
         genome_seq = self.get_genome_contig_hsp_seq()
 
-        # Only return mutation position objects with unique codon start positions
-        mutation_positions = [CodonMutationPosition(i, amr_seq, genome_seq, start) for i in self._get_match_positions()]
+        amr_pos = 0
+
+        for i in range(len(amr_seq)):
 
+            # Insertion: "-" in the reference:
+            if amr_seq[i] == "-":
+                # left side
+                offset = i - amr_pos  # accounting for string index and reference index possibly being different
+                mutation = CodonInsertionPosition(i, amr_seq, genome_seq, start, offset=offset)
+                mutation_positions.append(mutation)
+            # Mismatch or Deletion:
+            elif (amr_seq[i] != genome_seq[i]):
+                offset = i - amr_pos
+                mutation = CodonMutationPosition(i, amr_seq, genome_seq, start, offset=offset)
+                mutation_positions.append(mutation)
+                amr_pos += 1
+            # Match:
+            else:
+                amr_pos += 1
+
+        # Only return mutation position objects with unique codon start positions
         for m in mutation_positions:
             if m._codon_start not in codon_starts:
                 codon_starts.append(m._codon_start)