Skip to content

Commit

Permalink
Merge development
Browse files Browse the repository at this point in the history
  • Loading branch information
apetkau committed Oct 18, 2022
2 parents f0b29eb + 4b2d549 commit bdb5c3d
Show file tree
Hide file tree
Showing 31 changed files with 905 additions and 148 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Version 0.9.0

* Updates to PointFinder database handling
* Adds the ability to handle promoters (regions with both promoter nucleotide information and non-promoter codon information)
* Adds handling of insertions and deletions in nucleotide and codon sequence
* Updates list of supported PointFinder species to `salmonella`, `campylobacter`, `enterococcus_faecalis`, `enterococcus_faecium`, `escherichia_coli`, `helicobacter_pylori`.
* Switch name `e.coli` to `escherichia_coli` in PointFinder gene-drug key to match organism name in PointFinder database.

# Version 0.8.0

* Fixed issue when using older version of pandas (#136) (0.8.0.dev0).
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,12 @@ staramr db restore-default
By default, the ResFinder/PointFinder/PlasmidFinder genes listed in [genes_to_exclude.tsv][] will be excluded from the final results. To pass a custom list of genes the option `--exclude-genes-file` can be used, where the file specified will contains a list of the sequence ids (one per line) from the ResFinder/PointFinder/PlasmidFinder databases. For example:

```
#gene_id
gene_id
aac(6')-Iaa_1_NC_003197
ColpVC_1__JX133088
```

Please make sure to include `#gene_id` in the first line. The default exclusion list can also be disabled with `--no-exclude-genes`.
Please make sure to include `gene_id` in the first line. The default exclusion list can also be disabled with `--no-exclude-genes`.

# Output

Expand Down
2 changes: 1 addition & 1 deletion staramr/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.8.0'
__version__ = '0.9.0.dev0'
25 changes: 24 additions & 1 deletion staramr/blast/pointfinder/PointfinderBlastDatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,28 @@ def get_resistance_nucleotides(self, gene, nucleotide_mutations):
"""
return self._pointfinder_info.get_resistance_nucleotides(gene, nucleotide_mutations)

def get_resistance_promoter(self, gene, nucleotide_mutations):
"""
Gets a list of resistance nucleotides and codons located within the promoter, derived from the list
of nucleotide mutations.
:param gene: The name of the gene.
:param nucleotide_mutations: The positions of the nucleotide mutations.
:return: The resistance positions (both nucleotide and codon positions).
"""

# Get the mutations in the nucleotide (non-gene) part:
# Filter the list for negative coordinate positions.
nucleotide_part = list(filter(lambda x: (x._nucleotide_position_amr_gene < 0), nucleotide_mutations))
resistance_nucleotides = self._pointfinder_info.get_resistance_nucleotides(gene, nucleotide_part)

# Get the mutations in the coding part:
# Filter the list for non-negative coordinate positions.
codon_part = list(filter(lambda x: (x._nucleotide_position_amr_gene >= 0), nucleotide_mutations))
resistance_codons = self._pointfinder_info.get_resistance_codons(gene, codon_part)

# Combine and return the results:
return resistance_nucleotides + resistance_codons

def get_organism(self):
"""
Gets the particular organism of this database.
Expand All @@ -87,7 +109,8 @@ def get_available_organisms(cls):
A Class Method to get a list of organisms that are currently supported by staramr.
:return: The list of organisms currently supported by staramr.
"""
return ['salmonella', 'campylobacter']
return ['salmonella', 'campylobacter', 'enterococcus_faecalis', 'enterococcus_faecium',
'escherichia_coli', 'helicobacter_pylori']

@classmethod
def get_organisms(cls, database_dir):
Expand Down
97 changes: 87 additions & 10 deletions staramr/blast/pointfinder/PointfinderDatabaseInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from os import path

import pandas as pd
import Bio.Seq
from staramr.blast.results.pointfinder.codon.CodonMutationPosition import CodonMutationPosition
from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition

from staramr.exceptions.GenotypePhenotypeMatchException import GenotypePhenotypeMatchException

Expand Down Expand Up @@ -33,7 +36,15 @@ def from_file(cls, file):
:param file: The file containing drug resistance mutations.
:return: A new PointfinderDatabaseInfo.
"""
pointfinder_info = pd.read_csv(file, sep='\t', index_col=False)

with open(file) as f:
line = f.readline()

line = line.lstrip("#")
column_names = line.split()

pointfinder_info = pd.read_csv(file, sep='\t', index_col=False, comment='#', header=None, names=column_names)

return cls(pointfinder_info, file)

@classmethod
Expand All @@ -44,6 +55,18 @@ def from_pandas_table(cls, database_info_dataframe):
:return: A new PointfinderDatabaseInfo.
"""
return cls(database_info_dataframe)

@staticmethod
def to_codons(regex_match):
# Sometimes, the regex will match a string with a comma and return multiple matches.
# Ex: TTCATGGAC,TTC
# We need to make sure we handle these commas and multiple matches.
entries = regex_match.string.split(",")

for i in range(0, len(entries)):
entries[i] = Bio.Seq.translate(entries[i], table='Standard').upper()

return ",".join(entries)

def _resistance_table_hacks(self, table):
"""
Expand All @@ -55,15 +78,72 @@ def _resistance_table_hacks(self, table):
if self._file and 'salmonella' in str(self._file) and path.exists(
path.join(path.dirname(self._file), '16S_rrsD.fsa')):
logger.debug("Replacing [16S] with [16S_rrsD] for pointfinder organism [salmonella]")
table[['#Gene_ID']] = table[['#Gene_ID']].replace('16S', '16S_rrsD')
table[['Gene_ID']] = table[['Gene_ID']].replace('16S', '16S_rrsD')

# We need to fix table entries where codon deletions are listed,
# but no reference nucleotides are listed in the corresponding entry.
# This will mean that this specific entry (for some reason)
# is using 0-based nucleotide coordinates and nucleotides for the Res_codon
# instead of 1-based codon coordinates and codons for the Res_codon.
# However, we need make an exception for entries where "promoter", "16S", or "23S"
# are found in the 'Gene_ID'.

table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del')
& (table['Gene_ID'].str.contains('promoter') == False)
& (table['Gene_ID'].str.contains('16S') == False)
& (table['Gene_ID'].str.contains('23S') == False), "Codon_pos"] += 3
# We increment by 3 because the table is using base-0 coordinates
# and we have base-1 coordindates for everything codon-related.
# However, what's listed is nucleotide coordinates for the codon
# mutation, so we actually need to bump it by 3 nucleotides.

# Furthermore, we need to convert 'Res_codon' entries related to the above problem
# (codon deletions) from nucleotides into codons. We choose to convert nucleotides into
# codons instead of the other way around so that can more accurately compare observed mutations
# to the table entries.
table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del')
& (table['Gene_ID'].str.contains('promoter') == False)
& (table['Gene_ID'].str.contains('16S') == False)
& (table['Gene_ID'].str.contains('23S') == False), "Res_codon"] = table.loc[(table['Ref_nuc'] == '-') & (table['Ref_codon'] == 'del')
& (table['Gene_ID'].str.contains('promoter') == False)
& (table['Gene_ID'].str.contains('16S') == False)
& (table['Gene_ID'].str.contains('23S') == False), "Res_codon"].str.replace('[A-Z,]+', self.to_codons, regex=True)

def _get_resistance_codon_match(self, gene, codon_mutation):
table = self._pointfinder_info

matches = table[(table['#Gene_ID'] == gene)
& (table['Codon_pos'] == codon_mutation.get_mutation_position())
& (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation())
& (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))]
# We need to handle codon deletions as a special case:
if(type(codon_mutation) is CodonMutationPosition
and codon_mutation.get_input_genome_amino_acid() == 'del'):
matches = table[(table['Gene_ID'] == gene)
# Codon deletions are listed using nucleotide coordinates
& (table['Codon_pos'] == codon_mutation.get_mutation_position() * 3)
# Such coords are denoted in nucleotide coordinates in the reference table for some reason,
# so we need to convert to nucleotide coordinates before making the comparison.
& (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation())
& (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))]

# We need to handle codon insertions as a special case:
# Pointfinder mis-reports the position of codon insertions. For example:
# ref: ACG --- ACG
# query: ACG GGG ACG
# ref_pos: 1 2
# Pointfinder is incorrectly reporting the insertion as 2_3insG instead of the correct 1_2insG,
# which is to say, the insertion happens between reference codon coordinates 1 and 2.
# We need to shift by 1 in our interpretation.
elif type(codon_mutation) is CodonInsertionPosition:
matches = table[(table['Gene_ID'] == gene)
# Codon inerstions need to be shifted by 1:
& (table['Codon_pos'] == codon_mutation.get_mutation_position() + 1)
& (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation())
& (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))]

# Normal cases:
else:
matches = table[(table['Gene_ID'] == gene)
& (table['Codon_pos'] == codon_mutation.get_mutation_position())
& (table['Ref_codon'] == codon_mutation.get_database_amr_gene_mutation())
& (table['Res_codon'].str.contains(codon_mutation.get_input_genome_mutation(), regex=False))]

if len(matches.index) > 1:
# If more then one match, try to match Res_codon exactly to subselect
Expand All @@ -72,10 +152,7 @@ def _get_resistance_codon_match(self, gene, codon_mutation):
if len(matches_subset.index) >= 1:
matches = matches_subset

if len(matches.index) > 1:
raise GenotypePhenotypeMatchException("Error, multiple matches for gene=" + str(gene) + ", codon_mutation=" + str(codon_mutation))
else:
return matches
return matches

def _get_resistance_nucleotide_match(self, gene, nucleotide_mutations):
return self._get_resistance_codon_match(gene, nucleotide_mutations)
Expand Down
13 changes: 11 additions & 2 deletions staramr/blast/results/pointfinder/BlastResultsParserPointfinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from staramr.blast.results.BlastResultsParser import BlastResultsParser
from staramr.blast.results.pointfinder.PointfinderHitHSP import PointfinderHitHSP
from staramr.blast.results.pointfinder.nucleotide.PointfinderHitHSPRNA import PointfinderHitHSPRNA
from staramr.blast.results.pointfinder.nucleotide.PointfinderHitHSPPromoter import PointfinderHitHSPPromoter

logger = logging.getLogger('BlastResultsParserPointfinder')

Expand All @@ -25,6 +26,7 @@ class BlastResultsParserPointfinder(BlastResultsParser):
Contig
Start
End
Pointfinder Position
'''.strip().split('\n')]
SORT_COLUMNS = ['Isolate ID', 'Gene']

Expand All @@ -47,11 +49,13 @@ def _create_hit(self, file, database_name, blast_record):
logger.debug("database_name=%s", database_name)
if (database_name == '16S_rrsD') or (database_name == '23S'):
return PointfinderHitHSPRNA(file, blast_record)
elif ('promoter' in database_name):
return PointfinderHitHSPPromoter(file, blast_record, database_name)
else:
return PointfinderHitHSP(file, blast_record)

def _get_result(self, hit, db_mutation):
return [hit.get_genome_id(),
result = [hit.get_genome_id(),
hit.get_amr_gene_id() + " (" + db_mutation.get_mutation_string_short() + ")",
db_mutation.get_type(),
db_mutation.get_mutation_position(),
Expand All @@ -61,9 +65,12 @@ def _get_result(self, hit, db_mutation):
str(hit.get_hsp_length()) + "/" + str(hit.get_amr_gene_length()),
hit.get_genome_contig_id(),
hit.get_genome_contig_start(),
hit.get_genome_contig_end()
hit.get_genome_contig_end(),
db_mutation.get_pointfinder_mutation_string()
]

return result

def _get_result_rows(self, hit, database_name):
database_mutations = hit.get_mutations()

Expand All @@ -75,6 +82,8 @@ def _get_result_rows(self, hit, database_name):

if (database_name == '16S_rrsD') or (database_name == '23S'):
database_resistance_mutations = self._blast_database.get_resistance_nucleotides(gene, database_mutations)
elif ('promoter' in database_name):
database_resistance_mutations = self._blast_database.get_resistance_promoter(gene, database_mutations)
else:
database_resistance_mutations = self._blast_database.get_resistance_codons(gene, database_mutations)
logger.debug("database_resistance_mutations=%s", database_resistance_mutations)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging

from staramr.blast.results.pointfinder.BlastResultsParserPointfinder import BlastResultsParserPointfinder
from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition

"""
Class used to parse out BLAST results for PointFinder, including phenotyhpes/resistances.
Expand All @@ -23,6 +24,7 @@ class BlastResultsParserPointfinderResistance(BlastResultsParserPointfinder):
Contig
Start
End
Pointfinder Position
'''.strip().split('\n')]

def __init__(self, file_blast_map, arg_drug_table, blast_database, pid_threshold, plength_threshold,
Expand All @@ -43,14 +45,21 @@ def __init__(self, file_blast_map, arg_drug_table, blast_database, pid_threshold
self._arg_drug_table = arg_drug_table

def _get_result(self, hit, db_mutation):

# We need to correct for Pointfinder codon insertions being off by 1.
if type(db_mutation) is CodonInsertionPosition:
mutation_position = db_mutation.get_mutation_position() + 1
else:
mutation_position = db_mutation.get_mutation_position()

drug = self._arg_drug_table.get_drug(self._blast_database.get_organism(), hit.get_amr_gene_id(),
db_mutation.get_mutation_position())
mutation_position)
gene_name = hit.get_amr_gene_id() + " (" + db_mutation.get_mutation_string_short() + ")"

if drug is None:
drug = 'unknown[' + gene_name + ']'

return [hit.get_genome_id(),
result = [hit.get_genome_id(),
gene_name,
drug,
db_mutation.get_type(),
Expand All @@ -61,5 +70,8 @@ def _get_result(self, hit, db_mutation):
str(hit.get_hsp_length()) + "/" + str(hit.get_amr_gene_length()),
hit.get_genome_contig_id(),
hit.get_genome_contig_start(),
hit.get_genome_contig_end()
hit.get_genome_contig_end(),
db_mutation.get_pointfinder_mutation_string()
]

return result
5 changes: 5 additions & 0 deletions staramr/blast/results/pointfinder/MutationPosition.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ def get_nucleotide_position(self):
"""
return self._nucleotide_position_amr_gene

def get_pointfinder_mutation_string(self):
# This function exists in order to show mutations in Pointfinder database coordinates,
# which are off-by-one in the case of codon insertions.
return self.get_mutation_string_short()

def get_mutation_string_short(self):
return self.get_database_amr_gene_mutation() + str(
self.get_mutation_position()) + self.get_input_genome_mutation()
Expand Down
32 changes: 24 additions & 8 deletions staramr/blast/results/pointfinder/PointfinderHitHSP.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

from staramr.blast.results.AMRHitHSP import AMRHitHSP
from staramr.blast.results.pointfinder.codon.CodonMutationPosition import CodonMutationPosition
from staramr.blast.results.pointfinder.codon.CodonInsertionPosition import CodonInsertionPosition

import pandas as pd

logger = logging.getLogger('PointfinderHitHSP')

Expand All @@ -27,22 +30,35 @@ def get_amr_gene_name(self):
"""
return self._blast_record['qseqid']

def _get_match_positions(self):
amr_seq = self.get_amr_gene_seq()
genome_seq = self.get_genome_contig_hsp_seq()

return [i for i, (x, y) in enumerate(zip(amr_seq, genome_seq)) if x != y]

def _get_mutation_positions(self, start):
mutation_positions = []
mutation_positions_filtered = []
codon_starts = []

amr_seq = self.get_amr_gene_seq()
genome_seq = self.get_genome_contig_hsp_seq()

# Only return mutation position objects with unique codon start positions
mutation_positions = [CodonMutationPosition(i, amr_seq, genome_seq, start) for i in self._get_match_positions()]
amr_pos = 0

for i in range(len(amr_seq)):

# Insertion: "-" in the reference:
if amr_seq[i] == "-":
# left side
offset = i - amr_pos # accounting for string index and reference index possibly being different
mutation = CodonInsertionPosition(i, amr_seq, genome_seq, start, offset=offset)
mutation_positions.append(mutation)
# Mismatch or Deletion:
elif (amr_seq[i] != genome_seq[i]):
offset = i - amr_pos
mutation = CodonMutationPosition(i, amr_seq, genome_seq, start, offset=offset)
mutation_positions.append(mutation)
amr_pos += 1
# Match:
else:
amr_pos += 1

# Only return mutation position objects with unique codon start positions
for m in mutation_positions:
if m._codon_start not in codon_starts:
codon_starts.append(m._codon_start)
Expand Down
Loading

0 comments on commit bdb5c3d

Please sign in to comment.