Skip to content

Commit

Permalink
introduce user-provided hmms (#327)
Browse files Browse the repository at this point in the history
* fix table in readme
* add hmm parameter
* bump PyHMMER dependency to 0.10.14
* introduce MIN_HMM_EVALUE constant
* implement HMM search
* add tests
* adjust ranks of user-provided proteins and HMMs

Increase the rank of user-provided proteins to 101, so that they have higher ranks than user-provided HMMs, which in turn are ranked 100, thus higher than all non-user-provided annotation sources.

* print user hmm parameter to stdout
* add hmms to readme
* remove orf debug print
* finish user-hmm feature
* polish readme
* add a second test HMM with short description format
* discard hmm hit source
* refactor expert hit annotation selection
* take into account annotation score to rank/score equal expert hits
* bump PyHMMER version to 0.10.15
  • Loading branch information
oschwengers authored Oct 10, 2024
1 parent ffacde1 commit 8774d3c
Show file tree
Hide file tree
Showing 15 changed files with 2,529 additions and 52 deletions.
26 changes: 24 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,8 @@ original locus id | new locus id | type | topology | name
NODE_1 | chrom | `chromosome` | `circular` | `-`
NODE_2 | p1 | `plasmid` | `c` | `pXYZ1`
NODE_3 | p2 | `p` | `c` | `pXYZ2`
NODE_4 | special-contig-name-xyz | `-` | -
NODE_5 | `` | `-` | -
NODE_4 | special-contig-name-xyz | `-` | `-` | `-`
NODE_5 | `` | `-` | `-` | `-`

#### User-provided regions

Expand Down Expand Up @@ -274,6 +274,27 @@ dbxrefs | `<empty>`, `db:id`, `,` separated list | `VFDB:VF0511`

Protein sequences provided in short Fasta or GenBank format are searched with default thresholds of 90%, 80% and 80% for minimal identity, query and subject coverage, respectively.

#### User-provided HMMs

Bakta accepts user-provided trusted HMMs via `--hmms` in HMMER's text format. If set, Bakta will adhere to the *trusted cutoff* specified in the HMM header. In addition, a max. evalue threshold of 1e-6 is applied. By default, Bakta uses the HMM description line as a product description. Further information can be provided via the HMM description line using the *short* format as explained above in the [User-provided protein sequences](####user-provided-protein-sequences) section.

```bash
# default
HMMER3/f [3.1b2 | February 2015]
NAME id
ACC id
DESC product
LENG 435
TC 600 600

# short
NAME id
ACC id
DESC gene~~~product~~~dbxrefs
LENG 435
TC 600 600
```

### Output

Annotation results are provided in standard bioinformatics file formats:
Expand Down Expand Up @@ -394,6 +415,7 @@ Annotation:
Replicon information table (tsv/csv)
--regions REGIONS Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).
--proteins PROTEINS Fasta file of trusted protein sequences for CDS annotation
--hmms HMMS HMM file of trusted hidden markov models in HMMER format for CDS annotation
--meta Run in metagenome mode. This only affects CDS prediction.

Workflow:
Expand Down
16 changes: 15 additions & 1 deletion bakta/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
replicons = None
compliant = None
user_proteins = None
user_hmms = None
meta = None
regions = None

Expand Down Expand Up @@ -162,7 +163,7 @@ def setup(args):
taxon = None

# annotation configurations
global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, meta, regions
global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
complete = args.complete
log.info('complete=%s', complete)
prodigal_tf = args.prodigal_tf
Expand Down Expand Up @@ -236,6 +237,19 @@ def setup(args):
sys.exit(f'ERROR: replicon table file ({replicons}) not valid!')
log.info('replicon-table=%s', replicons)
user_proteins = check_user_proteins(args)
user_hmms = args.hmms
if(user_hmms is not None):
try:
if(user_hmms == ''):
raise ValueError('File path argument must be non-empty')
user_hmms_path = Path(user_hmms).resolve()
check_readability('HMM', user_hmms_path)
check_content_size('HMM', user_hmms_path)
user_hmms = user_hmms_path
except:
log.error('provided HMM file not valid! path=%s', user_hmms)
sys.exit(f'ERROR: HMM file ({user_hmms}) not valid!')

regions = args.regions
if(regions is not None):
try:
Expand Down
1 change: 1 addition & 0 deletions bakta/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
MIN_PSC_IDENTITY = 0.9 # min protein identity for PSC detection
MIN_SORF_COVERAGE = 0.9 # min sORF coverage for PSC detection
MIN_SORF_IDENTITY = 0.9 # min sORF identity for PSC detection
MIN_HMM_EVALUE = 1e-6 # min evalue for CDS HMM searches
HYPOTHETICAL_PROTEIN = 'hypothetical protein' # hypothetical protein product description
CDS_MAX_OVERLAPS = 30 # max overlap [bp] allowed for user-provided/de novo-predicted CDS overlaps
CDS_SOURCE_USER = 'user-provided'
Expand Down
73 changes: 73 additions & 0 deletions bakta/expert/protein_hmms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import logging

from typing import Sequence

import pyhmmer

import bakta.config as cfg
import bakta.constants as bc
import bakta.features.orf as orf


log = logging.getLogger('EXPERT_AA_HMM')


def search(cdss: Sequence[dict], user_hmms_path):
"""Detect Pfam-A entries"""
cds_found = set()
orf_by_aa_digest = orf.get_orf_dictionary(cdss)
alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
proteins: list[pyhmmer.easel.DigitalSequence] = [ pyhmmer.easel.TextSequence(sequence=cds['aa'], name=bytes(orf.get_orf_key(cds), 'UTF-8')).digitize(alphabet) for cds in cdss ]
with pyhmmer.plan7.HMMFile(user_hmms_path) as hmms_fh:
hmms = list(hmms_fh)
for hmm_query_hits in pyhmmer.hmmsearch(hmms, proteins, bit_cutoffs='trusted', cpus=cfg.threads):
hmm_id = hmm_query_hits.query.accession.decode()
hmm_length = hmm_query_hits.query.M
hmm_description = hmm_query_hits.query.description.decode()
hmms_description_fields = hmm_description.split('~~~')
for hmm_query_hit in hmm_query_hits.reported:
aa_identifier = hmm_query_hit.name.decode()
cds = orf_by_aa_digest[aa_identifier]
if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE:
log.debug(
'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
)
else:
hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included])
hit = {
'type': 'user_hmms',
'rank': 100,
'id': hmm_id,
'length': hit_domain_lengths_sum,
'aa_cov': hit_domain_lengths_sum / len(cds['aa']),
'hmm_cov': hit_domain_lengths_sum / hmm_length,
'evalue': hmm_query_hit.evalue,
'score': hmm_query_hit.score,
'start': hmm_query_hit.best_domain.alignment.target_from,
'stop': hmm_query_hit.best_domain.alignment.target_to,
'db_xrefs': [f'UserHMM:{hmm_id}'],
'gene': None,
'product': None
}
# read user-provided gene, product, db_xrefs via HMM's description field provided as <gene>~~~<product>~~~<db_xrefs>
if(len(hmms_description_fields) == 1):
hit['product'] = hmm_description if hmm_description != '' else None
elif(len(hmms_description_fields) == 3):
(gene, product, dbxrefs) = hmms_description_fields
hit['gene'] = gene
hit['product'] = product
hit['db_xrefs'].extend(set(dbxrefs.split(',')))
else:
log.warning('wrong HMM description format: id=%s, desc=%s', hmm_id, hmm_description)

cds.setdefault('expert', [])
cds['expert'].append(hit)
log.debug(
'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score']
)
cds_found.add(aa_identifier)

log.info('found=%i', len(cds_found))
return cds_found
2 changes: 1 addition & 1 deletion bakta/expert/protein_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def write_user_protein_sequences(aa_fasta_path: Path):
with aa_fasta_path.open('w') as fh_out:
for user_protein in user_proteins:
(model_id, min_id, min_query_cov, min_model_cov, gene, product, dbxrefs, seq) = user_protein
fh_out.write(f">{model_id} UserProteins~~~{100}~~~{min_id}~~~{min_query_cov}~~~{min_model_cov}~~~{gene}~~~{product}~~~{','.join(dbxrefs)}\n{seq}\n")
fh_out.write(f">{model_id} UserProteins~~~{101}~~~{min_id}~~~{min_query_cov}~~~{min_model_cov}~~~{gene}~~~{product}~~~{','.join(dbxrefs)}\n{seq}\n")
log.debug(
'imported user aa: id=%s, length=%i, min-id=%f, min-query-cov=%f, min-model-cov=%f, gene=%s, product=%s, dbxrefs=%s',
model_id, len(seq), min_id, min_query_cov, min_model_cov, gene, product, dbxrefs
Expand Down
62 changes: 30 additions & 32 deletions bakta/features/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,18 +101,17 @@ def combine_annotation(feature: dict):
product = ips_product
for db_xref in ips['db_xrefs']:
db_xrefs.add(db_xref)
rank = 0
for hit in expert_hits:
db_xrefs.update(hit.get('db_xrefs', []))
expert_rank = hit['rank']
if(expert_rank > rank):
expert_genes = hit.get('gene', None)
if(expert_genes):
expert_genes = expert_genes.replace('/', ',').split(',')
genes.update(expert_genes)
gene = expert_genes[0]
product = hit.get('product', None)
rank = expert_rank

if(len(expert_hits) > 0):
top_expert_hit = sorted(expert_hits,key=lambda k: (k['rank'], k.get('score', 0), calc_annotation_score(k)), reverse=True)[0]
expert_genes = top_expert_hit.get('gene', None)
if(expert_genes):
expert_genes = expert_genes.replace('/', ',').split(',')
genes.update(expert_genes)
gene = expert_genes[0]
product = top_expert_hit.get('product', None)
for hit in expert_hits:
db_xrefs.update(hit.get('db_xrefs', []))

if(product):
product = revise_cds_product(product)
Expand Down Expand Up @@ -404,8 +403,8 @@ def detect_feature_overlaps(genome: dict):
elif(sorf['start'] == overlap_sorf['start'] and sorf['stop'] == overlap_sorf['stop']):
continue # same
else: # overlap -> remove sorf
score_sorf = calc_sorf_annotation_score(sorf)
score_overlap_sorf = calc_sorf_annotation_score(overlap_sorf)
score_sorf = calc_cds_annotation_score(sorf)
score_overlap_sorf = calc_cds_annotation_score(overlap_sorf)

if(score_sorf < score_overlap_sorf): # lower annotation score
overlap = f"[{max(sorf['start'], overlap_sorf['start'])},{min(sorf['stop'], overlap_sorf['stop'])}]"
Expand All @@ -431,39 +430,38 @@ def detect_feature_overlaps(genome: dict):
)


def calc_sorf_annotation_score(sorf: dict) -> int:
def calc_cds_annotation_score(cds: dict) -> int:
"""Calc an annotation score rewarding each identification & annotation"""
score = 0

if('ups' in sorf):
if('ups' in cds):
score += 1

ips = sorf.get('ips', None)
ips = cds.get('ips', None)
if(ips):
score += 1
ips_gene = ips.get('gene', None)
if(ips_gene):
score += 1
ips_product = ips.get('product', None)
if(ips_product):
score += 1
score += calc_annotation_score(ips)

psc = sorf.get('psc', None)
psc = cds.get('psc', None)
if(psc):
score += 1
psc_gene = psc.get('gene', None)
if(psc_gene):
score += 1
psc_product = psc.get('product', None)
if(psc_product):
score += 1
score += calc_annotation_score(psc)
log.debug(
'sorf score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
sorf['contig'], sorf['start'], sorf['stop'], sorf.get('gene', '-'), sorf.get('product', '-'), score
'cds score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
cds['contig'], cds['start'], cds['stop'], cds.get('gene', '-'), cds.get('product', '-'), score
)
return score


def calc_annotation_score(orf:dict) -> int:
score = 0
if(orf.get('gene', None)):
score += 1
if(orf.get('product', None)):
score += 1
return score


def extract_protein_gene_symbol(product: str) -> str:
gene_symbols = []
for part in product.split(' '): # try to extract valid gene symbols
Expand Down
2 changes: 1 addition & 1 deletion bakta/features/orf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def detect_spurious(orfs: Sequence[dict]):
for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads):
for hit in top_hits:
orf = orf_by_aa_digest[hit.name.decode()]
if hit.evalue > 1E-5:
if hit.evalue > bc.MIN_HMM_EVALUE:
log.debug(
'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f',
orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
Expand Down
13 changes: 10 additions & 3 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import bakta.io.insdc as insdc
import bakta.expert.amrfinder as exp_amr
import bakta.expert.protein_sequences as exp_aa_seq
import bakta.expert.protein_hmms as exp_aa_hmms
import bakta.features.annotation as anno
import bakta.features.t_rna as t_rna
import bakta.features.tm_rna as tm_rna
Expand Down Expand Up @@ -62,9 +63,10 @@ def main():
print(f'\tinput: {cfg.genome_path}')
print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}")
if(cfg.replicons): print(f'\treplicon table: {cfg.replicons}')
if(cfg.regions): print(f'\tregion table: {cfg.regions}')
if(cfg.prodigal_tf): print(f'\tprodigal training file: {cfg.prodigal_tf}')
if(cfg.regions): print(f'\tregion table: {cfg.regions}')
if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}')
if(cfg.user_hmms): print(f'\tuser hmms: {cfg.user_hmms}')
print(f'\toutput: {cfg.output_path}')
if(cfg.force): print(f'\tforce: {cfg.force}')
print(f'\ttmp directory: {cfg.tmp_path}')
Expand Down Expand Up @@ -292,6 +294,11 @@ def main():
user_aa_found = exp_aa_seq.search(cdss, cds_aa_path, 'user_proteins', user_aa_path)
print(f'\t\tuser protein sequences: {len(user_aa_found)}')

if(cfg.user_hmms):
log.debug('conduct expert system: user HMM')
user_hmm_found = exp_aa_hmms.search(cdss, cfg.user_hmms)
print(f'\t\tuser HMM sequences: {len(user_hmm_found)}')

if(cfg.gram != bc.GRAM_UNKNOWN):
sig_peptides_found = sig_peptides.search(cdss, cds_aa_path)
print(f'\tsignal peptides: {len(sig_peptides_found)}')
Expand Down Expand Up @@ -366,8 +373,8 @@ def main():
if(len(sorfs_not_found) > 0):
if(cfg.db_info['type'] == 'full'):
log.debug('search sORF PSC')
sorf_pscs, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
sorf_pscs_psccs.extend(sorf_pscs)
cdss_not_found_tmp, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
sorf_pscs_psccs.extend(cdss_not_found_tmp)
print(f'\tfound PSCs: {len(sorf_pscs_psccs)}')
else:
log.debug('search sORF PSCC')
Expand Down
Loading

0 comments on commit 8774d3c

Please sign in to comment.