introduce user-provided hmms (#327)

* fix table in readme * add hmm parameter * bump PyHMMER dependency to 0.10.14 * introduce MIN_HMM_EVALUE constant * implement HMM search * add tests * adjust ranks of user-provided proteins and HMMs Increase the rank of user-provided proteins to 101, so that they have higher ranks than user-provided HMMs, which in turn are ranked 100, thus higher than all non-user-provided annotation sources. * print user hmm parameter to stdout * add hmms to readme * remove orf debug print * finish user-hmm feature * polish readme * add a second test HMM with short description format * discard hmm hit source * refactor expert hit annotation selection * take into account annotation score to rank/score equal expert hits * bump PyHMMER version to 0.10.15
oschwengers · Oct 10, 2024 · 8774d3c · 8774d3c
1 parent ffacde1
commit 8774d3c
Show file tree

Hide file tree

Showing 15 changed files with 2,529 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -240,8 +240,8 @@ original locus id  |  new locus id  |  type  |  topology  |  name
 NODE_1 | chrom | `chromosome` | `circular` | `-`
 NODE_2 | p1 | `plasmid` | `c` | `pXYZ1`
 NODE_3 | p2 | `p`  |  `c` | `pXYZ2`
-NODE_4 | special-contig-name-xyz |  `-` | -
-NODE_5 | `` |  `-` | -
+NODE_4 | special-contig-name-xyz |  `-` | `-` | `-`
+NODE_5 | `` |  `-` | `-` | `-`
 
 #### User-provided regions
 
@@ -274,6 +274,27 @@ dbxrefs | `<empty>`, `db:id`, `,` separated list  | `VFDB:VF0511`
 
 Protein sequences provided in short Fasta or GenBank format are searched with default thresholds of 90%, 80% and 80% for minimal identity, query and subject coverage, respectively.
 
+#### User-provided HMMs
+
+Bakta accepts user-provided trusted HMMs via `--hmms` in HMMER's text format. If set, Bakta will adhere to the *trusted cutoff* specified in the HMM header. In addition, a max. evalue threshold of 1e-6 is applied. By default, Bakta uses the HMM description line as a product description. Further information can be provided via the HMM description line using the *short* format as explained above in the [User-provided protein sequences](####user-provided-protein-sequences) section.
+
+```bash
+# default
+HMMER3/f [3.1b2 | February 2015]
+NAME  id
+ACC   id
+DESC  product
+LENG  435
+TC    600 600
+
+# short
+NAME  id
+ACC   id
+DESC  gene~~~product~~~dbxrefs
+LENG  435
+TC    600 600
+```
+
 ### Output
 
 Annotation results are provided in standard bioinformatics file formats:
@@ -394,6 +415,7 @@ Annotation:
                         Replicon information table (tsv/csv)
   --regions REGIONS     Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).
   --proteins PROTEINS   Fasta file of trusted protein sequences for CDS annotation
+  --hmms HMMS           HMM file of trusted hidden markov models in HMMER format for CDS annotation
   --meta                Run in metagenome mode. This only affects CDS prediction.
 
 Workflow:

diff --git a/bakta/config.py b/bakta/config.py
@@ -55,6 +55,7 @@
 replicons = None
 compliant = None
 user_proteins = None
+user_hmms = None
 meta = None
 regions = None
 
@@ -162,7 +163,7 @@ def setup(args):
         taxon = None
 
     # annotation configurations
-    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, meta, regions
+    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
     complete = args.complete
     log.info('complete=%s', complete)
     prodigal_tf = args.prodigal_tf
@@ -236,6 +237,19 @@ def setup(args):
             sys.exit(f'ERROR: replicon table file ({replicons}) not valid!')
     log.info('replicon-table=%s', replicons)
     user_proteins = check_user_proteins(args)
+    user_hmms = args.hmms
+    if(user_hmms is not None):
+        try:
+            if(user_hmms == ''):
+                raise ValueError('File path argument must be non-empty')
+            user_hmms_path = Path(user_hmms).resolve()
+            check_readability('HMM', user_hmms_path)
+            check_content_size('HMM', user_hmms_path)
+            user_hmms = user_hmms_path
+        except:
+            log.error('provided HMM file not valid! path=%s', user_hmms)
+            sys.exit(f'ERROR: HMM file ({user_hmms}) not valid!')
+
     regions = args.regions
     if(regions is not None):
         try:

diff --git a/bakta/constants.py b/bakta/constants.py
@@ -26,6 +26,7 @@
 MIN_PSC_IDENTITY = 0.9  # min protein identity for PSC detection
 MIN_SORF_COVERAGE = 0.9  # min sORF coverage for PSC detection
 MIN_SORF_IDENTITY = 0.9  # min sORF identity for PSC detection
+MIN_HMM_EVALUE = 1e-6  # min evalue for CDS HMM searches
 HYPOTHETICAL_PROTEIN = 'hypothetical protein'  # hypothetical protein product description
 CDS_MAX_OVERLAPS = 30  # max overlap [bp] allowed for user-provided/de novo-predicted CDS overlaps
 CDS_SOURCE_USER = 'user-provided'

diff --git a/bakta/expert/protein_hmms.py b/bakta/expert/protein_hmms.py
@@ -0,0 +1,73 @@
+import logging
+
+from typing import Sequence
+
+import pyhmmer
+
+import bakta.config as cfg
+import bakta.constants as bc
+import bakta.features.orf as orf
+
+
+log = logging.getLogger('EXPERT_AA_HMM')
+
+
+def search(cdss: Sequence[dict], user_hmms_path):
+    """Detect Pfam-A entries"""
+    cds_found = set()
+    orf_by_aa_digest = orf.get_orf_dictionary(cdss)
+    alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
+    proteins: list[pyhmmer.easel.DigitalSequence] = [ pyhmmer.easel.TextSequence(sequence=cds['aa'], name=bytes(orf.get_orf_key(cds), 'UTF-8')).digitize(alphabet) for cds in cdss ]
+    with pyhmmer.plan7.HMMFile(user_hmms_path) as hmms_fh:
+        hmms = list(hmms_fh)
+        for hmm_query_hits in pyhmmer.hmmsearch(hmms, proteins, bit_cutoffs='trusted', cpus=cfg.threads):
+            hmm_id = hmm_query_hits.query.accession.decode()
+            hmm_length = hmm_query_hits.query.M
+            hmm_description = hmm_query_hits.query.description.decode()
+            hmms_description_fields = hmm_description.split('~~~')
+            for hmm_query_hit in hmm_query_hits.reported:
+                aa_identifier = hmm_query_hit.name.decode()
+                cds = orf_by_aa_digest[aa_identifier]
+                if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE:
+                    log.debug(
+                        'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
+                        cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
+                    )
+                else:
+                    hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included])
+                    hit = {
+                        'type': 'user_hmms',
+                        'rank': 100,
+                        'id': hmm_id,
+                        'length': hit_domain_lengths_sum,
+                        'aa_cov': hit_domain_lengths_sum / len(cds['aa']),
+                        'hmm_cov': hit_domain_lengths_sum / hmm_length,
+                        'evalue': hmm_query_hit.evalue,
+                        'score': hmm_query_hit.score,
+                        'start': hmm_query_hit.best_domain.alignment.target_from,
+                        'stop': hmm_query_hit.best_domain.alignment.target_to,
+                        'db_xrefs': [f'UserHMM:{hmm_id}'],
+                        'gene': None,
+                        'product': None
+                    }
+                    # read user-provided gene, product, db_xrefs via HMM's description field provided as <gene>~~~<product>~~~<db_xrefs>
+                    if(len(hmms_description_fields) == 1):
+                        hit['product'] = hmm_description if hmm_description != '' else None
+                    elif(len(hmms_description_fields) == 3):
+                        (gene, product, dbxrefs) = hmms_description_fields
+                        hit['gene'] = gene
+                        hit['product'] = product
+                        hit['db_xrefs'].extend(set(dbxrefs.split(',')))
+                    else:
+                        log.warning('wrong HMM description format: id=%s, desc=%s', hmm_id, hmm_description)
+
+                    cds.setdefault('expert', [])
+                    cds['expert'].append(hit)
+                    log.debug(
+                        'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
+                        cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score']
+                    )
+                    cds_found.add(aa_identifier)
+
+    log.info('found=%i', len(cds_found))
+    return cds_found
diff --git a/bakta/expert/protein_sequences.py b/bakta/expert/protein_sequences.py
@@ -117,7 +117,7 @@ def write_user_protein_sequences(aa_fasta_path: Path):
             with aa_fasta_path.open('w') as fh_out:
                 for user_protein in user_proteins:
                     (model_id, min_id, min_query_cov, min_model_cov, gene, product, dbxrefs, seq) = user_protein
-                    fh_out.write(f">{model_id} UserProteins~~~{100}~~~{min_id}~~~{min_query_cov}~~~{min_model_cov}~~~{gene}~~~{product}~~~{','.join(dbxrefs)}\n{seq}\n")
+                    fh_out.write(f">{model_id} UserProteins~~~{101}~~~{min_id}~~~{min_query_cov}~~~{min_model_cov}~~~{gene}~~~{product}~~~{','.join(dbxrefs)}\n{seq}\n")
                     log.debug(
                         'imported user aa: id=%s, length=%i, min-id=%f, min-query-cov=%f, min-model-cov=%f, gene=%s, product=%s, dbxrefs=%s',
                         model_id, len(seq), min_id, min_query_cov, min_model_cov, gene, product, dbxrefs

diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py
@@ -101,18 +101,17 @@ def combine_annotation(feature: dict):
             product = ips_product
         for db_xref in ips['db_xrefs']:
             db_xrefs.add(db_xref)
-    rank = 0
-    for hit in expert_hits:
-        db_xrefs.update(hit.get('db_xrefs', []))
-        expert_rank = hit['rank']
-        if(expert_rank > rank):
-            expert_genes = hit.get('gene', None)
-            if(expert_genes):
-                expert_genes = expert_genes.replace('/', ',').split(',')
-                genes.update(expert_genes)
-                gene = expert_genes[0]
-            product = hit.get('product', None)
-            rank = expert_rank
+
+    if(len(expert_hits) > 0):
+        top_expert_hit = sorted(expert_hits,key=lambda k: (k['rank'], k.get('score', 0), calc_annotation_score(k)), reverse=True)[0]
+        expert_genes = top_expert_hit.get('gene', None)
+        if(expert_genes):
+            expert_genes = expert_genes.replace('/', ',').split(',')
+            genes.update(expert_genes)
+            gene = expert_genes[0]
+        product = top_expert_hit.get('product', None)
+        for hit in expert_hits:
+            db_xrefs.update(hit.get('db_xrefs', []))
 
     if(product):
         product = revise_cds_product(product)
@@ -404,8 +403,8 @@ def detect_feature_overlaps(genome: dict):
                 elif(sorf['start'] == overlap_sorf['start'] and sorf['stop'] == overlap_sorf['stop']):
                     continue  # same
                 else:  # overlap -> remove sorf
-                    score_sorf = calc_sorf_annotation_score(sorf)
-                    score_overlap_sorf = calc_sorf_annotation_score(overlap_sorf)
+                    score_sorf = calc_cds_annotation_score(sorf)
+                    score_overlap_sorf = calc_cds_annotation_score(overlap_sorf)
 
                     if(score_sorf < score_overlap_sorf):  # lower annotation score
                         overlap = f"[{max(sorf['start'], overlap_sorf['start'])},{min(sorf['stop'], overlap_sorf['stop'])}]"
@@ -431,39 +430,38 @@ def detect_feature_overlaps(genome: dict):
                         )
 
 
-def calc_sorf_annotation_score(sorf: dict) -> int:
+def calc_cds_annotation_score(cds: dict) -> int:
     """Calc an annotation score rewarding each identification & annotation"""
     score = 0
 
-    if('ups' in sorf):
+    if('ups' in cds):
         score += 1
 
-    ips = sorf.get('ips', None)
+    ips = cds.get('ips', None)
     if(ips):
         score += 1
-        ips_gene = ips.get('gene', None)
-        if(ips_gene):
-            score += 1
-        ips_product = ips.get('product', None)
-        if(ips_product):
-            score += 1
+        score += calc_annotation_score(ips)
 
-    psc = sorf.get('psc', None)
+    psc = cds.get('psc', None)
     if(psc):
         score += 1
-        psc_gene = psc.get('gene', None)
-        if(psc_gene):
-            score += 1
-        psc_product = psc.get('product', None)
-        if(psc_product):
-            score += 1
+        score += calc_annotation_score(psc)
     log.debug(
-        'sorf score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
-        sorf['contig'], sorf['start'], sorf['stop'], sorf.get('gene', '-'), sorf.get('product', '-'), score
+        'cds score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
+        cds['contig'], cds['start'], cds['stop'], cds.get('gene', '-'), cds.get('product', '-'), score
     )
     return score
 
 
+def calc_annotation_score(orf:dict) -> int:
+    score = 0
+    if(orf.get('gene', None)):
+        score += 1
+    if(orf.get('product', None)):
+        score += 1
+    return score
+
+
 def extract_protein_gene_symbol(product: str) -> str:
     gene_symbols = []
     for part in product.split(' '):  # try to extract valid gene symbols

diff --git a/bakta/features/orf.py b/bakta/features/orf.py
@@ -24,7 +24,7 @@ def detect_spurious(orfs: Sequence[dict]):
         for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads):
             for hit in top_hits:
                 orf = orf_by_aa_digest[hit.name.decode()]
-                if hit.evalue > 1E-5:
+                if hit.evalue > bc.MIN_HMM_EVALUE:
                     log.debug(
                         'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f',
                         orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score

diff --git a/bakta/main.py b/bakta/main.py
@@ -15,6 +15,7 @@
 import bakta.io.insdc as insdc
 import bakta.expert.amrfinder as exp_amr
 import bakta.expert.protein_sequences as exp_aa_seq
+import bakta.expert.protein_hmms as exp_aa_hmms
 import bakta.features.annotation as anno
 import bakta.features.t_rna as t_rna
 import bakta.features.tm_rna as tm_rna
@@ -62,9 +63,10 @@ def main():
         print(f'\tinput: {cfg.genome_path}')
         print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}")
         if(cfg.replicons): print(f'\treplicon table: {cfg.replicons}')
-        if(cfg.regions): print(f'\tregion table: {cfg.regions}')
         if(cfg.prodigal_tf): print(f'\tprodigal training file: {cfg.prodigal_tf}')
+        if(cfg.regions): print(f'\tregion table: {cfg.regions}')
         if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}')
+        if(cfg.user_hmms): print(f'\tuser hmms: {cfg.user_hmms}')
         print(f'\toutput: {cfg.output_path}')
         if(cfg.force): print(f'\tforce: {cfg.force}')
         print(f'\ttmp directory: {cfg.tmp_path}')
@@ -292,6 +294,11 @@ def main():
                 user_aa_found = exp_aa_seq.search(cdss, cds_aa_path, 'user_proteins', user_aa_path)
                 print(f'\t\tuser protein sequences: {len(user_aa_found)}')
 
+            if(cfg.user_hmms):
+                log.debug('conduct expert system: user HMM')
+                user_hmm_found = exp_aa_hmms.search(cdss, cfg.user_hmms)
+                print(f'\t\tuser HMM sequences: {len(user_hmm_found)}')
+
             if(cfg.gram != bc.GRAM_UNKNOWN):
                 sig_peptides_found = sig_peptides.search(cdss, cds_aa_path)
                 print(f'\tsignal peptides: {len(sig_peptides_found)}')
@@ -366,8 +373,8 @@ def main():
         if(len(sorfs_not_found) > 0):
             if(cfg.db_info['type'] == 'full'):
                 log.debug('search sORF PSC')
-                sorf_pscs, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
-                sorf_pscs_psccs.extend(sorf_pscs)
+                cdss_not_found_tmp, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
+                sorf_pscs_psccs.extend(cdss_not_found_tmp)
                 print(f'\tfound PSCs: {len(sorf_pscs_psccs)}')
             else:
                 log.debug('search sORF PSCC')