Merge branch 'main' into fix-cds-products

oschwengers · Oct 10, 2024 · be07551 · be07551
2 parents cb8815c + 5d15f39
commit be07551
Show file tree

Hide file tree

Showing 25 changed files with 2,753 additions and 164 deletions.
diff --git a/README.md b/README.md
@@ -240,8 +240,8 @@ original locus id  |  new locus id  |  type  |  topology  |  name
 NODE_1 | chrom | `chromosome` | `circular` | `-`
 NODE_2 | p1 | `plasmid` | `c` | `pXYZ1`
 NODE_3 | p2 | `p`  |  `c` | `pXYZ2`
-NODE_4 | special-contig-name-xyz |  `-` | -
-NODE_5 | `` |  `-` | -
+NODE_4 | special-contig-name-xyz |  `-` | `-` | `-`
+NODE_5 | `` |  `-` | `-` | `-`
 
 #### User-provided regions
 
@@ -274,6 +274,27 @@ dbxrefs | `<empty>`, `db:id`, `,` separated list  | `VFDB:VF0511`
 
 Protein sequences provided in short Fasta or GenBank format are searched with default thresholds of 90%, 80% and 80% for minimal identity, query and subject coverage, respectively.
 
+#### User-provided HMMs
+
+Bakta accepts user-provided trusted HMMs via `--hmms` in HMMER's text format. If set, Bakta will adhere to the *trusted cutoff* specified in the HMM header. In addition, a max. evalue threshold of 1e-6 is applied. By default, Bakta uses the HMM description line as a product description. Further information can be provided via the HMM description line using the *short* format as explained above in the [User-provided protein sequences](####user-provided-protein-sequences) section.
+
+```bash
+# default
+HMMER3/f [3.1b2 | February 2015]
+NAME  id
+ACC   id
+DESC  product
+LENG  435
+TC    600 600
+
+# short
+NAME  id
+ACC   id
+DESC  gene~~~product~~~dbxrefs
+LENG  435
+TC    600 600
+```
+
 ### Output
 
 Annotation results are provided in standard bioinformatics file formats:
@@ -285,6 +306,7 @@ Annotation results are provided in standard bioinformatics file formats:
 - `<prefix>.fna`: replicon/contig DNA sequences as FASTA
 - `<prefix>.ffn`: feature nucleotide sequences as FASTA
 - `<prefix>.faa`: CDS/sORF amino acid sequences as FASTA
+- `<prefix>.inference.tsv`: inference metrics (score, evalue, coverage, identity) for annotated accessions as TSV
 - `<prefix>.hypotheticals.tsv`: further information on hypothetical protein CDS as simple human readble tab separated values
 - `<prefix>.hypotheticals.faa`: hypothetical protein CDS amino acid sequences as FASTA
 - `<prefix>.json`: all (internal) annotation & sequence information as JSON
@@ -393,6 +415,7 @@ Annotation:
                         Replicon information table (tsv/csv)
   --regions REGIONS     Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).
   --proteins PROTEINS   Fasta file of trusted protein sequences for CDS annotation
+  --hmms HMMS           HMM file of trusted hidden markov models in HMMER format for CDS annotation
   --meta                Run in metagenome mode. This only affects CDS prediction.
 
 Workflow:

diff --git a/bakta/config.py b/bakta/config.py
@@ -2,7 +2,6 @@
 import multiprocessing as mp
 import os
 import re
-import shutil
 import sys
 import tempfile
 
@@ -55,6 +54,7 @@
 replicons = None
 compliant = None
 user_proteins = None
+user_hmms = None
 meta = None
 regions = None
 
@@ -162,7 +162,7 @@ def setup(args):
         taxon = None
 
     # annotation configurations
-    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, meta, regions
+    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
     complete = args.complete
     log.info('complete=%s', complete)
     prodigal_tf = args.prodigal_tf
@@ -236,6 +236,19 @@ def setup(args):
             sys.exit(f'ERROR: replicon table file ({replicons}) not valid!')
     log.info('replicon-table=%s', replicons)
     user_proteins = check_user_proteins(args)
+    user_hmms = args.hmms
+    if(user_hmms is not None):
+        try:
+            if(user_hmms == ''):
+                raise ValueError('File path argument must be non-empty')
+            user_hmms_path = Path(user_hmms).resolve()
+            check_readability('HMM', user_hmms_path)
+            check_content_size('HMM', user_hmms_path)
+            user_hmms = user_hmms_path
+        except:
+            log.error('provided HMM file not valid! path=%s', user_hmms)
+            sys.exit(f'ERROR: HMM file ({user_hmms}) not valid!')
+
     regions = args.regions
     if(regions is not None):
         try:

diff --git a/bakta/constants.py b/bakta/constants.py
@@ -26,6 +26,7 @@
 MIN_PSC_IDENTITY = 0.9  # min protein identity for PSC detection
 MIN_SORF_COVERAGE = 0.9  # min sORF coverage for PSC detection
 MIN_SORF_IDENTITY = 0.9  # min sORF identity for PSC detection
+MIN_HMM_EVALUE = 1e-6  # min evalue for CDS HMM searches
 HYPOTHETICAL_PROTEIN = 'hypothetical protein'  # hypothetical protein product description
 CDS_MAX_OVERLAPS = 30  # max overlap [bp] allowed for user-provided/de novo-predicted CDS overlaps
 CDS_SOURCE_USER = 'user-provided'

diff --git a/bakta/expert/protein_hmms.py b/bakta/expert/protein_hmms.py
@@ -0,0 +1,73 @@
+import logging
+
+from typing import Sequence
+
+import pyhmmer
+
+import bakta.config as cfg
+import bakta.constants as bc
+import bakta.features.orf as orf
+
+
+log = logging.getLogger('EXPERT_AA_HMM')
+
+
+def search(cdss: Sequence[dict], user_hmms_path):
+    """Detect Pfam-A entries"""
+    cds_found = set()
+    orf_by_aa_digest = orf.get_orf_dictionary(cdss)
+    alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
+    proteins: list[pyhmmer.easel.DigitalSequence] = [ pyhmmer.easel.TextSequence(sequence=cds['aa'], name=bytes(orf.get_orf_key(cds), 'UTF-8')).digitize(alphabet) for cds in cdss ]
+    with pyhmmer.plan7.HMMFile(user_hmms_path) as hmms_fh:
+        hmms = list(hmms_fh)
+        for hmm_query_hits in pyhmmer.hmmsearch(hmms, proteins, bit_cutoffs='trusted', cpus=cfg.threads):
+            hmm_id = hmm_query_hits.query.accession.decode()
+            hmm_length = hmm_query_hits.query.M
+            hmm_description = hmm_query_hits.query.description.decode()
+            hmms_description_fields = hmm_description.split('~~~')
+            for hmm_query_hit in hmm_query_hits.reported:
+                aa_identifier = hmm_query_hit.name.decode()
+                cds = orf_by_aa_digest[aa_identifier]
+                if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE:
+                    log.debug(
+                        'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
+                        cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
+                    )
+                else:
+                    hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included])
+                    hit = {
+                        'type': 'user_hmms',
+                        'rank': 100,
+                        'id': hmm_id,
+                        'length': hit_domain_lengths_sum,
+                        'aa_cov': hit_domain_lengths_sum / len(cds['aa']),
+                        'hmm_cov': hit_domain_lengths_sum / hmm_length,
+                        'evalue': hmm_query_hit.evalue,
+                        'score': hmm_query_hit.score,
+                        'start': hmm_query_hit.best_domain.alignment.target_from,
+                        'stop': hmm_query_hit.best_domain.alignment.target_to,
+                        'db_xrefs': [f'UserHMM:{hmm_id}'],
+                        'gene': None,
+                        'product': None
+                    }
+                    # read user-provided gene, product, db_xrefs via HMM's description field provided as <gene>~~~<product>~~~<db_xrefs>
+                    if(len(hmms_description_fields) == 1):
+                        hit['product'] = hmm_description if hmm_description != '' else None
+                    elif(len(hmms_description_fields) == 3):
+                        (gene, product, dbxrefs) = hmms_description_fields
+                        hit['gene'] = gene
+                        hit['product'] = product
+                        hit['db_xrefs'].extend(set(dbxrefs.split(',')))
+                    else:
+                        log.warning('wrong HMM description format: id=%s, desc=%s', hmm_id, hmm_description)
+
+                    cds.setdefault('expert', [])
+                    cds['expert'].append(hit)
+                    log.debug(
+                        'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
+                        cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score']
+                    )
+                    cds_found.add(aa_identifier)
+
+    log.info('found=%i', len(cds_found))
+    return cds_found
diff --git a/bakta/expert/protein_sequences.py b/bakta/expert/protein_sequences.py
@@ -5,13 +5,13 @@
 from pathlib import Path
 from typing import Sequence
 
+from Bio import SeqIO
+from xopen import xopen
+
 import bakta.config as cfg
 import bakta.constants as bc
 import bakta.features.orf as orf
 
-from Bio import SeqIO
-from xopen import xopen
-
 
 log = logging.getLogger('EXPERT_AA_SEQ')
 
@@ -26,7 +26,7 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
         '--db', str(db_path),
         '--out', str(diamond_output_path),
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6', 'qseqid', 'sseqid', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),  # use tmp folder
         '--block-size', '4',  # increase block size for faster executions
@@ -51,13 +51,13 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
     cds_by_hexdigest = orf.get_orf_dictionary(cdss)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (aa_identifier, model_id, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
+            (aa_identifier, model_id, query_length, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
             model_cov = int(alignment_length) / int(model_length)
             identity = float(identity) / 100
-            evalue = float(evalue)
             bitscore = float(bitscore)
+            evalue = float(evalue)
             (source, rank, min_identity, min_query_cov, min_model_cov, gene, product, dbxrefs) = model_title.split(' ', 1)[1].split('~~~')
             rank = int(rank)
             min_identity = float(min_identity) / 100
@@ -72,19 +72,19 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
                     'gene': gene if gene != '' else None,
                     'product': product,
                     'query_cov': query_cov,
-                    'model_cov': model_cov,
+                    'subject_cov': model_cov,
                     'identity': identity,
+                    'score': bitscore,
                     'evalue': evalue,
-                    'bitscore': bitscore,
                     'db_xrefs': [] if dbxrefs == '' else dbxrefs.split(',')
                 }
                 if(expert_system == 'user_proteins'):
                     hit['db_xrefs'].append(f'UserProtein:{model_id}')
                 cds.setdefault('expert', [])
                 cds['expert'].append(hit)
                 log.debug(
-                    'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, identity=%0.3f, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
-                    source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, gene, product, evalue, bitscore
+                    'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
+                    source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
                 )
                 cds_found.add(aa_identifier)
 
@@ -117,7 +117,7 @@ def write_user_protein_sequences(aa_fasta_path: Path):
             with aa_fasta_path.open('w') as fh_out:
                 for user_protein in user_proteins:
                     (model_id, min_id, min_query_cov, min_model_cov, gene, product, dbxrefs, seq) = user_protein
-                    fh_out.write(f">{model_id} UserProteins~~~{100}~~~{min_id}~~~{min_query_cov}~~~{min_model_cov}~~~{gene}~~~{product}~~~{','.join(dbxrefs)}\n{seq}\n")
+                    fh_out.write(f">{model_id} UserProteins~~~{101}~~~{min_id}~~~{min_query_cov}~~~{min_model_cov}~~~{gene}~~~{product}~~~{','.join(dbxrefs)}\n{seq}\n")
                     log.debug(
                         'imported user aa: id=%s, length=%i, min-id=%f, min-query-cov=%f, min-model-cov=%f, gene=%s, product=%s, dbxrefs=%s',
                         model_id, len(seq), min_id, min_query_cov, min_model_cov, gene, product, dbxrefs

diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py
@@ -29,7 +29,7 @@
 RE_PROTEIN_WRONG_PRIMES = re.compile(r'[\u2032\u0060\u00B4]')  # prime (′), grave accent (`), acute accent (´)
 RE_PROTEIN_WEIGHT = re.compile(r' [0-9]+(?:\.[0-9]+)? k?da ', flags=re.IGNORECASE)
 RE_PROTEIN_SYMBOL = re.compile(r'[A-Z][a-z]{2}[A-Z][0-9]?')
-RE_DOMAIN_OF_UNKNOWN_FUCTION = re.compile(r'(DUF\d{3,4})', flags=re.IGNORECASE)
+RE_DOMAIN_OF_UNKNOWN_FUNCTION = re.compile(r'(DUF\d{3,4})', flags=re.IGNORECASE)
 RE_UNCHARACTERIZED_PROTEIN_FAMILY = re.compile(r'(UPF\d{3,4})', flags=re.IGNORECASE)
 RE_GENE_CAPITALIZED = re.compile(r'^[A-Z].+', flags=re.DOTALL)
 RE_GENE_SUSPECT_CHARS = re.compile(r'[\?]', flags=re.DOTALL)
@@ -105,18 +105,17 @@ def combine_annotation(feature: dict):
             product = ips_product
         for db_xref in ips['db_xrefs']:
             db_xrefs.add(db_xref)
-    rank = 0
-    for hit in expert_hits:
-        db_xrefs.update(hit.get('db_xrefs', []))
-        expert_rank = hit['rank']
-        if(expert_rank > rank):
-            expert_genes = hit.get('gene', None)
-            if(expert_genes):
-                expert_genes = expert_genes.replace('/', ',').split(',')
-                genes.update(expert_genes)
-                gene = expert_genes[0]
-            product = hit.get('product', None)
-            rank = expert_rank
+
+    if(len(expert_hits) > 0):
+        top_expert_hit = sorted(expert_hits,key=lambda k: (k['rank'], k.get('score', 0), calc_annotation_score(k)), reverse=True)[0]
+        expert_genes = top_expert_hit.get('gene', None)
+        if(expert_genes):
+            expert_genes = expert_genes.replace('/', ',').split(',')
+            genes.update(expert_genes)
+            gene = expert_genes[0]
+        product = top_expert_hit.get('product', None)
+        for hit in expert_hits:
+            db_xrefs.update(hit.get('db_xrefs', []))
 
     if(product):
         product = revise_cds_product(product)
@@ -408,8 +407,8 @@ def detect_feature_overlaps(genome: dict):
                 elif(sorf['start'] == overlap_sorf['start'] and sorf['stop'] == overlap_sorf['stop']):
                     continue  # same
                 else:  # overlap -> remove sorf
-                    score_sorf = calc_sorf_annotation_score(sorf)
-                    score_overlap_sorf = calc_sorf_annotation_score(overlap_sorf)
+                    score_sorf = calc_cds_annotation_score(sorf)
+                    score_overlap_sorf = calc_cds_annotation_score(overlap_sorf)
 
                     if(score_sorf < score_overlap_sorf):  # lower annotation score
                         overlap = f"[{max(sorf['start'], overlap_sorf['start'])},{min(sorf['stop'], overlap_sorf['stop'])}]"
@@ -435,39 +434,38 @@ def detect_feature_overlaps(genome: dict):
                         )
 
 
-def calc_sorf_annotation_score(sorf: dict) -> int:
+def calc_cds_annotation_score(cds: dict) -> int:
     """Calc an annotation score rewarding each identification & annotation"""
     score = 0
 
-    if('ups' in sorf):
+    if('ups' in cds):
         score += 1
 
-    ips = sorf.get('ips', None)
+    ips = cds.get('ips', None)
     if(ips):
         score += 1
-        ips_gene = ips.get('gene', None)
-        if(ips_gene):
-            score += 1
-        ips_product = ips.get('product', None)
-        if(ips_product):
-            score += 1
+        score += calc_annotation_score(ips)
 
-    psc = sorf.get('psc', None)
+    psc = cds.get('psc', None)
     if(psc):
         score += 1
-        psc_gene = psc.get('gene', None)
-        if(psc_gene):
-            score += 1
-        psc_product = psc.get('product', None)
-        if(psc_product):
-            score += 1
+        score += calc_annotation_score(psc)
     log.debug(
-        'sorf score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
-        sorf['contig'], sorf['start'], sorf['stop'], sorf.get('gene', '-'), sorf.get('product', '-'), score
+        'cds score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
+        cds['contig'], cds['start'], cds['stop'], cds.get('gene', '-'), cds.get('product', '-'), score
     )
     return score
 
 
+def calc_annotation_score(orf:dict) -> int:
+    score = 0
+    if(orf.get('gene', None)):
+        score += 1
+    if(orf.get('product', None)):
+        score += 1
+    return score
+
+
 def extract_protein_gene_symbol(product: str) -> str:
     gene_symbols = []
     for part in product.split(' '):  # try to extract valid gene symbols
@@ -575,7 +573,7 @@ def revise_cds_product(product: str):
 
     old_product = product
     dufs = []  # replace DUF-containing products
-    for m in RE_DOMAIN_OF_UNKNOWN_FUCTION.finditer(product):
+    for m in RE_DOMAIN_OF_UNKNOWN_FUNCTION.finditer(product):
         dufs.append(m.group(1).upper())
     if(len(dufs) >= 1):
         product = f"{' '.join(dufs)} domain{'s' if len(dufs) > 1 else ''}-containing protein"