Skip to content

Commit

Permalink
Merge branch 'main' into fix-cds-products
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers authored Oct 10, 2024
2 parents cb8815c + 5d15f39 commit be07551
Show file tree
Hide file tree
Showing 25 changed files with 2,753 additions and 164 deletions.
27 changes: 25 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,8 @@ original locus id | new locus id | type | topology | name
NODE_1 | chrom | `chromosome` | `circular` | `-`
NODE_2 | p1 | `plasmid` | `c` | `pXYZ1`
NODE_3 | p2 | `p` | `c` | `pXYZ2`
NODE_4 | special-contig-name-xyz | `-` | -
NODE_5 | `` | `-` | -
NODE_4 | special-contig-name-xyz | `-` | `-` | `-`
NODE_5 | `` | `-` | `-` | `-`

#### User-provided regions

Expand Down Expand Up @@ -274,6 +274,27 @@ dbxrefs | `<empty>`, `db:id`, `,` separated list | `VFDB:VF0511`

Protein sequences provided in short Fasta or GenBank format are searched with default thresholds of 90%, 80% and 80% for minimal identity, query and subject coverage, respectively.

#### User-provided HMMs

Bakta accepts user-provided trusted HMMs via `--hmms` in HMMER's text format. If set, Bakta will adhere to the *trusted cutoff* specified in the HMM header. In addition, a max. evalue threshold of 1e-6 is applied. By default, Bakta uses the HMM description line as a product description. Further information can be provided via the HMM description line using the *short* format as explained above in the [User-provided protein sequences](####user-provided-protein-sequences) section.

```bash
# default
HMMER3/f [3.1b2 | February 2015]
NAME id
ACC id
DESC product
LENG 435
TC 600 600

# short
NAME id
ACC id
DESC gene~~~product~~~dbxrefs
LENG 435
TC 600 600
```

### Output

Annotation results are provided in standard bioinformatics file formats:
Expand All @@ -285,6 +306,7 @@ Annotation results are provided in standard bioinformatics file formats:
- `<prefix>.fna`: replicon/contig DNA sequences as FASTA
- `<prefix>.ffn`: feature nucleotide sequences as FASTA
- `<prefix>.faa`: CDS/sORF amino acid sequences as FASTA
- `<prefix>.inference.tsv`: inference metrics (score, evalue, coverage, identity) for annotated accessions as TSV
- `<prefix>.hypotheticals.tsv`: further information on hypothetical protein CDS as simple human readble tab separated values
- `<prefix>.hypotheticals.faa`: hypothetical protein CDS amino acid sequences as FASTA
- `<prefix>.json`: all (internal) annotation & sequence information as JSON
Expand Down Expand Up @@ -393,6 +415,7 @@ Annotation:
Replicon information table (tsv/csv)
--regions REGIONS Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).
--proteins PROTEINS Fasta file of trusted protein sequences for CDS annotation
--hmms HMMS HMM file of trusted hidden markov models in HMMER format for CDS annotation
--meta Run in metagenome mode. This only affects CDS prediction.

Workflow:
Expand Down
17 changes: 15 additions & 2 deletions bakta/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import multiprocessing as mp
import os
import re
import shutil
import sys
import tempfile

Expand Down Expand Up @@ -55,6 +54,7 @@
replicons = None
compliant = None
user_proteins = None
user_hmms = None
meta = None
regions = None

Expand Down Expand Up @@ -162,7 +162,7 @@ def setup(args):
taxon = None

# annotation configurations
global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, meta, regions
global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
complete = args.complete
log.info('complete=%s', complete)
prodigal_tf = args.prodigal_tf
Expand Down Expand Up @@ -236,6 +236,19 @@ def setup(args):
sys.exit(f'ERROR: replicon table file ({replicons}) not valid!')
log.info('replicon-table=%s', replicons)
user_proteins = check_user_proteins(args)
user_hmms = args.hmms
if(user_hmms is not None):
try:
if(user_hmms == ''):
raise ValueError('File path argument must be non-empty')
user_hmms_path = Path(user_hmms).resolve()
check_readability('HMM', user_hmms_path)
check_content_size('HMM', user_hmms_path)
user_hmms = user_hmms_path
except:
log.error('provided HMM file not valid! path=%s', user_hmms)
sys.exit(f'ERROR: HMM file ({user_hmms}) not valid!')

regions = args.regions
if(regions is not None):
try:
Expand Down
1 change: 1 addition & 0 deletions bakta/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
MIN_PSC_IDENTITY = 0.9 # min protein identity for PSC detection
MIN_SORF_COVERAGE = 0.9 # min sORF coverage for PSC detection
MIN_SORF_IDENTITY = 0.9 # min sORF identity for PSC detection
MIN_HMM_EVALUE = 1e-6 # min evalue for CDS HMM searches
HYPOTHETICAL_PROTEIN = 'hypothetical protein' # hypothetical protein product description
CDS_MAX_OVERLAPS = 30 # max overlap [bp] allowed for user-provided/de novo-predicted CDS overlaps
CDS_SOURCE_USER = 'user-provided'
Expand Down
73 changes: 73 additions & 0 deletions bakta/expert/protein_hmms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import logging

from typing import Sequence

import pyhmmer

import bakta.config as cfg
import bakta.constants as bc
import bakta.features.orf as orf


log = logging.getLogger('EXPERT_AA_HMM')


def search(cdss: Sequence[dict], user_hmms_path):
"""Detect Pfam-A entries"""
cds_found = set()
orf_by_aa_digest = orf.get_orf_dictionary(cdss)
alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
proteins: list[pyhmmer.easel.DigitalSequence] = [ pyhmmer.easel.TextSequence(sequence=cds['aa'], name=bytes(orf.get_orf_key(cds), 'UTF-8')).digitize(alphabet) for cds in cdss ]
with pyhmmer.plan7.HMMFile(user_hmms_path) as hmms_fh:
hmms = list(hmms_fh)
for hmm_query_hits in pyhmmer.hmmsearch(hmms, proteins, bit_cutoffs='trusted', cpus=cfg.threads):
hmm_id = hmm_query_hits.query.accession.decode()
hmm_length = hmm_query_hits.query.M
hmm_description = hmm_query_hits.query.description.decode()
hmms_description_fields = hmm_description.split('~~~')
for hmm_query_hit in hmm_query_hits.reported:
aa_identifier = hmm_query_hit.name.decode()
cds = orf_by_aa_digest[aa_identifier]
if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE:
log.debug(
'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
)
else:
hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included])
hit = {
'type': 'user_hmms',
'rank': 100,
'id': hmm_id,
'length': hit_domain_lengths_sum,
'aa_cov': hit_domain_lengths_sum / len(cds['aa']),
'hmm_cov': hit_domain_lengths_sum / hmm_length,
'evalue': hmm_query_hit.evalue,
'score': hmm_query_hit.score,
'start': hmm_query_hit.best_domain.alignment.target_from,
'stop': hmm_query_hit.best_domain.alignment.target_to,
'db_xrefs': [f'UserHMM:{hmm_id}'],
'gene': None,
'product': None
}
# read user-provided gene, product, db_xrefs via HMM's description field provided as <gene>~~~<product>~~~<db_xrefs>
if(len(hmms_description_fields) == 1):
hit['product'] = hmm_description if hmm_description != '' else None
elif(len(hmms_description_fields) == 3):
(gene, product, dbxrefs) = hmms_description_fields
hit['gene'] = gene
hit['product'] = product
hit['db_xrefs'].extend(set(dbxrefs.split(',')))
else:
log.warning('wrong HMM description format: id=%s, desc=%s', hmm_id, hmm_description)

cds.setdefault('expert', [])
cds['expert'].append(hit)
log.debug(
'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score']
)
cds_found.add(aa_identifier)

log.info('found=%i', len(cds_found))
return cds_found
22 changes: 11 additions & 11 deletions bakta/expert/protein_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
from pathlib import Path
from typing import Sequence

from Bio import SeqIO
from xopen import xopen

import bakta.config as cfg
import bakta.constants as bc
import bakta.features.orf as orf

from Bio import SeqIO
from xopen import xopen


log = logging.getLogger('EXPERT_AA_SEQ')

Expand All @@ -26,7 +26,7 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
'--db', str(db_path),
'--out', str(diamond_output_path),
'--max-target-seqs', '1', # single best output
'--outfmt', '6', 'qseqid', 'sseqid', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
'--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
'--threads', str(cfg.threads),
'--tmpdir', str(cfg.tmp_path), # use tmp folder
'--block-size', '4', # increase block size for faster executions
Expand All @@ -51,13 +51,13 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
cds_by_hexdigest = orf.get_orf_dictionary(cdss)
with diamond_output_path.open() as fh:
for line in fh:
(aa_identifier, model_id, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
(aa_identifier, model_id, query_length, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
cds = cds_by_hexdigest[aa_identifier]
query_cov = int(alignment_length) / len(cds['aa'])
model_cov = int(alignment_length) / int(model_length)
identity = float(identity) / 100
evalue = float(evalue)
bitscore = float(bitscore)
evalue = float(evalue)
(source, rank, min_identity, min_query_cov, min_model_cov, gene, product, dbxrefs) = model_title.split(' ', 1)[1].split('~~~')
rank = int(rank)
min_identity = float(min_identity) / 100
Expand All @@ -72,19 +72,19 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
'gene': gene if gene != '' else None,
'product': product,
'query_cov': query_cov,
'model_cov': model_cov,
'subject_cov': model_cov,
'identity': identity,
'score': bitscore,
'evalue': evalue,
'bitscore': bitscore,
'db_xrefs': [] if dbxrefs == '' else dbxrefs.split(',')
}
if(expert_system == 'user_proteins'):
hit['db_xrefs'].append(f'UserProtein:{model_id}')
cds.setdefault('expert', [])
cds['expert'].append(hit)
log.debug(
'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, identity=%0.3f, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, gene, product, evalue, bitscore
'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
)
cds_found.add(aa_identifier)

Expand Down Expand Up @@ -117,7 +117,7 @@ def write_user_protein_sequences(aa_fasta_path: Path):
with aa_fasta_path.open('w') as fh_out:
for user_protein in user_proteins:
(model_id, min_id, min_query_cov, min_model_cov, gene, product, dbxrefs, seq) = user_protein
fh_out.write(f">{model_id} UserProteins~~~{100}~~~{min_id}~~~{min_query_cov}~~~{min_model_cov}~~~{gene}~~~{product}~~~{','.join(dbxrefs)}\n{seq}\n")
fh_out.write(f">{model_id} UserProteins~~~{101}~~~{min_id}~~~{min_query_cov}~~~{min_model_cov}~~~{gene}~~~{product}~~~{','.join(dbxrefs)}\n{seq}\n")
log.debug(
'imported user aa: id=%s, length=%i, min-id=%f, min-query-cov=%f, min-model-cov=%f, gene=%s, product=%s, dbxrefs=%s',
model_id, len(seq), min_id, min_query_cov, min_model_cov, gene, product, dbxrefs
Expand Down
66 changes: 32 additions & 34 deletions bakta/features/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
RE_PROTEIN_WRONG_PRIMES = re.compile(r'[\u2032\u0060\u00B4]') # prime (′), grave accent (`), acute accent (´)
RE_PROTEIN_WEIGHT = re.compile(r' [0-9]+(?:\.[0-9]+)? k?da ', flags=re.IGNORECASE)
RE_PROTEIN_SYMBOL = re.compile(r'[A-Z][a-z]{2}[A-Z][0-9]?')
RE_DOMAIN_OF_UNKNOWN_FUCTION = re.compile(r'(DUF\d{3,4})', flags=re.IGNORECASE)
RE_DOMAIN_OF_UNKNOWN_FUNCTION = re.compile(r'(DUF\d{3,4})', flags=re.IGNORECASE)
RE_UNCHARACTERIZED_PROTEIN_FAMILY = re.compile(r'(UPF\d{3,4})', flags=re.IGNORECASE)
RE_GENE_CAPITALIZED = re.compile(r'^[A-Z].+', flags=re.DOTALL)
RE_GENE_SUSPECT_CHARS = re.compile(r'[\?]', flags=re.DOTALL)
Expand Down Expand Up @@ -105,18 +105,17 @@ def combine_annotation(feature: dict):
product = ips_product
for db_xref in ips['db_xrefs']:
db_xrefs.add(db_xref)
rank = 0
for hit in expert_hits:
db_xrefs.update(hit.get('db_xrefs', []))
expert_rank = hit['rank']
if(expert_rank > rank):
expert_genes = hit.get('gene', None)
if(expert_genes):
expert_genes = expert_genes.replace('/', ',').split(',')
genes.update(expert_genes)
gene = expert_genes[0]
product = hit.get('product', None)
rank = expert_rank

if(len(expert_hits) > 0):
top_expert_hit = sorted(expert_hits,key=lambda k: (k['rank'], k.get('score', 0), calc_annotation_score(k)), reverse=True)[0]
expert_genes = top_expert_hit.get('gene', None)
if(expert_genes):
expert_genes = expert_genes.replace('/', ',').split(',')
genes.update(expert_genes)
gene = expert_genes[0]
product = top_expert_hit.get('product', None)
for hit in expert_hits:
db_xrefs.update(hit.get('db_xrefs', []))

if(product):
product = revise_cds_product(product)
Expand Down Expand Up @@ -408,8 +407,8 @@ def detect_feature_overlaps(genome: dict):
elif(sorf['start'] == overlap_sorf['start'] and sorf['stop'] == overlap_sorf['stop']):
continue # same
else: # overlap -> remove sorf
score_sorf = calc_sorf_annotation_score(sorf)
score_overlap_sorf = calc_sorf_annotation_score(overlap_sorf)
score_sorf = calc_cds_annotation_score(sorf)
score_overlap_sorf = calc_cds_annotation_score(overlap_sorf)

if(score_sorf < score_overlap_sorf): # lower annotation score
overlap = f"[{max(sorf['start'], overlap_sorf['start'])},{min(sorf['stop'], overlap_sorf['stop'])}]"
Expand All @@ -435,39 +434,38 @@ def detect_feature_overlaps(genome: dict):
)


def calc_sorf_annotation_score(sorf: dict) -> int:
def calc_cds_annotation_score(cds: dict) -> int:
"""Calc an annotation score rewarding each identification & annotation"""
score = 0

if('ups' in sorf):
if('ups' in cds):
score += 1

ips = sorf.get('ips', None)
ips = cds.get('ips', None)
if(ips):
score += 1
ips_gene = ips.get('gene', None)
if(ips_gene):
score += 1
ips_product = ips.get('product', None)
if(ips_product):
score += 1
score += calc_annotation_score(ips)

psc = sorf.get('psc', None)
psc = cds.get('psc', None)
if(psc):
score += 1
psc_gene = psc.get('gene', None)
if(psc_gene):
score += 1
psc_product = psc.get('product', None)
if(psc_product):
score += 1
score += calc_annotation_score(psc)
log.debug(
'sorf score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
sorf['contig'], sorf['start'], sorf['stop'], sorf.get('gene', '-'), sorf.get('product', '-'), score
'cds score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
cds['contig'], cds['start'], cds['stop'], cds.get('gene', '-'), cds.get('product', '-'), score
)
return score


def calc_annotation_score(orf:dict) -> int:
score = 0
if(orf.get('gene', None)):
score += 1
if(orf.get('product', None)):
score += 1
return score


def extract_protein_gene_symbol(product: str) -> str:
gene_symbols = []
for part in product.split(' '): # try to extract valid gene symbols
Expand Down Expand Up @@ -575,7 +573,7 @@ def revise_cds_product(product: str):

old_product = product
dufs = [] # replace DUF-containing products
for m in RE_DOMAIN_OF_UNKNOWN_FUCTION.finditer(product):
for m in RE_DOMAIN_OF_UNKNOWN_FUNCTION.finditer(product):
dufs.append(m.group(1).upper())
if(len(dufs) >= 1):
product = f"{' '.join(dufs)} domain{'s' if len(dufs) > 1 else ''}-containing protein"
Expand Down
Loading

0 comments on commit be07551

Please sign in to comment.