From 1706350dfd39c103d2f5c4e558b37a8ba1fbadbe Mon Sep 17 00:00:00 2001 From: jhahnfel Date: Thu, 22 Jun 2023 12:03:53 +0200 Subject: [PATCH 1/5] replace hmmer with pyhmmer --- README.md | 3 +- bakta/features/cds.py | 123 ++++++++++++++++++++---------------------- bakta/features/orf.py | 90 +++++++++++++++---------------- bakta/main.py | 8 +-- environment.yml | 2 +- test/test-genomes.nf | 2 +- 6 files changed, 107 insertions(+), 121 deletions(-) diff --git a/README.md b/README.md index 54c255d4..c7c59656 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ Bakta requires the following 3rd party software tools which must be installed an - INFERNAL (1.1.4) - PILER-CR (1.06) - Pyrodigal (2.0.2) -- Hmmer (3.3.2) +- PyHMMER (0.8.2) - Diamond (2.0.14) - Blast+ (2.12.0) - AMRFinderPlus (3.10.23) @@ -738,6 +738,7 @@ Bakta is *standing on the shoulder of giants* taking advantage of many great sof - Pyrodigal - Diamond - BLAST+ +- PyHMMER - HMMER - AMRFinderPlus - DeepSig diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 562e6fff..128ccce7 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -10,6 +10,7 @@ from pathlib import Path import pyrodigal +import pyhmmer from Bio.Seq import Seq from Bio.SeqUtils.ProtParam import ProteinAnalysis @@ -166,75 +167,69 @@ def create_cdss(genes, contig): def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]: """Detect Pfam-A entries""" - fasta_path = cfg.tmp_path.joinpath('hypotheticals.faa') - orf.write_internal_faa(cdss, fasta_path) - output_path = cfg.tmp_path.joinpath('cds.hypotheticals.pfam.tsv') - cmd = [ - 'hmmsearch', - '--noali', - '--cut_ga', # use gathering cutoff - '--domtblout', str(output_path), - '--cpu', str(cfg.threads if cfg.threads <= 4 else 4), - str(cfg.db_path.joinpath('pfam')), - str(fasta_path) + alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino() + proteins: list[pyhmmer.easel.DigitalSequence] = [ + pyhmmer.easel.TextSequence(sequence=cds['aa'], + name=bytes(orf.get_orf_key(cds), + 'UTF-8')).digitize(alphabet) for cds in cdss ] - log.debug('cmd=%s', cmd) - proc = sp.run( - cmd, - cwd=str(cfg.tmp_path), - env=cfg.env, - stdout=sp.PIPE, - stderr=sp.PIPE, - universal_newlines=True - ) - if(proc.returncode != 0): - log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr) - log.warning('pfam detection failed! hmmsearch-error-code=%d', proc.returncode) - raise Exception(f'hmmsearch error! error code: {proc.returncode}') + + hits: list[dict] = [] + with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('pfam')) as hmm: + for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads): + for hit in top_hits: + hits.append( + { + 'query': hit.name.decode(), + 'subject_name': hit.best_domain.alignment.hmm_name.decode(), + 'subject_id': hit.best_domain.alignment.hmm_accession.decode(), + 'bitscore': hit.score, + 'evalue': hit.evalue, + 'domain_length': len(hit.best_domain.alignment.hmm_sequence), + 'domain_start': hit.best_domain.alignment.hmm_from, + 'domain_stop': hit.best_domain.alignment.hmm_to, + 'aa_start': hit.best_domain.alignment.target_from, + 'aa_stop': hit.best_domain.alignment.target_to + } + ) pfam_hits = [] - cds_pfams_hits = {} + cds_pfams_hits = [] orf_by_aa_digest = orf.get_orf_dictionary(cdss) - with output_path.open() as fh: - for line in fh: - if(line[0] != '#'): - cols = bc.RE_MULTIWHITESPACE.split(line.strip()) - aa_identifier = cols[0] - cds = orf_by_aa_digest[aa_identifier] - - domain_length = int(cols[5]) - domain_start = int(cols[15]) - domain_stop = int(cols[16]) - domain_cov = (domain_stop - domain_start + 1) / domain_length - aa_start = int(cols[19]) - aa_stop = int(cols[20]) - aa_cov = (aa_stop - aa_start + 1) / len(cds['aa']) - - pfam = OrderedDict() - pfam['id'] = cols[4] - pfam['name'] = cols[3] - pfam['length'] = domain_length - pfam['aa_cov'] = aa_cov - pfam['hmm_cov'] = domain_cov - pfam['evalue'] = float(cols[6]) - pfam['score'] = float(cols[7]) - pfam['start'] = aa_start - pfam['stop'] = aa_stop - - if('pfams' not in cds): - cds['pfams'] = [] - cds['pfams'].append(pfam) - if('db_xrefs' not in cds): - cds['db_xrefs'] = [] - cds['db_xrefs'].append(f"PFAM:{pfam['id']}") - pfam_hits.append(cds) - cds_pfams_hits[aa_identifier] = cds - log.info( - 'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name'] - ) + for hit in hits: + cds = orf_by_aa_digest[hit['query']] + + domain_length = hit['domain_length'] + domain_start = hit['domain_start'] + domain_stop = hit['domain_stop'] + domain_cov = (domain_stop - domain_start + 1) / domain_length + aa_start = hit['aa_start'] + aa_stop = hit['aa_stop'] + aa_cov = (aa_stop - aa_start + 1) / len(cds['aa']) + + pfam = OrderedDict() + pfam['name'] = hit['subject_name'] + pfam['id'] = hit['subject_id'] + pfam['length'] = domain_length + pfam['aa_cov'] = aa_cov + pfam['hmm_cov'] = domain_cov + pfam['evalue'] = hit['evalue'] + pfam['score'] = hit['bitscore'] + pfam['start'] = aa_start + pfam['stop'] = aa_stop + + cds.setdefault('pfams', []) + cds['pfams'].append(pfam) + cds.setdefault('db_xrefs', []) + cds['db_xrefs'].append(f"PFAM:{pfam['id']}") + pfam_hits.append(cds) + cds_pfams_hits.append(cds) + log.info( + 'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s', + cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name'] + ) log.info('predicted-pfams=%i, CDS-w/-pfams=%i', len(pfam_hits), len(cds_pfams_hits)) - return cds_pfams_hits.values() + return cds_pfams_hits def analyze_proteins(cdss: Sequence[dict]): diff --git a/bakta/features/orf.py b/bakta/features/orf.py index 5dcaa171..3937d502 100644 --- a/bakta/features/orf.py +++ b/bakta/features/orf.py @@ -5,6 +5,8 @@ from pathlib import Path from typing import Dict, Sequence +import pyhmmer + import bakta.config as cfg import bakta.constants as bc @@ -12,59 +14,51 @@ log = logging.getLogger('ORF') -def detect_spurious(orfs: Sequence[dict], orf_aa_path: Path): +def detect_spurious(orfs: Sequence[dict]): """Detect spurious ORFs with AntiFam""" - output_path = cfg.tmp_path.joinpath('cds.spurious.hmm.tsv') - cmd = [ - 'hmmsearch', - '--noali', - '--cut_ga', # use gathering cutoff - '--tblout', str(output_path), - '--cpu', str(cfg.threads if cfg.threads <= 4 else 4), - str(cfg.db_path.joinpath('antifam')), - str(orf_aa_path) + alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino() + proteins: list[pyhmmer.easel.DigitalSequence] = [ + pyhmmer.easel.TextSequence(sequence=orf['aa'], + name=bytes(get_orf_key(orf), + 'UTF-8')).digitize(alphabet) for orf in orfs ] - log.debug('cmd=%s', cmd) - proc = sp.run( - cmd, - cwd=str(cfg.tmp_path), - env=cfg.env, - stdout=sp.PIPE, - stderr=sp.PIPE, - universal_newlines=True - ) - if(proc.returncode != 0): - log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr) - log.warning('spurious ORF detection failed! hmmsearch-error-code=%d', proc.returncode) - raise Exception(f'hmmsearch error! error code: {proc.returncode}') + + hits: list[dict] = [] + with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('antifam')) as hmm: + for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads): + for hit in top_hits: + hits.append( + { + 'query': hit.name.decode(), + 'subject_name': hit.best_domain.alignment.hmm_name.decode(), + 'subject_id': hit.best_domain.alignment.hmm_accession.decode(), + 'bitscore': hit.score, + 'evalue': hit.evalue, + } + ) discarded_orfs = [] orf_by_aa_digest = get_orf_dictionary(orfs) - with output_path.open() as fh: - for line in fh: - if(line[0] != '#'): - (aa_identifier, _, subject_name, subject_id, evalue, bitscore, _) = line.strip().split(maxsplit=6) - orf = orf_by_aa_digest[aa_identifier] - evalue = float(evalue) - bitscore = float(bitscore) - if(evalue > 1E-5): - log.debug( - 'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f', - orf['contig'], orf['start'], orf['stop'], orf['strand'], subject_name, evalue, bitscore - ) - else: - discard = OrderedDict() - discard['type'] = bc.DISCARD_TYPE_SPURIOUS - discard['description'] = f'(partial) homology to spurious sequence HMM (AntiFam:{subject_id})' - discard['score'] = bitscore - discard['evalue'] = evalue - - orf['discarded'] = discard - discarded_orfs.append(orf) - log.info( - 'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f', - orf['contig'], orf['start'], orf['stop'], orf['strand'], subject_name, evalue, bitscore - ) + for hit in hits: + orf = orf_by_aa_digest[hit['query']] + if hit['evalue'] > 1E-5: + log.debug( + 'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f', + orf['contig'], orf['start'], orf['stop'], orf['strand'], hit['subject_name'], hit['evalue'], hit['bitscore'] + ) + else: + discard = OrderedDict() + discard['type'] = bc.DISCARD_TYPE_SPURIOUS + discard['description'] = f'(partial) homology to spurious sequence HMM (AntiFam:{hit["subject_id"]})' + discard['score'] = hit['bitscore'] + discard['evalue'] = hit['evalue'] + + orf['discarded'] = discard + discarded_orfs.append(orf) + log.info( + 'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f', + orf['contig'], orf['start'], orf['stop'], orf['strand'], hit['subject_name'], hit['evalue'], hit['bitscore'] + ) log.info('discarded=%i', len(discarded_orfs)) return discarded_orfs diff --git a/bakta/main.py b/bakta/main.py index b473e59c..89ad2fe4 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -230,9 +230,7 @@ def main(): if(len(cdss) > 0): log.debug('detect spurious CDS') - cds_aa_path = cfg.tmp_path.joinpath('cds.spurious.faa') - orf.write_internal_faa(cdss, cds_aa_path) - discarded_cdss = orf.detect_spurious(cdss, cds_aa_path) + discarded_cdss = orf.detect_spurious(cdss) print(f'\tdiscarded spurious: {len(discarded_cdss)}') cdss = [cds for cds in cdss if 'discarded' not in cds] @@ -339,9 +337,7 @@ def main(): if(len(sorfs) > 0): log.debug('detect spurious sORF') - sorf_aa_path = cfg.tmp_path.joinpath('sorf.spurious.faa') - orf.write_internal_faa(sorfs, sorf_aa_path) - discarded_sorfs = orf.detect_spurious(sorfs, sorf_aa_path) + discarded_sorfs = orf.detect_spurious(sorfs) print(f'\tdiscarded spurious: {len(discarded_sorfs)}') sorfs = [sorf for sorf in sorfs if 'discarded' not in sorf] diff --git a/environment.yml b/environment.yml index e464ae24..0ec9b4c4 100644 --- a/environment.yml +++ b/environment.yml @@ -14,7 +14,7 @@ dependencies: - aragorn>=1.2.41 - infernal>=1.1.4 - piler-cr - - hmmer>=3.3.2 + - pyhmmer>=0.8.2 - diamond>=2.0.14 - blast>=2.12.0 - ncbi-amrfinderplus>=3.11.2 diff --git a/test/test-genomes.nf b/test/test-genomes.nf index 5026d8ba..d175e620 100644 --- a/test/test-genomes.nf +++ b/test/test-genomes.nf @@ -65,6 +65,6 @@ process bakta { script: completeOption = complete ? '--complete' : '' """ - ${baseDir}/../bin/bakta --db ${pathDb} --verbose --prefix ${accession} --genus ${genus} --species "${species}" --strain "${strain}" --keep-contig-headers --threads ${task.cpus} ${completeOption} assembly.fna + ${baseDir}/../bin/bakta --force --db ${pathDb} --verbose --prefix ${accession} --genus ${genus} --species "${species}" --strain "${strain}" --keep-contig-headers --threads ${task.cpus} ${completeOption} assembly.fna """ } From d4d5b8deda2203cd130c2243e664b3a4444b919a Mon Sep 17 00:00:00 2001 From: jhahnfel Date: Mon, 26 Jun 2023 09:30:18 +0200 Subject: [PATCH 2/5] Add correct version of pyhmmer --- environment.yml | 2 +- setup.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 0ec9b4c4..fbc4e100 100644 --- a/environment.yml +++ b/environment.yml @@ -14,7 +14,7 @@ dependencies: - aragorn>=1.2.41 - infernal>=1.1.4 - piler-cr - - pyhmmer>=0.8.2 + - pyhmmer>=0.8.1 - diamond>=2.0.14 - blast>=2.12.0 - ncbi-amrfinderplus>=3.11.2 diff --git a/setup.py b/setup.py index 9e5a7804..2cdd5e3a 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,8 @@ 'requests >= 2.25.1', 'alive-progress >= 3.0.1', 'PyYAML >= 6.0', - 'pyrodigal >= 2.1.0' + 'pyrodigal >= 2.1.0', + 'pyhmmer >= 0.8.1' ], entry_points={ 'console_scripts': [ From d0ad7ac965c9e7b1a918a028061b4a790f8c5e84 Mon Sep 17 00:00:00 2001 From: jhahnfel Date: Mon, 26 Jun 2023 10:00:19 +0200 Subject: [PATCH 3/5] Undo --force parameter in test-genomes.nf --- test/test-genomes.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test-genomes.nf b/test/test-genomes.nf index d175e620..5026d8ba 100644 --- a/test/test-genomes.nf +++ b/test/test-genomes.nf @@ -65,6 +65,6 @@ process bakta { script: completeOption = complete ? '--complete' : '' """ - ${baseDir}/../bin/bakta --force --db ${pathDb} --verbose --prefix ${accession} --genus ${genus} --species "${species}" --strain "${strain}" --keep-contig-headers --threads ${task.cpus} ${completeOption} assembly.fna + ${baseDir}/../bin/bakta --db ${pathDb} --verbose --prefix ${accession} --genus ${genus} --species "${species}" --strain "${strain}" --keep-contig-headers --threads ${task.cpus} ${completeOption} assembly.fna """ } From a693f4333e82131d33e84a54b952df8ca2f24bec Mon Sep 17 00:00:00 2001 From: jhahnfel Date: Mon, 26 Jun 2023 12:05:14 +0200 Subject: [PATCH 4/5] Correct PyHMMER version --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c7c59656..bcb6c41b 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ Bakta requires the following 3rd party software tools which must be installed an - INFERNAL (1.1.4) - PILER-CR (1.06) - Pyrodigal (2.0.2) -- PyHMMER (0.8.2) +- PyHMMER (0.8.1) - Diamond (2.0.14) - Blast+ (2.12.0) - AMRFinderPlus (3.10.23) From 0a1af0176bb298232e572db7fc2c4639a50af7e6 Mon Sep 17 00:00:00 2001 From: jhahnfel Date: Mon, 26 Jun 2023 12:58:11 +0200 Subject: [PATCH 5/5] Refactor hmmscan hit loops --- bakta/features/cds.py | 79 ++++++++++++++++--------------------------- bakta/features/orf.py | 56 +++++++++++++----------------- 2 files changed, 52 insertions(+), 83 deletions(-) diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 128ccce7..70b57ef2 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -174,60 +174,39 @@ def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]: 'UTF-8')).digitize(alphabet) for cds in cdss ] - hits: list[dict] = [] + pfam_hits = [] + cds_pfams_hits = [] + orf_by_aa_digest = orf.get_orf_dictionary(cdss) with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('pfam')) as hmm: for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads): for hit in top_hits: - hits.append( - { - 'query': hit.name.decode(), - 'subject_name': hit.best_domain.alignment.hmm_name.decode(), - 'subject_id': hit.best_domain.alignment.hmm_accession.decode(), - 'bitscore': hit.score, - 'evalue': hit.evalue, - 'domain_length': len(hit.best_domain.alignment.hmm_sequence), - 'domain_start': hit.best_domain.alignment.hmm_from, - 'domain_stop': hit.best_domain.alignment.hmm_to, - 'aa_start': hit.best_domain.alignment.target_from, - 'aa_stop': hit.best_domain.alignment.target_to - } + cds = orf_by_aa_digest[hit.name.decode()] + + domain_cov = (hit.best_domain.alignment.hmm_to - hit.best_domain.alignment.hmm_from + 1) / len(hit.best_domain.alignment.hmm_sequence) + aa_cov = (hit.best_domain.alignment.target_to - hit.best_domain.alignment.target_from + 1) / len(cds['aa']) + + pfam = OrderedDict() + pfam['name'] = hit.best_domain.alignment.hmm_name.decode() + pfam['id'] = hit.best_domain.alignment.hmm_accession.decode() + pfam['length'] = len(hit.best_domain.alignment.hmm_sequence) + pfam['aa_cov'] = aa_cov + pfam['hmm_cov'] = domain_cov + pfam['evalue'] = hit.evalue + pfam['score'] = hit.score + pfam['start'] = hit.best_domain.alignment.target_from + pfam['stop'] = hit.best_domain.alignment.target_to + + cds.setdefault('pfams', []) + cds['pfams'].append(pfam) + cds.setdefault('db_xrefs', []) + cds['db_xrefs'].append(f"PFAM:{pfam['id']}") + pfam_hits.append(cds) + cds_pfams_hits.append(cds) + log.info( + 'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s', + cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], + pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name'] ) - - pfam_hits = [] - cds_pfams_hits = [] - orf_by_aa_digest = orf.get_orf_dictionary(cdss) - for hit in hits: - cds = orf_by_aa_digest[hit['query']] - - domain_length = hit['domain_length'] - domain_start = hit['domain_start'] - domain_stop = hit['domain_stop'] - domain_cov = (domain_stop - domain_start + 1) / domain_length - aa_start = hit['aa_start'] - aa_stop = hit['aa_stop'] - aa_cov = (aa_stop - aa_start + 1) / len(cds['aa']) - - pfam = OrderedDict() - pfam['name'] = hit['subject_name'] - pfam['id'] = hit['subject_id'] - pfam['length'] = domain_length - pfam['aa_cov'] = aa_cov - pfam['hmm_cov'] = domain_cov - pfam['evalue'] = hit['evalue'] - pfam['score'] = hit['bitscore'] - pfam['start'] = aa_start - pfam['stop'] = aa_stop - - cds.setdefault('pfams', []) - cds['pfams'].append(pfam) - cds.setdefault('db_xrefs', []) - cds['db_xrefs'].append(f"PFAM:{pfam['id']}") - pfam_hits.append(cds) - cds_pfams_hits.append(cds) - log.info( - 'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name'] - ) log.info('predicted-pfams=%i, CDS-w/-pfams=%i', len(pfam_hits), len(cds_pfams_hits)) return cds_pfams_hits diff --git a/bakta/features/orf.py b/bakta/features/orf.py index 3937d502..9d2b7bc8 100644 --- a/bakta/features/orf.py +++ b/bakta/features/orf.py @@ -23,42 +23,32 @@ def detect_spurious(orfs: Sequence[dict]): 'UTF-8')).digitize(alphabet) for orf in orfs ] - hits: list[dict] = [] + discarded_orfs = [] + orf_by_aa_digest = get_orf_dictionary(orfs) with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('antifam')) as hmm: for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads): for hit in top_hits: - hits.append( - { - 'query': hit.name.decode(), - 'subject_name': hit.best_domain.alignment.hmm_name.decode(), - 'subject_id': hit.best_domain.alignment.hmm_accession.decode(), - 'bitscore': hit.score, - 'evalue': hit.evalue, - } - ) - - discarded_orfs = [] - orf_by_aa_digest = get_orf_dictionary(orfs) - for hit in hits: - orf = orf_by_aa_digest[hit['query']] - if hit['evalue'] > 1E-5: - log.debug( - 'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f', - orf['contig'], orf['start'], orf['stop'], orf['strand'], hit['subject_name'], hit['evalue'], hit['bitscore'] - ) - else: - discard = OrderedDict() - discard['type'] = bc.DISCARD_TYPE_SPURIOUS - discard['description'] = f'(partial) homology to spurious sequence HMM (AntiFam:{hit["subject_id"]})' - discard['score'] = hit['bitscore'] - discard['evalue'] = hit['evalue'] - - orf['discarded'] = discard - discarded_orfs.append(orf) - log.info( - 'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f', - orf['contig'], orf['start'], orf['stop'], orf['strand'], hit['subject_name'], hit['evalue'], hit['bitscore'] - ) + orf = orf_by_aa_digest[hit.name.decode()] + if hit.evalue > 1E-5: + log.debug( + 'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f', + orf['contig'], orf['start'], orf['stop'], orf['strand'], + hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score + ) + else: + discard = OrderedDict() + discard['type'] = bc.DISCARD_TYPE_SPURIOUS + discard['description'] = f'(partial) homology to spurious sequence HMM ' \ + f'(AntiFam:{hit.best_domain.alignment.hmm_accession.decode()})' + discard['score'] = hit.score + discard['evalue'] = hit.evalue + + orf['discarded'] = discard + discarded_orfs.append(orf) + log.info( + 'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f', + orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score + ) log.info('discarded=%i', len(discarded_orfs)) return discarded_orfs