Skip to content

Commit

Permalink
PD-5191, PD-5192: Only operons with >= 80% identity are reported; --t…
Browse files Browse the repository at this point in the history
…hreads
  • Loading branch information
Vyacheslav Brover committed Dec 14, 2024
1 parent 942a86a commit 52fb71b
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 23 deletions.
2 changes: 2 additions & 0 deletions common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1918,6 +1918,7 @@ void Root::saveFile (const string &fName) const



#if 0
void Root::trace (ostream& os,
const string& title) const
{
Expand All @@ -1927,6 +1928,7 @@ void Root::trace (ostream& os,
os << title << ": ";
saveText (os);
}
#endif



Expand Down
17 changes: 14 additions & 3 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ constexpr size_t no_index = numeric_limits<size_t>::max ();
static_assert ((size_t) 0 - 1 == no_index);

constexpr double NaN = numeric_limits<double>::quiet_NaN ();



// Global variables
Expand Down Expand Up @@ -165,6 +165,11 @@ void beep ();



template <typename T>
inline bool isNan (T x)
{ return x != x; }


// Comparison templates

template <typename T>
Expand Down Expand Up @@ -465,10 +470,14 @@ template <typename T /*container*/>
typedef map<string/*key*/,string/*value*/> KeyValue;

inline string find (const KeyValue& kv,
const string& key)
const string& key,
bool force)
{ const auto& it = kv. find (key);
if (it == kv. end ())
{ if (force)
return key;
throw runtime_error ("Key \"" + key + "\" is not found");
}
return it->second;
}

Expand Down Expand Up @@ -2087,15 +2096,17 @@ struct Root
saveText (oss);
return oss. str ();
}
#if 0
void trace (ostream& os,
const string& title) const;
#endif
virtual void saveXml (Xml::File& /*f*/) const
{ throwf ("Root::saveXml() is not implemented"); }
virtual Json* toJson (JsonContainer* /*parent_arg*/,
const string& /*name_arg*/) const
{ throwf ("Root::toJson() is not implemented"); }
virtual bool empty () const
{ return true; }
{ throwf ("Root::empty() is not implemented"); }
virtual void clear ()
{ throwf ("Root::clear() is not implemented"); }
// Postcondition: empty()
Expand Down
63 changes: 44 additions & 19 deletions stxtyper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
* Dependencies: NCBI BLAST, gunzip (optional)
*
* Release changes:
* PD-5191 strong operons should not be partial
* --threads: reduces time by 30% for large input DNA
* Bug in frame shifts detection (for multiple frame shifts in the same protein)
* 1.0.29 12/12/2024 PD-5192 Only operons with >= 80% identity are reported // Only BLAST HSPs with identity >= 80% are considered
* 1.0.28 12/03/2024 tblastn -gapextend 2
* 10/30/2024 colorizeDir()
* 1.0.27 10/23/2024 PD-5155 "Hierarchy node" with mixed types is <stx1>::<stx2>
Expand Down Expand Up @@ -117,6 +121,7 @@ map<string,double> stxClass2identity;
// PAR
constexpr size_t intergenic_max {36}; // Max. intergenic region in the reference set + 2
constexpr size_t slack = 30;
constexpr double identity_min = 0.8; // PD-5192

const string stxS ("stx");
const string na ("NA");
Expand Down Expand Up @@ -150,6 +155,7 @@ struct BlastAlignment
, targetStart {0}, targetEnd {0}, targetLen {0};
// Positions are 0-based
// targetStart < targetEnd
size_t frame {no_index};
bool stopCodon {false};
bool frameshift {false};

Expand Down Expand Up @@ -253,6 +259,8 @@ struct BlastAlignment
refStart--;
targetStart--;

frame = (targetStart % 3) + 1;

//targetAlign = targetEnd - targetStart;
//QC_ASSERT (targetAlign_aa % 3 == 0);
//targetAlign_aa /= 3;
Expand Down Expand Up @@ -280,6 +288,7 @@ struct BlastAlignment
QC_ASSERT (refEnd - refStart <= length);
}
QC_ASSERT (! targetName. empty ());
QC_ASSERT (between<size_t> (frame, 1, 3));
QC_ASSERT (contains (stxClass2identity, stxClass));
QC_ASSERT (isLeft (stxType, stxClass));
QC_ASSERT (subunit == 'A' || subunit == 'B');
Expand All @@ -291,6 +300,10 @@ struct BlastAlignment
QC_IMPLY (! frameshift, length == targetSeq. size ());
QC_ASSERT (stxType. size () == 2);
}
void saveText (ostream &os) const
{
os << targetName << " (" << targetStart + 1 << '-' << targetEnd << ") " << refAccession << " (" << refStart + 1 << '-' << refEnd << ")" << '\n';
}
void saveTsvOut (TsvOut& td,
bool verboseP) const
{ if (! td. live ())
Expand Down Expand Up @@ -384,9 +397,11 @@ struct BlastAlignment
ASSERT (targetStart > prev. targetStart);
targetStart = prev. targetStart;
if (targetStrand)
refStart = prev. refStart;
//refStart = prev. refStart;
minimize (refStart, prev. refStart);
else
refEnd = prev. refEnd;
//refEnd = prev. refEnd;
maximize (refEnd, prev. refEnd);
// Approximately
length += prev. length;
nident += prev. nident;
Expand All @@ -397,8 +412,6 @@ struct BlastAlignment
stopCodon = true;
frameshift = true;
}
size_t getFrame () const
{ return (targetStart % 3) + 1; }
double getIdentity () const
{ return (double) nident / (double) (length); }
size_t getAbsCoverage () const
Expand All @@ -420,6 +433,10 @@ struct BlastAlignment
{ ASSERT (! truncated ());
return ! refStart && refEnd + 1 == refLen;
}
bool partial () const
{ return getRelCoverage () < 1.0
&& ! getExtended ();
}
bool insideEq (const BlastAlignment &other) const
{ return targetStart >= other. targetStart
&& targetEnd <= other. targetEnd;
Expand Down Expand Up @@ -681,17 +698,17 @@ struct Operon
return string ("2 ") + a [312] + a [318] + b [34];
return "2";
}
bool partial () const
{ return (getA () -> getRelCoverage () < 1.0 && ! getA () -> getExtended ())
|| (getB () -> getRelCoverage () < 1.0 && ! getB () -> getExtended ());
}
bool ambig () const
{ return getA () -> ambig ()
|| getB () -> ambig ();
}
public:
double getIdentity () const
{ return double (al1->nident + al2->nident) / double (al1->length/*refLen*/ + al2->length/*refLen*/); }
bool partial () const
{ return getA () -> partial ()
|| getB () -> partial ();
}
bool insideEq (const Operon &other) const
{ return al1->targetStrand == other. al1->targetStrand
&& al1->targetStart + slack >= other. al1->targetStart
Expand Down Expand Up @@ -723,11 +740,11 @@ struct Operon

void goodBlasts2operons (const VectorPtr<BlastAlignment> &goodBlastAls,
Vector<Operon> &operons,
bool sameType,
bool sameClass,
bool strong,
TsvOut &logTd)
{
IMPLY (sameType, strong);
IMPLY (sameClass, strong);

LOG ("\nGood blasts:");

Expand All @@ -744,7 +761,7 @@ void goodBlasts2operons (const VectorPtr<BlastAlignment> &goodBlastAls,
while ( start < i
&& ! ( goodBlastAls [start] -> targetName == alB->targetName
&& goodBlastAls [start] -> targetStrand == alB->targetStrand
&& ( ! sameType
&& ( ! sameClass
|| goodBlastAls [start] -> stxClass == alB->stxClass
)
)
Expand All @@ -758,7 +775,7 @@ void goodBlasts2operons (const VectorPtr<BlastAlignment> &goodBlastAls,
continue;
ASSERT (alA->targetName == alB->targetName);
ASSERT (alA->targetStrand == alB->targetStrand);
IMPLY (sameType, alA->stxClass == alB->stxClass);
IMPLY (sameClass, alA->stxClass == alB->stxClass);
ASSERT (alA->subunit <= alB->subunit);
if (alA->subunit == alB->subunit)
break;
Expand All @@ -777,6 +794,9 @@ void goodBlasts2operons (const VectorPtr<BlastAlignment> &goodBlastAls,
if ( ! strong
|| ( op. getIdentity () >= stxClass2identity [op. al1->stxClass]
&& op. getIdentity () >= stxClass2identity [op. al2->stxClass]
&& ! op. al1->truncated ()
&& ! op. al2->truncated ()
&& ! op. partial () // PD-5191
)
)
{
Expand Down Expand Up @@ -818,7 +838,7 @@ void goodBlasts2operons (const VectorPtr<BlastAlignment> &goodBlastAls,
struct ThisApplication final : ShellApplication
{
ThisApplication ()
: ShellApplication ("Determine stx type(s) of a genome, print .tsv-file", true, false, true, true)
: ShellApplication ("Determine stx type(s) of a genome, print .tsv-file", true, true/*threadsUsed*/, true, true)
{
addKey ("nucleotide", "Input nucleotide FASTA file (can be gzipped)", "", 'n', "NUC_FASTA");
//addKey ("translation_table", "NCBI genetic code for translated BLAST", "11", 't', "TRANSLATION_TABLE");
Expand Down Expand Up @@ -890,10 +910,10 @@ struct ThisApplication final : ShellApplication
exec (fullProg ("makeblastdb") + "-in " + dna_flat + " -dbtype nucl -out " + tmp + "/db -logfile " + tmp + "/db.log > /dev/null", tmp + "db.log");
findProg ("tblastn");
const string blast_fmt ("-outfmt '6 sseqid qseqid sstart send slen qstart qend qlen sseq qseq'");
exec (fullProg ("tblastn") + " -query " + execDir + "stx.prot -db " + tmp + "/db "
+ "-comp_based_stats 0 -evalue 1e-10 -seg no -max_target_seqs 10000 -word_size 5 -gapextend 2 -db_gencode " + to_string (gencode)
exec (fullProg ("tblastn") + " -query " + execDir + "stx.prot -db " + tmp + "/db"
+ " -comp_based_stats 0 -evalue 1e-10 -seg no -max_target_seqs 10000 -word_size 5 -gapextend 2 -db_gencode " + to_string (gencode)
+ " -mt_mode 1 -num_threads " + to_string (threads_max) // Reduces time by 30% for large DNA
//+ " -task tblastn-fast -threshold 100 -window_size 15" // from amrfinder.cpp: Reduces time by 9%
//+ " -num_threads 10 -mt_mode 1" // Reduces time by 30%
+ " " + blast_fmt + " -out " + blastOut + " > /dev/null 2> " + tmp + "/blast-err", tmp + "/blast-err");
}

Expand Down Expand Up @@ -982,7 +1002,7 @@ struct ThisApplication final : ShellApplication
LOG (f. line);
auto al = new BlastAlignment (f. line);
al->qc ();
blastAls << al;
blastAls << al;
}
}

Expand All @@ -1001,11 +1021,12 @@ struct ThisApplication final : ShellApplication
&& al->refAccession == prev->refAccession
&& al->targetStart > prev->targetStart
&& (int) al->targetStart - (int) prev->targetEnd < 10 // PAR
&& al->getFrame () != prev->getFrame ()
&& al->frame != prev->frame
)
{
{
var_cast (al) -> merge (*prev);
al->qc ();
//al->saveText (cout);
var_cast (prev) -> reported = true;
}
al->saveTsvOut (logTd, true);
Expand Down Expand Up @@ -1079,6 +1100,8 @@ struct ThisApplication final : ShellApplication
{
op. saveTsvOut (logTd, true);
op. qc ();
if (op. getIdentity () < identity_min)
continue;
bool found = false;
for (const Operon& goodOp : goodOperons)
if ( op. al1->targetName == goodOp. al1->targetName
Expand Down Expand Up @@ -1139,6 +1162,8 @@ struct ThisApplication final : ShellApplication
ASSERT (al1);
if (al1->reported)
continue;
if (al1->getIdentity () < identity_min)
continue;
Operon op (*al1);
goodOperons << std::move (op);
FFOR_START (size_t, j, i + 1, goodBlastAls. size ())
Expand Down
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.28
1.0.29

0 comments on commit 52fb71b

Please sign in to comment.