Merge pull request #394 from psj1997/main

strobealign-aemb module for metagenomic binning
ksahlin · Mar 4, 2024 · bd183f5 · bd183f5
2 parents 79cdd96 + 5a1ab9e
commit bd183f5
Show file tree

Hide file tree

Showing 11 changed files with 192 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -113,6 +113,11 @@ strobealign ref.fa reads.1.fastq.gz reads.2.fastq.gz | samtools sort -o sorted.b
 This is usually faster than doing the two steps separately because fewer
 intermediate files are created.
 
+To output the estimated abundance of every contig, the format of output file is: contig_id \t abundance_value:
+```
+strobealign ref.fa reads.fq --aemb > abundance.txt                # Single-end reads
+strobealign ref.fa reads1.fq reads2.fq --aemb > abundance.txt     # Paired-end reads
+```
 
 ## Command-line options
 
@@ -127,6 +132,7 @@ options. Some important ones are:
 * `--eqx`: Emit `=` and `X` CIGAR operations instead of `M`.
 * `-x`: Only map reads, do not do no base-level alignment. This switches the
   output format from SAM to [PAF](https://github.com/lh3/miniasm/blob/master/PAF.md).
+* `--aemb`: Output the estimated abundance value of every contig, the format of output file is: contig_id \t abundance_value.
 * `--rg-id=ID`: Add RG tag to each SAM record.
 * `--rg=TAG:VALUE`: Add read group metadata to the SAM header. This can be
   specified multiple times. Example: `--rg-id=1 --rg=SM:mysamle --rg=LB:mylibrary`.

diff --git a/src/aln.cpp b/src/aln.cpp
@@ -862,13 +862,17 @@ std::vector<ScoredAlignmentPair> align_paired(
     return high_scores;
 }
 
-// Only used for PAF output
+// Used for PAF and abundances output
 inline void get_best_map_location(
     std::vector<Nam> &nams1,
     std::vector<Nam> &nams2,
     InsertSizeDistribution &isize_est,
     Nam &best_nam1,
-    Nam &best_nam2
+    Nam &best_nam2,
+    int read1_len,
+    int read2_len,
+    std::vector<double> &abundances,
+    bool output_abundance
 ) {
     std::vector<NamPair> nam_pairs = get_best_scoring_nam_pairs(nams1, nams2, isize_est.mu, isize_est.sigma);
     best_nam1.ref_start = -1; //Unmapped until proven mapped
@@ -903,6 +907,52 @@ inline void get_best_map_location(
     if (score_joint > score_indiv) { // joint score is better than individual
         best_nam1 = n1_joint_max;
         best_nam2 = n2_joint_max;
+
+        if (output_abundance){
+            // we loop twice because we need to count the number of best pairs
+            size_t n_best = 0;
+            for (auto &[score, n1, n2] : nam_pairs){
+                if ((n1.score + n2.score) == score_joint){
+                    ++n_best;
+                } else {
+                    break;
+                }
+            }
+            for (auto &[score, n1, n2] : nam_pairs){
+                if ((n1.score + n2.score) == score_joint){
+                    if (n1.ref_start >= 0) {
+                        abundances[n1.ref_id] += float(read1_len) / float(n_best);
+                    }
+                    if (n2.ref_start >= 0) {
+                        abundances[n2.ref_id] += float(read2_len) / float(n_best);
+                    }
+                } else {
+                    break;
+                }
+            }
+        }
+    } else if (output_abundance) {
+        for (auto &[nams, read_len]: {  std::make_pair(std::cref(nams1), read1_len),
+                                        std::make_pair(std::cref(nams2), read2_len) }) {
+            size_t best_score = 0;
+            // We loop twice because we need to count the number of NAMs with best score
+            for (auto &nam : nams) {
+                if (nam.score == nams[0].score){
+                    ++best_score;
+                } else {
+                    break;
+                }
+            }
+            for (auto &nam: nams) {
+                if (nam.ref_start < 0) {
+                    continue;
+                }
+                if (nam.score != nams[0].score){
+                    break;
+                }
+                abundances[nam.ref_id] += float(read_len) / float(best_score);
+            }
+        }
     }
 
     if (isize_est.sample_size < 400 && score_joint > score_indiv) {
@@ -957,7 +1007,8 @@ void align_or_map_paired(
     const IndexParameters& index_parameters,
     const References& references,
     const StrobemerIndex& index,
-    std::minstd_rand& random_engine
+    std::minstd_rand& random_engine,
+    std::vector<double> &abundances
 ) {
     std::array<Details, 2> details;
     std::array<std::vector<Nam>, 2> nams_pair;
@@ -991,18 +1042,24 @@ void align_or_map_paired(
     }
 
     Timer extend_timer;
-    if (!map_param.is_sam_out) {
+    if (map_param.output_format != OutputFormat::SAM) { // PAF or abundance
         Nam nam_read1;
         Nam nam_read2;
-        get_best_map_location(nams_pair[0], nams_pair[1], isize_est,
-                              nam_read1,
-                              nam_read2);
-        output_hits_paf_PE(outstring, nam_read1, record1.name,
-                           references,
-                           record1.seq.length());
-        output_hits_paf_PE(outstring, nam_read2, record2.name,
-                           references,
-                           record2.seq.length());
+        get_best_map_location(
+                nams_pair[0], nams_pair[1],
+                isize_est,
+                nam_read1, nam_read2,
+                record1.seq.length(), record2.seq.length(),
+                abundances,
+                map_param.output_format == OutputFormat::Abundance);
+        if (map_param.output_format == OutputFormat::PAF) {
+            output_hits_paf_PE(outstring, nam_read1, record1.name,
+                            references,
+                            record1.seq.length());
+            output_hits_paf_PE(outstring, nam_read2, record2.name,
+                            references,
+                            record2.seq.length());
+        }
     } else {
         Read read1(record1.seq);
         Read read2(record2.seq);
@@ -1082,7 +1139,8 @@ void align_or_map_single(
     const IndexParameters& index_parameters,
     const References& references,
     const StrobemerIndex& index,
-    std::minstd_rand& random_engine
+    std::minstd_rand& random_engine,
+    std::vector<double> &abundances
 ) {
     Details details;
     Timer strobe_timer;
@@ -1111,15 +1169,41 @@ void align_or_map_single(
 
 
     Timer extend_timer;
-    if (!map_param.is_sam_out) {
-        output_hits_paf(outstring, nams, record.name, references,
-                        record.seq.length());
-    } else {
-        align_single(
-            aligner, sam, nams, record, index_parameters.syncmer.k,
-            references, details, map_param.dropoff_threshold, map_param.max_tries,
-            map_param.max_secondary, random_engine
-        );
+    size_t n_best = 0;
+    switch (map_param.output_format) {
+        case OutputFormat::Abundance: {
+            if (!nams.empty()){
+                for (auto &t : nams){
+                    if (t.score == nams[0].score){
+                        ++n_best;
+                    }else{
+                        break;
+                    }
+                }
+
+                for (auto &nam: nams) {
+                    if (nam.ref_start < 0) {
+                        continue;
+                    }
+                    if (nam.score != nams[0].score){
+                        break;
+                    }
+                    abundances[nam.ref_id] += float(record.seq.length()) / float(n_best);
+                }
+            }
+        }
+        break;
+        case OutputFormat::PAF:
+            output_hits_paf(outstring, nams, record.name, references,
+                            record.seq.length());
+            break;
+        case OutputFormat::SAM:
+            align_single(
+                aligner, sam, nams, record, index_parameters.syncmer.k,
+                references, details, map_param.dropoff_threshold, map_param.max_tries,
+                map_param.max_secondary, random_engine
+            );
+            break;
     }
     statistics.tot_extend += extend_timer.duration();
     statistics += details;

diff --git a/src/aln.hpp b/src/aln.hpp
@@ -56,14 +56,20 @@ struct AlignmentStatistics {
     }
 };
 
+enum class OutputFormat {
+    SAM,
+    PAF,
+    Abundance
+};
+
 struct MappingParameters {
     int r { 150 };
     int max_secondary { 0 };
     float dropoff_threshold { 0.5 };
     int rescue_level { 2 };
     int max_tries { 20 };
     int rescue_cutoff;
-    bool is_sam_out { true };
+    OutputFormat output_format {OutputFormat::SAM};
     CigarOps cigar_ops{CigarOps::M};
     bool output_unmapped { true };
     bool details{false};
@@ -88,7 +94,8 @@ void align_or_map_paired(
     const IndexParameters& index_parameters,
     const References& references,
     const StrobemerIndex& index,
-    std::minstd_rand& random_engine
+    std::minstd_rand& random_engine,
+    std::vector<double> &abundances
 );
 
 void align_or_map_single(
@@ -101,7 +108,8 @@ void align_or_map_single(
     const IndexParameters& index_parameters,
     const References& references,
     const StrobemerIndex& index,
-    std::minstd_rand& random_engine
+    std::minstd_rand& random_engine,
+    std::vector<double> &abundances
 );
 
 // Private declarations, only needed for tests

diff --git a/src/cmdline.cpp b/src/cmdline.cpp
@@ -27,6 +27,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
     args::Flag v(parser, "v", "Verbose output", {'v'});
     args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"});
     args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'});
+    args::Flag aemb(parser, "aemb", "Output the estimated abundance value of contigs, the format of output file is: contig_id \t abundance_value", {"aemb"});
     args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"});
     args::ValueFlag<std::string> index_statistics(parser, "PATH", "Print statistics of indexing to PATH", {"index-statistics"});
     args::Flag i(parser, "index", "Do not map reads; only generate the strobemer index and write it to disk. If read files are provided, they are used to estimate read length", {"create-index", 'i'});
@@ -97,6 +98,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
     if (index_statistics) { opt.logfile_name = args::get(index_statistics); }
     if (i) { opt.only_gen_index = true; }
     if (use_index) { opt.use_index = true; }
+    if (aemb) {opt.is_abundance_out = true; }
 
     // SAM output
     if (eqx) { opt.cigar_eqx = true; }

diff --git a/src/cmdline.hpp b/src/cmdline.hpp
@@ -18,6 +18,7 @@ struct CommandLineOptions {
     bool only_gen_index { false };
     bool use_index { false };
     bool is_sam_out { true };
+    bool is_abundance_out {false};
 
     // SAM output
     bool cigar_eqx { false };

diff --git a/src/main.cpp b/src/main.cpp
@@ -105,6 +105,12 @@ InputBuffer get_input_buffer(const CommandLineOptions& opt) {
     }
 }
 
+void output_abundance(const std::vector<double>& abundances, const References& references){
+        for (size_t i = 0; i < references.size(); ++i) {
+            std::cout << references.names[i] << '\t' << std::fixed << std::setprecision(6) << abundances[i] / double(references.sequences[i].size()) << std::endl;
+        }
+}
+
 void show_progress_until_done(std::vector<int>& worker_done, std::vector<AlignmentStatistics>& stats) {
     Timer timer;
     bool reported = false;
@@ -155,6 +161,11 @@ int run_strobealign(int argc, char **argv) {
     if (opt.c >= 64 || opt.c <= 0) {
         throw BadParameter("c must be greater than 0 and less than 64");
     }
+
+    if (!opt.is_sam_out && opt.is_abundance_out){
+        throw BadParameter("Can not use -x and --aemb at the same time");
+    }
+
     InputBuffer input_buffer = get_input_buffer(opt);
     if (!opt.r_set && !opt.reads_filename1.empty()) {
         opt.r = estimate_read_length(input_buffer);
@@ -184,7 +195,10 @@ int run_strobealign(int argc, char **argv) {
     map_param.dropoff_threshold = opt.dropoff_threshold;
     map_param.rescue_level = opt.rescue_level;
     map_param.max_tries = opt.max_tries;
-    map_param.is_sam_out = opt.is_sam_out;
+    map_param.output_format = (
+            opt.is_abundance_out ? OutputFormat::Abundance :
+            opt.is_sam_out ? OutputFormat::SAM :
+                OutputFormat::PAF);
     map_param.cigar_ops = opt.cigar_eqx ? CigarOps::EQX : CigarOps::M;
     map_param.output_unmapped = opt.output_unmapped;
     map_param.details = opt.details;
@@ -288,32 +302,31 @@ int run_strobealign(int argc, char **argv) {
     }
 
     std::ostream out(buf);
-
-    if (map_param.is_sam_out) {
-        std::stringstream cmd_line;
-        for(int i = 0; i < argc; ++i) {
-            cmd_line << argv[i] << " ";
-        }
-
-        out << sam_header(references, opt.read_group_id, opt.read_group_fields);
-        if (opt.pg_header) {
-            out << pg_header(cmd_line.str());
-        }
+
+    if (map_param.output_format == OutputFormat::SAM) {
+            std::stringstream cmd_line;
+            for(int i = 0; i < argc; ++i) {
+                cmd_line << argv[i] << " ";
+            }
+            out << sam_header(references, opt.read_group_id, opt.read_group_fields);
+            if (opt.pg_header) {
+                out << pg_header(cmd_line.str());
+            }
     }
 
     std::vector<AlignmentStatistics> log_stats_vec(opt.n_threads);
-
+    
     logger.info() << "Running in " << (opt.is_SE ? "single-end" : "paired-end") << " mode" << std::endl;
 
     OutputBuffer output_buffer(out);
-
     std::vector<std::thread> workers;
     std::vector<int> worker_done(opt.n_threads);  // each thread sets its entry to 1 when it’s done
+    std::vector<std::vector<double>> worker_abundances(opt.n_threads, std::vector<double>(references.size(), 0));
     for (int i = 0; i < opt.n_threads; ++i) {
         std::thread consumer(perform_task, std::ref(input_buffer), std::ref(output_buffer),
             std::ref(log_stats_vec[i]), std::ref(worker_done[i]), std::ref(aln_params),
             std::ref(map_param), std::ref(index_parameters), std::ref(references),
-            std::ref(index), std::ref(opt.read_group_id));
+            std::ref(index), std::ref(opt.read_group_id), std::ref(worker_abundances[i]));
         workers.push_back(std::move(consumer));
     }
     if (opt.show_progress && isatty(2)) {
@@ -329,6 +342,19 @@ int run_strobealign(int argc, char **argv) {
         tot_statistics += it;
     }
 
+    if (map_param.output_format == OutputFormat::Abundance) {
+        std::vector<double> abundances(references.size(), 0);
+        for (size_t i = 0; i < worker_abundances.size(); ++i) {
+            for (size_t j = 0; j < worker_abundances[i].size(); ++j) {
+                abundances[j] += worker_abundances[i][j];
+            }
+        }
+
+        // output the abundance file
+        output_abundance(abundances, references);
+    }
+
+
     logger.info() << "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl
         << "Total calls to ssw: " << tot_statistics.tot_aligner_calls << std::endl
         << "Inconsistent NAM ends: " << tot_statistics.inconsistent_nams << std::endl