Skip to content

Commit

Permalink
Merge pull request #394 from psj1997/main
Browse files Browse the repository at this point in the history
strobealign-aemb module for metagenomic binning
  • Loading branch information
marcelm authored Mar 4, 2024
2 parents 79cdd96 + 5a1ab9e commit bd183f5
Show file tree
Hide file tree
Showing 11 changed files with 192 additions and 47 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ strobealign ref.fa reads.1.fastq.gz reads.2.fastq.gz | samtools sort -o sorted.b
This is usually faster than doing the two steps separately because fewer
intermediate files are created.

To output the estimated abundance of every contig, the format of output file is: contig_id \t abundance_value:
```
strobealign ref.fa reads.fq --aemb > abundance.txt # Single-end reads
strobealign ref.fa reads1.fq reads2.fq --aemb > abundance.txt # Paired-end reads
```

## Command-line options

Expand All @@ -127,6 +132,7 @@ options. Some important ones are:
* `--eqx`: Emit `=` and `X` CIGAR operations instead of `M`.
* `-x`: Only map reads, do not do no base-level alignment. This switches the
output format from SAM to [PAF](https://github.com/lh3/miniasm/blob/master/PAF.md).
* `--aemb`: Output the estimated abundance value of every contig, the format of output file is: contig_id \t abundance_value.
* `--rg-id=ID`: Add RG tag to each SAM record.
* `--rg=TAG:VALUE`: Add read group metadata to the SAM header. This can be
specified multiple times. Example: `--rg-id=1 --rg=SM:mysamle --rg=LB:mylibrary`.
Expand Down
130 changes: 107 additions & 23 deletions src/aln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -862,13 +862,17 @@ std::vector<ScoredAlignmentPair> align_paired(
return high_scores;
}

// Only used for PAF output
// Used for PAF and abundances output
inline void get_best_map_location(
std::vector<Nam> &nams1,
std::vector<Nam> &nams2,
InsertSizeDistribution &isize_est,
Nam &best_nam1,
Nam &best_nam2
Nam &best_nam2,
int read1_len,
int read2_len,
std::vector<double> &abundances,
bool output_abundance
) {
std::vector<NamPair> nam_pairs = get_best_scoring_nam_pairs(nams1, nams2, isize_est.mu, isize_est.sigma);
best_nam1.ref_start = -1; //Unmapped until proven mapped
Expand Down Expand Up @@ -903,6 +907,52 @@ inline void get_best_map_location(
if (score_joint > score_indiv) { // joint score is better than individual
best_nam1 = n1_joint_max;
best_nam2 = n2_joint_max;

if (output_abundance){
// we loop twice because we need to count the number of best pairs
size_t n_best = 0;
for (auto &[score, n1, n2] : nam_pairs){
if ((n1.score + n2.score) == score_joint){
++n_best;
} else {
break;
}
}
for (auto &[score, n1, n2] : nam_pairs){
if ((n1.score + n2.score) == score_joint){
if (n1.ref_start >= 0) {
abundances[n1.ref_id] += float(read1_len) / float(n_best);
}
if (n2.ref_start >= 0) {
abundances[n2.ref_id] += float(read2_len) / float(n_best);
}
} else {
break;
}
}
}
} else if (output_abundance) {
for (auto &[nams, read_len]: { std::make_pair(std::cref(nams1), read1_len),
std::make_pair(std::cref(nams2), read2_len) }) {
size_t best_score = 0;
// We loop twice because we need to count the number of NAMs with best score
for (auto &nam : nams) {
if (nam.score == nams[0].score){
++best_score;
} else {
break;
}
}
for (auto &nam: nams) {
if (nam.ref_start < 0) {
continue;
}
if (nam.score != nams[0].score){
break;
}
abundances[nam.ref_id] += float(read_len) / float(best_score);
}
}
}

if (isize_est.sample_size < 400 && score_joint > score_indiv) {
Expand Down Expand Up @@ -957,7 +1007,8 @@ void align_or_map_paired(
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index,
std::minstd_rand& random_engine
std::minstd_rand& random_engine,
std::vector<double> &abundances
) {
std::array<Details, 2> details;
std::array<std::vector<Nam>, 2> nams_pair;
Expand Down Expand Up @@ -991,18 +1042,24 @@ void align_or_map_paired(
}

Timer extend_timer;
if (!map_param.is_sam_out) {
if (map_param.output_format != OutputFormat::SAM) { // PAF or abundance
Nam nam_read1;
Nam nam_read2;
get_best_map_location(nams_pair[0], nams_pair[1], isize_est,
nam_read1,
nam_read2);
output_hits_paf_PE(outstring, nam_read1, record1.name,
references,
record1.seq.length());
output_hits_paf_PE(outstring, nam_read2, record2.name,
references,
record2.seq.length());
get_best_map_location(
nams_pair[0], nams_pair[1],
isize_est,
nam_read1, nam_read2,
record1.seq.length(), record2.seq.length(),
abundances,
map_param.output_format == OutputFormat::Abundance);
if (map_param.output_format == OutputFormat::PAF) {
output_hits_paf_PE(outstring, nam_read1, record1.name,
references,
record1.seq.length());
output_hits_paf_PE(outstring, nam_read2, record2.name,
references,
record2.seq.length());
}
} else {
Read read1(record1.seq);
Read read2(record2.seq);
Expand Down Expand Up @@ -1082,7 +1139,8 @@ void align_or_map_single(
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index,
std::minstd_rand& random_engine
std::minstd_rand& random_engine,
std::vector<double> &abundances
) {
Details details;
Timer strobe_timer;
Expand Down Expand Up @@ -1111,15 +1169,41 @@ void align_or_map_single(


Timer extend_timer;
if (!map_param.is_sam_out) {
output_hits_paf(outstring, nams, record.name, references,
record.seq.length());
} else {
align_single(
aligner, sam, nams, record, index_parameters.syncmer.k,
references, details, map_param.dropoff_threshold, map_param.max_tries,
map_param.max_secondary, random_engine
);
size_t n_best = 0;
switch (map_param.output_format) {
case OutputFormat::Abundance: {
if (!nams.empty()){
for (auto &t : nams){
if (t.score == nams[0].score){
++n_best;
}else{
break;
}
}

for (auto &nam: nams) {
if (nam.ref_start < 0) {
continue;
}
if (nam.score != nams[0].score){
break;
}
abundances[nam.ref_id] += float(record.seq.length()) / float(n_best);
}
}
}
break;
case OutputFormat::PAF:
output_hits_paf(outstring, nams, record.name, references,
record.seq.length());
break;
case OutputFormat::SAM:
align_single(
aligner, sam, nams, record, index_parameters.syncmer.k,
references, details, map_param.dropoff_threshold, map_param.max_tries,
map_param.max_secondary, random_engine
);
break;
}
statistics.tot_extend += extend_timer.duration();
statistics += details;
Expand Down
14 changes: 11 additions & 3 deletions src/aln.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,20 @@ struct AlignmentStatistics {
}
};

enum class OutputFormat {
SAM,
PAF,
Abundance
};

struct MappingParameters {
int r { 150 };
int max_secondary { 0 };
float dropoff_threshold { 0.5 };
int rescue_level { 2 };
int max_tries { 20 };
int rescue_cutoff;
bool is_sam_out { true };
OutputFormat output_format {OutputFormat::SAM};
CigarOps cigar_ops{CigarOps::M};
bool output_unmapped { true };
bool details{false};
Expand All @@ -88,7 +94,8 @@ void align_or_map_paired(
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index,
std::minstd_rand& random_engine
std::minstd_rand& random_engine,
std::vector<double> &abundances
);

void align_or_map_single(
Expand All @@ -101,7 +108,8 @@ void align_or_map_single(
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index,
std::minstd_rand& random_engine
std::minstd_rand& random_engine,
std::vector<double> &abundances
);

// Private declarations, only needed for tests
Expand Down
2 changes: 2 additions & 0 deletions src/cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
args::Flag v(parser, "v", "Verbose output", {'v'});
args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"});
args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'});
args::Flag aemb(parser, "aemb", "Output the estimated abundance value of contigs, the format of output file is: contig_id \t abundance_value", {"aemb"});
args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"});
args::ValueFlag<std::string> index_statistics(parser, "PATH", "Print statistics of indexing to PATH", {"index-statistics"});
args::Flag i(parser, "index", "Do not map reads; only generate the strobemer index and write it to disk. If read files are provided, they are used to estimate read length", {"create-index", 'i'});
Expand Down Expand Up @@ -97,6 +98,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
if (index_statistics) { opt.logfile_name = args::get(index_statistics); }
if (i) { opt.only_gen_index = true; }
if (use_index) { opt.use_index = true; }
if (aemb) {opt.is_abundance_out = true; }

// SAM output
if (eqx) { opt.cigar_eqx = true; }
Expand Down
1 change: 1 addition & 0 deletions src/cmdline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ struct CommandLineOptions {
bool only_gen_index { false };
bool use_index { false };
bool is_sam_out { true };
bool is_abundance_out {false};

// SAM output
bool cigar_eqx { false };
Expand Down
56 changes: 41 additions & 15 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ InputBuffer get_input_buffer(const CommandLineOptions& opt) {
}
}

void output_abundance(const std::vector<double>& abundances, const References& references){
for (size_t i = 0; i < references.size(); ++i) {
std::cout << references.names[i] << '\t' << std::fixed << std::setprecision(6) << abundances[i] / double(references.sequences[i].size()) << std::endl;
}
}

void show_progress_until_done(std::vector<int>& worker_done, std::vector<AlignmentStatistics>& stats) {
Timer timer;
bool reported = false;
Expand Down Expand Up @@ -155,6 +161,11 @@ int run_strobealign(int argc, char **argv) {
if (opt.c >= 64 || opt.c <= 0) {
throw BadParameter("c must be greater than 0 and less than 64");
}

if (!opt.is_sam_out && opt.is_abundance_out){
throw BadParameter("Can not use -x and --aemb at the same time");
}

InputBuffer input_buffer = get_input_buffer(opt);
if (!opt.r_set && !opt.reads_filename1.empty()) {
opt.r = estimate_read_length(input_buffer);
Expand Down Expand Up @@ -184,7 +195,10 @@ int run_strobealign(int argc, char **argv) {
map_param.dropoff_threshold = opt.dropoff_threshold;
map_param.rescue_level = opt.rescue_level;
map_param.max_tries = opt.max_tries;
map_param.is_sam_out = opt.is_sam_out;
map_param.output_format = (
opt.is_abundance_out ? OutputFormat::Abundance :
opt.is_sam_out ? OutputFormat::SAM :
OutputFormat::PAF);
map_param.cigar_ops = opt.cigar_eqx ? CigarOps::EQX : CigarOps::M;
map_param.output_unmapped = opt.output_unmapped;
map_param.details = opt.details;
Expand Down Expand Up @@ -288,32 +302,31 @@ int run_strobealign(int argc, char **argv) {
}

std::ostream out(buf);

if (map_param.is_sam_out) {
std::stringstream cmd_line;
for(int i = 0; i < argc; ++i) {
cmd_line << argv[i] << " ";
}

out << sam_header(references, opt.read_group_id, opt.read_group_fields);
if (opt.pg_header) {
out << pg_header(cmd_line.str());
}

if (map_param.output_format == OutputFormat::SAM) {
std::stringstream cmd_line;
for(int i = 0; i < argc; ++i) {
cmd_line << argv[i] << " ";
}
out << sam_header(references, opt.read_group_id, opt.read_group_fields);
if (opt.pg_header) {
out << pg_header(cmd_line.str());
}
}

std::vector<AlignmentStatistics> log_stats_vec(opt.n_threads);

logger.info() << "Running in " << (opt.is_SE ? "single-end" : "paired-end") << " mode" << std::endl;

OutputBuffer output_buffer(out);

std::vector<std::thread> workers;
std::vector<int> worker_done(opt.n_threads); // each thread sets its entry to 1 when it’s done
std::vector<std::vector<double>> worker_abundances(opt.n_threads, std::vector<double>(references.size(), 0));
for (int i = 0; i < opt.n_threads; ++i) {
std::thread consumer(perform_task, std::ref(input_buffer), std::ref(output_buffer),
std::ref(log_stats_vec[i]), std::ref(worker_done[i]), std::ref(aln_params),
std::ref(map_param), std::ref(index_parameters), std::ref(references),
std::ref(index), std::ref(opt.read_group_id));
std::ref(index), std::ref(opt.read_group_id), std::ref(worker_abundances[i]));
workers.push_back(std::move(consumer));
}
if (opt.show_progress && isatty(2)) {
Expand All @@ -329,6 +342,19 @@ int run_strobealign(int argc, char **argv) {
tot_statistics += it;
}

if (map_param.output_format == OutputFormat::Abundance) {
std::vector<double> abundances(references.size(), 0);
for (size_t i = 0; i < worker_abundances.size(); ++i) {
for (size_t j = 0; j < worker_abundances[i].size(); ++j) {
abundances[j] += worker_abundances[i][j];
}
}

// output the abundance file
output_abundance(abundances, references);
}


logger.info() << "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl
<< "Total calls to ssw: " << tot_statistics.tot_aligner_calls << std::endl
<< "Inconsistent NAM ends: " << tot_statistics.inconsistent_nams << std::endl
Expand Down
Loading

0 comments on commit bd183f5

Please sign in to comment.