Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

strobealign-aemb module for metagenomic binning #394

Merged
merged 29 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a4a22c4
Update aemb module
psj1997 Feb 12, 2024
f113da2
Code improve; documentation update; testing update
psj1997 Feb 20, 2024
3ced054
Merge branch 'ksahlin:main' into main
psj1997 Feb 20, 2024
c09d367
Update single-end abundance testing file
psj1997 Feb 20, 2024
0719ec5
Update paired-end abundance testing file
psj1997 Feb 20, 2024
7418bb6
Vendor zstr instead of fetching it from Git at build time
marcelm Jan 22, 2024
e9c453c
Use poolstl to sort randstrobes in parallel
marcelm Dec 7, 2023
c689d89
Bump poolSTL and use poolstl::pluggable_sort
marcelm Jan 14, 2024
bc96aa7
Bump to poolSTL 0.3.4
marcelm Jan 22, 2024
1c63767
Update baseline commit
marcelm Jan 22, 2024
6e07fcc
Bump poolSTL to 0.3.5
marcelm Jan 31, 2024
3540334
Update baseline commit
marcelm Jan 31, 2024
af350a6
Ensure sorting of randstrobes is reproducible
marcelm Feb 14, 2024
f1de1e2
Update baseline commit
marcelm Feb 14, 2024
dad60c8
Introduce canonical read length 75
marcelm Feb 16, 2024
3eebfe9
Explicit error if too many sequences are used
luispedro Feb 9, 2024
9a79d89
Update aemb module
psj1997 Feb 12, 2024
358a5b2
Code improve; documentation update; testing update
psj1997 Feb 20, 2024
cec26e8
Update single-end abundance testing file
psj1997 Feb 20, 2024
7be8e2c
Update paired-end abundance testing file
psj1997 Feb 20, 2024
47770ee
RFCT Simplify code by merging duplications
luispedro Feb 21, 2024
677edcc
RFCT Simplify output_format handling
luispedro Feb 21, 2024
ff7222a
RFCT Merge duplicated code
luispedro Feb 21, 2024
0948cd1
Merge branch 'main' into rfct_aemb
psj1997 Feb 21, 2024
782fed8
Merge pull request #3 from luispedro/rfct_aemb
psj1997 Feb 21, 2024
372b868
BUG fix of RFCT aemb
psj1997 Feb 21, 2024
383515d
Remove unused vector in single-end mode
psj1997 Feb 21, 2024
8c32350
Minor fix
psj1997 Feb 26, 2024
5a1ab9e
Update src/main.cpp
psj1997 Feb 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ strobealign ref.fa reads.1.fastq.gz reads.2.fastq.gz | samtools sort -o sorted.b
This is usually faster than doing the two steps separately because fewer
intermediate files are created.

To output the estimated abundance of every contig, the format of output file is: contig_id \t abundance_value:
```
strobealign ref.fa reads.fq --aemb > abundance.txt # Single-end reads
strobealign ref.fa reads1.fq reads2.fq --aemb > abundance.txt # Paired-end reads
```

## Command-line options

Expand All @@ -127,6 +132,7 @@ options. Some important ones are:
* `--eqx`: Emit `=` and `X` CIGAR operations instead of `M`.
* `-x`: Only map reads, do not do no base-level alignment. This switches the
output format from SAM to [PAF](https://github.com/lh3/miniasm/blob/master/PAF.md).
* `--aemb`: Output the estimated abundance value of every contig, the format of output file is: contig_id \t abundance_value.
* `--rg-id=ID`: Add RG tag to each SAM record.
* `--rg=TAG:VALUE`: Add read group metadata to the SAM header. This can be
specified multiple times. Example: `--rg-id=1 --rg=SM:mysamle --rg=LB:mylibrary`.
Expand Down
130 changes: 107 additions & 23 deletions src/aln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -862,13 +862,17 @@ std::vector<ScoredAlignmentPair> align_paired(
return high_scores;
}

// Only used for PAF output
// Used for PAF and abundances output
inline void get_best_map_location(
std::vector<Nam> &nams1,
std::vector<Nam> &nams2,
InsertSizeDistribution &isize_est,
Nam &best_nam1,
Nam &best_nam2
Nam &best_nam2,
int read1_len,
int read2_len,
std::vector<double> &abundances,
bool output_abundance
) {
std::vector<NamPair> nam_pairs = get_best_scoring_nam_pairs(nams1, nams2, isize_est.mu, isize_est.sigma);
best_nam1.ref_start = -1; //Unmapped until proven mapped
Expand Down Expand Up @@ -903,6 +907,52 @@ inline void get_best_map_location(
if (score_joint > score_indiv) { // joint score is better than individual
best_nam1 = n1_joint_max;
best_nam2 = n2_joint_max;

if (output_abundance){
// we loop twice because we need to count the number of best pairs
size_t n_best = 0;
for (auto &[score, n1, n2] : nam_pairs){
if ((n1.score + n2.score) == score_joint){
++n_best;
} else {
break;
}
}
for (auto &[score, n1, n2] : nam_pairs){
if ((n1.score + n2.score) == score_joint){
if (n1.ref_start >= 0) {
abundances[n1.ref_id] += float(read1_len) / float(n_best);
}
if (n2.ref_start >= 0) {
abundances[n2.ref_id] += float(read2_len) / float(n_best);
}
} else {
break;
}
}
}
} else if (output_abundance) {
for (auto &[nams, read_len]: { std::make_pair(std::cref(nams1), read1_len),
std::make_pair(std::cref(nams2), read2_len) }) {
size_t best_score = 0;
// We loop twice because we need to count the number of NAMs with best score
for (auto &nam : nams) {
if (nam.score == nams[0].score){
++best_score;
} else {
break;
}
}
for (auto &nam: nams) {
if (nam.ref_start < 0) {
continue;
}
if (nam.score != nams[0].score){
break;
}
abundances[nam.ref_id] += float(read_len) / float(best_score);
}
}
}

if (isize_est.sample_size < 400 && score_joint > score_indiv) {
Expand Down Expand Up @@ -957,7 +1007,8 @@ void align_or_map_paired(
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index,
std::minstd_rand& random_engine
std::minstd_rand& random_engine,
std::vector<double> &abundances
) {
std::array<Details, 2> details;
std::array<std::vector<Nam>, 2> nams_pair;
Expand Down Expand Up @@ -991,18 +1042,24 @@ void align_or_map_paired(
}

Timer extend_timer;
if (!map_param.is_sam_out) {
if (map_param.output_format != OutputFormat::SAM) { // PAF or abundance
Nam nam_read1;
Nam nam_read2;
get_best_map_location(nams_pair[0], nams_pair[1], isize_est,
nam_read1,
nam_read2);
output_hits_paf_PE(outstring, nam_read1, record1.name,
references,
record1.seq.length());
output_hits_paf_PE(outstring, nam_read2, record2.name,
references,
record2.seq.length());
get_best_map_location(
nams_pair[0], nams_pair[1],
isize_est,
nam_read1, nam_read2,
record1.seq.length(), record2.seq.length(),
abundances,
map_param.output_format == OutputFormat::Abundance);
if (map_param.output_format == OutputFormat::PAF) {
output_hits_paf_PE(outstring, nam_read1, record1.name,
references,
record1.seq.length());
output_hits_paf_PE(outstring, nam_read2, record2.name,
references,
record2.seq.length());
}
} else {
Read read1(record1.seq);
Read read2(record2.seq);
Expand Down Expand Up @@ -1082,7 +1139,8 @@ void align_or_map_single(
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index,
std::minstd_rand& random_engine
std::minstd_rand& random_engine,
std::vector<double> &abundances
) {
Details details;
Timer strobe_timer;
Expand Down Expand Up @@ -1111,15 +1169,41 @@ void align_or_map_single(


Timer extend_timer;
if (!map_param.is_sam_out) {
output_hits_paf(outstring, nams, record.name, references,
record.seq.length());
} else {
align_single(
aligner, sam, nams, record, index_parameters.syncmer.k,
references, details, map_param.dropoff_threshold, map_param.max_tries,
map_param.max_secondary, random_engine
);
size_t n_best = 0;
switch (map_param.output_format) {
case OutputFormat::Abundance: {
if (!nams.empty()){
for (auto &t : nams){
if (t.score == nams[0].score){
++n_best;
}else{
break;
}
}

for (auto &nam: nams) {
if (nam.ref_start < 0) {
continue;
}
if (nam.score != nams[0].score){
break;
}
abundances[nam.ref_id] += float(record.seq.length()) / float(n_best);
}
}
}
break;
case OutputFormat::PAF:
output_hits_paf(outstring, nams, record.name, references,
record.seq.length());
break;
case OutputFormat::SAM:
align_single(
aligner, sam, nams, record, index_parameters.syncmer.k,
references, details, map_param.dropoff_threshold, map_param.max_tries,
map_param.max_secondary, random_engine
);
break;
}
statistics.tot_extend += extend_timer.duration();
statistics += details;
Expand Down
14 changes: 11 additions & 3 deletions src/aln.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,20 @@ struct AlignmentStatistics {
}
};

enum class OutputFormat {
SAM,
PAF,
Abundance
};

struct MappingParameters {
int r { 150 };
int max_secondary { 0 };
float dropoff_threshold { 0.5 };
int rescue_level { 2 };
int max_tries { 20 };
int rescue_cutoff;
bool is_sam_out { true };
OutputFormat output_format {OutputFormat::SAM};
CigarOps cigar_ops{CigarOps::M};
bool output_unmapped { true };
bool details{false};
Expand All @@ -88,7 +94,8 @@ void align_or_map_paired(
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index,
std::minstd_rand& random_engine
std::minstd_rand& random_engine,
std::vector<double> &abundances
);

void align_or_map_single(
Expand All @@ -101,7 +108,8 @@ void align_or_map_single(
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index,
std::minstd_rand& random_engine
std::minstd_rand& random_engine,
std::vector<double> &abundances
);

// Private declarations, only needed for tests
Expand Down
2 changes: 2 additions & 0 deletions src/cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
args::Flag v(parser, "v", "Verbose output", {'v'});
args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"});
args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'});
args::Flag aemb(parser, "aemb", "Output the estimated abundance value of contigs, the format of output file is: contig_id \t abundance_value", {"aemb"});
args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"});
args::ValueFlag<std::string> index_statistics(parser, "PATH", "Print statistics of indexing to PATH", {"index-statistics"});
args::Flag i(parser, "index", "Do not map reads; only generate the strobemer index and write it to disk. If read files are provided, they are used to estimate read length", {"create-index", 'i'});
Expand Down Expand Up @@ -97,6 +98,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
if (index_statistics) { opt.logfile_name = args::get(index_statistics); }
if (i) { opt.only_gen_index = true; }
if (use_index) { opt.use_index = true; }
if (aemb) {opt.is_abundance_out = true; }

// SAM output
if (eqx) { opt.cigar_eqx = true; }
Expand Down
1 change: 1 addition & 0 deletions src/cmdline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ struct CommandLineOptions {
bool only_gen_index { false };
bool use_index { false };
bool is_sam_out { true };
bool is_abundance_out {false};

// SAM output
bool cigar_eqx { false };
Expand Down
56 changes: 41 additions & 15 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ InputBuffer get_input_buffer(const CommandLineOptions& opt) {
}
}

void output_abundance(const std::vector<double>& abundances, const References& references){
for (size_t i = 0; i < references.size(); ++i) {
std::cout << references.names[i] << '\t' << std::fixed << std::setprecision(6) << abundances[i] / double(references.sequences[i].size()) << std::endl;
}
}

void show_progress_until_done(std::vector<int>& worker_done, std::vector<AlignmentStatistics>& stats) {
Timer timer;
bool reported = false;
Expand Down Expand Up @@ -155,6 +161,11 @@ int run_strobealign(int argc, char **argv) {
if (opt.c >= 64 || opt.c <= 0) {
throw BadParameter("c must be greater than 0 and less than 64");
}

if (!opt.is_sam_out && opt.is_abundance_out){
throw BadParameter("Can not use -x and --aemb at the same time");
}

InputBuffer input_buffer = get_input_buffer(opt);
if (!opt.r_set && !opt.reads_filename1.empty()) {
opt.r = estimate_read_length(input_buffer);
Expand Down Expand Up @@ -184,7 +195,10 @@ int run_strobealign(int argc, char **argv) {
map_param.dropoff_threshold = opt.dropoff_threshold;
map_param.rescue_level = opt.rescue_level;
map_param.max_tries = opt.max_tries;
map_param.is_sam_out = opt.is_sam_out;
map_param.output_format = (
opt.is_abundance_out ? OutputFormat::Abundance :
opt.is_sam_out ? OutputFormat::SAM :
OutputFormat::PAF);
map_param.cigar_ops = opt.cigar_eqx ? CigarOps::EQX : CigarOps::M;
map_param.output_unmapped = opt.output_unmapped;
map_param.details = opt.details;
Expand Down Expand Up @@ -288,32 +302,31 @@ int run_strobealign(int argc, char **argv) {
}

std::ostream out(buf);

if (map_param.is_sam_out) {
std::stringstream cmd_line;
for(int i = 0; i < argc; ++i) {
cmd_line << argv[i] << " ";
}

out << sam_header(references, opt.read_group_id, opt.read_group_fields);
if (opt.pg_header) {
out << pg_header(cmd_line.str());
}

if (map_param.output_format == OutputFormat::SAM) {
std::stringstream cmd_line;
for(int i = 0; i < argc; ++i) {
cmd_line << argv[i] << " ";
}
out << sam_header(references, opt.read_group_id, opt.read_group_fields);
if (opt.pg_header) {
out << pg_header(cmd_line.str());
}
}

std::vector<AlignmentStatistics> log_stats_vec(opt.n_threads);

logger.info() << "Running in " << (opt.is_SE ? "single-end" : "paired-end") << " mode" << std::endl;

OutputBuffer output_buffer(out);

std::vector<std::thread> workers;
std::vector<int> worker_done(opt.n_threads); // each thread sets its entry to 1 when it’s done
std::vector<std::vector<double>> worker_abundances(opt.n_threads, std::vector<double>(references.size(), 0));
for (int i = 0; i < opt.n_threads; ++i) {
std::thread consumer(perform_task, std::ref(input_buffer), std::ref(output_buffer),
std::ref(log_stats_vec[i]), std::ref(worker_done[i]), std::ref(aln_params),
std::ref(map_param), std::ref(index_parameters), std::ref(references),
std::ref(index), std::ref(opt.read_group_id));
std::ref(index), std::ref(opt.read_group_id), std::ref(worker_abundances[i]));
workers.push_back(std::move(consumer));
}
if (opt.show_progress && isatty(2)) {
Expand All @@ -329,6 +342,19 @@ int run_strobealign(int argc, char **argv) {
tot_statistics += it;
}

if (map_param.output_format == OutputFormat::Abundance) {
std::vector<double> abundances(references.size(), 0);
for (size_t i = 0; i < worker_abundances.size(); ++i) {
for (size_t j = 0; j < worker_abundances[i].size(); ++j) {
abundances[j] += worker_abundances[i][j];
}
}

// output the abundance file
output_abundance(abundances, references);
}


logger.info() << "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl
<< "Total calls to ssw: " << tot_statistics.tot_aligner_calls << std::endl
<< "Inconsistent NAM ends: " << tot_statistics.inconsistent_nams << std::endl
Expand Down
Loading
Loading