Skip to content

Commit

Permalink
Merge pull request #261 from ksahlin/meqx
Browse files Browse the repository at this point in the history
Emit M instead of =/X CIGAR operations by default
  • Loading branch information
marcelm authored Apr 11, 2023
2 parents f4f133c + 56742b9 commit f041502
Show file tree
Hide file tree
Showing 16 changed files with 153 additions and 47 deletions.
16 changes: 9 additions & 7 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,27 @@

## development version

* #258: Fix compilation on MinGW. Thanks @teepean.
* #260: Include full command line in the SAM PG header.
* #258: Fixed compilation on MinGW. Thanks @teepean.
* #260: Include full command line in the SAM PG header. Thanks @telmin.
* #20: By default, emit `M` CIGAR operations instead of `=` and `X`.
Added option `--eqx` to use `=` and `X` as before.

## v0.9.0 (2023-03-16)

* Add progress report (only shown if output is not a terminal; can be
* Added progress report (only shown if output is not a terminal; can be
disabled with `--no-progress`)
* PR #250: Avoid overeager soft clipping by adding an “end bonus” to the
alignment score if the alignment reaches the 5' or 3' end of the read.
This is equivalent to penalizing soft-clipping and improves mapping
accuracy, in particular for short reads, as candidate mapping sites with and
without soft clipping are compared more fairly. Use `-L` to change the end
bonus. (This emulates a feature found in BWA-MEM.)
* Issue #238: Fix occasionally incorrect soft clipping.
* PR #239: Fix an uninitialized variable that could lead to nondeterministic
* Issue #238: Fixed occasionally incorrect soft clipping.
* PR #239: Fixed an uninitialized variable that could lead to nondeterministic
results.
* Issue #137: Compute TLEN (in SAM output) correctly
* PR #255: Add support for reading gzip-compressed reference FASTA files.
* Issue #222: Make it possible again to build strobealign from the release
* PR #255: Added support for reading gzip-compressed reference FASTA files.
* Issue #222: Made it possible again to build strobealign from the release
tarball (not only from the Git repository).

## v0.8.0 (2023-02-01)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ options. Some important ones are:
`--create-index`, see [index files](#index-files).
* `-t N`, `--threads=N`: Use N threads. This mainly applies to the mapping step
as the indexing step is only partially parallelized.
* `--eqx`: Emit `=` and `X` CIGAR operations instead of `M`.
* `-x`: Only map reads, do not do no base-level alignment. This switches the
output format from SAM to [PAF](https://github.com/lh3/miniasm/blob/master/PAF.md).
* `--rg-id=ID`: Add RG tag to each SAM record.
Expand Down
33 changes: 15 additions & 18 deletions src/aln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,14 @@ static inline void align_SE_secondary_hits(
if (sam_aln.sw_score > best_align_sw_score) {
min_mapq_diff = std::max(0, sam_aln.sw_score - best_align_sw_score); // new distance to next best match
best_align_sw_score = sam_aln.sw_score;
best_sam_aln = sam_aln;
best_sam_aln = std::move(sam_aln);
if (max_secondary == 1) {
best_align_dist = sam_aln.global_ed;
best_align_dist = best_sam_aln.global_ed;
}
}
// sam_aln.ed = 10000; // init
if (max_secondary > 1) {
alignments.push_back(sam_aln);
alignments.emplace_back(sam_aln);
}
cnt++;
}
Expand Down Expand Up @@ -181,7 +181,7 @@ static inline alignment align_segment(

if (hamming_dist >= 0 && (((float) hamming_dist / read_segm_len) < 0.05) ) { //Hamming distance worked fine, no need to ksw align
auto info = hamming_align(read_segm, ref_segm_ham, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus);
sam_aln_segm.cigar = info.cigar.to_string();
sam_aln_segm.cigar = std::move(info.cigar);
sam_aln_segm.ed = info.edit_distance;
sam_aln_segm.sw_score = info.sw_score;
sam_aln_segm.ref_start = ref_start + ext_left + info.query_start;
Expand All @@ -193,7 +193,7 @@ static inline alignment align_segment(
}
}
auto info = aligner.align(read_segm, ref_segm);
sam_aln_segm.cigar = info.cigar.to_string();
sam_aln_segm.cigar = std::move(info.cigar);
sam_aln_segm.ed = info.edit_distance;
sam_aln_segm.sw_score = info.sw_score;
sam_aln_segm.ref_start = ref_start + info.ref_start;
Expand Down Expand Up @@ -262,7 +262,7 @@ static inline alignment get_alignment(
}
int softclipped = info.query_start + (query.size() - info.query_end);
alignment sam_aln;
sam_aln.cigar = info.cigar.to_string();
sam_aln.cigar = std::move(info.cigar);
sam_aln.ed = info.edit_distance;
sam_aln.global_ed = info.edit_distance + softclipped;
sam_aln.sw_score = info.sw_score;
Expand Down Expand Up @@ -310,8 +310,8 @@ static inline alignment get_alignment_unused(
int hamming_dist = hamming_distance(r_tmp, ref_segm);

if (hamming_dist >= 0 && (((float) hamming_dist / ref_segm_size) < 0.05) ) { //Hamming distance worked fine, no need to ksw align
const auto info = hamming_align(r_tmp, ref_segm, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus);
sam_aln.cigar = info.cigar.to_string();
auto info = hamming_align(r_tmp, ref_segm, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus);
sam_aln.cigar = std::move(info.cigar);
sam_aln.ed = info.edit_distance;
sam_aln.sw_score = info.sw_score; // aln_params.match*(read_len-hamming_dist) - aln_params.mismatch*hamming_dist;
sam_aln.ref_start = ref_start + ext_left + info.query_start;
Expand Down Expand Up @@ -383,7 +383,7 @@ static inline alignment get_alignment_unused(

// Stitch together
sam_aln.ref_id = n.ref_id;
sam_aln.cigar = sam_aln_segm_left.cigar + sam_aln_segm_right.cigar;
sam_aln.cigar = Cigar(sam_aln_segm_left.cigar.to_string() + sam_aln_segm_right.cigar.to_string());
sam_aln.ed = sam_aln_segm_left.ed + sam_aln_segm_right.ed;
sam_aln.sw_score = sam_aln_segm_left.sw_score + sam_aln_segm_right.sw_score;
sam_aln.ref_start = sam_aln_segm_left.ref_start;
Expand Down Expand Up @@ -568,9 +568,6 @@ static inline bool sort_scores(const std::tuple<double, alignment, alignment> &a
return std::get<0>(a) > std::get<0>(b);
}




static inline void get_best_scoring_pair(
const std::vector<alignment> &aln_scores1, const std::vector<alignment> &aln_scores2, std::vector<std::tuple<double,alignment,alignment>> &high_scores, float mu, float sigma
) {
Expand All @@ -585,7 +582,7 @@ static inline void get_best_scoring_pair(
// 10 corresponds to a value of log(normal_pdf(dist, mu, sigma)) of more than 4 stddevs away
score -= 10;
}
high_scores.push_back(std::make_tuple(score, a1, a2));
high_scores.emplace_back(std::make_tuple(score, a1, a2));
}
}
std::sort(high_scores.begin(), high_scores.end(), sort_scores); // Sorting by highest score first
Expand Down Expand Up @@ -742,7 +739,7 @@ static inline void rescue_mate(
ref_end = std::min(ref_len, std::max(0, b));

if (ref_end < ref_start + k){
sam_aln.cigar = "*";
sam_aln.cigar = Cigar();
sam_aln.ed = read_len;
sam_aln.sw_score = 0;
sam_aln.aln_score = 0;
Expand All @@ -756,7 +753,7 @@ static inline void rescue_mate(
std::string ref_segm = references.sequences[n.ref_id].substr(ref_start, ref_end - ref_start);

if (!has_shared_substring(r_tmp, ref_segm, k)){
sam_aln.cigar = "*";
sam_aln.cigar = Cigar();
sam_aln.ed = read_len;
sam_aln.sw_score = 0;
sam_aln.aln_score = 0;
Expand Down Expand Up @@ -786,7 +783,7 @@ static inline void rescue_mate(
// info = ksw_align(ref_ptr, ref_segm.size(), read_ptr, r_tmp.size(), 1, 4, 6, 1, ez);
// std::cerr << "Cigar: " << info.cigar << std::endl;

sam_aln.cigar = info.cigar.to_string();
sam_aln.cigar = info.cigar;
sam_aln.ed = info.edit_distance;
sam_aln.sw_score = info.sw_score;
sam_aln.aln_score = sam_aln.sw_score;
Expand Down Expand Up @@ -837,13 +834,13 @@ void rescue_read(
statistics.did_not_fit++;
}
alignment a1 = get_alignment(aligner, n, references, read1, fits);
aln_scores1.push_back(a1);
aln_scores1.emplace_back(a1);

//////// Force SW alignment to rescue mate /////////
alignment a2;
// std::cerr << query_acc2 << " force rescue" << std::endl;
rescue_mate(aligner, n, references, read1, read2, a2, mu, sigma, statistics.tot_rescued, k);
aln_scores2.push_back(a2);
aln_scores2.emplace_back(a2);

cnt1 ++;
statistics.tot_all_tried ++;
Expand Down
1 change: 1 addition & 0 deletions src/aln.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ struct mapping_params {
int maxTries { 20 };
int rescue_cutoff;
bool is_sam_out { true };
bool cigar_eqx { true };
bool output_unmapped { true };
};

Expand Down
14 changes: 14 additions & 0 deletions src/cigar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@
#include <sstream>
#include "cigar.hpp"

/* Return a new Cigar that uses M operations instead of =/X */
Cigar Cigar::to_m() const {
Cigar cigar;
for (auto op_len : m_ops) {
auto op = op_len & 0xf;
auto len = op_len >> 4;
if (op == CIGAR_EQ || op == CIGAR_X) {
cigar.push(CIGAR_MATCH, len);
} else {
cigar.push(op, len);
}
}
return cigar;
}

Cigar Cigar::to_eqx(const std::string& query, const std::string& ref) const {
size_t i = 0, j = 0;
Expand Down
7 changes: 6 additions & 1 deletion src/cigar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class Cigar {

explicit Cigar(std::vector<uint32_t> ops) : m_ops(std::move(ops)) { }

Cigar(Cigar& other) : m_ops(other.m_ops) { }
Cigar(const Cigar& other) : m_ops(other.m_ops) { }

explicit Cigar(uint32_t* ops, size_t n) {
m_ops.assign(ops, ops + n);
Expand All @@ -38,6 +38,8 @@ class Cigar {
*this = std::move(other);
}

Cigar& operator=(const Cigar&) = default;

Cigar& operator=(Cigar&& other) {
if (this != &other) {
m_ops = std::move(other.m_ops);
Expand Down Expand Up @@ -79,6 +81,9 @@ class Cigar {
std::reverse(m_ops.begin(), m_ops.end());
}

/* Return a new Cigar that uses M instead of =/X */
Cigar to_m() const;

/* Return a new Cigar that uses =/X instead of M */
Cigar to_eqx(const std::string& query, const std::string& ref) const;

Expand Down
2 changes: 2 additions & 0 deletions src/cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
args::ValueFlag<std::string> o(parser, "PATH", "redirect output to file [stdout]", {'o'});
args::Flag v(parser, "v", "Verbose output", {'v'});
args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"});
args::Flag eqx(parser, "eqx", "Emit =/X instead of M CIGAR operations", {"eqx"});
args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'});
args::Flag U(parser, "U", "Suppress output of unmapped reads", {'U'});
args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"});
Expand Down Expand Up @@ -88,6 +89,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
if (o) { opt.output_file_name = args::get(o); opt.write_to_stdout = false; }
if (v) { opt.verbose = true; }
if (no_progress) { opt.show_progress = false; }
if (eqx) { opt.cigar_eqx = true; }
if (x) { opt.is_sam_out = false; }
if (U) { opt.output_unmapped = false; }
if (rgid) { opt.read_group_id = args::get(rgid); }
Expand Down
1 change: 1 addition & 0 deletions src/cmdline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ struct CommandLineOptions {
bool write_to_stdout { true };
bool verbose { false };
bool show_progress { true };
bool cigar_eqx { false };
std::string read_group_id { "" };
std::vector<std::string> read_group_fields;
std::string logfile_name { "" };
Expand Down
1 change: 1 addition & 0 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ int run_strobealign(int argc, char **argv) {
map_param.R = opt.R;
map_param.maxTries = opt.maxTries;
map_param.is_sam_out = opt.is_sam_out;
map_param.cigar_eqx = opt.cigar_eqx;
map_param.output_unmapped = opt.output_unmapped;

log_parameters(index_parameters, map_param, aln_params);
Expand Down
3 changes: 2 additions & 1 deletion src/pc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ void perform_task(

std::string sam_out;
sam_out.reserve(7*map_param.r * (records1.size() + records3.size()));
Sam sam{sam_out, references, read_group_id, map_param.output_unmapped};
CigarOps cigar_ops = map_param.cigar_eqx ? CigarOps::EQX : CigarOps::M;
Sam sam{sam_out, references, cigar_ops, read_group_id, map_param.output_unmapped};
for (size_t i = 0; i < records1.size(); ++i) {
auto record1 = records1[i];
auto record2 = records2[i];
Expand Down
18 changes: 16 additions & 2 deletions src/sam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,20 @@ void Sam::append_tail() {
sam_string.append(tail);
}

std::string Sam::cigar_string(const Cigar& cigar) const {
if (cigar.empty()) {
// This case should normally not occur because
// unmapped reads would be added with add_unmapped,
// which hardcodes the "*"
return "*";
}
if (cigar_ops == CigarOps::EQX) {
return cigar.to_string();
} else {
return cigar.to_m().to_string();
}
}

void Sam::add_unmapped(const KSeq& record, int flags) {
if (!output_unmapped) {
return;
Expand Down Expand Up @@ -104,7 +118,7 @@ void Sam::add_record(
const std::string& reference_name,
int pos,
int mapq,
const std::string& cigar,
const Cigar& cigar,
const std::string& mate_name,
int mate_ref_start,
int template_len,
Expand All @@ -124,7 +138,7 @@ void Sam::add_record(
sam_string.append("\t");
sam_string.append(std::to_string(mapq));
sam_string.append("\t");
sam_string.append(cigar);
sam_string.append(cigar_string(cigar));
sam_string.append("\t");

sam_string.append(mate_name);
Expand Down
26 changes: 21 additions & 5 deletions src/sam.hpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#ifndef SAM_HPP
#define SAM_HPP
#ifndef STROBEALIGN_SAM_HPP
#define STROBEALIGN_SAM_HPP

#include <string>
#include "kseq++.hpp"
#include "refs.hpp"
#include "cigar.hpp"


struct alignment {
std::string cigar;
Cigar cigar;
int ref_start;
int ed;
int global_ed;
Expand Down Expand Up @@ -34,12 +36,24 @@ enum SamFlags {
SUPPLEMENTARY = 0x800,
};

enum struct CigarOps {
EQX = 0, // use = and X CIGAR operations
M = 1, // use M CIGAR operations
};

class Sam {

public:
Sam(std::string& sam_string, const References& references, const std::string& read_group_id = "", bool output_unmapped = true)
Sam(
std::string& sam_string,
const References& references,
CigarOps cigar_ops = CigarOps::EQX,
const std::string& read_group_id = "",
bool output_unmapped = true
)
: sam_string(sam_string)
, references(references)
, cigar_ops(cigar_ops)
, output_unmapped(output_unmapped) {
if (read_group_id.empty()) {
tail = "\n";
Expand All @@ -55,12 +69,14 @@ class Sam {
void add_unmapped_pair(const klibpp::KSeq& r1, const klibpp::KSeq& r2);
void add_unmapped_mate(const klibpp::KSeq& record, int flags, const std::string& mate_rname, int mate_pos);

void add_record(const std::string& query_name, int flags, const std::string& reference_name, int pos, int mapq, const std::string& cigar, const std::string& mate_name, int mate_ref_start, int template_len, const std::string& query_sequence, const std::string& query_sequence_rc, const std::string& qual, int ed, int aln_score);
void add_record(const std::string& query_name, int flags, const std::string& reference_name, int pos, int mapq, const Cigar& cigar, const std::string& mate_name, int mate_ref_start, int template_len, const std::string& query_sequence, const std::string& query_sequence_rc, const std::string& qual, int ed, int aln_score);

private:
void append_tail();
std::string cigar_string(const Cigar& cigar) const;
std::string& sam_string;
const References& references;
const CigarOps cigar_ops;
std::string tail;
bool output_unmapped;
};
Expand Down
10 changes: 8 additions & 2 deletions tests/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,18 @@ if strobealign -G > /dev/null 2> /dev/null; then false; fi
strobealign -h > /dev/null

# Single-end SAM
strobealign --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library -v tests/phix.fasta tests/phix.1.fastq | grep -v '^@PG' > phix.se.sam
strobealign --eqx --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library -v tests/phix.fasta tests/phix.1.fastq | grep -v '^@PG' > phix.se.sam
diff tests/phix.se.sam phix.se.sam
rm phix.se.sam

# Single-end SAM, M CIGAR operators
strobealign tests/phix.fasta tests/phix.1.fastq | grep -v '^@PG' > phix.se.m.sam
if samtools view phix.se.m.sam | cut -f6 | grep -q '[X=]'; then false; fi

rm phix.se.m.sam

# Paired-end SAM
strobealign --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library tests/phix.fasta tests/phix.1.fastq tests/phix.2.fastq | grep -v '^@PG' > phix.pe.sam
strobealign --eqx --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library tests/phix.fasta tests/phix.1.fastq tests/phix.2.fastq | grep -v '^@PG' > phix.pe.sam
diff tests/phix.pe.sam phix.pe.sam
rm phix.pe.sam

Expand Down
2 changes: 1 addition & 1 deletion tests/test_aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ TEST_CASE("hamming_align") {
"", "",
7, 5, 0
);
CHECK(info.cigar.to_string() == "");
CHECK(info.cigar.empty());
CHECK(info.edit_distance == 0);
CHECK(info.sw_score == 0);
CHECK(info.ref_start == 0);
Expand Down
Loading

0 comments on commit f041502

Please sign in to comment.