Skip to content

Commit

Permalink
Merge pull request #16 from vpc-ccg/dev-clean
Browse files Browse the repository at this point in the history
Fixed truncation bug with negative strands
  • Loading branch information
f0t1h authored Jul 25, 2023
2 parents 6cb8360 + 77d50e7 commit e21ec1e
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 93 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ bin
*.pyc
test/build
output/
compile_flags.txt
76 changes: 6 additions & 70 deletions src/interval.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,13 @@ class interval {
}
return 0;
}
friend int larger_interval(const interval &i1, const interval &i2) {
return std::max(i1.end - i1.start, i2.end - i2.start);
}
friend int larger_interval(const interval &i1, const interval &i2) { return std::max(i1.size(), i2.size()); }
double reciprocal(const interval &other) const {
return static_cast<double>(overlap(other)) / larger_interval(*this, other);
}
bool contains(int pos) const { return pos > start && pos < end; }
constexpr bool contains(int pos) const { return pos > start && pos < end; }

int size() const { return end - start; }
constexpr int size() const { return end - start; }
};

class contig_str : public string {
Expand Down Expand Up @@ -857,10 +855,9 @@ struct molecule_descriptor {
return this;
}

size_t size() const {
return std::accumulate(
_segments.begin(), _segments.end(), 0L,
[](size_t sum_so_far, const ginterval &g) -> size_t { return sum_so_far + g.end - g.start; });
constexpr size_t size() const {
return std::accumulate(_segments.begin(), _segments.end(), 0L,
[](size_t sum_so_far, const ginterval &g) -> size_t { return sum_so_far + g.size(); });
}

string dump_comment() const {
Expand Down Expand Up @@ -903,66 +900,5 @@ flip_molecule(const molecule_descriptor &md) {

return flipped_md;
}
/*
// pcr copy structure that tracks pcr errors introduced
struct pcr_copy {
string id;
vector<ginterval> segments;
vector<std::pair<int, char>> errors_so_far;
bool reversed;
int depth;
string comment;
pcr_copy() {}
pcr_copy(const string &id) : id(id), depth(1) {}
pcr_copy(const string &id, const isoform &iso) : id(id), depth(iso.depth) {
for (const exon &e : iso.segments) {
segments.push_back(e);
}
}
pcr_copy(const isoform &iso) : id(iso.transcript_id), depth(iso.depth) {
for (const exon &e : iso.segments) {
segments.push_back(e);
}
}
pcr_copy(const string &id, const vector<ginterval> &segments, const vector<std::pair<int, char>> &errors_so_far)
: id(id), segments(segments), errors_so_far(errors_so_far), depth(1) {}
pcr_copy(const string &id, const vector<ginterval> &segments, const vector<std::pair<int, char>> &errors_so_far,
int depth)
: id(id), segments(segments), errors_so_far(errors_so_far), depth(depth) {}
pcr_copy(const vector<ginterval> &segments, const vector<std::pair<int, char>> &errors_so_far)
: id("copy"), segments(segments), errors_so_far(errors_so_far), depth(1) {}
pcr_copy(const vector<ginterval> &segments) : id("copy"), segments(segments), depth(1) {}

void append(const ginterval &g) { segments.push_back(g); }
void prepend(const ginterval &g) {
segments.insert(segments.begin(), g);
for (std::pair<int, char> &errs : errors_so_far) {
errs = std::make_pair(errs.first + g.end - g.start, errs.second);
}
}
size_t size() const {
return std::accumulate(
segments.begin(), segments.end(), 0L,
[](size_t sum_so_far, const ginterval &g) -> size_t { return sum_so_far + g.end - g.start; });
}
};
// pcr molecule structure that can model paired molecules
struct pcr_molecule {
vector<molecule_descriptor> paired;
pcr_molecule() {}
pcr_molecule(const pcr_molecule &other) : paired(other.paired) {}
pcr_molecule(const molecule_descriptor &other) { paired.push_back(other); }
pcr_molecule(const string &id, const isoform &other) { paired.push_back(*molecule_descriptor{other}.id(id)); }
pcr_molecule(const pcr_molecule &first, const pcr_molecule &second) : paired(first.paired) {
paired.reserve(first.paired.size() + second.paired.size());
paired.insert(paired.end(), second.paired.begin(), second.paired.end());
}
};
*/
#endif
51 changes: 28 additions & 23 deletions src/truncate.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "truncate.h"

#include <fmt/compile.h>
#include <fmt/core.h>

#include <cxxopts.hpp>
#include <npy/npy.hpp>
#include <random>
Expand All @@ -11,40 +14,44 @@
#include "pimpl.h"

inline void
truncate(molecule_descriptor &md, int truncated_length, int min_val = 100) {
if (min_val > truncated_length) {
truncated_length = min_val;
truncate(molecule_descriptor &md, int post_truncation_length, int min_val = 100) {
if (min_val > post_truncation_length) {
post_truncation_length = min_val;
}
size_t i = 0;
int len_so_far = 0;
auto &segments = md.get_segments();
size_t i = 0;
int kept_so_far = 0;
auto &segments = md.get_segments();
for (const ginterval &g : segments) {
len_so_far += (g.end - g.start);
if (len_so_far >= truncated_length) {
if (kept_so_far + g.size() >= post_truncation_length) {
break;
}
kept_so_far += g.size();
++i;
}
if (i != segments.size()) {
std::stringstream ss;
ss << segments[i].chr << ':' << segments[i].end - (len_so_far - truncated_length) << '-' << segments[i].end;
md.add_comment("truncated", ss.str());
segments[i].truncate(0, segments[i].end - segments[i].start - (len_so_far - truncated_length));
for (size_t j = i + 1; j < segments.size(); ++j) {
std::stringstream ss;
ss << segments[j];
int segment_kept_len = (post_truncation_length - kept_so_far);

int trunc_start, trunc_end;
if (segments[i].plus_strand) {
trunc_start = segments[i].start + segment_kept_len;
trunc_end = segments[i].end;
segments[i].truncate(0, segment_kept_len);
}
else {
trunc_start = segments[i].start;
trunc_end = segments[i].end - segment_kept_len;
segments[i].truncate(segments[i].size() - segment_kept_len, segments[i].size());
}

md.add_comment("truncated", ss.str());
md.add_comment("truncated", fmt::format(FMT_COMPILE("{}:{}-{}"), segments[i].chr, trunc_start, trunc_end));
for (size_t j = i + 1; j < segments.size(); ++j) {
md.add_comment("truncated",
fmt::format(FMT_COMPILE("{}:{}-{}"), segments[j].chr, segments[j].start, segments[j].end));
}
logd("Before resize: {}", md.cget_segments().size());
segments.resize(i + 1);
logd("After resize: {}", md.cget_segments().size());
}
int sum = 0;
for (const auto &g : md.cget_segments()) {
sum += g.end - g.start;
}
logd("sum: {} truncated_len: {}", sum, truncated_length);
}

template <class RealType = double, class IndexType = long>
Expand Down Expand Up @@ -288,8 +295,6 @@ class Truncate_module::impl : public tksm_module {
std::ofstream output_file{output_file_path};
auto disko = get_dist();

// std::ranges::copy( stream_mdf(args["input"].as<string>(), true) | truncate_transformer(disko),
// std::ostream_iterator<molecule_descriptor>{output_file});
for (const auto &md : stream_mdf(args["input"].as<string>(), true) | truncate_transformer(disko, rand_gen)) {
output_file << md;
}
Expand Down

0 comments on commit e21ec1e

Please sign in to comment.