Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add inline comments #46

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions neusomatic/include/bam_variant.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef BAM_VARIANT_HPP
#define BAM_VARIANT_HPP

// For an alignment record, get the variants based on the MD tags.

#include <vector>
#include <string>
#include "variant.hpp"
Expand Down
3 changes: 3 additions & 0 deletions neusomatic/include/change_coordinates.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#ifndef CHANGE_COORDINATES_HPP
#define CHANGE_COORDINATES_HPP

// Given a reference string in gapped space (reference bases seperated by 0 or more gaps),
// This class can calculate the gapped postion based on a ungapped postion, or vice versa.

#include "SeqanUtils.h"

namespace neusomatic {
Expand Down
19 changes: 18 additions & 1 deletion neusomatic/include/msa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,11 @@ class ReadSeqGap {

template<typename BamRecord, typename GInv>
class MSABuilder {
/*
* Building a MSA from pairwise bam alignment.
* This requires reading a pileup of alignment, usually in BAM format
* Each MSABuilder contains intrinsiclly a convertion from nongapped space to gapped space.
*/
const std::vector<BamRecord>& bam_records_;
//std::vector<size_t> rids_;
GappedSeq<char, GInv> ref_gaps_;
Expand Down Expand Up @@ -217,6 +222,9 @@ class MSABuilder {


auto GetGappedSeqAndQual(const BamRecord& r) const {
//Initialize a seq of MSA row length with ~(missing char). Replace ~ with -(gap char) between the begin and
//end pos (in gapped space) of a alignment. Iterate through the cigars, fill in the seq with read bases based
//on the cigars. Some idea for qual.

std::string msa_bases(ncol(), missing_chr_);
std::string msa_bquals(ncol(), 33);
Expand Down Expand Up @@ -279,7 +287,7 @@ class MSABuilder {
}
read_pos += c->Length();
}
if (c->Type() == 'D') {
if (c->Type() == 'D') { // for deletion qual, always use one base before.
const int l = std::max(ref_pos, ref_gaps_.left());
const int r = std::min(int32_t(ref_pos + c->Length()), ref_gaps_.right());
if (l < r) {
Expand Down Expand Up @@ -308,6 +316,9 @@ class MSABuilder {
}

MSABuilder(const GInv& ginv, const std::vector<BamRecord>& bams, const std::string& refstr):
//Given a reference seq (nongapped) and a set of bam records, set up the MSA
//reference in gapped space. For each ref pos, set up the gap length as the longest
//insertion length of the pileup.
bam_records_(bams), ref_gaps_(refstr, ginv) {
BuildRefGap(bams, ref_gaps_);
ncol_ = ref_gaps_.length();
Expand All @@ -329,7 +340,12 @@ class MSABuilder {
}
}

//Deprecated
std::vector<std::string> GetMSA() const {
// For each bam record, first get all variants. And then use the variants to
// To get the variants, it uses the MD tags which may or may not exist depends on the aligner.
// replace the reference bases if applies.
// Return a MSA matrix as vector of strings
std::vector<std::string> result(bam_records_.size());
for (size_t i = 0; i < bam_records_.size(); ++i) {
auto const& r = bam_records_[i];
Expand All @@ -341,6 +357,7 @@ class MSABuilder {
}

auto GetClipping(const BamRecord& r) const {
// Return the clipping positions. If no clipping return -1.
int lsc = -1, rsc = -1;
auto const& cigar = r.GetCigar();
auto front_cigar=cigar.front().Type();
Expand Down
30 changes: 21 additions & 9 deletions neusomatic/include/msa_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,18 @@
namespace neusomatic{

class Col{
/*
* Each Col is a reduced representation of a MSA column,
* i.e., counts of A,T,C,G,-,~, with averaged base qualities and etc..
* A,C,G,T-(gap),~(missing) are coded as 0,1,2,3,4,5
* Therefore, we can use a vector to store the mean or the freq for different base, the accessed by the indices.
* For instance, base_freq_[2] is the occurance (maybe normalized by depth) of G in a MSA col.
*/
private:
using Idx = unsigned;
using Base = int;
std::vector<Base> bases_;
std::vector<int> bquals_;
std::vector<Base> bases_; // deprecated, no longer storing individual base
std::vector<int> bquals_; // deprecated, no longer storing individual qual

public:
static const int ALPHABET_SIZE = 6; // a, t, c, g, gap and missing_char
Expand All @@ -34,32 +41,33 @@ class Col{
std::vector<int> lsc_mean;
std::vector<int> rsc_mean;
std::vector<std::vector<float>> tag_mean;
decltype(auto) bases() const {return (bases_);}
decltype(auto) bquals() const {return (bquals_);}
decltype(auto) bases() const {return (bases_);} // deprecated, see above
decltype(auto) bquals() const {return (bquals_);} // deprecated, see above

void emplace(const Idx& id, const Base& b) {
void emplace(const Idx& id, const Base& b) { // deprecated, see above
bases_[id] = b;
}

void emplace_qual(const Idx& id, const int& b) {
void emplace_qual(const Idx& id, const int& b) {// deprecated, see above
bquals_[id] = b;
}

decltype(auto) at(Idx i) const {
decltype(auto) at(Idx i) const {// deprecated, see above
return bases_.at(i);
}

decltype(auto) size() const {
decltype(auto) size() const {// deprecated, see above
return bases_.size();
}

decltype(auto) operator[](Idx i) {
decltype(auto) operator[](Idx i) {// deprecated, see above
return (bases_[i]);
}
};

template<typename Base>
class CondensedArray{
//CondenseArray is an array of Col (ColSpace).
public:
using Idx = unsigned;
static const unsigned char missing_chr_ = '~';
Expand Down Expand Up @@ -130,8 +138,10 @@ class CondensedArray{

CondensedArray() : nrow_(0) {}

//deprecated.
template<typename GInv>
explicit CondensedArray(const std::vector<std::string>& msa, const int& total_cov, const GInv& ginv, const std::string& refgap) :
//construct a condensed array without qualities and other info from a vector of base strings.
nrow_(msa.size()),
cspace_(msa[0].size(),
Col(msa.size())),
Expand Down Expand Up @@ -205,8 +215,10 @@ class CondensedArray{
}


//deprecated
template<typename GInv>
explicit CondensedArray(const std::vector<std::string>& msa, const std::vector<std::string>& bqual,
//construct a condensed array with qualities and other info from multiple inputs.
const std::vector<int>& mqual, const std::vector<int>& strand,
const std::vector<int>& lsc, const std::vector<int>& rsc,
const std::vector<std::vector<int>>& tags,
Expand Down