Skip to content

Commit

Permalink
compute raw activity profile
Browse files Browse the repository at this point in the history
  • Loading branch information
joergi-w committed Sep 23, 2021
1 parent f31f991 commit 487cdd8
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 13 deletions.
1 change: 1 addition & 0 deletions include/iGenVar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ struct cmd_arguments
// Input:
/* -i */ std::filesystem::path alignment_short_reads_file_path{""};
/* -j */ std::filesystem::path alignment_long_reads_file_path{""};
/* -g */ std::filesystem::path genome_file_path{""};
// Output:
/* -o */ std::filesystem::path output_file_path{};
/* -s */ std::string vcf_sample_name{"MYSAMPLE"};
Expand Down
9 changes: 9 additions & 0 deletions include/variant_detection/snp_indel_detection.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include <seqan3/std/filesystem>

/*!
* \brief Detect Single Nucleotide Polymorphisms (SNPs) and short deletions and insertions.
* \param reads_filename The file path where to find the sequenced reads.
*/
void detect_snp_and_indel(std::filesystem::path const & reads_filename);
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ add_library ("${PROJECT_NAME}_lib" STATIC modules/clustering/hierarchical_cluste
structures/cluster.cpp
structures/junction.cpp
variant_detection/method_enums.cpp
variant_detection/snp_indel_detection.cpp
variant_detection/variant_detection.cpp
variant_detection/variant_output.cpp)

Expand Down
35 changes: 26 additions & 9 deletions src/iGenVar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "modules/clustering/hierarchical_clustering_method.hpp" // for the hierarchical clustering method
#include "modules/clustering/simple_clustering_method.hpp" // for the simple clustering method
#include "structures/cluster.hpp" // for class Cluster
#include "variant_detection/snp_indel_detection.hpp" // for detect_snp_and_indel
#include "variant_detection/variant_detection.hpp" // for detect_junctions_in_long_reads_sam_file()
#include "variant_detection/variant_output.hpp" // for find_and_output_variants()

Expand Down Expand Up @@ -35,6 +36,11 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
"Input long read alignments in SAM or BAM format (PacBio, Oxford Nanopore, ...).",
seqan3::option_spec::standard,
seqan3::input_file_validator{{"sam", "bam"}} );
parser.add_option(args.genome_file_path,
'g', "input_genome",
"Input the reference genome in FASTA or FASTQ format.",
seqan3::option_spec::standard,
seqan3::input_file_validator{{"fasta", "fa", "fastq"}} );
parser.add_option(args.output_file_path, 'o', "output",
"The path of the vcf output file. If no path is given, will output to standard output.",
seqan3::option_spec::standard,
Expand Down Expand Up @@ -124,18 +130,26 @@ void detect_variants_in_alignment_file(cmd_arguments const & args)
std::vector<Junction> junctions{};
std::map<std::string, int32_t> references_lengths{};

// short reads
if (args.alignment_short_reads_file_path != "")
if (args.genome_file_path.empty())
{
seqan3::debug_stream << "Detect junctions in short reads...\n";
detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args);
}
// short reads
if (!args.alignment_short_reads_file_path.empty())
{
seqan3::debug_stream << "Detect junctions in short reads...\n";
detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args);
}

// long reads
if (args.alignment_long_reads_file_path != "")
// long reads
if (!args.alignment_long_reads_file_path.empty())
{
seqan3::debug_stream << "Detect junctions in long reads...\n";
detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args);
}
}
else
{
seqan3::debug_stream << "Detect junctions in long reads...\n";
detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args);
seqan3::debug_stream << "Detect SNPs, insertions and deletions in short reads...\n";
detect_snp_and_indel(args.alignment_long_reads_file_path);
}

std::sort(junctions.begin(), junctions.end());
Expand Down Expand Up @@ -233,6 +247,9 @@ int main(int argc, char ** argv)
return -1;
}

// Set the number of decompression threads
seqan3::contrib::bgzf_thread_count = args.threads;

// Check that method selection contains no duplicates.
std::vector<detection_methods> unique_methods{args.methods};
std::ranges::sort(unique_methods);
Expand Down
94 changes: 94 additions & 0 deletions src/variant_detection/snp_indel_detection.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include <iostream>

#include <seqan3/alphabet/cigar/cigar.hpp>
#include <seqan3/core/debug_stream.hpp>
#include <seqan3/io/sam_file/input.hpp>

#include "iGenVar.hpp"
#include "variant_detection/bam_functions.hpp" // check the sam flags
#include "variant_detection/snp_indel_detection.hpp" // detect_snp_and_indel

void detect_snp_and_indel(std::filesystem::path const & reads_filename)
{
// Get the header information and set the necessary fields.
using sam_fields = seqan3::fields<seqan3::field::flag, // 2: FLAG
seqan3::field::ref_id, // 3: RNAME
seqan3::field::ref_offset, // 4: POS
seqan3::field::mapq, // 5: MAPQ
seqan3::field::cigar>; // 6: CIGAR

seqan3::sam_file_input reads_file{reads_filename, sam_fields{}};
auto const & header = reads_file.header();

// Check that the file is sorted.
if (header.sorting != "coordinate")
throw seqan3::format_error{"ERROR: Input file must be sorted by coordinate (e.g. samtools sort)"};

// Prepare one activity vector per reference sequence.
std::vector<std::vector<unsigned>> activity{};
activity.resize(header.ref_id_info.size()); // allocate the number of reference sequences
for (size_t idx = 0; idx < activity.size(); ++idx)
activity[idx].resize(std::get<0>(header.ref_id_info[idx])); // allocate the length of the reference sequence

for (auto && record : reads_file)
{
seqan3::sam_flag const flag = record.flag(); // 2: FLAG
int32_t const ref_id = record.reference_id().value_or(-1); // 3: RNAME
int32_t ref_pos = record.reference_position().value_or(-1); // 4: POS

// Skip reads with certain properties.
if (hasFlagUnmapped(flag) ||
hasFlagSecondary(flag) ||
hasFlagDuplicate(flag) ||
record.mapping_quality() < 20 ||
ref_id < 0 ||
ref_pos < 0)
{
continue;
}

// Track the current position within the read.
unsigned read_pos = 0;

// Step through the CIGAR string.
for (auto && [length, operation] : record.cigar_sequence())
{
// Skip long variants.
if (length >= 30) // TODO: use min_length parameter
{
read_pos += length;
ref_pos += length;
continue;
}

// Case distinction for cigar elements.
seqan3::cigar::operation const op = operation;
switch (op.to_char())
{
case 'I':
case 'S': // insertion or soft clip
{
activity[ref_id][ref_pos] += length;
read_pos += length;
break;
}
case 'D': // deletion
{
for (unsigned idx = 0; idx < length; ++idx)
++activity[ref_id][ref_pos + idx];
ref_pos += length;
break;
}
default: // match or mismatch
{ // TODO: ATTENTION, mismatches are currently ignored and should still be verified with the reference
ref_pos += length;
read_pos += length;
}
}
}
}

// Print the raw activity profiles.
for (auto const & av : activity)
seqan3::debug_stream << av << std::endl;
}
4 changes: 0 additions & 4 deletions src/variant_detection/variant_detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ void detect_junctions_in_short_reads_sam_file(std::vector<Junction> & junctions,
seqan3::field::ref_offset, // 4: POS
seqan3::field::mapq>; // 5: MAPQ

// Set number of decompression threads
seqan3::contrib::bgzf_thread_count = args.threads;
seqan3::sam_file_input alignment_short_reads_file{args.alignment_short_reads_file_path, my_fields{}};

std::deque<std::string> const ref_ids = read_header_information(alignment_short_reads_file, references_lengths);
Expand Down Expand Up @@ -115,8 +113,6 @@ void detect_junctions_in_long_reads_sam_file(std::vector<Junction> & junctions,
seqan3::field::seq, // 10:SEQ
seqan3::field::tags>;

// Set number of decompression threads
seqan3::contrib::bgzf_thread_count = args.threads;
seqan3::sam_file_input alignment_long_reads_file{args.alignment_long_reads_file_path, my_fields{}};

std::deque<std::string> const ref_ids = read_header_information(alignment_long_reads_file, references_lengths);
Expand Down

0 comments on commit 487cdd8

Please sign in to comment.