Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compute raw activity profile #162

Merged
merged 4 commits into from
Oct 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/iGenVar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ struct cmd_arguments
// Input:
/* -i */ std::filesystem::path alignment_short_reads_file_path{""};
/* -j */ std::filesystem::path alignment_long_reads_file_path{""};
/* -g */ std::filesystem::path genome_file_path{""};
// Output:
/* -o */ std::filesystem::path output_file_path{};
/* -s */ std::string vcf_sample_name{"MYSAMPLE"};
Expand Down
9 changes: 9 additions & 0 deletions include/variant_detection/snp_indel_detection.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include <seqan3/std/filesystem>

/*!
* \brief Detect Single Nucleotide Polymorphisms (SNPs) and short deletions and insertions.
* \param[in] reads_filename - The file path where to find the sequenced reads.
*/
void detect_snp_and_indel(std::filesystem::path const & reads_filename);
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ add_library ("${PROJECT_NAME}_lib" STATIC modules/clustering/hierarchical_cluste
structures/cluster.cpp
structures/junction.cpp
variant_detection/method_enums.cpp
variant_detection/snp_indel_detection.cpp
variant_detection/variant_detection.cpp
variant_detection/variant_output.cpp)

Expand Down
37 changes: 35 additions & 2 deletions src/iGenVar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@

#include <seqan3/contrib/stream/bgzf_stream_util.hpp> // for bgzf_thread_count
#include <seqan3/core/debug_stream.hpp> // for seqan3::debug_stream
#include <seqan3/io/sequence_file/format_embl.hpp> // for embl sequence file extensions
#include <seqan3/io/sequence_file/format_fasta.hpp> // for fasta sequence file extensions
#include <seqan3/io/sequence_file/format_fastq.hpp> // for fastq sequence file extensions
#include <seqan3/io/sequence_file/format_genbank.hpp> // for genbank sequence file extensions

#include "modules/clustering/hierarchical_clustering_method.hpp" // for the hierarchical clustering method
#include "modules/clustering/simple_clustering_method.hpp" // for the simple clustering method
#include "structures/cluster.hpp" // for class Cluster
#include "variant_detection/snp_indel_detection.hpp" // for detect_snp_and_indel
#include "variant_detection/variant_detection.hpp" // for detect_junctions_in_long_reads_sam_file()
#include "variant_detection/variant_output.hpp" // for find_and_output_variants()

Expand All @@ -24,6 +29,18 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
parser.info.short_copyright = "short_copyright";
parser.info.url = "https://github.com/seqan/iGenVar/";

// merge the vectors of all sequence file extensions
std::vector<std::string> seq_file_extensions = seqan3::format_fasta::file_extensions;
seq_file_extensions.insert(seq_file_extensions.end(),
seqan3::format_fastq::file_extensions.begin(),
seqan3::format_fastq::file_extensions.end());
seq_file_extensions.insert(seq_file_extensions.end(),
seqan3::format_embl::file_extensions.begin(),
seqan3::format_embl::file_extensions.end());
seq_file_extensions.insert(seq_file_extensions.end(),
seqan3::format_genbank::file_extensions.begin(),
seqan3::format_genbank::file_extensions.end());

// Options - Input / Output:
parser.add_option(args.alignment_short_reads_file_path,
'i', "input_short_reads",
Expand All @@ -35,6 +52,11 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
"Input long read alignments in SAM or BAM format (PacBio, Oxford Nanopore, ...).",
seqan3::option_spec::standard,
seqan3::input_file_validator{{"sam", "bam"}} );
parser.add_option(args.genome_file_path,
'g', "input_genome",
"Input the reference genome in FASTA or FASTQ format.",
seqan3::option_spec::standard,
seqan3::input_file_validator{seq_file_extensions} );
parser.add_option(args.output_file_path, 'o', "output",
"The path of the vcf output file. If no path is given, will output to standard output.",
seqan3::option_spec::standard,
Expand Down Expand Up @@ -125,19 +147,27 @@ void detect_variants_in_alignment_file(cmd_arguments const & args)
std::map<std::string, int32_t> references_lengths{};

// short reads
if (args.alignment_short_reads_file_path != "")
// TODO (joergi-w 30.09.2021) Control the selection with the 'method' parameter, not the availability of a genome.
if (!args.alignment_short_reads_file_path.empty() && args.genome_file_path.empty())
{
seqan3::debug_stream << "Detect junctions in short reads...\n";
detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args);
}

// long reads
if (args.alignment_long_reads_file_path != "")
if (!args.alignment_long_reads_file_path.empty())
{
seqan3::debug_stream << "Detect junctions in long reads...\n";
detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args);
}

// SNPs and indels for short reads; genome must be given
if (!args.alignment_short_reads_file_path.empty() && !args.genome_file_path.empty())
{
seqan3::debug_stream << "Detect SNPs and indels in short reads...\n";
detect_snp_and_indel(args.alignment_short_reads_file_path);
}

std::sort(junctions.begin(), junctions.end());

if (args.junctions_file_path != "")
Expand Down Expand Up @@ -233,6 +263,9 @@ int main(int argc, char ** argv)
return -1;
}

// Set the number of decompression threads
seqan3::contrib::bgzf_thread_count = args.threads;

// Check that method selection contains no duplicates.
std::vector<detection_methods> unique_methods{args.methods};
std::ranges::sort(unique_methods);
Expand Down
94 changes: 94 additions & 0 deletions src/variant_detection/snp_indel_detection.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include <iostream>

#include <seqan3/alphabet/cigar/cigar.hpp>
#include <seqan3/core/debug_stream.hpp>
#include <seqan3/io/sam_file/input.hpp>

#include "iGenVar.hpp"
#include "variant_detection/bam_functions.hpp" // check the sam flags
#include "variant_detection/snp_indel_detection.hpp" // detect_snp_and_indel

void detect_snp_and_indel(std::filesystem::path const & reads_filename)
{
// Get the header information and set the necessary fields.
using sam_fields = seqan3::fields<seqan3::field::flag, // 2: FLAG
seqan3::field::ref_id, // 3: RNAME
seqan3::field::ref_offset, // 4: POS
seqan3::field::mapq, // 5: MAPQ
seqan3::field::cigar>; // 6: CIGAR

seqan3::sam_file_input reads_file{reads_filename, sam_fields{}};
auto const & header = reads_file.header();

// Check that the file is sorted.
if (header.sorting != "coordinate")
throw seqan3::format_error{"ERROR: Input file must be sorted by coordinate (e.g. samtools sort)"};

// Prepare one activity vector per reference sequence.
Irallia marked this conversation as resolved.
Show resolved Hide resolved
std::vector<std::vector<unsigned>> activity{};
activity.resize(header.ref_id_info.size()); // allocate the number of reference sequences
for (size_t idx = 0; idx < activity.size(); ++idx)
activity[idx].resize(std::get<0>(header.ref_id_info[idx])); // allocate the length of the reference sequence

for (auto && record : reads_file)
{
seqan3::sam_flag const flag = record.flag(); // 2: FLAG
int32_t const ref_id = record.reference_id().value_or(-1); // 3: RNAME
int32_t ref_pos = record.reference_position().value_or(-1); // 4: POS

// Skip reads with certain properties.
if (hasFlagUnmapped(flag) ||
hasFlagSecondary(flag) ||
hasFlagDuplicate(flag) ||
record.mapping_quality() < 20 ||
ref_id < 0 ||
ref_pos < 0)
{
continue;
}

// Track the current position within the read.
unsigned read_pos = 0;

// Step through the CIGAR string.
for (auto && [length, operation] : record.cigar_sequence())
{
// Skip long variants.
if (length >= 30) // TODO (joergi-w 30.09.2021) Use the min_length parameter here.
{
read_pos += length;
ref_pos += length;
continue;
}

// Case distinction for cigar elements.
seqan3::cigar::operation const op = operation;
switch (op.to_char())
{
case 'I':
case 'S': // insertion or soft clip
{
activity[ref_id][ref_pos] += length;
read_pos += length;
break;
}
case 'D': // deletion
{
for (unsigned idx = 0; idx < length; ++idx)
++activity[ref_id][ref_pos + idx];
ref_pos += length;
break;
}
default: // ignore match, mismatch, hard clipping, padding, and skipped region
{ // TODO (joergi-w 30.09.2021) mismatches are ignored but should be verified with the reference
ref_pos += length;
read_pos += length;
}
}
}
}

// Print the raw activity profiles.
for (auto const & av : activity)
seqan3::debug_stream << av << std::endl;
}
4 changes: 0 additions & 4 deletions src/variant_detection/variant_detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ void detect_junctions_in_short_reads_sam_file([[maybe_unused]] std::vector<Junct
seqan3::field::ref_offset, // 4: POS
seqan3::field::mapq>; // 5: MAPQ

// Set number of decompression threads
seqan3::contrib::bgzf_thread_count = args.threads;
seqan3::sam_file_input alignment_short_reads_file{args.alignment_short_reads_file_path, my_fields{}};

std::deque<std::string> const ref_ids = read_header_information(alignment_short_reads_file, references_lengths);
Expand Down Expand Up @@ -115,8 +113,6 @@ void detect_junctions_in_long_reads_sam_file(std::vector<Junction> & junctions,
seqan3::field::seq, // 10:SEQ
seqan3::field::tags>;

// Set number of decompression threads
seqan3::contrib::bgzf_thread_count = args.threads;
seqan3::sam_file_input alignment_long_reads_file{args.alignment_long_reads_file_path, my_fields{}};

std::deque<std::string> const ref_ids = read_header_information(alignment_long_reads_file, references_lengths);
Expand Down
1 change: 1 addition & 0 deletions test/api/detection_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,7 @@ TEST(junction_detection, analyze_sa_tag)
// Args
cmd_arguments args{std::filesystem::path{}, // alignment_short_reads_file_path
std::filesystem::path{}, // alignment_long_reads_file_path
std::filesystem::path{}, // genome_file_path
std::filesystem::path{}, // output_file_path
"MYSAMPLE", // vcf_sample_name
std::filesystem::path{}, // junctions_file_path
Expand Down
4 changes: 4 additions & 0 deletions test/api/input_file_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ TEST(input_file, detect_junctions_in_short_read_sam_file)

cmd_arguments args{default_alignment_short_reads_file_path,
"",
empty_path, // empty genome path,
empty_path, // empty output path,
default_vcf_sample_name,
empty_path, // empty junctions path,
Expand Down Expand Up @@ -100,6 +101,7 @@ TEST(input_file, detect_junctions_in_long_reads_sam_file)

cmd_arguments args{"",
default_alignment_long_reads_file_path,
empty_path, // empty genome path,
empty_path, // empty output path,
default_vcf_sample_name,
empty_path, // empty junctions path,
Expand Down Expand Up @@ -216,6 +218,7 @@ TEST(input_file, long_read_sam_file_unsorted)

cmd_arguments args{"",
unsorted_sam_path,
empty_path, // empty genome path,
empty_path, // empty output path,
default_vcf_sample_name,
empty_path, // empty junctions path,
Expand Down Expand Up @@ -295,6 +298,7 @@ TEST(input_file, short_and_long_read_sam_file_with_different_references_lengths)

cmd_arguments args{short_sam_path,
long_sam_path,
empty_path, // empty genome path
empty_path, // empty output path
default_vcf_sample_name,
empty_path, // empty junctions path
Expand Down
1 change: 1 addition & 0 deletions test/cli/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ cmake_minimum_required (VERSION 3.11)
add_cli_test(iGenVar_cli_test.cpp)
add_dependencies (iGenVar_cli_test "iGenVar")
target_use_datasources (iGenVar_cli_test FILES simulated.minimap2.hg19.coordsorted_cutoff.sam)
target_use_datasources (iGenVar_cli_test FILES mini_example_reference.fasta)
Irallia marked this conversation as resolved.
Show resolved Hide resolved
target_use_datasources (iGenVar_cli_test FILES paired_end_mini_example.sam)
target_use_datasources (iGenVar_cli_test FILES single_end_mini_example.sam)
target_use_datasources (iGenVar_cli_test FILES output_res.txt)
Expand Down
48 changes: 35 additions & 13 deletions test/cli/iGenVar_cli_test.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
#include <seqan3/alphabet/nucleotide/dna5.hpp>
#include <seqan3/io/sequence_file/input.hpp>
#include <seqan3/core/debug_stream.hpp>

#include "cli_test.hpp"
#include <fstream>
#include <sstream>

#include "cli_test.hpp"

std::string const default_alignment_long_reads_file_path = "simulated.minimap2.hg19.coordsorted_cutoff.sam";
std::string const default_genome_file_path = "mini_example_reference.fasta";
std::string const vcf_out_file_path = "variants_file_out.vcf";
std::string const junctions_out_file_path = "junctions_file_out.txt";
std::string const clusters_out_file_path = "clusters_file_out.txt";
Expand Down Expand Up @@ -39,6 +37,11 @@ std::string const help_page_part_1
" Input long read alignments in SAM or BAM format (PacBio, Oxford\n"
" Nanopore, ...). Default: \"\". The input file must exist and read\n"
" permissions must be granted. Valid file extensions are: [sam, bam].\n"
" -g, --input_genome (std::filesystem::path)\n"
" Input the reference genome in FASTA or FASTQ format. Default: \"\".\n"
" The input file must exist and read permissions must be granted.\n"
" Valid file extensions are: [fasta, fa, fna, ffn, faa, frn, fas,\n"
" fastq, fq, embl, genbank, gb, gbk].\n"
" -o, --output (std::filesystem::path)\n"
" The path of the vcf output file. If no path is given, will output to\n"
" standard output. Default: \"\". Write permissions must be granted.\n"
Expand Down Expand Up @@ -293,6 +296,33 @@ TEST_F(iGenVar_cli_test, test_intermediate_result_output)
EXPECT_NE(buffer2.str(), "");
}

TEST_F(iGenVar_cli_test, test_genome_input)
{
cli_test_result result = execute_app("iGenVar",
"-g", data(default_genome_file_path),
"-i", data("single_end_mini_example.sam"));
std::string const expected_err = "Detect SNPs and indels in short reads...\n"
"[0,0,0,0,0,0,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,"
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,9,9,9,9,9,9,9,9,9,9,9,9,1,0,0,0,0,0,0,0,"
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"
"0,0,0,0,0,0,0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,33,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,"
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,2,2,2,2,2,2,2,2,"
"2,2,2,2,2,2,2,67,3,3,3,67,1,1,1,1,1,1,1,1,1,1,1,1,43,0,0,0,0,0,0,0,0,0,0,0,0,0,"
"0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,4,4,4,4,4,4,4,4,4,4,4,4,0,0,0,"
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,"
"1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,"
"0,0,0,0,0,0,0,0,0,0]\n"
"Start clustering...\n"
"Done with clustering. Found 0 junction clusters.\n"
"No refinement was selected.\n";
Comment on lines +318 to +320
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question for the next meeting: Do we have our own clustering method here or do we use the existing one? Do we need the refinement for SNPs at all?

EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{"##fileformat=VCFv4.3\n##source=iGenVarCaller\n"} + general_header_lines);
EXPECT_EQ(result.err, expected_err);
}

// SV specifications:

TEST_F(iGenVar_cli_test, fail_negative_min_var_length)
Expand Down Expand Up @@ -612,12 +642,9 @@ TEST_F(iGenVar_cli_test, dataset_paired_end_mini_example)
"--method read_pairs");

// Check the output of junctions:
seqan3::debug_stream << "Check the output of junctions... " << '\n';
EXPECT_EQ(result.out, expected_res_empty + general_header_lines);
seqan3::debug_stream << "done. " << '\n';

// Check the debug output of junctions:
seqan3::debug_stream << "Check the debug output of junctions... " << '\n';
std::string expected_err
{
"Detect junctions in short reads...\n"
Expand Down Expand Up @@ -662,7 +689,6 @@ TEST_F(iGenVar_cli_test, dataset_paired_end_mini_example)
"No refinement was selected.\n"
};
EXPECT_EQ(result.err, expected_err);
seqan3::debug_stream << "done. " << '\n';
}

TEST_F(iGenVar_cli_test, dataset_single_end_mini_example)
Expand All @@ -675,18 +701,14 @@ TEST_F(iGenVar_cli_test, dataset_single_end_mini_example)
"--hierarchical_clustering_cutoff 0.1");

// Check the output of junctions:
seqan3::debug_stream << "Check the output of junctions... " << '\n';
std::ifstream output_res_file("../../data/output_res.txt");
std::string output_res_str((std::istreambuf_iterator<char>(output_res_file)),
std::istreambuf_iterator<char>());
EXPECT_EQ(result.out, output_res_str);
seqan3::debug_stream << "done. " << '\n';

// Check the debug output of junctions:
seqan3::debug_stream << "Check the debug output of junctions... " << '\n';
std::ifstream output_err_file("../../data/output_err.txt");
std::string output_err_str((std::istreambuf_iterator<char>(output_err_file)),
std::istreambuf_iterator<char>());
EXPECT_EQ(result.err, output_err_str);
seqan3::debug_stream << "done. " << '\n';
}
Loading