From 487cdd8e807aecc314591f7b8639acfe4ca4c201 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Winkler?= Date: Thu, 23 Sep 2021 12:25:11 +0200 Subject: [PATCH] compute raw activity profile --- include/iGenVar.hpp | 1 + .../variant_detection/snp_indel_detection.hpp | 9 ++ src/CMakeLists.txt | 1 + src/iGenVar.cpp | 35 +++++-- src/variant_detection/snp_indel_detection.cpp | 94 +++++++++++++++++++ src/variant_detection/variant_detection.cpp | 4 - 6 files changed, 131 insertions(+), 13 deletions(-) create mode 100644 include/variant_detection/snp_indel_detection.hpp create mode 100644 src/variant_detection/snp_indel_detection.cpp diff --git a/include/iGenVar.hpp b/include/iGenVar.hpp index c5d1352d..e1420726 100644 --- a/include/iGenVar.hpp +++ b/include/iGenVar.hpp @@ -11,6 +11,7 @@ struct cmd_arguments // Input: /* -i */ std::filesystem::path alignment_short_reads_file_path{""}; /* -j */ std::filesystem::path alignment_long_reads_file_path{""}; + /* -g */ std::filesystem::path genome_file_path{""}; // Output: /* -o */ std::filesystem::path output_file_path{}; /* -s */ std::string vcf_sample_name{"MYSAMPLE"}; diff --git a/include/variant_detection/snp_indel_detection.hpp b/include/variant_detection/snp_indel_detection.hpp new file mode 100644 index 00000000..c1ddbd10 --- /dev/null +++ b/include/variant_detection/snp_indel_detection.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include + +/*! + * \brief Detect Single Nucleotide Polymorphisms (SNPs) and short deletions and insertions. + * \param reads_filename The file path where to find the sequenced reads. + */ +void detect_snp_and_indel(std::filesystem::path const & reads_filename); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8701a396..be351ffd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,7 @@ add_library ("${PROJECT_NAME}_lib" STATIC modules/clustering/hierarchical_cluste structures/cluster.cpp structures/junction.cpp variant_detection/method_enums.cpp + variant_detection/snp_indel_detection.cpp variant_detection/variant_detection.cpp variant_detection/variant_output.cpp) diff --git a/src/iGenVar.cpp b/src/iGenVar.cpp index a434f624..017d1056 100644 --- a/src/iGenVar.cpp +++ b/src/iGenVar.cpp @@ -8,6 +8,7 @@ #include "modules/clustering/hierarchical_clustering_method.hpp" // for the hierarchical clustering method #include "modules/clustering/simple_clustering_method.hpp" // for the simple clustering method #include "structures/cluster.hpp" // for class Cluster +#include "variant_detection/snp_indel_detection.hpp" // for detect_snp_and_indel #include "variant_detection/variant_detection.hpp" // for detect_junctions_in_long_reads_sam_file() #include "variant_detection/variant_output.hpp" // for find_and_output_variants() @@ -35,6 +36,11 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments "Input long read alignments in SAM or BAM format (PacBio, Oxford Nanopore, ...).", seqan3::option_spec::standard, seqan3::input_file_validator{{"sam", "bam"}} ); + parser.add_option(args.genome_file_path, + 'g', "input_genome", + "Input the reference genome in FASTA or FASTQ format.", + seqan3::option_spec::standard, + seqan3::input_file_validator{{"fasta", "fa", "fastq"}} ); parser.add_option(args.output_file_path, 'o', "output", "The path of the vcf output file. If no path is given, will output to standard output.", seqan3::option_spec::standard, @@ -124,18 +130,26 @@ void detect_variants_in_alignment_file(cmd_arguments const & args) std::vector junctions{}; std::map references_lengths{}; - // short reads - if (args.alignment_short_reads_file_path != "") + if (args.genome_file_path.empty()) { - seqan3::debug_stream << "Detect junctions in short reads...\n"; - detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args); - } + // short reads + if (!args.alignment_short_reads_file_path.empty()) + { + seqan3::debug_stream << "Detect junctions in short reads...\n"; + detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args); + } - // long reads - if (args.alignment_long_reads_file_path != "") + // long reads + if (!args.alignment_long_reads_file_path.empty()) + { + seqan3::debug_stream << "Detect junctions in long reads...\n"; + detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args); + } + } + else { - seqan3::debug_stream << "Detect junctions in long reads...\n"; - detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args); + seqan3::debug_stream << "Detect SNPs, insertions and deletions in short reads...\n"; + detect_snp_and_indel(args.alignment_long_reads_file_path); } std::sort(junctions.begin(), junctions.end()); @@ -233,6 +247,9 @@ int main(int argc, char ** argv) return -1; } + // Set the number of decompression threads + seqan3::contrib::bgzf_thread_count = args.threads; + // Check that method selection contains no duplicates. std::vector unique_methods{args.methods}; std::ranges::sort(unique_methods); diff --git a/src/variant_detection/snp_indel_detection.cpp b/src/variant_detection/snp_indel_detection.cpp new file mode 100644 index 00000000..3d7ece99 --- /dev/null +++ b/src/variant_detection/snp_indel_detection.cpp @@ -0,0 +1,94 @@ +#include + +#include +#include +#include + +#include "iGenVar.hpp" +#include "variant_detection/bam_functions.hpp" // check the sam flags +#include "variant_detection/snp_indel_detection.hpp" // detect_snp_and_indel + +void detect_snp_and_indel(std::filesystem::path const & reads_filename) +{ + // Get the header information and set the necessary fields. + using sam_fields = seqan3::fields; // 6: CIGAR + + seqan3::sam_file_input reads_file{reads_filename, sam_fields{}}; + auto const & header = reads_file.header(); + + // Check that the file is sorted. + if (header.sorting != "coordinate") + throw seqan3::format_error{"ERROR: Input file must be sorted by coordinate (e.g. samtools sort)"}; + + // Prepare one activity vector per reference sequence. + std::vector> activity{}; + activity.resize(header.ref_id_info.size()); // allocate the number of reference sequences + for (size_t idx = 0; idx < activity.size(); ++idx) + activity[idx].resize(std::get<0>(header.ref_id_info[idx])); // allocate the length of the reference sequence + + for (auto && record : reads_file) + { + seqan3::sam_flag const flag = record.flag(); // 2: FLAG + int32_t const ref_id = record.reference_id().value_or(-1); // 3: RNAME + int32_t ref_pos = record.reference_position().value_or(-1); // 4: POS + + // Skip reads with certain properties. + if (hasFlagUnmapped(flag) || + hasFlagSecondary(flag) || + hasFlagDuplicate(flag) || + record.mapping_quality() < 20 || + ref_id < 0 || + ref_pos < 0) + { + continue; + } + + // Track the current position within the read. + unsigned read_pos = 0; + + // Step through the CIGAR string. + for (auto && [length, operation] : record.cigar_sequence()) + { + // Skip long variants. + if (length >= 30) // TODO: use min_length parameter + { + read_pos += length; + ref_pos += length; + continue; + } + + // Case distinction for cigar elements. + seqan3::cigar::operation const op = operation; + switch (op.to_char()) + { + case 'I': + case 'S': // insertion or soft clip + { + activity[ref_id][ref_pos] += length; + read_pos += length; + break; + } + case 'D': // deletion + { + for (unsigned idx = 0; idx < length; ++idx) + ++activity[ref_id][ref_pos + idx]; + ref_pos += length; + break; + } + default: // match or mismatch + { // TODO: ATTENTION, mismatches are currently ignored and should still be verified with the reference + ref_pos += length; + read_pos += length; + } + } + } + } + + // Print the raw activity profiles. + for (auto const & av : activity) + seqan3::debug_stream << av << std::endl; +} diff --git a/src/variant_detection/variant_detection.cpp b/src/variant_detection/variant_detection.cpp index 74dd9986..f46c52e5 100644 --- a/src/variant_detection/variant_detection.cpp +++ b/src/variant_detection/variant_detection.cpp @@ -53,8 +53,6 @@ void detect_junctions_in_short_reads_sam_file(std::vector & junctions, seqan3::field::ref_offset, // 4: POS seqan3::field::mapq>; // 5: MAPQ - // Set number of decompression threads - seqan3::contrib::bgzf_thread_count = args.threads; seqan3::sam_file_input alignment_short_reads_file{args.alignment_short_reads_file_path, my_fields{}}; std::deque const ref_ids = read_header_information(alignment_short_reads_file, references_lengths); @@ -115,8 +113,6 @@ void detect_junctions_in_long_reads_sam_file(std::vector & junctions, seqan3::field::seq, // 10:SEQ seqan3::field::tags>; - // Set number of decompression threads - seqan3::contrib::bgzf_thread_count = args.threads; seqan3::sam_file_input alignment_long_reads_file{args.alignment_long_reads_file_path, my_fields{}}; std::deque const ref_ids = read_header_information(alignment_long_reads_file, references_lengths);