From ff4ecdc43fcd0933264ee77d36ba62e6378cae53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Winkler?= Date: Mon, 27 Sep 2021 16:58:25 +0200 Subject: [PATCH 1/4] [FEATURE] add new option for reading a genome --- include/iGenVar.hpp | 1 + src/iGenVar.cpp | 30 +++++++++++++++++++++--------- test/api/detection_test.cpp | 1 + test/api/input_file_test.cpp | 4 ++++ test/cli/iGenVar_cli_test.cpp | 4 ++++ 5 files changed, 31 insertions(+), 9 deletions(-) diff --git a/include/iGenVar.hpp b/include/iGenVar.hpp index 8dd64874..07cb5bff 100644 --- a/include/iGenVar.hpp +++ b/include/iGenVar.hpp @@ -11,6 +11,7 @@ struct cmd_arguments // Input: /* -i */ std::filesystem::path alignment_short_reads_file_path{""}; /* -j */ std::filesystem::path alignment_long_reads_file_path{""}; + /* -g */ std::filesystem::path genome_file_path{""}; // Output: /* -o */ std::filesystem::path output_file_path{}; /* -s */ std::string vcf_sample_name{"MYSAMPLE"}; diff --git a/src/iGenVar.cpp b/src/iGenVar.cpp index 3a3d537c..bdec0ffd 100644 --- a/src/iGenVar.cpp +++ b/src/iGenVar.cpp @@ -35,6 +35,11 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments "Input long read alignments in SAM or BAM format (PacBio, Oxford Nanopore, ...).", seqan3::option_spec::standard, seqan3::input_file_validator{{"sam", "bam"}} ); + parser.add_option(args.genome_file_path, + 'g', "input_genome", + "Input the reference genome in FASTA or FASTQ format.", + seqan3::option_spec::standard, + seqan3::input_file_validator{{"fasta", "fa", "fastq"}} ); parser.add_option(args.output_file_path, 'o', "output", "The path of the vcf output file. If no path is given, will output to standard output.", seqan3::option_spec::standard, @@ -124,18 +129,25 @@ void detect_variants_in_alignment_file(cmd_arguments const & args) std::vector junctions{}; std::map references_lengths{}; - // short reads - if (args.alignment_short_reads_file_path != "") + if (args.genome_file_path.empty()) { - seqan3::debug_stream << "Detect junctions in short reads...\n"; - detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args); - } + // short reads + if (!args.alignment_short_reads_file_path.empty()) + { + seqan3::debug_stream << "Detect junctions in short reads...\n"; + detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args); + } - // long reads - if (args.alignment_long_reads_file_path != "") + // long reads + if (!args.alignment_long_reads_file_path.empty()) + { + seqan3::debug_stream << "Detect junctions in long reads...\n"; + detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args); + } + } + else { - seqan3::debug_stream << "Detect junctions in long reads...\n"; - detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args); + seqan3::debug_stream << "Detect SNPs, insertions and deletions in short reads...\n"; } std::sort(junctions.begin(), junctions.end()); diff --git a/test/api/detection_test.cpp b/test/api/detection_test.cpp index de1dd33d..114f49b8 100644 --- a/test/api/detection_test.cpp +++ b/test/api/detection_test.cpp @@ -408,6 +408,7 @@ TEST(junction_detection, analyze_sa_tag) // Args cmd_arguments args{std::filesystem::path{}, // alignment_short_reads_file_path std::filesystem::path{}, // alignment_long_reads_file_path + std::filesystem::path{}, // genome_file_path std::filesystem::path{}, // output_file_path "MYSAMPLE", // vcf_sample_name std::filesystem::path{}, // junctions_file_path diff --git a/test/api/input_file_test.cpp b/test/api/input_file_test.cpp index 829a3c11..b1e75c69 100644 --- a/test/api/input_file_test.cpp +++ b/test/api/input_file_test.cpp @@ -58,6 +58,7 @@ TEST(input_file, detect_junctions_in_short_read_sam_file) cmd_arguments args{default_alignment_short_reads_file_path, "", + empty_path, // empty genome path, empty_path, // empty output path, default_vcf_sample_name, empty_path, // empty junctions path, @@ -100,6 +101,7 @@ TEST(input_file, detect_junctions_in_long_reads_sam_file) cmd_arguments args{"", default_alignment_long_reads_file_path, + empty_path, // empty genome path, empty_path, // empty output path, default_vcf_sample_name, empty_path, // empty junctions path, @@ -216,6 +218,7 @@ TEST(input_file, long_read_sam_file_unsorted) cmd_arguments args{"", unsorted_sam_path, + empty_path, // empty genome path, empty_path, // empty output path, default_vcf_sample_name, empty_path, // empty junctions path, @@ -295,6 +298,7 @@ TEST(input_file, short_and_long_read_sam_file_with_different_references_lengths) cmd_arguments args{short_sam_path, long_sam_path, + empty_path, // empty genome path empty_path, // empty output path default_vcf_sample_name, empty_path, // empty junctions path diff --git a/test/cli/iGenVar_cli_test.cpp b/test/cli/iGenVar_cli_test.cpp index f87e65eb..48cba037 100644 --- a/test/cli/iGenVar_cli_test.cpp +++ b/test/cli/iGenVar_cli_test.cpp @@ -39,6 +39,10 @@ std::string const help_page_part_1 " Input long read alignments in SAM or BAM format (PacBio, Oxford\n" " Nanopore, ...). Default: \"\". The input file must exist and read\n" " permissions must be granted. Valid file extensions are: [sam, bam].\n" + " -g, --input_genome (std::filesystem::path)\n" + " Input the reference genome in FASTA or FASTQ format. Default: \"\".\n" + " The input file must exist and read permissions must be granted.\n" + " Valid file extensions are: [fasta, fa, fastq].\n" " -o, --output (std::filesystem::path)\n" " The path of the vcf output file. If no path is given, will output to\n" " standard output. Default: \"\". Write permissions must be granted.\n" From dc2bb92ff1d46b6515c777032a860b5db20d40c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Winkler?= Date: Thu, 23 Sep 2021 12:25:11 +0200 Subject: [PATCH 2/4] compute raw activity profile --- .../variant_detection/snp_indel_detection.hpp | 9 ++ src/CMakeLists.txt | 1 + src/iGenVar.cpp | 5 + src/variant_detection/snp_indel_detection.cpp | 94 +++++++++++++++++++ src/variant_detection/variant_detection.cpp | 4 - 5 files changed, 109 insertions(+), 4 deletions(-) create mode 100644 include/variant_detection/snp_indel_detection.hpp create mode 100644 src/variant_detection/snp_indel_detection.cpp diff --git a/include/variant_detection/snp_indel_detection.hpp b/include/variant_detection/snp_indel_detection.hpp new file mode 100644 index 00000000..c1ddbd10 --- /dev/null +++ b/include/variant_detection/snp_indel_detection.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include + +/*! + * \brief Detect Single Nucleotide Polymorphisms (SNPs) and short deletions and insertions. + * \param reads_filename The file path where to find the sequenced reads. + */ +void detect_snp_and_indel(std::filesystem::path const & reads_filename); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8701a396..be351ffd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,7 @@ add_library ("${PROJECT_NAME}_lib" STATIC modules/clustering/hierarchical_cluste structures/cluster.cpp structures/junction.cpp variant_detection/method_enums.cpp + variant_detection/snp_indel_detection.cpp variant_detection/variant_detection.cpp variant_detection/variant_output.cpp) diff --git a/src/iGenVar.cpp b/src/iGenVar.cpp index bdec0ffd..d79a2d6b 100644 --- a/src/iGenVar.cpp +++ b/src/iGenVar.cpp @@ -8,6 +8,7 @@ #include "modules/clustering/hierarchical_clustering_method.hpp" // for the hierarchical clustering method #include "modules/clustering/simple_clustering_method.hpp" // for the simple clustering method #include "structures/cluster.hpp" // for class Cluster +#include "variant_detection/snp_indel_detection.hpp" // for detect_snp_and_indel #include "variant_detection/variant_detection.hpp" // for detect_junctions_in_long_reads_sam_file() #include "variant_detection/variant_output.hpp" // for find_and_output_variants() @@ -148,6 +149,7 @@ void detect_variants_in_alignment_file(cmd_arguments const & args) else { seqan3::debug_stream << "Detect SNPs, insertions and deletions in short reads...\n"; + detect_snp_and_indel(args.alignment_long_reads_file_path); } std::sort(junctions.begin(), junctions.end()); @@ -245,6 +247,9 @@ int main(int argc, char ** argv) return -1; } + // Set the number of decompression threads + seqan3::contrib::bgzf_thread_count = args.threads; + // Check that method selection contains no duplicates. std::vector unique_methods{args.methods}; std::ranges::sort(unique_methods); diff --git a/src/variant_detection/snp_indel_detection.cpp b/src/variant_detection/snp_indel_detection.cpp new file mode 100644 index 00000000..3d7ece99 --- /dev/null +++ b/src/variant_detection/snp_indel_detection.cpp @@ -0,0 +1,94 @@ +#include + +#include +#include +#include + +#include "iGenVar.hpp" +#include "variant_detection/bam_functions.hpp" // check the sam flags +#include "variant_detection/snp_indel_detection.hpp" // detect_snp_and_indel + +void detect_snp_and_indel(std::filesystem::path const & reads_filename) +{ + // Get the header information and set the necessary fields. + using sam_fields = seqan3::fields; // 6: CIGAR + + seqan3::sam_file_input reads_file{reads_filename, sam_fields{}}; + auto const & header = reads_file.header(); + + // Check that the file is sorted. + if (header.sorting != "coordinate") + throw seqan3::format_error{"ERROR: Input file must be sorted by coordinate (e.g. samtools sort)"}; + + // Prepare one activity vector per reference sequence. + std::vector> activity{}; + activity.resize(header.ref_id_info.size()); // allocate the number of reference sequences + for (size_t idx = 0; idx < activity.size(); ++idx) + activity[idx].resize(std::get<0>(header.ref_id_info[idx])); // allocate the length of the reference sequence + + for (auto && record : reads_file) + { + seqan3::sam_flag const flag = record.flag(); // 2: FLAG + int32_t const ref_id = record.reference_id().value_or(-1); // 3: RNAME + int32_t ref_pos = record.reference_position().value_or(-1); // 4: POS + + // Skip reads with certain properties. + if (hasFlagUnmapped(flag) || + hasFlagSecondary(flag) || + hasFlagDuplicate(flag) || + record.mapping_quality() < 20 || + ref_id < 0 || + ref_pos < 0) + { + continue; + } + + // Track the current position within the read. + unsigned read_pos = 0; + + // Step through the CIGAR string. + for (auto && [length, operation] : record.cigar_sequence()) + { + // Skip long variants. + if (length >= 30) // TODO: use min_length parameter + { + read_pos += length; + ref_pos += length; + continue; + } + + // Case distinction for cigar elements. + seqan3::cigar::operation const op = operation; + switch (op.to_char()) + { + case 'I': + case 'S': // insertion or soft clip + { + activity[ref_id][ref_pos] += length; + read_pos += length; + break; + } + case 'D': // deletion + { + for (unsigned idx = 0; idx < length; ++idx) + ++activity[ref_id][ref_pos + idx]; + ref_pos += length; + break; + } + default: // match or mismatch + { // TODO: ATTENTION, mismatches are currently ignored and should still be verified with the reference + ref_pos += length; + read_pos += length; + } + } + } + } + + // Print the raw activity profiles. + for (auto const & av : activity) + seqan3::debug_stream << av << std::endl; +} diff --git a/src/variant_detection/variant_detection.cpp b/src/variant_detection/variant_detection.cpp index dceed565..e0942a00 100644 --- a/src/variant_detection/variant_detection.cpp +++ b/src/variant_detection/variant_detection.cpp @@ -53,8 +53,6 @@ void detect_junctions_in_short_reads_sam_file([[maybe_unused]] std::vector; // 5: MAPQ - // Set number of decompression threads - seqan3::contrib::bgzf_thread_count = args.threads; seqan3::sam_file_input alignment_short_reads_file{args.alignment_short_reads_file_path, my_fields{}}; std::deque const ref_ids = read_header_information(alignment_short_reads_file, references_lengths); @@ -115,8 +113,6 @@ void detect_junctions_in_long_reads_sam_file(std::vector & junctions, seqan3::field::seq, // 10:SEQ seqan3::field::tags>; - // Set number of decompression threads - seqan3::contrib::bgzf_thread_count = args.threads; seqan3::sam_file_input alignment_long_reads_file{args.alignment_long_reads_file_path, my_fields{}}; std::deque const ref_ids = read_header_information(alignment_long_reads_file, references_lengths); From 541e1f496682c4b4e8d2b8f3ce45b11a16efb7aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Winkler?= Date: Wed, 29 Sep 2021 11:14:14 +0200 Subject: [PATCH 3/4] [TEST] create CLI test for the genome file parameter --- test/cli/CMakeLists.txt | 1 + test/cli/iGenVar_cli_test.cpp | 43 ++++++++++++++++++++++++----------- test/data/datasources.cmake | 5 ++++ 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/test/cli/CMakeLists.txt b/test/cli/CMakeLists.txt index d4382c8e..6fa15475 100755 --- a/test/cli/CMakeLists.txt +++ b/test/cli/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required (VERSION 3.11) add_cli_test(iGenVar_cli_test.cpp) add_dependencies (iGenVar_cli_test "iGenVar") target_use_datasources (iGenVar_cli_test FILES simulated.minimap2.hg19.coordsorted_cutoff.sam) +target_use_datasources (iGenVar_cli_test FILES mini_example_reference.fasta) target_use_datasources (iGenVar_cli_test FILES paired_end_mini_example.sam) target_use_datasources (iGenVar_cli_test FILES single_end_mini_example.sam) target_use_datasources (iGenVar_cli_test FILES output_res.txt) diff --git a/test/cli/iGenVar_cli_test.cpp b/test/cli/iGenVar_cli_test.cpp index 48cba037..6b6b169a 100644 --- a/test/cli/iGenVar_cli_test.cpp +++ b/test/cli/iGenVar_cli_test.cpp @@ -1,12 +1,10 @@ -#include -#include -#include - -#include "cli_test.hpp" #include #include +#include "cli_test.hpp" + std::string const default_alignment_long_reads_file_path = "simulated.minimap2.hg19.coordsorted_cutoff.sam"; +std::string const default_genome_file_path = "mini_example_reference.fasta"; std::string const vcf_out_file_path = "variants_file_out.vcf"; std::string const junctions_out_file_path = "junctions_file_out.txt"; std::string const clusters_out_file_path = "clusters_file_out.txt"; @@ -297,6 +295,33 @@ TEST_F(iGenVar_cli_test, test_intermediate_result_output) EXPECT_NE(buffer2.str(), ""); } +TEST_F(iGenVar_cli_test, test_genome_input) +{ + cli_test_result result = execute_app("iGenVar", + "-g", data(default_genome_file_path), + "-j", data("single_end_mini_example.sam")); + std::string const expected_err = "Detect SNPs, insertions and deletions in short reads...\n" + "[0,0,0,0,0,0,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0," + "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,9,9,9,9,9,9,9,9,9,9,9,9,1,0,0,0,0,0,0,0," + "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0," + "0,0,0,0,0,0,0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0," + "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,33,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0," + "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0," + "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,2,2,2,2,2,2,2,2," + "2,2,2,2,2,2,2,67,3,3,3,67,1,1,1,1,1,1,1,1,1,1,1,1,43,0,0,0,0,0,0,0,0,0,0,0,0,0," + "0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,4,4,4,4,4,4,4,4,4,4,4,4,0,0,0," + "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1," + "1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0," + "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52," + "0,0,0,0,0,0,0,0,0,0]\n" + "Start clustering...\n" + "Done with clustering. Found 0 junction clusters.\n" + "No refinement was selected.\n"; + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{"##fileformat=VCFv4.3\n##source=iGenVarCaller\n"} + general_header_lines); + EXPECT_EQ(result.err, expected_err); +} + // SV specifications: TEST_F(iGenVar_cli_test, fail_negative_min_var_length) @@ -616,12 +641,9 @@ TEST_F(iGenVar_cli_test, dataset_paired_end_mini_example) "--method read_pairs"); // Check the output of junctions: - seqan3::debug_stream << "Check the output of junctions... " << '\n'; EXPECT_EQ(result.out, expected_res_empty + general_header_lines); - seqan3::debug_stream << "done. " << '\n'; // Check the debug output of junctions: - seqan3::debug_stream << "Check the debug output of junctions... " << '\n'; std::string expected_err { "Detect junctions in short reads...\n" @@ -666,7 +688,6 @@ TEST_F(iGenVar_cli_test, dataset_paired_end_mini_example) "No refinement was selected.\n" }; EXPECT_EQ(result.err, expected_err); - seqan3::debug_stream << "done. " << '\n'; } TEST_F(iGenVar_cli_test, dataset_single_end_mini_example) @@ -679,18 +700,14 @@ TEST_F(iGenVar_cli_test, dataset_single_end_mini_example) "--hierarchical_clustering_cutoff 0.1"); // Check the output of junctions: - seqan3::debug_stream << "Check the output of junctions... " << '\n'; std::ifstream output_res_file("../../data/output_res.txt"); std::string output_res_str((std::istreambuf_iterator(output_res_file)), std::istreambuf_iterator()); EXPECT_EQ(result.out, output_res_str); - seqan3::debug_stream << "done. " << '\n'; // Check the debug output of junctions: - seqan3::debug_stream << "Check the debug output of junctions... " << '\n'; std::ifstream output_err_file("../../data/output_err.txt"); std::string output_err_str((std::istreambuf_iterator(output_err_file)), std::istreambuf_iterator()); EXPECT_EQ(result.err, output_err_str); - seqan3::debug_stream << "done. " << '\n'; } diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index a3a4f162..c62d3d71 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -11,6 +11,11 @@ declare_datasource (FILE simulated.minimap2.hg19.coordsorted_cutoff.sam URL ${CMAKE_SOURCE_DIR}/test/data/simulated.minimap2.hg19.coordsorted_cutoff.sam URL_HASH SHA256=e59b42c85ed309faf8b3d2f1a8e64a9ccd0a47becd1cb291144efd56be0aa4f9) +# creates the link /data/mini_example_reference.fasta +declare_datasource (FILE mini_example_reference.fasta + URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/mini_example_reference.fasta + URL_HASH SHA256=a45fed19a4ef58bcd1ac2d48e4a30d82dd74d7cadc64e468da2487946ce41afc) + # copies file to /data/paired_end_mini_example.sam declare_datasource (FILE paired_end_mini_example.sam URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/paired_end_mini_example.sam From b2587a5d3fab350c49b9bd29ee521f2ff6011715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Winkler?= Date: Thu, 30 Sep 2021 17:16:04 +0200 Subject: [PATCH 4/4] apply review suggestions --- .../variant_detection/snp_indel_detection.hpp | 2 +- src/iGenVar.cpp | 50 ++++++++++++------- src/variant_detection/snp_indel_detection.cpp | 6 +-- test/cli/iGenVar_cli_test.cpp | 7 +-- 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/include/variant_detection/snp_indel_detection.hpp b/include/variant_detection/snp_indel_detection.hpp index c1ddbd10..3fd9c702 100644 --- a/include/variant_detection/snp_indel_detection.hpp +++ b/include/variant_detection/snp_indel_detection.hpp @@ -4,6 +4,6 @@ /*! * \brief Detect Single Nucleotide Polymorphisms (SNPs) and short deletions and insertions. - * \param reads_filename The file path where to find the sequenced reads. + * \param[in] reads_filename - The file path where to find the sequenced reads. */ void detect_snp_and_indel(std::filesystem::path const & reads_filename); diff --git a/src/iGenVar.cpp b/src/iGenVar.cpp index d79a2d6b..0da74800 100644 --- a/src/iGenVar.cpp +++ b/src/iGenVar.cpp @@ -4,6 +4,10 @@ #include // for bgzf_thread_count #include // for seqan3::debug_stream +#include // for embl sequence file extensions +#include // for fasta sequence file extensions +#include // for fastq sequence file extensions +#include // for genbank sequence file extensions #include "modules/clustering/hierarchical_clustering_method.hpp" // for the hierarchical clustering method #include "modules/clustering/simple_clustering_method.hpp" // for the simple clustering method @@ -25,6 +29,18 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments parser.info.short_copyright = "short_copyright"; parser.info.url = "https://github.com/seqan/iGenVar/"; + // merge the vectors of all sequence file extensions + std::vector seq_file_extensions = seqan3::format_fasta::file_extensions; + seq_file_extensions.insert(seq_file_extensions.end(), + seqan3::format_fastq::file_extensions.begin(), + seqan3::format_fastq::file_extensions.end()); + seq_file_extensions.insert(seq_file_extensions.end(), + seqan3::format_embl::file_extensions.begin(), + seqan3::format_embl::file_extensions.end()); + seq_file_extensions.insert(seq_file_extensions.end(), + seqan3::format_genbank::file_extensions.begin(), + seqan3::format_genbank::file_extensions.end()); + // Options - Input / Output: parser.add_option(args.alignment_short_reads_file_path, 'i', "input_short_reads", @@ -40,7 +56,7 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments 'g', "input_genome", "Input the reference genome in FASTA or FASTQ format.", seqan3::option_spec::standard, - seqan3::input_file_validator{{"fasta", "fa", "fastq"}} ); + seqan3::input_file_validator{seq_file_extensions} ); parser.add_option(args.output_file_path, 'o', "output", "The path of the vcf output file. If no path is given, will output to standard output.", seqan3::option_spec::standard, @@ -130,26 +146,26 @@ void detect_variants_in_alignment_file(cmd_arguments const & args) std::vector junctions{}; std::map references_lengths{}; - if (args.genome_file_path.empty()) + // short reads + // TODO (joergi-w 30.09.2021) Control the selection with the 'method' parameter, not the availability of a genome. + if (!args.alignment_short_reads_file_path.empty() && args.genome_file_path.empty()) { - // short reads - if (!args.alignment_short_reads_file_path.empty()) - { - seqan3::debug_stream << "Detect junctions in short reads...\n"; - detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args); - } + seqan3::debug_stream << "Detect junctions in short reads...\n"; + detect_junctions_in_short_reads_sam_file(junctions, references_lengths, args); + } - // long reads - if (!args.alignment_long_reads_file_path.empty()) - { - seqan3::debug_stream << "Detect junctions in long reads...\n"; - detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args); - } + // long reads + if (!args.alignment_long_reads_file_path.empty()) + { + seqan3::debug_stream << "Detect junctions in long reads...\n"; + detect_junctions_in_long_reads_sam_file(junctions, references_lengths, args); } - else + + // SNPs and indels for short reads; genome must be given + if (!args.alignment_short_reads_file_path.empty() && !args.genome_file_path.empty()) { - seqan3::debug_stream << "Detect SNPs, insertions and deletions in short reads...\n"; - detect_snp_and_indel(args.alignment_long_reads_file_path); + seqan3::debug_stream << "Detect SNPs and indels in short reads...\n"; + detect_snp_and_indel(args.alignment_short_reads_file_path); } std::sort(junctions.begin(), junctions.end()); diff --git a/src/variant_detection/snp_indel_detection.cpp b/src/variant_detection/snp_indel_detection.cpp index 3d7ece99..e8e974ed 100644 --- a/src/variant_detection/snp_indel_detection.cpp +++ b/src/variant_detection/snp_indel_detection.cpp @@ -54,7 +54,7 @@ void detect_snp_and_indel(std::filesystem::path const & reads_filename) for (auto && [length, operation] : record.cigar_sequence()) { // Skip long variants. - if (length >= 30) // TODO: use min_length parameter + if (length >= 30) // TODO (joergi-w 30.09.2021) Use the min_length parameter here. { read_pos += length; ref_pos += length; @@ -79,8 +79,8 @@ void detect_snp_and_indel(std::filesystem::path const & reads_filename) ref_pos += length; break; } - default: // match or mismatch - { // TODO: ATTENTION, mismatches are currently ignored and should still be verified with the reference + default: // ignore match, mismatch, hard clipping, padding, and skipped region + { // TODO (joergi-w 30.09.2021) mismatches are ignored but should be verified with the reference ref_pos += length; read_pos += length; } diff --git a/test/cli/iGenVar_cli_test.cpp b/test/cli/iGenVar_cli_test.cpp index 6b6b169a..ad8fa675 100644 --- a/test/cli/iGenVar_cli_test.cpp +++ b/test/cli/iGenVar_cli_test.cpp @@ -40,7 +40,8 @@ std::string const help_page_part_1 " -g, --input_genome (std::filesystem::path)\n" " Input the reference genome in FASTA or FASTQ format. Default: \"\".\n" " The input file must exist and read permissions must be granted.\n" - " Valid file extensions are: [fasta, fa, fastq].\n" + " Valid file extensions are: [fasta, fa, fna, ffn, faa, frn, fas,\n" + " fastq, fq, embl, genbank, gb, gbk].\n" " -o, --output (std::filesystem::path)\n" " The path of the vcf output file. If no path is given, will output to\n" " standard output. Default: \"\". Write permissions must be granted.\n" @@ -299,8 +300,8 @@ TEST_F(iGenVar_cli_test, test_genome_input) { cli_test_result result = execute_app("iGenVar", "-g", data(default_genome_file_path), - "-j", data("single_end_mini_example.sam")); - std::string const expected_err = "Detect SNPs, insertions and deletions in short reads...\n" + "-i", data("single_end_mini_example.sam")); + std::string const expected_err = "Detect SNPs and indels in short reads...\n" "[0,0,0,0,0,0,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0," "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,9,9,9,9,9,9,9,9,9,9,9,9,1,0,0,0,0,0,0,0," "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"