From 61e47136268b035dc4756549d8721d958260c798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 26 Aug 2016 21:59:20 +0200 Subject: [PATCH 01/32] Remove unnecessary debug messages --- .gitignore | 1 + src/ococo_caller.h | 43 ------------------------------------------- 2 files changed, 1 insertion(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index e539513..44daa44 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ a.fa Debug/ *~ +*.swp *.anjuta .anjuta* diff --git a/src/ococo_caller.h b/src/ococo_caller.h index 90078f7..f71a925 100644 --- a/src/ococo_caller.h +++ b/src/ococo_caller.h @@ -296,10 +296,6 @@ namespace ococo{ */ ococo::info("Starting the main loop.\n"); - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Starting the main loop."; - } - int32_t r; b = bam_init1(); while ((r = sam_read1(params->sam_file, header, b)) >= 0) { @@ -338,53 +334,19 @@ namespace ococo{ const int32_t bq = qual[read_pos]; if (bq != 0xff && bq < (stats->params->min_baseq)) { - if(debugging){ - - BOOST_LOG_TRIVIAL(trace) - << "Omitting base (too low base quality): chrom=" - << seqid << ", pos=" << ref_pos - << ", nucl=" << nt256 << ", quality=" << bq << "."; - } continue; } if (nt4 == 0x4) { - if(debugging){ - BOOST_LOG_TRIVIAL(trace) - << "Omitting base (ambiguous nucleotide): chrom=" - << seqid << ", pos=" << ref_pos - << ", nucl=" << nt256 << ", quality=" << bq << "."; - } continue; } - if(debugging){ - - BOOST_LOG_TRIVIAL(trace) - << "Incrementing counter: chrom=" << seqid - << ", pos=" << ref_pos << ", nucl=" << nt256 - << ", quality=" << bq << ". Old state: " - << stats->debug_str_counters(seqid, ref_pos) << ","; - } - stats->seq_stats[seqid][ref_pos] = stats->increment(stats->seq_stats[seqid][ref_pos], nt4); - if(debugging){ - BOOST_LOG_TRIVIAL(trace) - << " ...new state: " - << stats->debug_str_counters(seqid, ref_pos) << "."; - } - if (stats->params->mode == ococo::mode_t::REALTIME) { stats->call_consensus_position(params->vcf_file, params->pileup_file, seqid, ref_pos); - if(debugging){ - - BOOST_LOG_TRIVIAL(trace) - << "Consensus called. New state: " - << stats->debug_str_counters(seqid, ref_pos) << "."; - } } } @@ -412,11 +374,6 @@ namespace ococo{ break; } } - - if(debugging) { - BOOST_LOG_TRIVIAL(debug) << "Alignment of '" << rname - << "' incorporated into statistics."; - } } /* From 3e5f523ca7b930d79eff440fae1d3e6f25b0368d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 26 Aug 2016 22:25:25 +0200 Subject: [PATCH 02/32] Remove other unnecessary debugging code --- src/main.cpp | 42 +------------- src/ococo.h | 2 - src/ococo_caller.h | 141 +++++---------------------------------------- src/ococo_misc.cpp | 15 +---- 4 files changed, 17 insertions(+), 183 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index d0163c6..e11bc5b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,54 +1,18 @@ #include "ococo.h" -//#ifdef DEBUGGING_MODE #define BOOST_LOG_DYN_LINK -//#endif - -//#include #include #include #include -#include -#include -#include - -#ifndef DEBUGGING_SEVERITY -#define DEBUGGING_SEVERITY trace -#endif - -namespace logging = boost::log; - -#ifdef DEBUGGING_MODE -const bool debugging = true; -#else -const bool debugging = false; -#endif /* -------------------------- */ - int main(int argc, const char *argv[]) { - if(debugging){ - logging::core::get()->set_filter(logging::trivial::severity >= - logging::trivial::DEBUGGING_SEVERITY); - - /* - BOOST_LOG_TRIVIAL(trace) << "A trace severity message"; - BOOST_LOG_TRIVIAL(debug) << "A debug severity message"; - BOOST_LOG_TRIVIAL(info) << "An informational severity message"; - BOOST_LOG_TRIVIAL(warning) << "A warning severity message"; - BOOST_LOG_TRIVIAL(error) << "An error severity message"; - BOOST_LOG_TRIVIAL(fatal) << "A fatal severity message"; - */ - - BOOST_LOG_TRIVIAL(info) << "Ococo started."; - } - /* * Default configuration. */ @@ -61,7 +25,7 @@ int main(int argc, const char *argv[]) { case ococo::OCOCO16: { - ococo::caller_t caller(¶ms); + ococo::caller_t caller(¶ms); if (!caller.correctly_initialized){ return EXIT_FAILURE ; } @@ -71,7 +35,7 @@ int main(int argc, const char *argv[]) { case ococo::OCOCO32: { - ococo::caller_t caller(¶ms); + ococo::caller_t caller(¶ms); if (!caller.correctly_initialized){ return EXIT_FAILURE ; } @@ -81,7 +45,7 @@ int main(int argc, const char *argv[]) { case ococo::OCOCO64: { - ococo::caller_t caller(¶ms); + ococo::caller_t caller(¶ms); if (!caller.correctly_initialized){ return EXIT_FAILURE ; } diff --git a/src/ococo.h b/src/ococo.h index 6efa140..76eb22c 100644 --- a/src/ococo.h +++ b/src/ococo.h @@ -1,7 +1,5 @@ #pragma once -#define BOOST_LOG_DYN_LINK - #include "ococo_misc.h" #include "ococo_params.h" #include "consensus_functions.h" diff --git a/src/ococo_caller.h b/src/ococo_caller.h index f71a925..588941c 100644 --- a/src/ococo_caller.h +++ b/src/ococo_caller.h @@ -5,27 +5,9 @@ #include "ococo_stats.h" #include "ococo_params.h" - -//#ifndef DEBUGGING_SEVERITY -//#define DEBUGGING_SEVERITY trace - -#include -#include -#include - -namespace logging = boost::log; - -//void boost_logging_init() { -// logging::core::get()->set_filter(logging::trivial::severity >= -// logging::trivial::DEBUGGING_SEVERITY); -//} - -//#endif - - namespace ococo{ - template + template struct caller_t { bool correctly_initialized; int return_code; @@ -46,12 +28,10 @@ namespace ococo{ void run(); }; - template - caller_t::caller_t(params_t *params_): + template + caller_t::caller_t(params_t *params_): params(params_) { - - /* * Read SAM headers. */ @@ -66,11 +46,6 @@ namespace ococo{ header = nullptr; stats = nullptr; - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "SAM/BAM reader initialization: reading '" - << params->sam_fn.c_str() << "'."; - } - params->sam_file = sam_open(params->sam_fn.c_str(), "r"); if (params->sam_file == nullptr) { ococo::fatal_error("Problem with opening SAM/BAM file ('%s').\n", @@ -105,10 +80,6 @@ namespace ococo{ if (!params->stats_in_fn.empty()) { ococo::info("Loading statistics ('%s').\n", params->stats_in_fn.c_str()); - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Importing statistics: '" << params->stats_in_fn - << "'."; - } int error_code = stats->import_stats(params->stats_in_fn); if (error_code != 0) { @@ -118,19 +89,10 @@ namespace ococo{ return; } } else { - if(debugging){ - - BOOST_LOG_TRIVIAL(info) << "No file with statistics provided."; - } if (!params->fasta_in_fn.empty()) { ococo::info("Loading reference ('%s').\n", params->fasta_in_fn.c_str()); - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Loading FASTA: '" << params->fasta_in_fn - << "'."; - } - int error_code = stats->load_fasta(params->fasta_in_fn); if (error_code != 0) { ococo::fatal_error("Loading of FASTA failed (file '%s').\n", @@ -153,10 +115,6 @@ namespace ococo{ if (params->vcf_fn.size() > 0) { ococo::info("Opening VCF stream ('%s').\n", params->vcf_fn.c_str()); - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Open VCF: '" << params->vcf_fn << "'."; - } - if (params->vcf_fn == std::string("-")) { params->vcf_file = stdout; } else { @@ -179,10 +137,6 @@ namespace ococo{ } stats->print_vcf_header(params->vcf_file, params->command, fasta_full_path); - } else { - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "No VCF file required."; - } } /* @@ -192,10 +146,6 @@ namespace ococo{ if (params->pileup_fn.size() > 0) { ococo::info("Opening pileup stream ('%s').\n", params->pileup_fn.c_str()); - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Open pileup: '" << params->pileup_fn << "'."; - } - if (params->pileup_fn == std::string("-")) { params->pileup_file = stdout; } else { @@ -208,10 +158,6 @@ namespace ococo{ } } - } else { - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "No pileup file required."; - } } /* @@ -221,13 +167,7 @@ namespace ococo{ if (params->fasta_out_fn.size() > 0) { params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); - ococo::info("Opening consensus file ('%s').\n", params->fasta_out_fn.c_str()); - - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Open FASTA for consensus: '" << params->fasta_out_fn - << "'."; - } - + ococo::info("Opening consensus file ('%s').\n", params->fasta_out_fn.c_str()); params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); if (params->fasta_out_file == nullptr) { @@ -237,9 +177,6 @@ namespace ococo{ correctly_initialized=false; return; } - } else { - if(debugging) - BOOST_LOG_TRIVIAL(info) << "No FASTA file for consensus required."; } } @@ -249,37 +186,17 @@ namespace ococo{ ////////////////////////////////////////////////////// */ - template - bool caller_t::check_read(int32_t seqid, int32_t flags, int32_t mapq) { - /* TODO: return back - if(debugging){ - - BOOST_LOG_TRIVIAL(debug) - << "Reading alignment: rname='" << rname << ", chrom=" << seqid - << ", pos=" << mappping_pos << ", mapq=" << mapq - << ", flags=" << flags; - }*/ - + template + bool caller_t::check_read(int32_t seqid, int32_t flags, int32_t mapq) { if ((flags & BAM_FUNMAP) != 0) { - if(debugging){ - BOOST_LOG_TRIVIAL(debug) << "Discarded: read is not aligned."; - } return false; } if (!stats->seq_active[seqid]) { - if(debugging){ - BOOST_LOG_TRIVIAL(debug) - << "Discarded: consensus calling is off for this chromosome."; - } return false; } if (mapq < stats->params->min_mapq) { - if(debugging){ - BOOST_LOG_TRIVIAL(debug) - << "Discarded: mapping quality is too low."; - } return false; } @@ -288,8 +205,8 @@ namespace ococo{ - template - void caller_t::run() { + template + void caller_t::run() { /* * Process alignments. @@ -377,43 +294,23 @@ namespace ococo{ } /* - * Calling final consensus and export stats. + * Call final consensus and export stats. */ if (stats->params->mode == ococo::mode_t::BATCH) { - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Calling consensus for the entire reference " - "sequence (batch mode)."; - } - stats->call_consensus(params->vcf_file, params->pileup_file); if (params->fasta_out_fn.size() > 0) { - if(debugging){ - - BOOST_LOG_TRIVIAL(info) << "Saving FASTA: '" << params->fasta_out_fn - << "'."; - } - int error_code = stats->save_fasta(params->fasta_out_fn); if (error_code != 0) { ococo::error("FASTA '%s' could not be saved.\n", params->fasta_out_fn.c_str()); return_code = EXIT_FAILURE ; } - } else { - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "FASTA not saved."; - } } } if (params->stats_out_fn.size() > 0) { - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Saving statistics: '" << params->stats_out_fn - << "'."; - } - ococo::info("Saving statistics ('%s').\n", params->stats_out_fn.c_str()); int error_code = stats->export_stats(params->stats_out_fn); @@ -422,10 +319,6 @@ namespace ococo{ params->stats_out_fn.c_str()); return_code = EXIT_FAILURE; } - } else { - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Statistics not saved."; - } } } @@ -439,14 +332,10 @@ namespace ococo{ */ - template - caller_t::~caller_t(){ - - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Freeing memory."; - } - - hts_itr_destroy(iter); + template + caller_t::~caller_t(){ + + hts_itr_destroy(iter); bam_destroy1(b); bam_hdr_destroy(header); @@ -458,10 +347,6 @@ namespace ococo{ ococo::info("Ococo successfully finished. Bye.\n"); } - if(debugging){ - BOOST_LOG_TRIVIAL(info) << "Ococo finished."; - } - } } diff --git a/src/ococo_misc.cpp b/src/ococo_misc.cpp index 5cc75f2..1f4343e 100644 --- a/src/ococo_misc.cpp +++ b/src/ococo_misc.cpp @@ -4,19 +4,6 @@ void ococo::print_version(){ std::cout << "Program: Ococo (online consensus caller, " << "http://github.com/karel-brinda/ococo)." << std::endl; std::cout << "Version: " << OCOCO_VERSION << std::endl; - /* " (" - << 8 * sizeof(OCOCO_BASIC_TYPE) << "bit variant" - << ", counter size " << BITS_PER_COUNTER << "bits"; - -#ifdef DEBUGGING_MODE - std::cerr << ", debugging mode"; -#endif - -#ifdef VERBOSE_VCF - std::cerr << ", verbose VCF"; -#endif - - std::cerr << ")." << std::endl;*/ } void ococo::fatal_error(const char *format, ...) { @@ -60,4 +47,4 @@ bool ococo::file_exists(const std::string &fn) { return true; } return false; -} \ No newline at end of file +} From 49c12defc53a62c66a40991866d505d8fd8ab479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 26 Aug 2016 22:32:17 +0200 Subject: [PATCH 03/32] Remove Boost log from CMake --- CMakeLists.txt | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a181b0..0cec68a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ set(Boost_USE_STATIC_LIBS 1) if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting build type to 'RELEASE' as none was specified.") set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "Choose the type of build." FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "DEBUG" "RELEASE" "RELWITHDEBINFO") + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "DEBUG" "RELEASE") endif() if(CMAKE_BUILD_TYPE MATCHES DEBUG) @@ -22,13 +22,6 @@ if(CMAKE_BUILD_TYPE MATCHES RELEASE) message("RELEASE mode") endif(CMAKE_BUILD_TYPE MATCHES RELEASE) -if(CMAKE_BUILD_TYPE MATCHES RELWITHDEBINFO) - message("RELWITHDEBINFO mode") -endif(CMAKE_BUILD_TYPE MATCHES RELWITHDEBINFO) - -#option(DEBUG "Build in the debugging mode" OFF) -#option(OCOCO32 "Increase size of stats per position from 16b to 32b." OFF) -#option(VERBOSE_VCF "Print also VCF records without updates." OFF) option(INSTALL_DEBUG_SCRIPTS "Install debugging scripts." OFF) set(DEBUGGING_SEVERITY "trace" CACHE STRING "Verbosity of debugging mode.") @@ -39,11 +32,7 @@ option(BUILD_TESTS "Build tests." OFF) # BOOST # ######### -find_package( Boost 1.46 COMPONENTS program_options system log log_setup thread REQUIRED ) -if(CMAKE_BUILD_TYPE MATCHES DEBUG) - add_definitions(-DDEBUGGING_MODE) - add_definitions(-DDEBUGGING_SEVERITY=${DEBUGGING_SEVERITY}) -endif(CMAKE_BUILD_TYPE MATCHES DEBUG) +find_package( Boost 1.46 COMPONENTS program_options REQUIRED ) include_directories( ${Boost_INCLUDE_DIR} ) @@ -94,10 +83,9 @@ find_package (Threads) include_directories( "src" ) add_subdirectory(tests/unit_tests) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -Wno-missing-field-initializers -Wshadow ") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -DNDEBUG") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -march=native -DNDEBUG -g") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -pedantic") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -Wno-missing-field-initializers -Wshadow -g ") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -DNDEBUG") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -pedantic") add_executable(ococo src/main.cpp) add_dependencies(ococo htslib ococo_core) @@ -124,7 +112,6 @@ install(TARGETS ococo DESTINATION bin) if(INSTALL_DEBUG_SCRIPTS) install(PROGRAMS scripts/ococo_test_bam.sh DESTINATION bin) - #install(PROGRAMS scripts/ococo_test_bam.sh DESTINATION bin) endif(INSTALL_DEBUG_SCRIPTS) From d92c034d6aeb43dda252111353175bb96d635842 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 26 Aug 2016 22:37:11 +0200 Subject: [PATCH 04/32] Get rid of some warnings --- src/ococo_caller.h | 5 ++--- src/ococo_stats.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ococo_caller.h b/src/ococo_caller.h index 588941c..99ef140 100644 --- a/src/ococo_caller.h +++ b/src/ococo_caller.h @@ -216,12 +216,11 @@ namespace ococo{ int32_t r; b = bam_init1(); while ((r = sam_read1(params->sam_file, header, b)) >= 0) { - const char *rname = bam_get_qname(b); + //const char *rname = bam_get_qname(b); const uint8_t *seq = bam_get_seq(b); const uint8_t *qual = bam_get_qual(b); const uint32_t *cigar = bam_get_cigar(b); const int32_t n_cigar = b->core.n_cigar; - //+b->core.l_qname const int32_t seqid = b->core.tid; const int64_t mappping_pos = b->core.pos; const int32_t mapq = b->core.qual; @@ -247,7 +246,7 @@ namespace ococo{ for (; read_pos < next_read_pos; read_pos++, ref_pos++) { const uint8_t nt16 = bam_seqi(seq, read_pos); const uint8_t nt4 = ococo::nt16_nt4[nt16]; - const char nt256 = ococo::nt16_nt256[nt16]; + //const char nt256 = ococo::nt16_nt256[nt16]; const int32_t bq = qual[read_pos]; if (bq != 0xff && bq < (stats->params->min_baseq)) { diff --git a/src/ococo_stats.h b/src/ococo_stats.h index 99f980e..722beca 100644 --- a/src/ococo_stats.h +++ b/src/ococo_stats.h @@ -496,7 +496,7 @@ namespace ococo { assert(check_allocation()); assert(vcf_file != nullptr); - float alt_freq=1.0*psu.counters[nt256_nt4[new_base]]/psu.sum; + float alt_freq=1.0*psu.counters[nt256_nt4[static_cast(new_base)]]/psu.sum; fprintf(vcf_file, "%s\t%" PRId64 "\t.\t%c\t%c\t100\tPASS\tAF=%.2f;CS=%" PRId32 ",%" PRId32 From b8629e09be358d72dca0671b0c6d358b0312abe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 26 Aug 2016 22:48:22 +0200 Subject: [PATCH 05/32] Spaces => tabs --- CMakeLists.txt | 56 +- src/consensus_functions.h | 241 ++++---- src/main.cpp | 88 +-- src/ococo.h | 1 + src/ococo_caller.h | 684 ++++++++++----------- src/ococo_misc.cpp | 60 +- src/ococo_misc.h | 36 +- src/ococo_params.cpp | 580 +++++++++--------- src/ococo_params.h | 204 +++---- src/ococo_stats.h | 1214 +++++++++++++++++++------------------ src/ococo_types.h | 163 ++--- src/version.h | 5 +- 12 files changed, 1668 insertions(+), 1664 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0cec68a..2272e51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,17 +9,17 @@ project (ococo C CXX) set(Boost_USE_STATIC_LIBS 1) if(NOT CMAKE_BUILD_TYPE) - message(STATUS "Setting build type to 'RELEASE' as none was specified.") - set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "Choose the type of build." FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "DEBUG" "RELEASE") + message(STATUS "Setting build type to 'RELEASE' as none was specified.") + set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "Choose the type of build." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "DEBUG" "RELEASE") endif() if(CMAKE_BUILD_TYPE MATCHES DEBUG) - message("DEBUG mode") + message("DEBUG mode") endif(CMAKE_BUILD_TYPE MATCHES DEBUG) if(CMAKE_BUILD_TYPE MATCHES RELEASE) - message("RELEASE mode") + message("RELEASE mode") endif(CMAKE_BUILD_TYPE MATCHES RELEASE) option(INSTALL_DEBUG_SCRIPTS "Install debugging scripts." OFF) @@ -42,21 +42,21 @@ include_directories( ${Boost_INCLUDE_DIR} ) ########## if (CMAKE_GENERATOR STREQUAL "Unix Makefiles") - set(MAKE_COMMAND "$(MAKE)") + set(MAKE_COMMAND "$(MAKE)") else() - find_program(MAKE_COMMAND NAMES make gmake) + find_program(MAKE_COMMAND NAMES make gmake) endif() include(ExternalProject) ExternalProject_Add(htslib - PREFIX ${CMAKE_BINARY_DIR}/ext/htslib.tmp - SOURCE_DIR "${CMAKE_BINARY_DIR}/ext/htslib" - BUILD_IN_SOURCE 1 - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND ${MAKE_COMMAND} lib-static - INSTALL_COMMAND "" -) + PREFIX ${CMAKE_BINARY_DIR}/ext/htslib.tmp + SOURCE_DIR "${CMAKE_BINARY_DIR}/ext/htslib" + BUILD_IN_SOURCE 1 + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND ${MAKE_COMMAND} lib-static + INSTALL_COMMAND "" + ) include_directories(${CMAKE_BINARY_DIR}/ext/htslib) @@ -67,9 +67,9 @@ include_directories(${CMAKE_BINARY_DIR}/ext/htslib) find_package(ZLIB REQUIRED) if (ZLIB_FOUND) - include_directories(${ZLIB_INCLUDE_DIRS}) + include_directories(${ZLIB_INCLUDE_DIRS}) else() - message (FATAL_ERROR "zlib not found.") + message (FATAL_ERROR "zlib not found.") endif(ZLIB_FOUND) include_directories(${htslib_INSTALL}/include) @@ -90,16 +90,16 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -pedantic") add_executable(ococo src/main.cpp) add_dependencies(ococo htslib ococo_core) add_library(ococo_core - src/ococo_caller.h - src/ococo_params.cpp - src/ococo_params.h - src/ococo_misc.cpp - src/ococo_misc.h - src/ococo.h - src/ococo_types.h - src/ococo_stats.h - src/version.h - ) + src/ococo_caller.h + src/ococo_params.cpp + src/ococo_params.h + src/ococo_misc.cpp + src/ococo_misc.h + src/ococo.h + src/ococo_types.h + src/ococo_stats.h + src/version.h + ) target_link_libraries(ococo_core ${Boost_LIBRARIES} ${CMAKE_BINARY_DIR}/ext/htslib/libhts.a ${ZLIB_LIBRARIES}) target_link_libraries(ococo ${CMAKE_THREAD_LIBS_INIT} ${ZLIB_LIBRARIES} ococo_core) @@ -111,7 +111,7 @@ target_link_libraries(ococo ${CMAKE_THREAD_LIBS_INIT} ${ZLIB_LIBRARIES} ococo_co install(TARGETS ococo DESTINATION bin) if(INSTALL_DEBUG_SCRIPTS) - install(PROGRAMS scripts/ococo_test_bam.sh DESTINATION bin) + install(PROGRAMS scripts/ococo_test_bam.sh DESTINATION bin) endif(INSTALL_DEBUG_SCRIPTS) diff --git a/src/consensus_functions.h b/src/consensus_functions.h index 4ac3c08..1756d91 100644 --- a/src/consensus_functions.h +++ b/src/consensus_functions.h @@ -8,126 +8,127 @@ namespace ococo { -inline char cons_call_no_updates(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - return nt16_nt256[psu.nt16]; -} + inline char cons_call_no_updates(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + return nt16_nt256[psu.nt16]; + } + + inline char cons_call_stoch(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + if (psu.sum == 0) { + return nt16_nt256[psu.nt16]; + } + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + if (psu.sum < params.min_coverage + params.init_ref_weight) { + return nt16_nt256[psu.nt16]; + } + } + + const int32_t prefsum[] = { + psu.counters[0], psu.counters[0] + psu.counters[1], + psu.counters[0] + psu.counters[1] + psu.counters[2], + psu.counters[0] + psu.counters[1] + psu.counters[2] + psu.counters[3]}; + + assert(prefsum[3] == psu.sum); + + const int32_t rn = rand() % psu.sum; + for (int32_t i = 0; i < 4; i++) { + if (rn < prefsum[i]) { + return nt4_nt256[i]; + } + } + + return 'n'; + } + + inline char cons_call_stoch_amb(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + if (psu.sum == 0) { + return nt16_nt256[psu.nt16]; + } + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + if (psu.sum < params.min_coverage + params.init_ref_weight) { + return nt16_nt256[psu.nt16]; + } + } + + nt16_t nucl_nt16 = nt256_nt16[static_cast('N')]; + + while (nucl_nt16 == nt256_nt16[static_cast('N')]) { + nucl_nt16 = 0; + for (int32_t i = 0; i < 4; i++) { + const int32_t rn = rand() % psu.sum; + + if (rn < psu.counters[i]) { + nucl_nt16 |= nt4_nt16[i]; + } + } + } + + return nt16_nt256[nucl_nt16]; + } + + inline char cons_call_maj(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + if (psu.sum == 0) { + return nt16_nt256[psu.nt16]; + } + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + if (psu.sum < params.min_coverage + params.init_ref_weight) { + return nt16_nt256[psu.nt16]; + } + } + + char nucl_nt256 = nt16_nt256[psu.nt16]; + + int32_t required_min = + static_cast(ceil(params.majority_threshold * psu.sum)); + int32_t max = 0; + for (int32_t i = 0; i < 4; i++) { + if (psu.counters[i] >= required_min) { + if (psu.counters[i] > max) { + max = psu.counters[i]; + nucl_nt256 = nt4_nt256[i]; + } + } + } + + return nucl_nt256; + } + + inline char cons_call_maj_amb(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + if (psu.sum == 0) { + return nt16_nt256[psu.nt16]; + } + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + if (psu.sum < params.min_coverage + params.init_ref_weight) { + return nt16_nt256[psu.nt16]; + } + } + + char nucl_nt16 = psu.nt16; + + int32_t required_min = + static_cast(round(params.majority_threshold * psu.sum)); + int32_t max = 0; + for (int32_t i = 0; i < 4; i++) { + if (psu.counters[i] >= required_min) { + if (psu.counters[i] > max) { + max = psu.counters[i]; + nucl_nt16 = nt4_nt16[i]; + } else if (psu.counters[i] >= max) { + nucl_nt16 |= nt4_nt16[i]; + } + } + } + + return nt16_nt256[static_cast(nucl_nt16)]; + } -inline char cons_call_stoch(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - if (psu.sum == 0) { - return nt16_nt256[psu.nt16]; - } - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - if (psu.sum < params.min_coverage + params.init_ref_weight) { - return nt16_nt256[psu.nt16]; - } - } - - const int32_t prefsum[] = { - psu.counters[0], psu.counters[0] + psu.counters[1], - psu.counters[0] + psu.counters[1] + psu.counters[2], - psu.counters[0] + psu.counters[1] + psu.counters[2] + psu.counters[3]}; - - assert(prefsum[3] == psu.sum); - - const int32_t rn = rand() % psu.sum; - for (int32_t i = 0; i < 4; i++) { - if (rn < prefsum[i]) { - return nt4_nt256[i]; - } - } - - return 'n'; } -inline char cons_call_stoch_amb(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - if (psu.sum == 0) { - return nt16_nt256[psu.nt16]; - } - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - if (psu.sum < params.min_coverage + params.init_ref_weight) { - return nt16_nt256[psu.nt16]; - } - } - - nt16_t nucl_nt16 = nt256_nt16[static_cast('N')]; - - while (nucl_nt16 == nt256_nt16[static_cast('N')]) { - nucl_nt16 = 0; - for (int32_t i = 0; i < 4; i++) { - const int32_t rn = rand() % psu.sum; - - if (rn < psu.counters[i]) { - nucl_nt16 |= nt4_nt16[i]; - } - } - } - - return nt16_nt256[nucl_nt16]; -} - -inline char cons_call_maj(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - if (psu.sum == 0) { - return nt16_nt256[psu.nt16]; - } - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - if (psu.sum < params.min_coverage + params.init_ref_weight) { - return nt16_nt256[psu.nt16]; - } - } - - char nucl_nt256 = nt16_nt256[psu.nt16]; - - int32_t required_min = - static_cast(ceil(params.majority_threshold * psu.sum)); - int32_t max = 0; - for (int32_t i = 0; i < 4; i++) { - if (psu.counters[i] >= required_min) { - if (psu.counters[i] > max) { - max = psu.counters[i]; - nucl_nt256 = nt4_nt256[i]; - } - } - } - - return nucl_nt256; -} - -inline char cons_call_maj_amb(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - if (psu.sum == 0) { - return nt16_nt256[psu.nt16]; - } - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - if (psu.sum < params.min_coverage + params.init_ref_weight) { - return nt16_nt256[psu.nt16]; - } - } - - char nucl_nt16 = psu.nt16; - - int32_t required_min = - static_cast(round(params.majority_threshold * psu.sum)); - int32_t max = 0; - for (int32_t i = 0; i < 4; i++) { - if (psu.counters[i] >= required_min) { - if (psu.counters[i] > max) { - max = psu.counters[i]; - nucl_nt16 = nt4_nt16[i]; - } else if (psu.counters[i] >= max) { - nucl_nt16 |= nt4_nt16[i]; - } - } - } - - return nt16_nt256[static_cast(nucl_nt16)]; -} - -} diff --git a/src/main.cpp b/src/main.cpp index e11bc5b..f9648bf 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,51 +8,51 @@ /* - -------------------------- + * -------------------------- */ int main(int argc, const char *argv[]) { - - /* - * Default configuration. - */ - ococo::params_t params = ococo::params_t(argc, argv); - if (!params.correctly_initialized){ - return EXIT_FAILURE; - } - - switch (params.counter_configuration){ - - case ococo::OCOCO16: - { - ococo::caller_t caller(¶ms); - if (!caller.correctly_initialized){ - return EXIT_FAILURE ; - } - caller.run(); - return caller.return_code; - } - - case ococo::OCOCO32: - { - ococo::caller_t caller(¶ms); - if (!caller.correctly_initialized){ - return EXIT_FAILURE ; - } - caller.run(); - return caller.return_code; - } - - case ococo::OCOCO64: - { - ococo::caller_t caller(¶ms); - if (!caller.correctly_initialized){ - return EXIT_FAILURE ; - } - caller.run(); - return caller.return_code; - } - } - - return EXIT_FAILURE; + + /* + * Default configuration. + */ + ococo::params_t params = ococo::params_t(argc, argv); + if (!params.correctly_initialized){ + return EXIT_FAILURE; + } + + switch (params.counter_configuration){ + + case ococo::OCOCO16: + { + ococo::caller_t caller(¶ms); + if (!caller.correctly_initialized){ + return EXIT_FAILURE ; + } + caller.run(); + return caller.return_code; + } + + case ococo::OCOCO32: + { + ococo::caller_t caller(¶ms); + if (!caller.correctly_initialized){ + return EXIT_FAILURE ; + } + caller.run(); + return caller.return_code; + } + + case ococo::OCOCO64: + { + ococo::caller_t caller(¶ms); + if (!caller.correctly_initialized){ + return EXIT_FAILURE ; + } + caller.run(); + return caller.return_code; + } + } + + return EXIT_FAILURE; } diff --git a/src/ococo.h b/src/ococo.h index 76eb22c..fe66fbd 100644 --- a/src/ococo.h +++ b/src/ococo.h @@ -17,3 +17,4 @@ #include #include #include + diff --git a/src/ococo_caller.h b/src/ococo_caller.h index 99ef140..4ceae18 100644 --- a/src/ococo_caller.h +++ b/src/ococo_caller.h @@ -6,346 +6,346 @@ #include "ococo_params.h" namespace ococo{ - - template - struct caller_t { - bool correctly_initialized; - int return_code; - - hts_itr_t *iter; - - bam1_t *b; - bam_hdr_t *header; - - stats_t *stats; - - params_t *params; - - caller_t(params_t *params_); - ~caller_t(); - - bool check_read(int32_t seqid, int32_t flags, int32_t mapq); - void run(); - }; - - template - caller_t::caller_t(params_t *params_): - params(params_) - { - /* - * Read SAM headers. - */ - - ococo::info("Initialing SAM/BAM reader.\n"); - - correctly_initialized=true; - return_code=EXIT_SUCCESS; - - iter = nullptr; - b = nullptr; - header = nullptr; - stats = nullptr; - - params->sam_file = sam_open(params->sam_fn.c_str(), "r"); - if (params->sam_file == nullptr) { - ococo::fatal_error("Problem with opening SAM/BAM file ('%s').\n", - params->sam_fn.c_str()); - correctly_initialized=false; - return; - } - - if ((header = sam_hdr_read(params->sam_file)) == 0) { - ococo::fatal_error("SAM/BAM headers are missing or corrupted.\n"); - correctly_initialized=false; - return; - } - - stats = new (std::nothrow) stats_t(params, *header); - if (stats == nullptr || !stats->check_allocation()) { - ococo::fatal_error("Allocation of the main structure failed.\n"); - correctly_initialized=false; - return; - } - - /* - * Load FASTA and stats. - */ - - if (!params->stats_in_fn.empty() && !params->fasta_in_fn.empty()) { - ococo::fatal_error("Initial FASTA reference and input statistics " - "cannot be used at the same time.\n"); - correctly_initialized=false; - return; - } - - if (!params->stats_in_fn.empty()) { - ococo::info("Loading statistics ('%s').\n", params->stats_in_fn.c_str()); - - int error_code = stats->import_stats(params->stats_in_fn); - if (error_code != 0) { - ococo::fatal_error("Import of statistics failed (file '%s').\n", - params->stats_in_fn.c_str()); - correctly_initialized=false; - return; - } - } else { - - if (!params->fasta_in_fn.empty()) { - ococo::info("Loading reference ('%s').\n", params->fasta_in_fn.c_str()); - - int error_code = stats->load_fasta(params->fasta_in_fn); - if (error_code != 0) { - ococo::fatal_error("Loading of FASTA failed (file '%s').\n", - params->fasta_in_fn.c_str()); - correctly_initialized=false; - return; - } - } - - else { - ococo::info("Neither reference, nor statistics provided. Going to " - "consider sequence of N's as a reference.\n"); - } - } - - /* - * Open VCF file. - */ - - if (params->vcf_fn.size() > 0) { - ococo::info("Opening VCF stream ('%s').\n", params->vcf_fn.c_str()); - - if (params->vcf_fn == std::string("-")) { - params->vcf_file = stdout; - } else { - params->vcf_file = fopen(params->vcf_fn.c_str(), "w+"); - if (params->vcf_file == nullptr) { - ococo::fatal_error("Problem with opening VCF file '%s'.\n", - params->vcf_fn.c_str()); - correctly_initialized=false; - return; - } - } - - char buf[PATH_MAX + 1]; - char *res = realpath(params->fasta_in_fn.c_str(), buf); - std::string fasta_full_path; - if (res) { - fasta_full_path = std::string(buf); - } else { - fasta_full_path = params->fasta_in_fn; - } - - stats->print_vcf_header(params->vcf_file, params->command, fasta_full_path); - } - - /* - * Open pileup file. - */ - - if (params->pileup_fn.size() > 0) { - ococo::info("Opening pileup stream ('%s').\n", params->pileup_fn.c_str()); - - if (params->pileup_fn == std::string("-")) { - params->pileup_file = stdout; - } else { - params->pileup_file = fopen(params->pileup_fn.c_str(), "w+"); - if (params->pileup_file == nullptr) { - ococo::fatal_error("Problem with opening pileup file '%s'.\n", - params->pileup_fn.c_str()); - correctly_initialized=false; - return; - } - } - - } - - /* - * Open consensus FASTA file. - */ - - if (params->fasta_out_fn.size() > 0) { - params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); - - ococo::info("Opening consensus file ('%s').\n", params->fasta_out_fn.c_str()); - params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); - - if (params->fasta_out_file == nullptr) { - ococo::fatal_error( - "Problem with opening FASTA for consensus: '%s'.\n", - params->fasta_out_fn.c_str()); - correctly_initialized=false; - return; - } - } - } - - /* - ////////////////////////////////////////////////////// - ////////////////////////////////////////////////////// - ////////////////////////////////////////////////////// - */ - - template - bool caller_t::check_read(int32_t seqid, int32_t flags, int32_t mapq) { - if ((flags & BAM_FUNMAP) != 0) { - return false; - } - - if (!stats->seq_active[seqid]) { - return false; - } - - if (mapq < stats->params->min_mapq) { - return false; - } - - return true; - } - - - - template - void caller_t::run() { - - /* - * Process alignments. - */ - ococo::info("Starting the main loop.\n"); - - int32_t r; - b = bam_init1(); - while ((r = sam_read1(params->sam_file, header, b)) >= 0) { - //const char *rname = bam_get_qname(b); - const uint8_t *seq = bam_get_seq(b); - const uint8_t *qual = bam_get_qual(b); - const uint32_t *cigar = bam_get_cigar(b); - const int32_t n_cigar = b->core.n_cigar; - const int32_t seqid = b->core.tid; - const int64_t mappping_pos = b->core.pos; - const int32_t mapq = b->core.qual; - const int32_t flags = b->core.flag; - - bool read_ok = check_read(seqid, flags, mapq); - if(!read_ok){ - continue; - } - - int32_t ref_pos = mappping_pos; - for (int32_t cigar_grp = 0, read_pos = 0; cigar_grp < n_cigar; - cigar_grp++) { - const int32_t op = bam_cigar_op(cigar[cigar_grp]); - const int32_t ol = bam_cigar_oplen(cigar[cigar_grp]); - - const int32_t next_read_pos = read_pos + ol; - switch (op) { - case BAM_CMATCH: - case BAM_CDIFF: - case BAM_CEQUAL: - - for (; read_pos < next_read_pos; read_pos++, ref_pos++) { - const uint8_t nt16 = bam_seqi(seq, read_pos); - const uint8_t nt4 = ococo::nt16_nt4[nt16]; - //const char nt256 = ococo::nt16_nt256[nt16]; - const int32_t bq = qual[read_pos]; - - if (bq != 0xff && bq < (stats->params->min_baseq)) { - continue; - } - - if (nt4 == 0x4) { - continue; - } - - stats->seq_stats[seqid][ref_pos] = - stats->increment(stats->seq_stats[seqid][ref_pos], nt4); - - if (stats->params->mode == ococo::mode_t::REALTIME) { - stats->call_consensus_position(params->vcf_file, params->pileup_file, - seqid, ref_pos); - } - } - - break; - - case BAM_CDEL: - case BAM_CREF_SKIP: - ref_pos += ol; - break; - - case BAM_CSOFT_CLIP: - read_pos += ol; - break; - - case BAM_CBACK: - ref_pos -= ol; - break; - - case BAM_CINS: - read_pos += ol; - break; - - case BAM_CPAD: - case BAM_CHARD_CLIP: - break; - } - } - } - - /* - * Call final consensus and export stats. - */ - - if (stats->params->mode == ococo::mode_t::BATCH) { - stats->call_consensus(params->vcf_file, params->pileup_file); - - if (params->fasta_out_fn.size() > 0) { - int error_code = stats->save_fasta(params->fasta_out_fn); - if (error_code != 0) { - ococo::error("FASTA '%s' could not be saved.\n", - params->fasta_out_fn.c_str()); - return_code = EXIT_FAILURE ; - } - } - } - - if (params->stats_out_fn.size() > 0) { - ococo::info("Saving statistics ('%s').\n", params->stats_out_fn.c_str()); - - int error_code = stats->export_stats(params->stats_out_fn); - if (error_code != 0) { - ococo::error("Statistics could not be saved ('%s').\n", - params->stats_out_fn.c_str()); - return_code = EXIT_FAILURE; - } - } - } - - - - - /* - ////////////////////////////////////////////////////// - ////////////////////////////////////////////////////// - ////////////////////////////////////////////////////// - */ - - - template - caller_t::~caller_t(){ - - hts_itr_destroy(iter); - bam_destroy1(b); - bam_hdr_destroy(header); - - if (stats != nullptr) { - delete stats; - } - - if (return_code==EXIT_SUCCESS && correctly_initialized==true) { - ococo::info("Ococo successfully finished. Bye.\n"); - } - - } - + + template + struct caller_t { + bool correctly_initialized; + int return_code; + + hts_itr_t *iter; + + bam1_t *b; + bam_hdr_t *header; + + stats_t *stats; + + params_t *params; + + caller_t(params_t *params_); + ~caller_t(); + + bool check_read(int32_t seqid, int32_t flags, int32_t mapq); + void run(); + }; + + template + caller_t::caller_t(params_t *params_): + params(params_) + { + /* + * Read SAM headers. + */ + + ococo::info("Initialing SAM/BAM reader.\n"); + + correctly_initialized=true; + return_code=EXIT_SUCCESS; + + iter = nullptr; + b = nullptr; + header = nullptr; + stats = nullptr; + + params->sam_file = sam_open(params->sam_fn.c_str(), "r"); + if (params->sam_file == nullptr) { + ococo::fatal_error("Problem with opening SAM/BAM file ('%s').\n", + params->sam_fn.c_str()); + correctly_initialized=false; + return; + } + + if ((header = sam_hdr_read(params->sam_file)) == 0) { + ococo::fatal_error("SAM/BAM headers are missing or corrupted.\n"); + correctly_initialized=false; + return; + } + + stats = new (std::nothrow) stats_t(params, *header); + if (stats == nullptr || !stats->check_allocation()) { + ococo::fatal_error("Allocation of the main structure failed.\n"); + correctly_initialized=false; + return; + } + + /* + * Load FASTA and stats. + */ + + if (!params->stats_in_fn.empty() && !params->fasta_in_fn.empty()) { + ococo::fatal_error("Initial FASTA reference and input statistics " + "cannot be used at the same time.\n"); + correctly_initialized=false; + return; + } + + if (!params->stats_in_fn.empty()) { + ococo::info("Loading statistics ('%s').\n", params->stats_in_fn.c_str()); + + int error_code = stats->import_stats(params->stats_in_fn); + if (error_code != 0) { + ococo::fatal_error("Import of statistics failed (file '%s').\n", + params->stats_in_fn.c_str()); + correctly_initialized=false; + return; + } + } else { + + if (!params->fasta_in_fn.empty()) { + ococo::info("Loading reference ('%s').\n", params->fasta_in_fn.c_str()); + + int error_code = stats->load_fasta(params->fasta_in_fn); + if (error_code != 0) { + ococo::fatal_error("Loading of FASTA failed (file '%s').\n", + params->fasta_in_fn.c_str()); + correctly_initialized=false; + return; + } + } + + else { + ococo::info("Neither reference, nor statistics provided. Going to " + "consider sequence of N's as a reference.\n"); + } + } + + /* + * Open VCF file. + */ + + if (params->vcf_fn.size() > 0) { + ococo::info("Opening VCF stream ('%s').\n", params->vcf_fn.c_str()); + + if (params->vcf_fn == std::string("-")) { + params->vcf_file = stdout; + } else { + params->vcf_file = fopen(params->vcf_fn.c_str(), "w+"); + if (params->vcf_file == nullptr) { + ococo::fatal_error("Problem with opening VCF file '%s'.\n", + params->vcf_fn.c_str()); + correctly_initialized=false; + return; + } + } + + char buf[PATH_MAX + 1]; + char *res = realpath(params->fasta_in_fn.c_str(), buf); + std::string fasta_full_path; + if (res) { + fasta_full_path = std::string(buf); + } else { + fasta_full_path = params->fasta_in_fn; + } + + stats->print_vcf_header(params->vcf_file, params->command, fasta_full_path); + } + + /* + * Open pileup file. + */ + + if (params->pileup_fn.size() > 0) { + ococo::info("Opening pileup stream ('%s').\n", params->pileup_fn.c_str()); + + if (params->pileup_fn == std::string("-")) { + params->pileup_file = stdout; + } else { + params->pileup_file = fopen(params->pileup_fn.c_str(), "w+"); + if (params->pileup_file == nullptr) { + ococo::fatal_error("Problem with opening pileup file '%s'.\n", + params->pileup_fn.c_str()); + correctly_initialized=false; + return; + } + } + + } + + /* + * Open consensus FASTA file. + */ + + if (params->fasta_out_fn.size() > 0) { + params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); + + ococo::info("Opening consensus file ('%s').\n", params->fasta_out_fn.c_str()); + params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); + + if (params->fasta_out_file == nullptr) { + ococo::fatal_error( + "Problem with opening FASTA for consensus: '%s'.\n", + params->fasta_out_fn.c_str()); + correctly_initialized=false; + return; + } + } + } + + /* + ////////////////////////////////////////////////////// + ////////////////////////////////////////////////////// + ////////////////////////////////////////////////////// + */ + + template + bool caller_t::check_read(int32_t seqid, int32_t flags, int32_t mapq) { + if ((flags & BAM_FUNMAP) != 0) { + return false; + } + + if (!stats->seq_active[seqid]) { + return false; + } + + if (mapq < stats->params->min_mapq) { + return false; + } + + return true; + } + + + + template + void caller_t::run() { + + /* + * Process alignments. + */ + ococo::info("Starting the main loop.\n"); + + int32_t r; + b = bam_init1(); + while ((r = sam_read1(params->sam_file, header, b)) >= 0) { + //const char *rname = bam_get_qname(b); + const uint8_t *seq = bam_get_seq(b); + const uint8_t *qual = bam_get_qual(b); + const uint32_t *cigar = bam_get_cigar(b); + const int32_t n_cigar = b->core.n_cigar; + const int32_t seqid = b->core.tid; + const int64_t mappping_pos = b->core.pos; + const int32_t mapq = b->core.qual; + const int32_t flags = b->core.flag; + + bool read_ok = check_read(seqid, flags, mapq); + if(!read_ok){ + continue; + } + + int32_t ref_pos = mappping_pos; + for (int32_t cigar_grp = 0, read_pos = 0; cigar_grp < n_cigar; + cigar_grp++) { + const int32_t op = bam_cigar_op(cigar[cigar_grp]); + const int32_t ol = bam_cigar_oplen(cigar[cigar_grp]); + + const int32_t next_read_pos = read_pos + ol; + switch (op) { + case BAM_CMATCH: + case BAM_CDIFF: + case BAM_CEQUAL: + + for (; read_pos < next_read_pos; read_pos++, ref_pos++) { + const uint8_t nt16 = bam_seqi(seq, read_pos); + const uint8_t nt4 = ococo::nt16_nt4[nt16]; + //const char nt256 = ococo::nt16_nt256[nt16]; + const int32_t bq = qual[read_pos]; + + if (bq != 0xff && bq < (stats->params->min_baseq)) { + continue; + } + + if (nt4 == 0x4) { + continue; + } + + stats->seq_stats[seqid][ref_pos] = + stats->increment(stats->seq_stats[seqid][ref_pos], nt4); + + if (stats->params->mode == ococo::mode_t::REALTIME) { + stats->call_consensus_position(params->vcf_file, params->pileup_file, + seqid, ref_pos); + } + } + + break; + + case BAM_CDEL: + case BAM_CREF_SKIP: + ref_pos += ol; + break; + + case BAM_CSOFT_CLIP: + read_pos += ol; + break; + + case BAM_CBACK: + ref_pos -= ol; + break; + + case BAM_CINS: + read_pos += ol; + break; + + case BAM_CPAD: + case BAM_CHARD_CLIP: + break; + } + } + } + + /* + * Call final consensus and export stats. + */ + + if (stats->params->mode == ococo::mode_t::BATCH) { + stats->call_consensus(params->vcf_file, params->pileup_file); + + if (params->fasta_out_fn.size() > 0) { + int error_code = stats->save_fasta(params->fasta_out_fn); + if (error_code != 0) { + ococo::error("FASTA '%s' could not be saved.\n", + params->fasta_out_fn.c_str()); + return_code = EXIT_FAILURE ; + } + } + } + + if (params->stats_out_fn.size() > 0) { + ococo::info("Saving statistics ('%s').\n", params->stats_out_fn.c_str()); + + int error_code = stats->export_stats(params->stats_out_fn); + if (error_code != 0) { + ococo::error("Statistics could not be saved ('%s').\n", + params->stats_out_fn.c_str()); + return_code = EXIT_FAILURE; + } + } + } + + + + + /* + ////////////////////////////////////////////////////// + ////////////////////////////////////////////////////// + ////////////////////////////////////////////////////// + */ + + + template + caller_t::~caller_t(){ + + hts_itr_destroy(iter); + bam_destroy1(b); + bam_hdr_destroy(header); + + if (stats != nullptr) { + delete stats; + } + + if (return_code==EXIT_SUCCESS && correctly_initialized==true) { + ococo::info("Ococo successfully finished. Bye.\n"); + } + + } + } diff --git a/src/ococo_misc.cpp b/src/ococo_misc.cpp index 1f4343e..2ff51f5 100644 --- a/src/ococo_misc.cpp +++ b/src/ococo_misc.cpp @@ -1,50 +1,50 @@ #include "ococo_misc.h" void ococo::print_version(){ - std::cout << "Program: Ococo (online consensus caller, " - << "http://github.com/karel-brinda/ococo)." << std::endl; - std::cout << "Version: " << OCOCO_VERSION << std::endl; + std::cout << "Program: Ococo (online consensus caller, " + << "http://github.com/karel-brinda/ococo)." << std::endl; + std::cout << "Version: " << OCOCO_VERSION << std::endl; } void ococo::fatal_error(const char *format, ...) { - va_list args; - va_start(args, format); - fprintf(stderr, "[ococo:fatal-error]: "); - vfprintf(stderr, format, args); - va_end(args); + va_list args; + va_start(args, format); + fprintf(stderr, "[ococo:fatal-error]: "); + vfprintf(stderr, format, args); + va_end(args); } void ococo::error(const char *format, ...) { - va_list args; - va_start(args, format); - fprintf(stderr, "[ococo:error]: "); - vfprintf(stderr, format, args); - va_end(args); + va_list args; + va_start(args, format); + fprintf(stderr, "[ococo:error]: "); + vfprintf(stderr, format, args); + va_end(args); } void ococo::warning(const char *format, ...) { - va_list args; - va_start(args, format); - fprintf(stderr, "[ococo:warning]: "); - vfprintf(stderr, format, args); - va_end(args); + va_list args; + va_start(args, format); + fprintf(stderr, "[ococo:warning]: "); + vfprintf(stderr, format, args); + va_end(args); } void ococo::info(const char *format, ...) { - va_list args; - va_start(args, format); - fprintf(stderr, "[ococo]: "); - vfprintf(stderr, format, args); - va_end(args); + va_list args; + va_start(args, format); + fprintf(stderr, "[ococo]: "); + vfprintf(stderr, format, args); + va_end(args); } bool ococo::file_exists(const std::string &fn) { - FILE *file; + FILE *file; - file = fopen(fn.c_str(), "r"); - if (file) { - fclose(file); - return true; - } - return false; + file = fopen(fn.c_str(), "r"); + if (file) { + fclose(file); + return true; + } + return false; } diff --git a/src/ococo_misc.h b/src/ococo_misc.h index 4f8c15a..0076ce4 100644 --- a/src/ococo_misc.h +++ b/src/ococo_misc.h @@ -10,29 +10,29 @@ namespace ococo { -void print_version(); + void print_version(); -void fatal_error(const char *format, ...); + void fatal_error(const char *format, ...); -void error(const char *format, ...); + void error(const char *format, ...); -void warning(const char *format, ...); + void warning(const char *format, ...); -void info(const char *format, ...); + void info(const char *format, ...); -bool file_exists(const std::string &fn); + bool file_exists(const std::string &fn); -/* - * Get a right full mask (right n bits set to 1) - * - * T - type - * size - number of 1's - */ -template -constexpr T right_full_mask() { - static_assert(size <= 8 * sizeof(T), "Exceeding data type borders."); - return (size == 0) ? 0 - : (((static_cast(0x1) << (size - 1)) - 1) << 1) | 1; -} + /* + * Get a right full mask (right n bits set to 1) + * + * T - type + * size - number of 1's + */ + template + constexpr T right_full_mask() { + static_assert(size <= 8 * sizeof(T), "Exceeding data type borders."); + return (size == 0) ? 0 + : (((static_cast(0x1) << (size - 1)) - 1) << 1) | 1; + } } diff --git a/src/ococo_params.cpp b/src/ococo_params.cpp index 10b6a61..55648d4 100644 --- a/src/ococo_params.cpp +++ b/src/ococo_params.cpp @@ -6,305 +6,305 @@ ****************************/ void ococo::params_t::init_default_values() { - verbose=false; - counters_str="ococo16"; - counter_configuration=OCOCO16; - mode=BATCH; - mode_str="batch"; - strategy=MAJORITY; - strategy_str="majority"; - min_mapq=1; - min_baseq=13; - init_ref_weight=0; - min_coverage=2; - majority_threshold=0.60; - - cons_alg[strategy_t::NO_UPDATES] = &cons_call_no_updates; - cons_alg[strategy_t::STOCHASTIC] = &cons_call_stoch; - cons_alg[strategy_t::STOCHASTIC_AMB] = &cons_call_stoch_amb; - cons_alg[strategy_t::MAJORITY] = &cons_call_maj; - cons_alg[strategy_t::MAJORITY_AMB] = &cons_call_maj_amb; - - vcf_file = nullptr; - pileup_file = nullptr; - fasta_out_file = nullptr; - sam_file = nullptr; - - correctly_initialized=true; - return_code=0; + verbose=false; + counters_str="ococo16"; + counter_configuration=OCOCO16; + mode=BATCH; + mode_str="batch"; + strategy=MAJORITY; + strategy_str="majority"; + min_mapq=1; + min_baseq=13; + init_ref_weight=0; + min_coverage=2; + majority_threshold=0.60; + + cons_alg[strategy_t::NO_UPDATES] = &cons_call_no_updates; + cons_alg[strategy_t::STOCHASTIC] = &cons_call_stoch; + cons_alg[strategy_t::STOCHASTIC_AMB] = &cons_call_stoch_amb; + cons_alg[strategy_t::MAJORITY] = &cons_call_maj; + cons_alg[strategy_t::MAJORITY_AMB] = &cons_call_maj_amb; + + vcf_file = nullptr; + pileup_file = nullptr; + fasta_out_file = nullptr; + sam_file = nullptr; + + correctly_initialized=true; + return_code=0; } ococo::params_t::params_t(){ - init_default_values(); + init_default_values(); } ococo::params_t::params_t(int argc, const char *argv[]){ - init_default_values(); - parse_commandline(argc, argv); + init_default_values(); + parse_commandline(argc, argv); } ococo::params_t::~params_t(){ - /* - * Close files. - */ - - if (sam_file != nullptr) { - int error_code = sam_close(sam_file); - if (error_code != 0) { - ococo::error("SAM file could not be closed.\n"); - return_code=-1; - } - } - - if (vcf_file != nullptr) { - int error_code = fclose(vcf_file); - if (error_code != 0) { - ococo::error("VCF file could not be closed.\n"); - return_code=-1; - } - } - - if (pileup_file != nullptr) { - int error_code = fclose(pileup_file); - if (error_code != 0) { - return_code=error_code; - ococo::error("Pileup file could not be closed.\n"); - return_code=-1; - } - } - - if (fasta_out_file != nullptr) { - int error_code = fclose(fasta_out_file); - if (error_code != 0) { - ococo::error("FASTA consensus file could not be closed.\n"); - return_code=-1; - } - } + /* + * Close files. + */ + + if (sam_file != nullptr) { + int error_code = sam_close(sam_file); + if (error_code != 0) { + ococo::error("SAM file could not be closed.\n"); + return_code=-1; + } + } + + if (vcf_file != nullptr) { + int error_code = fclose(vcf_file); + if (error_code != 0) { + ococo::error("VCF file could not be closed.\n"); + return_code=-1; + } + } + + if (pileup_file != nullptr) { + int error_code = fclose(pileup_file); + if (error_code != 0) { + return_code=error_code; + ococo::error("Pileup file could not be closed.\n"); + return_code=-1; + } + } + + if (fasta_out_file != nullptr) { + int error_code = fclose(fasta_out_file); + if (error_code != 0) { + ococo::error("FASTA consensus file could not be closed.\n"); + return_code=-1; + } + } } void ococo::params_t::parse_commandline(int argc, const char *argv[]){ - - /* Save cmd parameters */ - - std::stringstream cmd; - for (int32_t i = 0; i < argc; i++) { - cmd << argv[i]; - if (i != argc - 1) { - cmd << " "; - } - } - command=cmd.str(); - - - /* Parse cmd parameters*/ - - try { - - namespace po = boost::program_options; - - po::options_description options_generic("Generic options"); - options_generic.add_options() - // - ("version,v", - "Print version and exit.") - // - ("help,h", - "Print this message and exit.") - // - ; - - po::options_description options_input("Input options"); - options_input.add_options() - // - ("input,i", - po::value(&sam_fn)->required(), - "Input SAM/BAM file (- for standard input).") - // - ( - "fasta-ref,f", po::value(&fasta_in_fn), - "Initial FASTA reference (if not provided, sequence of N's is " - "considered as the reference).") - // - ("stats-in,s",po::value(&stats_in_fn), - "Input statistics.") - // - ; - - po::options_description options_output("Output options"); - options_output.add_options() - // - ( - "fasta-cons,F", po::value(&fasta_out_fn), - "FASTA file with consensus.") - // - ( - "stats-out,S", po::value(&stats_out_fn), - "Outputs statistics.") - // - ( - "vcf-cons,V", po::value(&vcf_fn), - "VCF file with updates of consensus (- for standard output)." - ) - // - ( - "pileup,P", po::value(&pileup_fn), - "Truncated pileup (- for standard output).") - // - ("verbose", - "Verbose mode.") - // - ; - - po::options_description options_consensus("Parameters of consensus calling"); - options_consensus.add_options() - // - ( - "counters,x", po::value(&counters_str)->default_value(counters_str), - "Counters configuration: \n - ococo16 (3 bits per counter)\n - ococo32 (7 bits per counter)\n - ococo64 (15 bits per counter)") - // - ( - "mode,m", po::value(&mode_str)->default_value(mode_str), - "Mode: real-time / batch.") - // - ( - "strategy,t", po::value(&strategy_str)->default_value(strategy_str), - "Strategy for updates: no-updates / majority / stochastic." - ) - // - ("allow-amb,a", "Allow updates to ambiguous nucleotides.") - // - ( - "min-MQ,q", po::value(&min_mapq)->default_value(min_mapq), - "Skip alignments with mapping quality smaller than INT." - ) - // - ( - "min-BQ,Q", po::value(&min_baseq)->default_value(min_baseq), - "Skip bases with base quality smaller than INT." - ) - // - ( - "ref-weight,w", po::value(&init_ref_weight)->default_value(init_ref_weight), - "Initial counter value for nucleotides from the reference." - ) - // - ( - "min-coverage,c", - po::value(&min_coverage)->default_value(min_coverage), - "Minimum coverage required for update." - ) - // - ( - "majority-threshold,M", - po::value(&majority_threshold)->default_value(majority_threshold), - "Majority threshold." - ) - // - ; - - po::options_description options_all; - options_all.add(options_generic).add(options_input).add(options_output).add(options_consensus); - - po::variables_map vm; - try { - - po::store(po::command_line_parser(argc, argv) - .options(options_all) - .run(), - vm); // can throw - - if (vm.count("version")) { - std::cout<(&sam_fn)->required(), + "Input SAM/BAM file (- for standard input).") + // + ( + "fasta-ref,f", po::value(&fasta_in_fn), + "Initial FASTA reference (if not provided, sequence of N's is " + "considered as the reference).") + // + ("stats-in,s",po::value(&stats_in_fn), + "Input statistics.") + // + ; + + po::options_description options_output("Output options"); + options_output.add_options() + // + ( + "fasta-cons,F", po::value(&fasta_out_fn), + "FASTA file with consensus.") + // + ( + "stats-out,S", po::value(&stats_out_fn), + "Outputs statistics.") + // + ( + "vcf-cons,V", po::value(&vcf_fn), + "VCF file with updates of consensus (- for standard output)." + ) + // + ( + "pileup,P", po::value(&pileup_fn), + "Truncated pileup (- for standard output).") + // + ("verbose", + "Verbose mode.") + // + ; + + po::options_description options_consensus("Parameters of consensus calling"); + options_consensus.add_options() + // + ( + "counters,x", po::value(&counters_str)->default_value(counters_str), + "Counters configuration: \n - ococo16 (3 bits per counter)\n - ococo32 (7 bits per counter)\n - ococo64 (15 bits per counter)") + // + ( + "mode,m", po::value(&mode_str)->default_value(mode_str), + "Mode: real-time / batch.") + // + ( + "strategy,t", po::value(&strategy_str)->default_value(strategy_str), + "Strategy for updates: no-updates / majority / stochastic." + ) + // + ("allow-amb,a", "Allow updates to ambiguous nucleotides.") + // + ( + "min-MQ,q", po::value(&min_mapq)->default_value(min_mapq), + "Skip alignments with mapping quality smaller than INT." + ) + // + ( + "min-BQ,Q", po::value(&min_baseq)->default_value(min_baseq), + "Skip bases with base quality smaller than INT." + ) + // + ( + "ref-weight,w", po::value(&init_ref_weight)->default_value(init_ref_weight), + "Initial counter value for nucleotides from the reference." + ) + // + ( + "min-coverage,c", + po::value(&min_coverage)->default_value(min_coverage), + "Minimum coverage required for update." + ) + // + ( + "majority-threshold,M", + po::value(&majority_threshold)->default_value(majority_threshold), + "Majority threshold." + ) + // + ; + + po::options_description options_all; + options_all.add(options_generic).add(options_input).add(options_output).add(options_consensus); + + po::variables_map vm; + try { + + po::store(po::command_line_parser(argc, argv) + .options(options_all) + .run(), + vm); // can throw + + if (vm.count("version")) { + std::cout< - struct stats_t { - static_assert(8 * sizeof(T) >= 4 * counter_size + refbase_size, - "Too large counter size (does not fit into the main type)."); - - int32_t n_seqs; - bool *seq_active; - int64_t *seq_len; - std::string *seq_name; - std::string *seq_comment; - T **seq_stats; - - params_t *params; - - // stats_t(); - stats_t(params_t *params, bam_hdr_t &h); - ~stats_t(); - - /******* - * I/O * - *******/ - - int import_stats(const std::string &stats_fn); - int export_stats(const std::string &stats_fn) const; - - // Call consensus probabilistically. - int call_consensus(FILE *vcf_file, FILE *pileup_file); - int call_consensus_position(FILE *vcf_file, FILE *pileup_file, - int32_t seqid, int64_t pos); - - // Loader header from a BAM. - int load_headers_bam_hdr(const bam_hdr_t &h); - // Load header and data from a FASTA file and initialize statistics. - int load_fasta(const std::string &fasta_fn); - int save_fasta(const std::string &fasta_fn) const; - - int print_vcf_header(FILE *vcf_file, std::string cmd, - std::string fasta) const; - int print_vcf_substitution(FILE *vcf_file, int32_t seqid, int64_t pos, - char old_base, char new_base, - const pos_stats_uncompr_t &psu) const; - - int print_pileup_line(FILE *pileup_file, int32_t seqid, int64_t pos, - const pos_stats_uncompr_t &psu) const; - - /************************* - * Statistics & counters * - *************************/ - - inline int set_nucl_nt256(int32_t seqid, int64_t pos, const char &nt256); - inline int get_nucl_nt256(int32_t seqid, int64_t pos, char &nt256) const; - - static T compress_position_stats(const pos_stats_uncompr_t &psu); - static void decompress_position_stats(T psc, pos_stats_uncompr_t &psu); - static T increment(T psc, nt4_t nt4); - - /*********************** - * Debuging & checking * - ***********************/ - - // Check if everything was initialized. - bool check_allocation() const; - - // Check if a BAM header corresponds to the stats. - bool check_headers_bam_hdr(const bam_hdr_t &h) const; - - void debug_print_counters() const; - - std::string debug_str_counters(int32_t seqid, int64_t pos) const; - }; - - template - stats_t::stats_t( - ococo::params_t *params, bam_hdr_t &h) - : n_seqs(h.n_targets), seq_active(new (std::nothrow) bool[n_seqs]()), - seq_len(new (std::nothrow) int64_t[n_seqs]()), - seq_name(new (std::nothrow) std::string[n_seqs]()), - seq_comment(new (std::nothrow) std::string[n_seqs]()), - seq_stats(new (std::nothrow) T *[n_seqs]()), params(params) { - for (int seqid = 0; seqid < n_seqs; seqid++) { - seq_len[seqid] = h.target_len[seqid]; - seq_active[seqid] = true; - seq_name[seqid] = std::string(h.target_name[seqid]); - - seq_stats[seqid] = new (std::nothrow) T[seq_len[seqid]](); - } - } - - template - stats_t::~stats_t() { - if (seq_stats != nullptr) { - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - delete[] seq_stats[seqid]; - } - } - delete[] seq_active; - delete[] seq_len; - delete[] seq_name; - delete[] seq_comment; - delete[] seq_stats; - } - - template - int stats_t::load_fasta( - const std::string &fasta_fn) { - gzFile fp; - kseq_t *seq; - int l; - fp = gzopen(fasta_fn.c_str(), "r"); - seq = kseq_init(fp); - - constexpr int32_t max_counter_value = - ococo::right_full_mask(); - - if(errno!=0 || fp==nullptr){ - ococo::error("File '%s' could not be opened.\n", - fasta_fn.c_str()); - return -1; - - } - - for (int seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { - if (seq_name[seqid].compare(seq->name.s) != 0) { - error("Sequence names in BAM/SAM and in FASTA do not correspond " - "('%s'!='%s').\n", - seq_name[seqid].c_str(), seq->name.s); - return -1; - } - - if (seq_len[seqid] != static_cast(seq->seq.l)) { - error("Sequence lengths in BAM/SAM and in FASTA do not correspond " - "(%" PRId64 "!=%" PRId64 ").\n", - static_cast(seq->seq.l), - static_cast(seq_len[seqid])); - return -1; - } - - if (seq->comment.l && seq_comment[seqid].empty()) { - seq_comment[seqid] = std::string(seq->comment.s); - } - - for (int64_t pos = 0; pos < static_cast(seq->seq.l); pos++) { - assert(seq_stats[seqid][pos] == 0); - - pos_stats_uncompr_t psu = {0, {0, 0, 0, 0}, 0}; - psu.nt16 = nt256_nt16[static_cast(seq->seq.s[pos])]; - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - for (int32_t i = 0; i < 4; i++) { - psu.counters[i] = ((0x1 << i) & psu.nt16) - ? std::min(params->init_ref_weight, - max_counter_value) - : 0; - } - } - - psu.sum = psu.counters[0] + psu.counters[1] + psu.counters[2] + - psu.counters[3]; - seq_stats[seqid][pos] = compress_position_stats(psu); - } - } - kseq_destroy(seq); // STEP 5: destroy seq - gzclose(fp); // STEP 6: close the file handler - return 0; - } - - template - int stats_t::save_fasta( - const std::string &fasta_fn) const { - assert(check_allocation()); - - FILE *fasta_file = nullptr; - fasta_file = fopen(fasta_fn.c_str(), "w+"); - - char fasta_buffer[fasta_line_l]; - for (int s = 0; s < n_seqs; s++) { - // printf("%s\n",seq_name[s]); - if (!seq_comment[s].empty()) { - fprintf(fasta_file, ">%s %s\n", seq_name[s].c_str(), - seq_comment[s].c_str()); - } else { - fprintf(fasta_file, ">%s\n", seq_name[s].c_str()); - } - - for (int64_t i = 0, j = 0; i < seq_len[s]; i++, j++) { - get_nucl_nt256(s, i, fasta_buffer[j]); - - if (j == fasta_line_l - 1 || i == seq_len[s] - 1) { - fwrite(fasta_buffer, 1, j + 1, fasta_file); - fwrite("\n", 1, 1, fasta_file); - j = -1; - } - } - } - - fclose(fasta_file); - - return 0; - } - - template - bool stats_t::check_allocation() const { - if (seq_active == nullptr || seq_len == nullptr || seq_stats == nullptr || - seq_name == nullptr || seq_comment == nullptr) { - return false; - } - - for (int i = 0; i < n_seqs; i++) { - if (seq_stats[i] == nullptr) { - return false; - } - } - - return true; - } - - template - bool stats_t::check_headers_bam_hdr( - const bam_hdr_t &h) const { - if (!check_allocation()) { - return false; - } - - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - if (seq_len[seqid] != static_cast(h.target_len[seqid])) { - return false; - } - if (seq_name[seqid].compare(h.target_name[seqid]) != 0) { - return false; - } - } - - return true; - } - - template - int stats_t::import_stats( - const std::string &stats_fn) { - assert(check_allocation()); - - int error_code = 0; - - FILE *fo = fopen(stats_fn.c_str(), "r"); - if (fo == nullptr) { - ococo::error("File with statistics could not be opened ('%s').\n", - stats_fn.c_str()); - return -1; - } - - /* number of seqs */ - int32_t n_seqs_loaded; - fread(&n_seqs_loaded, sizeof(int32_t), 1, fo); - - if (n_seqs_loaded != n_seqs) { - error("Numbers of sequences in stats and SAM/BAM do not correspond " - "%" PRId32 "!=%" PRId32 ").\n", - n_seqs_loaded, n_seqs); - return -1; - } - - for (int seqid = 0; seqid < n_seqs; seqid++) { - /* sequence */ - - single_seq_serial_t seq_ser; - fread(&seq_ser, sizeof(single_seq_serial_t), 1, fo); - - if (seq_ser.seq_active != seq_active[seqid]) { - error("Active sequences in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ").\n", - seqid); - return -1; - } - - if (seq_ser.seq_len != seq_len[seqid]) { - error("Sequence lengths in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ", %" PRId64 "!=%" PRId64 ").\n", - seqid, seq_ser.seq_len, seq_len[seqid]); - return -1; - } - - if (seq_name[seqid].compare(seq_ser.seq_name) != 0) { - error("Sequence names in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ", '%s'!='%s').\n", - seqid, seq_ser.seq_name, seq_name[seqid].c_str()); - return -1; - } - - fread(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); - } - - error_code = fclose(fo); - if (error_code != 0) { - ococo::error("File with statistics could not be closed ('%s').\n", - stats_fn.c_str()); - return -1; - } - - return 0; - } - - template - int stats_t::export_stats( - const std::string &stats_fn) const { - assert(check_allocation()); - - int error_code = 0; - - FILE *fo = fopen(stats_fn.c_str(), "w+"); - if (fo == nullptr) { - ococo::error("File with statistics could not be opened ('%s').\n", - stats_fn.c_str()); - return -1; - } - - /* number of seqs */ - fwrite(&n_seqs, sizeof(int32_t), 1, fo); - - for (int seqid = 0; seqid < n_seqs; seqid++) { - /* sequence */ - single_seq_serial_t seq_ser = {0}; - seq_ser.seq_active = seq_active[seqid]; - seq_ser.seq_len = seq_len[seqid]; - strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); - strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); - uint64_t written = 0; - written += fwrite(&seq_ser, sizeof(single_seq_serial_t), 1, fo); - written += fwrite(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); - if (written != 1 + static_cast(seq_len[seqid])) { - ococo::error( - "Problem with writting to the file with statistics ('%s').\n", - stats_fn.c_str()); - return -1; - } - } - - error_code = fclose(fo); - if (error_code != 0) { - ococo::error("File with statistics could not be closed ('%s').\n", - stats_fn.c_str()); - return -1; - } - return 0; - } - - template - int stats_t::call_consensus( - FILE *vcf_file, FILE *pileup_file) { - assert(check_allocation()); - - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { - call_consensus_position(vcf_file, pileup_file, seqid, pos); - } - } - - return 0; - } - - template - int stats_t::call_consensus_position( - FILE *vcf_file, FILE *pileup_file, int32_t seqid, int64_t pos) { - pos_stats_uncompr_t psu; - decompress_position_stats(seq_stats[seqid][pos], psu); - - char old_base_nt256; - get_nucl_nt256(seqid, pos, old_base_nt256); - // const char new_base_nt256=cons_call_maj(psu); - const char new_base_nt256 = (params->cons_alg[params->strategy])(psu, *params); - - if (old_base_nt256 != new_base_nt256) { - if (vcf_file != nullptr) { - print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, - new_base_nt256, psu); - } - - set_nucl_nt256(seqid, pos, new_base_nt256); - } - - if(params->verbose){ - if (old_base_nt256 == new_base_nt256) { - if (vcf_file != nullptr) { - print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, - new_base_nt256, psu); - } - } - } - - if (pileup_file != nullptr) { - print_pileup_line(pileup_file, seqid, pos, psu); - } - - return 0; - } - - template - T stats_t::compress_position_stats( - const pos_stats_uncompr_t &psu) { - T psc = 0; - - for (int32_t i = 0; i < 4; i++) { - psc <<= counter_size; - psc |= psu.counters[i] & right_full_mask(); - } - - psc <<= refbase_size; - psc |= psu.nt16; - - return psc; - } - - template - void stats_t::decompress_position_stats( - T psc, pos_stats_uncompr_t &psu) { - psu.nt16 = psc & right_full_mask(); - psc >>= refbase_size; - - psu.sum = 0; - for (int32_t i = 3; i >= 0; i--) { - psu.counters[i] = psc & right_full_mask(); - psu.sum += psu.counters[i]; - psc >>= counter_size; - } - } - - template - int stats_t::print_vcf_header( - FILE *vcf_file, std::string cmd, std::string fasta) const { - assert(check_allocation()); - assert(vcf_file != nullptr); - - std::time_t tt = std::time(nullptr); - tm *tm = localtime(&tt); - - fprintf(vcf_file, "##fileformat=VCFv4.3\n" - "##fileDate=%04d%02d%02d\n" - "##source=Ococo\n", - tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); - - if (!cmd.empty()) { - fprintf(vcf_file, "##ococo_command=%s\n", cmd.c_str()); - } - fprintf(vcf_file, "##ococo_stats_datatype_size=%zubits\n", 8 * sizeof(T)); - fprintf(vcf_file, "##ococo_counter_size=%dbits\n", counter_size); - - if (!fasta.empty()) { - fprintf(vcf_file, "##reference=%s\n", fasta.c_str()); - } - - for (int seqid = 0; seqid < n_seqs; seqid++) { - fprintf(vcf_file, "##contig=\n", - seq_name[seqid].c_str(), seq_len[seqid]); - } - - - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); - - return 0; - } - - template - int stats_t::print_vcf_substitution( - FILE *vcf_file, int32_t seqid, int64_t pos, char old_base, char new_base, - const pos_stats_uncompr_t &psu) const { - assert(check_allocation()); - assert(vcf_file != nullptr); - - float alt_freq=1.0*psu.counters[nt256_nt4[static_cast(new_base)]]/psu.sum; - - fprintf(vcf_file, - "%s\t%" PRId64 "\t.\t%c\t%c\t100\tPASS\tAF=%.2f;CS=%" PRId32 ",%" PRId32 - ",%" PRId32 ",%" PRId32 ";COV=%" PRId32 "\n", - seq_name[seqid].c_str(), pos + 1, old_base, new_base, - round(alt_freq*100.0)/100, - psu.counters[0], psu.counters[1], psu.counters[2], psu.counters[3], - psu.sum); - - return 0; - } - - template - int stats_t::print_pileup_line( - FILE *pileup_file, int32_t seqid, int64_t pos, - const pos_stats_uncompr_t &psu) const { - assert(check_allocation()); - assert(pileup_file != nullptr); - - // todo: fix situation when depth is larger (use the printing buffer more - // timess) - - const int32_t max_depth = 1000; - - assert(psu.sum < max_depth); - char bases[max_depth]; - char qualities[max_depth]; - - char ref_nt256 = nt16_nt256[psu.nt16]; - - if (psu.sum == 0) { - return 0; - } - - if (ref_nt256 == '=') { - ref_nt256 = 'N'; - } - - int32_t j = 0; - - for (int32_t nt4 = 0; nt4 < 4; nt4++) { - const char filling_char = - nt4_nt16[nt4] == psu.nt16 ? '.' : nt4_nt256[nt4]; - for (int32_t i = 0; i < psu.counters[nt4]; i++, j++) { - bases[j] = filling_char; - qualities[j] = '~'; - } - } - - if (psu.sum >= max_depth) { - ococo::error("Too high coverage at position %" PRId64 - ". Pileup does not support coverage higher than %" PRId32 - ".", - pos, max_depth); - return -1; - } - - bases[j] = '\0'; - qualities[j] = '\0'; - - fprintf(pileup_file, "%s\t%" PRId64 "\t%c\t%" PRId32 "\t%s\t%s\n", - seq_name[seqid].c_str(), pos + 1, ref_nt256, psu.sum, bases, - qualities); - - return 0; - } - - template - std::string stats_t::debug_str_counters( - int32_t seqid, int64_t pos) const { - pos_stats_uncompr_t psu; - decompress_position_stats(seq_stats[seqid][pos], psu); - std::stringstream ss; - ss << "[" << nt16_nt256[psu.nt16] << "]" - << "(" << psu.counters[0] << "," << psu.counters[1] << "," - << psu.counters[2] << "," << psu.counters[3] << ")"; - return ss.str(); - } - - template - void stats_t::debug_print_counters() - const { - for (int seqid = 0; seqid < n_seqs; seqid++) { - fprintf(stderr, "%s\n", seq_name[seqid]); - for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { - fprintf(stderr, "%8" PRId64 " %04x \n", pos, seq_stats[seqid][pos]); - } - } - } - - template - inline int stats_t::set_nucl_nt256( - int32_t seqid, int64_t pos, const char &nt256) { - nt16_t nt16 = nt256_nt16[static_cast(nt256)]; - T n_psc = seq_stats[seqid][pos]; - n_psc >>= refbase_size; - n_psc <<= refbase_size; - n_psc |= nt16 & right_full_mask(); - seq_stats[seqid][pos] = n_psc; - return 0; - } - - template - inline int stats_t::get_nucl_nt256( - int32_t seqid, int64_t pos, char &nt256) const { - nt256 = nt16_nt256[seq_stats[seqid][pos] & right_full_mask()]; - if (nt256=='='){ - nt256='N'; - } - return 0; - } - - template - T stats_t::increment(T psc, nt4_t nt4) { - assert(0 <= nt4 && nt4 < 4); - - pos_stats_uncompr_t psu; - decompress_position_stats(psc, psu); - - if (psu.counters[nt4] == right_full_mask()) { - psu.counters[0] >>= 1; - psu.counters[1] >>= 1; - psu.counters[2] >>= 1; - psu.counters[3] >>= 1; - } - - psu.counters[nt4]++; - - return compress_position_stats(psu); - } + KSEQ_INIT(gzFile, gzread) + + {} + + template + struct stats_t { + static_assert(8 * sizeof(T) >= 4 * counter_size + refbase_size, + "Too large counter size (does not fit into the main type)."); + + int32_t n_seqs; + bool *seq_active; + int64_t *seq_len; + std::string *seq_name; + std::string *seq_comment; + T **seq_stats; + + params_t *params; + + // stats_t(); + stats_t(params_t *params, bam_hdr_t &h); + ~stats_t(); + + /******* + * I/O * + *******/ + + int import_stats(const std::string &stats_fn); + int export_stats(const std::string &stats_fn) const; + + // Call consensus probabilistically. + int call_consensus(FILE *vcf_file, FILE *pileup_file); + int call_consensus_position(FILE *vcf_file, FILE *pileup_file, + int32_t seqid, int64_t pos); + + // Loader header from a BAM. + int load_headers_bam_hdr(const bam_hdr_t &h); + // Load header and data from a FASTA file and initialize statistics. + int load_fasta(const std::string &fasta_fn); + int save_fasta(const std::string &fasta_fn) const; + + int print_vcf_header(FILE *vcf_file, std::string cmd, + std::string fasta) const; + int print_vcf_substitution(FILE *vcf_file, int32_t seqid, int64_t pos, + char old_base, char new_base, + const pos_stats_uncompr_t &psu) const; + + int print_pileup_line(FILE *pileup_file, int32_t seqid, int64_t pos, + const pos_stats_uncompr_t &psu) const; + + /************************* + * Statistics & counters * + *************************/ + + inline int set_nucl_nt256(int32_t seqid, int64_t pos, const char &nt256); + inline int get_nucl_nt256(int32_t seqid, int64_t pos, char &nt256) const; + + static T compress_position_stats(const pos_stats_uncompr_t &psu); + static void decompress_position_stats(T psc, pos_stats_uncompr_t &psu); + static T increment(T psc, nt4_t nt4); + + /*********************** + * Debuging & checking * + ***********************/ + + // Check if everything was initialized. + bool check_allocation() const; + + // Check if a BAM header corresponds to the stats. + bool check_headers_bam_hdr(const bam_hdr_t &h) const; + + void debug_print_counters() const; + + std::string debug_str_counters(int32_t seqid, int64_t pos) const; + }; + + template + stats_t::stats_t( + ococo::params_t *params, bam_hdr_t &h) + : n_seqs(h.n_targets), seq_active(new (std::nothrow) bool[n_seqs]()), + seq_len(new (std::nothrow) int64_t[n_seqs]()), + seq_name(new (std::nothrow) std::string[n_seqs]()), + seq_comment(new (std::nothrow) std::string[n_seqs]()), + seq_stats(new (std::nothrow) T *[n_seqs]()), params(params) { + for (int seqid = 0; seqid < n_seqs; seqid++) { + seq_len[seqid] = h.target_len[seqid]; + seq_active[seqid] = true; + seq_name[seqid] = std::string(h.target_name[seqid]); + + seq_stats[seqid] = new (std::nothrow) T[seq_len[seqid]](); + } + } + + template + stats_t::~stats_t() { + if (seq_stats != nullptr) { + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + delete[] seq_stats[seqid]; + } + } + delete[] seq_active; + delete[] seq_len; + delete[] seq_name; + delete[] seq_comment; + delete[] seq_stats; + } + + template + int stats_t::load_fasta( + const std::string &fasta_fn) { + gzFile fp; + kseq_t *seq; + int l; + fp = gzopen(fasta_fn.c_str(), "r"); + seq = kseq_init(fp); + + constexpr int32_t max_counter_value = + ococo::right_full_mask(); + + if(errno!=0 || fp==nullptr){ + ococo::error("File '%s' could not be opened.\n", + fasta_fn.c_str()); + return -1; + + } + + for (int seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { + if (seq_name[seqid].compare(seq->name.s) != 0) { + error("Sequence names in BAM/SAM and in FASTA do not correspond " + "('%s'!='%s').\n", + seq_name[seqid].c_str(), seq->name.s); + return -1; + } + + if (seq_len[seqid] != static_cast(seq->seq.l)) { + error("Sequence lengths in BAM/SAM and in FASTA do not correspond " + "(%" PRId64 "!=%" PRId64 ").\n", + static_cast(seq->seq.l), + static_cast(seq_len[seqid])); + return -1; + } + + if (seq->comment.l && seq_comment[seqid].empty()) { + seq_comment[seqid] = std::string(seq->comment.s); + } + + for (int64_t pos = 0; pos < static_cast(seq->seq.l); pos++) { + assert(seq_stats[seqid][pos] == 0); + + pos_stats_uncompr_t psu = {0, {0, 0, 0, 0}, 0}; + psu.nt16 = nt256_nt16[static_cast(seq->seq.s[pos])]; + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + for (int32_t i = 0; i < 4; i++) { + psu.counters[i] = ((0x1 << i) & psu.nt16) + ? std::min(params->init_ref_weight, + max_counter_value) + : 0; + } + } + + psu.sum = psu.counters[0] + psu.counters[1] + psu.counters[2] + + psu.counters[3]; + seq_stats[seqid][pos] = compress_position_stats(psu); + } + } + kseq_destroy(seq); // STEP 5: destroy seq + gzclose(fp); // STEP 6: close the file handler + return 0; + } + + template + int stats_t::save_fasta( + const std::string &fasta_fn) const { + assert(check_allocation()); + + FILE *fasta_file = nullptr; + fasta_file = fopen(fasta_fn.c_str(), "w+"); + + char fasta_buffer[fasta_line_l]; + for (int s = 0; s < n_seqs; s++) { + // printf("%s\n",seq_name[s]); + if (!seq_comment[s].empty()) { + fprintf(fasta_file, ">%s %s\n", seq_name[s].c_str(), + seq_comment[s].c_str()); + } else { + fprintf(fasta_file, ">%s\n", seq_name[s].c_str()); + } + + for (int64_t i = 0, j = 0; i < seq_len[s]; i++, j++) { + get_nucl_nt256(s, i, fasta_buffer[j]); + + if (j == fasta_line_l - 1 || i == seq_len[s] - 1) { + fwrite(fasta_buffer, 1, j + 1, fasta_file); + fwrite("\n", 1, 1, fasta_file); + j = -1; + } + } + } + + fclose(fasta_file); + + return 0; + } + + template + bool stats_t::check_allocation() const { + if (seq_active == nullptr || seq_len == nullptr || seq_stats == nullptr || + seq_name == nullptr || seq_comment == nullptr) { + return false; + } + + for (int i = 0; i < n_seqs; i++) { + if (seq_stats[i] == nullptr) { + return false; + } + } + + return true; + } + + template + bool stats_t::check_headers_bam_hdr( + const bam_hdr_t &h) const { + if (!check_allocation()) { + return false; + } + + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + if (seq_len[seqid] != static_cast(h.target_len[seqid])) { + return false; + } + if (seq_name[seqid].compare(h.target_name[seqid]) != 0) { + return false; + } + } + + return true; + } + + template + int stats_t::import_stats( + const std::string &stats_fn) { + assert(check_allocation()); + + int error_code = 0; + + FILE *fo = fopen(stats_fn.c_str(), "r"); + if (fo == nullptr) { + ococo::error("File with statistics could not be opened ('%s').\n", + stats_fn.c_str()); + return -1; + } + + /* number of seqs */ + int32_t n_seqs_loaded; + fread(&n_seqs_loaded, sizeof(int32_t), 1, fo); + + if (n_seqs_loaded != n_seqs) { + error("Numbers of sequences in stats and SAM/BAM do not correspond " + "%" PRId32 "!=%" PRId32 ").\n", + n_seqs_loaded, n_seqs); + return -1; + } + + for (int seqid = 0; seqid < n_seqs; seqid++) { + /* sequence */ + + single_seq_serial_t seq_ser; + fread(&seq_ser, sizeof(single_seq_serial_t), 1, fo); + + if (seq_ser.seq_active != seq_active[seqid]) { + error("Active sequences in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ").\n", + seqid); + return -1; + } + + if (seq_ser.seq_len != seq_len[seqid]) { + error("Sequence lengths in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ", %" PRId64 "!=%" PRId64 ").\n", + seqid, seq_ser.seq_len, seq_len[seqid]); + return -1; + } + + if (seq_name[seqid].compare(seq_ser.seq_name) != 0) { + error("Sequence names in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ", '%s'!='%s').\n", + seqid, seq_ser.seq_name, seq_name[seqid].c_str()); + return -1; + } + + fread(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); + } + + error_code = fclose(fo); + if (error_code != 0) { + ococo::error("File with statistics could not be closed ('%s').\n", + stats_fn.c_str()); + return -1; + } + + return 0; + } + + template + int stats_t::export_stats( + const std::string &stats_fn) const { + assert(check_allocation()); + + int error_code = 0; + + FILE *fo = fopen(stats_fn.c_str(), "w+"); + if (fo == nullptr) { + ococo::error("File with statistics could not be opened ('%s').\n", + stats_fn.c_str()); + return -1; + } + + /* number of seqs */ + fwrite(&n_seqs, sizeof(int32_t), 1, fo); + + for (int seqid = 0; seqid < n_seqs; seqid++) { + /* sequence */ + single_seq_serial_t seq_ser = {0}; + seq_ser.seq_active = seq_active[seqid]; + seq_ser.seq_len = seq_len[seqid]; + strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); + strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); + uint64_t written = 0; + written += fwrite(&seq_ser, sizeof(single_seq_serial_t), 1, fo); + written += fwrite(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); + if (written != 1 + static_cast(seq_len[seqid])) { + ococo::error( + "Problem with writting to the file with statistics ('%s').\n", + stats_fn.c_str()); + return -1; + } + } + + error_code = fclose(fo); + if (error_code != 0) { + ococo::error("File with statistics could not be closed ('%s').\n", + stats_fn.c_str()); + return -1; + } + return 0; + } + + template + int stats_t::call_consensus( + FILE *vcf_file, FILE *pileup_file) { + assert(check_allocation()); + + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { + call_consensus_position(vcf_file, pileup_file, seqid, pos); + } + } + + return 0; + } + + template + int stats_t::call_consensus_position( + FILE *vcf_file, FILE *pileup_file, int32_t seqid, int64_t pos) { + pos_stats_uncompr_t psu; + decompress_position_stats(seq_stats[seqid][pos], psu); + + char old_base_nt256; + get_nucl_nt256(seqid, pos, old_base_nt256); + // const char new_base_nt256=cons_call_maj(psu); + const char new_base_nt256 = (params->cons_alg[params->strategy])(psu, *params); + + if (old_base_nt256 != new_base_nt256) { + if (vcf_file != nullptr) { + print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, + new_base_nt256, psu); + } + + set_nucl_nt256(seqid, pos, new_base_nt256); + } + + if(params->verbose){ + if (old_base_nt256 == new_base_nt256) { + if (vcf_file != nullptr) { + print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, + new_base_nt256, psu); + } + } + } + + if (pileup_file != nullptr) { + print_pileup_line(pileup_file, seqid, pos, psu); + } + + return 0; + } + + template + T stats_t::compress_position_stats( + const pos_stats_uncompr_t &psu) { + T psc = 0; + + for (int32_t i = 0; i < 4; i++) { + psc <<= counter_size; + psc |= psu.counters[i] & right_full_mask(); + } + + psc <<= refbase_size; + psc |= psu.nt16; + + return psc; + } + + template + void stats_t::decompress_position_stats( + T psc, pos_stats_uncompr_t &psu) { + psu.nt16 = psc & right_full_mask(); + psc >>= refbase_size; + + psu.sum = 0; + for (int32_t i = 3; i >= 0; i--) { + psu.counters[i] = psc & right_full_mask(); + psu.sum += psu.counters[i]; + psc >>= counter_size; + } + } + + template + int stats_t::print_vcf_header( + FILE *vcf_file, std::string cmd, std::string fasta) const { + assert(check_allocation()); + assert(vcf_file != nullptr); + + std::time_t tt = std::time(nullptr); + tm *tm = localtime(&tt); + + fprintf(vcf_file, "##fileformat=VCFv4.3\n" + "##fileDate=%04d%02d%02d\n" + "##source=Ococo\n", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); + + if (!cmd.empty()) { + fprintf(vcf_file, "##ococo_command=%s\n", cmd.c_str()); + } + fprintf(vcf_file, "##ococo_stats_datatype_size=%zubits\n", 8 * sizeof(T)); + fprintf(vcf_file, "##ococo_counter_size=%dbits\n", counter_size); + + if (!fasta.empty()) { + fprintf(vcf_file, "##reference=%s\n", fasta.c_str()); + } + + for (int seqid = 0; seqid < n_seqs; seqid++) { + fprintf(vcf_file, "##contig=\n", + seq_name[seqid].c_str(), seq_len[seqid]); + } + + + fprintf(vcf_file, "##INFO=\n"); + fprintf(vcf_file, "##INFO=\n"); + fprintf(vcf_file, "##INFO=\n"); + fprintf(vcf_file, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); + + return 0; + } + + template + int stats_t::print_vcf_substitution( + FILE *vcf_file, int32_t seqid, int64_t pos, char old_base, char new_base, + const pos_stats_uncompr_t &psu) const { + assert(check_allocation()); + assert(vcf_file != nullptr); + + float alt_freq=1.0*psu.counters[nt256_nt4[static_cast(new_base)]]/psu.sum; + + fprintf(vcf_file, + "%s\t%" PRId64 "\t.\t%c\t%c\t100\tPASS\tAF=%.2f;CS=%" PRId32 ",%" PRId32 + ",%" PRId32 ",%" PRId32 ";COV=%" PRId32 "\n", + seq_name[seqid].c_str(), pos + 1, old_base, new_base, + round(alt_freq*100.0)/100, + psu.counters[0], psu.counters[1], psu.counters[2], psu.counters[3], + psu.sum); + + return 0; + } + + template + int stats_t::print_pileup_line( + FILE *pileup_file, int32_t seqid, int64_t pos, + const pos_stats_uncompr_t &psu) const { + assert(check_allocation()); + assert(pileup_file != nullptr); + + // todo: fix situation when depth is larger (use the printing buffer more + // timess) + + const int32_t max_depth = 1000; + + assert(psu.sum < max_depth); + char bases[max_depth]; + char qualities[max_depth]; + + char ref_nt256 = nt16_nt256[psu.nt16]; + + if (psu.sum == 0) { + return 0; + } + + if (ref_nt256 == '=') { + ref_nt256 = 'N'; + } + + int32_t j = 0; + + for (int32_t nt4 = 0; nt4 < 4; nt4++) { + const char filling_char = + nt4_nt16[nt4] == psu.nt16 ? '.' : nt4_nt256[nt4]; + for (int32_t i = 0; i < psu.counters[nt4]; i++, j++) { + bases[j] = filling_char; + qualities[j] = '~'; + } + } + + if (psu.sum >= max_depth) { + ococo::error("Too high coverage at position %" PRId64 + ". Pileup does not support coverage higher than %" PRId32 + ".", + pos, max_depth); + return -1; + } + + bases[j] = '\0'; + qualities[j] = '\0'; + + fprintf(pileup_file, "%s\t%" PRId64 "\t%c\t%" PRId32 "\t%s\t%s\n", + seq_name[seqid].c_str(), pos + 1, ref_nt256, psu.sum, bases, + qualities); + + return 0; + } + + template + std::string stats_t::debug_str_counters( + int32_t seqid, int64_t pos) const { + pos_stats_uncompr_t psu; + decompress_position_stats(seq_stats[seqid][pos], psu); + std::stringstream ss; + ss << "[" << nt16_nt256[psu.nt16] << "]" + << "(" << psu.counters[0] << "," << psu.counters[1] << "," + << psu.counters[2] << "," << psu.counters[3] << ")"; + return ss.str(); + } + + template + void stats_t::debug_print_counters() + const { + for (int seqid = 0; seqid < n_seqs; seqid++) { + fprintf(stderr, "%s\n", seq_name[seqid]); + for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { + fprintf(stderr, "%8" PRId64 " %04x \n", pos, seq_stats[seqid][pos]); + } + } + } + + template + inline int stats_t::set_nucl_nt256( + int32_t seqid, int64_t pos, const char &nt256) { + nt16_t nt16 = nt256_nt16[static_cast(nt256)]; + T n_psc = seq_stats[seqid][pos]; + n_psc >>= refbase_size; + n_psc <<= refbase_size; + n_psc |= nt16 & right_full_mask(); + seq_stats[seqid][pos] = n_psc; + return 0; + } + + template + inline int stats_t::get_nucl_nt256( + int32_t seqid, int64_t pos, char &nt256) const { + nt256 = nt16_nt256[seq_stats[seqid][pos] & right_full_mask()]; + if (nt256=='='){ + nt256='N'; + } + return 0; + } + + template + T stats_t::increment(T psc, nt4_t nt4) { + assert(0 <= nt4 && nt4 < 4); + + pos_stats_uncompr_t psu; + decompress_position_stats(psc, psu); + + if (psu.counters[nt4] == right_full_mask()) { + psu.counters[0] >>= 1; + psu.counters[1] >>= 1; + psu.counters[2] >>= 1; + psu.counters[3] >>= 1; + } + + psu.counters[nt4]++; + + return compress_position_stats(psu); + } } diff --git a/src/ococo_types.h b/src/ococo_types.h index 31295b8..e161c5a 100644 --- a/src/ococo_types.h +++ b/src/ococo_types.h @@ -5,84 +5,85 @@ namespace ococo { -const int fasta_line_l = 50; -const int stats_delim_l = 10; - -typedef uint8_t nt4_t; -typedef uint8_t nt16_t; -typedef uint8_t nt256_t; - -/****************** - * * - * Structures * - * * - ******************/ - -/***************** - *** Auxiliary *** - *****************/ - -struct single_seq_serial_t { - bool seq_active; - int64_t seq_len; - char seq_name[1000]; - char seq_comment[1000]; -}; - -/************************************ - *** Single position uncompressed *** - ************************************/ - -struct pos_stats_uncompr_t { - nt16_t nt16; - - int32_t counters[4]; - int32_t sum; -}; - -/************************** - *** Translation tables *** - **************************/ - -static const uint8_t nt256_nt4[] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; - -static const uint8_t nt16_nt4[] = {4, 0, 1, 4, 2, 4, 4, 4, - 3, 4, 4, 4, 4, 4, 4, 4}; - -static const uint8_t nt256_nt16[] = { - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 1, 2, 4, 8, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0 /*=*/, 15, 15, - 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, - 15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, - 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, - 15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, - - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; - -static const uint8_t nt16_nt256[] = "=ACMGRSVTWYHKDBN"; - -static const uint8_t nt4_nt256[] = "ACGTN"; - -static const uint8_t nt4_nt16[] = {1, 2, 4, 8, 15}; - -} \ No newline at end of file + const int fasta_line_l = 50; + const int stats_delim_l = 10; + + typedef uint8_t nt4_t; + typedef uint8_t nt16_t; + typedef uint8_t nt256_t; + + /****************** + * * + * Structures * + * * + ******************/ + + /***************** + *** Auxiliary *** + *****************/ + + struct single_seq_serial_t { + bool seq_active; + int64_t seq_len; + char seq_name[1000]; + char seq_comment[1000]; + }; + + /************************************ + *** Single position uncompressed *** + ************************************/ + + struct pos_stats_uncompr_t { + nt16_t nt16; + + int32_t counters[4]; + int32_t sum; + }; + + /************************** + *** Translation tables *** + **************************/ + + static const uint8_t nt256_nt4[] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; + + static const uint8_t nt16_nt4[] = {4, 0, 1, 4, 2, 4, 4, 4, + 3, 4, 4, 4, 4, 4, 4, 4}; + + static const uint8_t nt256_nt16[] = { + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 1, 2, 4, 8, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0 /*=*/, 15, 15, + 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, + 15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, + 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, + 15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, + + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; + + static const uint8_t nt16_nt256[] = "=ACMGRSVTWYHKDBN"; + + static const uint8_t nt4_nt256[] = "ACGTN"; + + static const uint8_t nt4_nt16[] = {1, 2, 4, 8, 15}; + +} + diff --git a/src/version.h b/src/version.h index 1e00310..a3ec639 100644 --- a/src/version.h +++ b/src/version.h @@ -1,7 +1,6 @@ #pragma once namespace ococo{ - -static const char *OCOCO_VERSION = "0.1.1"; - + static const char *OCOCO_VERSION = "0.1.2"; } + From 01fab0f69f4f5fc2edb249001437170c32c89b70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 26 Aug 2016 23:48:20 +0200 Subject: [PATCH 06/32] Add support for log of updates in real-time mode --- CMakeLists.txt | 14 +- src/{ococo_caller.h => caller.h} | 34 +- src/{consensus_functions.h => consensus.h} | 4 +- src/{ococo_misc.cpp => misc.cpp} | 3 +- src/{ococo_misc.h => misc.h} | 0 src/ococo.h | 12 +- src/ococo_stats.h | 633 -------------------- src/{ococo_params.cpp => params.cpp} | 17 +- src/{ococo_params.h => params.h} | 31 +- src/stats.h | 657 +++++++++++++++++++++ src/{ococo_types.h => types.h} | 0 11 files changed, 736 insertions(+), 669 deletions(-) rename src/{ococo_caller.h => caller.h} (91%) rename src/{consensus_functions.h => consensus.h} (98%) rename src/{ococo_misc.cpp => misc.cpp} (97%) rename src/{ococo_misc.h => misc.h} (100%) delete mode 100644 src/ococo_stats.h rename src/{ococo_params.cpp => params.cpp} (95%) rename src/{ococo_params.h => params.h} (87%) create mode 100644 src/stats.h rename src/{ococo_types.h => types.h} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2272e51..6cd5fc8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,14 +90,14 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -pedantic") add_executable(ococo src/main.cpp) add_dependencies(ococo htslib ococo_core) add_library(ococo_core - src/ococo_caller.h - src/ococo_params.cpp - src/ococo_params.h - src/ococo_misc.cpp - src/ococo_misc.h + src/caller.h + src/params.cpp + src/params.h + src/misc.cpp + src/misc.h src/ococo.h - src/ococo_types.h - src/ococo_stats.h + src/types.h + src/stats.h src/version.h ) target_link_libraries(ococo_core ${Boost_LIBRARIES} ${CMAKE_BINARY_DIR}/ext/htslib/libhts.a ${ZLIB_LIBRARIES}) diff --git a/src/ococo_caller.h b/src/caller.h similarity index 91% rename from src/ococo_caller.h rename to src/caller.h index 4ceae18..971bb41 100644 --- a/src/ococo_caller.h +++ b/src/caller.h @@ -2,8 +2,8 @@ #include -#include "ococo_stats.h" -#include "ococo_params.h" +#include "stats.h" +#include "params.h" namespace ococo{ @@ -178,6 +178,25 @@ namespace ococo{ return; } } + + /* + * Open log file. + */ + + if (params->log_fn.size() > 0) { + params->log_file = fopen(params->log_fn.c_str(), "w+"); + + ococo::info("Opening log file ('%s').\n", params->log_fn.c_str()); + params->log_file = fopen(params->log_fn.c_str(), "w+"); + + if (params->log_file == nullptr) { + ococo::fatal_error( + "Problem with opening log file: '%s'.\n", + params->log_fn.c_str()); + correctly_initialized=false; + return; + } + } } /* @@ -215,8 +234,10 @@ namespace ococo{ int32_t r; b = bam_init1(); + int64_t n_upd0=0; + int64_t i_read=0; while ((r = sam_read1(params->sam_file, header, b)) >= 0) { - //const char *rname = bam_get_qname(b); + const char *rname = bam_get_qname(b); const uint8_t *seq = bam_get_seq(b); const uint8_t *qual = bam_get_qual(b); const uint32_t *cigar = bam_get_cigar(b); @@ -289,7 +310,14 @@ namespace ococo{ case BAM_CHARD_CLIP: break; } + + if (stats->params->log_file != nullptr){ + fprintf(stats->params->log_file, "%" PRIu64 "\t%s\t%" PRIu64 "\n", i_read, rname, stats->params->n_upd - n_upd0); + n_upd0=stats->params->n_upd; + } } + + i_read+=1; } /* diff --git a/src/consensus_functions.h b/src/consensus.h similarity index 98% rename from src/consensus_functions.h rename to src/consensus.h index 1756d91..477033a 100644 --- a/src/consensus_functions.h +++ b/src/consensus.h @@ -1,7 +1,7 @@ #pragma once -#include "ococo_types.h" -#include "ococo_params.h" +#include "types.h" +#include "params.h" #include "cassert" #include "cmath" diff --git a/src/ococo_misc.cpp b/src/misc.cpp similarity index 97% rename from src/ococo_misc.cpp rename to src/misc.cpp index 2ff51f5..a4adbd3 100644 --- a/src/ococo_misc.cpp +++ b/src/misc.cpp @@ -1,4 +1,4 @@ -#include "ococo_misc.h" +#include "misc.h" void ococo::print_version(){ std::cout << "Program: Ococo (online consensus caller, " @@ -48,3 +48,4 @@ bool ococo::file_exists(const std::string &fn) { } return false; } + diff --git a/src/ococo_misc.h b/src/misc.h similarity index 100% rename from src/ococo_misc.h rename to src/misc.h diff --git a/src/ococo.h b/src/ococo.h index fe66fbd..fc9471e 100644 --- a/src/ococo.h +++ b/src/ococo.h @@ -1,11 +1,11 @@ #pragma once -#include "ococo_misc.h" -#include "ococo_params.h" -#include "consensus_functions.h" -#include "ococo_types.h" -#include "ococo_stats.h" -#include "ococo_caller.h" +#include "misc.h" +#include "params.h" +#include "consensus.h" +#include "types.h" +#include "stats.h" +#include "caller.h" #include "version.h" diff --git a/src/ococo_stats.h b/src/ococo_stats.h deleted file mode 100644 index 8cfe0f8..0000000 --- a/src/ococo_stats.h +++ /dev/null @@ -1,633 +0,0 @@ -#pragma once - -#include - -#include "ococo.h" - -#include -#include -#include -#include -#include -#include - -#include -#include - - -/*********************** - *** Main statistics *** - ***********************/ - -namespace ococo { - - KSEQ_INIT(gzFile, gzread) - - {} - - template - struct stats_t { - static_assert(8 * sizeof(T) >= 4 * counter_size + refbase_size, - "Too large counter size (does not fit into the main type)."); - - int32_t n_seqs; - bool *seq_active; - int64_t *seq_len; - std::string *seq_name; - std::string *seq_comment; - T **seq_stats; - - params_t *params; - - // stats_t(); - stats_t(params_t *params, bam_hdr_t &h); - ~stats_t(); - - /******* - * I/O * - *******/ - - int import_stats(const std::string &stats_fn); - int export_stats(const std::string &stats_fn) const; - - // Call consensus probabilistically. - int call_consensus(FILE *vcf_file, FILE *pileup_file); - int call_consensus_position(FILE *vcf_file, FILE *pileup_file, - int32_t seqid, int64_t pos); - - // Loader header from a BAM. - int load_headers_bam_hdr(const bam_hdr_t &h); - // Load header and data from a FASTA file and initialize statistics. - int load_fasta(const std::string &fasta_fn); - int save_fasta(const std::string &fasta_fn) const; - - int print_vcf_header(FILE *vcf_file, std::string cmd, - std::string fasta) const; - int print_vcf_substitution(FILE *vcf_file, int32_t seqid, int64_t pos, - char old_base, char new_base, - const pos_stats_uncompr_t &psu) const; - - int print_pileup_line(FILE *pileup_file, int32_t seqid, int64_t pos, - const pos_stats_uncompr_t &psu) const; - - /************************* - * Statistics & counters * - *************************/ - - inline int set_nucl_nt256(int32_t seqid, int64_t pos, const char &nt256); - inline int get_nucl_nt256(int32_t seqid, int64_t pos, char &nt256) const; - - static T compress_position_stats(const pos_stats_uncompr_t &psu); - static void decompress_position_stats(T psc, pos_stats_uncompr_t &psu); - static T increment(T psc, nt4_t nt4); - - /*********************** - * Debuging & checking * - ***********************/ - - // Check if everything was initialized. - bool check_allocation() const; - - // Check if a BAM header corresponds to the stats. - bool check_headers_bam_hdr(const bam_hdr_t &h) const; - - void debug_print_counters() const; - - std::string debug_str_counters(int32_t seqid, int64_t pos) const; - }; - - template - stats_t::stats_t( - ococo::params_t *params, bam_hdr_t &h) - : n_seqs(h.n_targets), seq_active(new (std::nothrow) bool[n_seqs]()), - seq_len(new (std::nothrow) int64_t[n_seqs]()), - seq_name(new (std::nothrow) std::string[n_seqs]()), - seq_comment(new (std::nothrow) std::string[n_seqs]()), - seq_stats(new (std::nothrow) T *[n_seqs]()), params(params) { - for (int seqid = 0; seqid < n_seqs; seqid++) { - seq_len[seqid] = h.target_len[seqid]; - seq_active[seqid] = true; - seq_name[seqid] = std::string(h.target_name[seqid]); - - seq_stats[seqid] = new (std::nothrow) T[seq_len[seqid]](); - } - } - - template - stats_t::~stats_t() { - if (seq_stats != nullptr) { - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - delete[] seq_stats[seqid]; - } - } - delete[] seq_active; - delete[] seq_len; - delete[] seq_name; - delete[] seq_comment; - delete[] seq_stats; - } - - template - int stats_t::load_fasta( - const std::string &fasta_fn) { - gzFile fp; - kseq_t *seq; - int l; - fp = gzopen(fasta_fn.c_str(), "r"); - seq = kseq_init(fp); - - constexpr int32_t max_counter_value = - ococo::right_full_mask(); - - if(errno!=0 || fp==nullptr){ - ococo::error("File '%s' could not be opened.\n", - fasta_fn.c_str()); - return -1; - - } - - for (int seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { - if (seq_name[seqid].compare(seq->name.s) != 0) { - error("Sequence names in BAM/SAM and in FASTA do not correspond " - "('%s'!='%s').\n", - seq_name[seqid].c_str(), seq->name.s); - return -1; - } - - if (seq_len[seqid] != static_cast(seq->seq.l)) { - error("Sequence lengths in BAM/SAM and in FASTA do not correspond " - "(%" PRId64 "!=%" PRId64 ").\n", - static_cast(seq->seq.l), - static_cast(seq_len[seqid])); - return -1; - } - - if (seq->comment.l && seq_comment[seqid].empty()) { - seq_comment[seqid] = std::string(seq->comment.s); - } - - for (int64_t pos = 0; pos < static_cast(seq->seq.l); pos++) { - assert(seq_stats[seqid][pos] == 0); - - pos_stats_uncompr_t psu = {0, {0, 0, 0, 0}, 0}; - psu.nt16 = nt256_nt16[static_cast(seq->seq.s[pos])]; - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - for (int32_t i = 0; i < 4; i++) { - psu.counters[i] = ((0x1 << i) & psu.nt16) - ? std::min(params->init_ref_weight, - max_counter_value) - : 0; - } - } - - psu.sum = psu.counters[0] + psu.counters[1] + psu.counters[2] + - psu.counters[3]; - seq_stats[seqid][pos] = compress_position_stats(psu); - } - } - kseq_destroy(seq); // STEP 5: destroy seq - gzclose(fp); // STEP 6: close the file handler - return 0; - } - - template - int stats_t::save_fasta( - const std::string &fasta_fn) const { - assert(check_allocation()); - - FILE *fasta_file = nullptr; - fasta_file = fopen(fasta_fn.c_str(), "w+"); - - char fasta_buffer[fasta_line_l]; - for (int s = 0; s < n_seqs; s++) { - // printf("%s\n",seq_name[s]); - if (!seq_comment[s].empty()) { - fprintf(fasta_file, ">%s %s\n", seq_name[s].c_str(), - seq_comment[s].c_str()); - } else { - fprintf(fasta_file, ">%s\n", seq_name[s].c_str()); - } - - for (int64_t i = 0, j = 0; i < seq_len[s]; i++, j++) { - get_nucl_nt256(s, i, fasta_buffer[j]); - - if (j == fasta_line_l - 1 || i == seq_len[s] - 1) { - fwrite(fasta_buffer, 1, j + 1, fasta_file); - fwrite("\n", 1, 1, fasta_file); - j = -1; - } - } - } - - fclose(fasta_file); - - return 0; - } - - template - bool stats_t::check_allocation() const { - if (seq_active == nullptr || seq_len == nullptr || seq_stats == nullptr || - seq_name == nullptr || seq_comment == nullptr) { - return false; - } - - for (int i = 0; i < n_seqs; i++) { - if (seq_stats[i] == nullptr) { - return false; - } - } - - return true; - } - - template - bool stats_t::check_headers_bam_hdr( - const bam_hdr_t &h) const { - if (!check_allocation()) { - return false; - } - - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - if (seq_len[seqid] != static_cast(h.target_len[seqid])) { - return false; - } - if (seq_name[seqid].compare(h.target_name[seqid]) != 0) { - return false; - } - } - - return true; - } - - template - int stats_t::import_stats( - const std::string &stats_fn) { - assert(check_allocation()); - - int error_code = 0; - - FILE *fo = fopen(stats_fn.c_str(), "r"); - if (fo == nullptr) { - ococo::error("File with statistics could not be opened ('%s').\n", - stats_fn.c_str()); - return -1; - } - - /* number of seqs */ - int32_t n_seqs_loaded; - fread(&n_seqs_loaded, sizeof(int32_t), 1, fo); - - if (n_seqs_loaded != n_seqs) { - error("Numbers of sequences in stats and SAM/BAM do not correspond " - "%" PRId32 "!=%" PRId32 ").\n", - n_seqs_loaded, n_seqs); - return -1; - } - - for (int seqid = 0; seqid < n_seqs; seqid++) { - /* sequence */ - - single_seq_serial_t seq_ser; - fread(&seq_ser, sizeof(single_seq_serial_t), 1, fo); - - if (seq_ser.seq_active != seq_active[seqid]) { - error("Active sequences in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ").\n", - seqid); - return -1; - } - - if (seq_ser.seq_len != seq_len[seqid]) { - error("Sequence lengths in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ", %" PRId64 "!=%" PRId64 ").\n", - seqid, seq_ser.seq_len, seq_len[seqid]); - return -1; - } - - if (seq_name[seqid].compare(seq_ser.seq_name) != 0) { - error("Sequence names in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ", '%s'!='%s').\n", - seqid, seq_ser.seq_name, seq_name[seqid].c_str()); - return -1; - } - - fread(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); - } - - error_code = fclose(fo); - if (error_code != 0) { - ococo::error("File with statistics could not be closed ('%s').\n", - stats_fn.c_str()); - return -1; - } - - return 0; - } - - template - int stats_t::export_stats( - const std::string &stats_fn) const { - assert(check_allocation()); - - int error_code = 0; - - FILE *fo = fopen(stats_fn.c_str(), "w+"); - if (fo == nullptr) { - ococo::error("File with statistics could not be opened ('%s').\n", - stats_fn.c_str()); - return -1; - } - - /* number of seqs */ - fwrite(&n_seqs, sizeof(int32_t), 1, fo); - - for (int seqid = 0; seqid < n_seqs; seqid++) { - /* sequence */ - single_seq_serial_t seq_ser = {0}; - seq_ser.seq_active = seq_active[seqid]; - seq_ser.seq_len = seq_len[seqid]; - strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); - strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); - uint64_t written = 0; - written += fwrite(&seq_ser, sizeof(single_seq_serial_t), 1, fo); - written += fwrite(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); - if (written != 1 + static_cast(seq_len[seqid])) { - ococo::error( - "Problem with writting to the file with statistics ('%s').\n", - stats_fn.c_str()); - return -1; - } - } - - error_code = fclose(fo); - if (error_code != 0) { - ococo::error("File with statistics could not be closed ('%s').\n", - stats_fn.c_str()); - return -1; - } - return 0; - } - - template - int stats_t::call_consensus( - FILE *vcf_file, FILE *pileup_file) { - assert(check_allocation()); - - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { - call_consensus_position(vcf_file, pileup_file, seqid, pos); - } - } - - return 0; - } - - template - int stats_t::call_consensus_position( - FILE *vcf_file, FILE *pileup_file, int32_t seqid, int64_t pos) { - pos_stats_uncompr_t psu; - decompress_position_stats(seq_stats[seqid][pos], psu); - - char old_base_nt256; - get_nucl_nt256(seqid, pos, old_base_nt256); - // const char new_base_nt256=cons_call_maj(psu); - const char new_base_nt256 = (params->cons_alg[params->strategy])(psu, *params); - - if (old_base_nt256 != new_base_nt256) { - if (vcf_file != nullptr) { - print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, - new_base_nt256, psu); - } - - set_nucl_nt256(seqid, pos, new_base_nt256); - } - - if(params->verbose){ - if (old_base_nt256 == new_base_nt256) { - if (vcf_file != nullptr) { - print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, - new_base_nt256, psu); - } - } - } - - if (pileup_file != nullptr) { - print_pileup_line(pileup_file, seqid, pos, psu); - } - - return 0; - } - - template - T stats_t::compress_position_stats( - const pos_stats_uncompr_t &psu) { - T psc = 0; - - for (int32_t i = 0; i < 4; i++) { - psc <<= counter_size; - psc |= psu.counters[i] & right_full_mask(); - } - - psc <<= refbase_size; - psc |= psu.nt16; - - return psc; - } - - template - void stats_t::decompress_position_stats( - T psc, pos_stats_uncompr_t &psu) { - psu.nt16 = psc & right_full_mask(); - psc >>= refbase_size; - - psu.sum = 0; - for (int32_t i = 3; i >= 0; i--) { - psu.counters[i] = psc & right_full_mask(); - psu.sum += psu.counters[i]; - psc >>= counter_size; - } - } - - template - int stats_t::print_vcf_header( - FILE *vcf_file, std::string cmd, std::string fasta) const { - assert(check_allocation()); - assert(vcf_file != nullptr); - - std::time_t tt = std::time(nullptr); - tm *tm = localtime(&tt); - - fprintf(vcf_file, "##fileformat=VCFv4.3\n" - "##fileDate=%04d%02d%02d\n" - "##source=Ococo\n", - tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); - - if (!cmd.empty()) { - fprintf(vcf_file, "##ococo_command=%s\n", cmd.c_str()); - } - fprintf(vcf_file, "##ococo_stats_datatype_size=%zubits\n", 8 * sizeof(T)); - fprintf(vcf_file, "##ococo_counter_size=%dbits\n", counter_size); - - if (!fasta.empty()) { - fprintf(vcf_file, "##reference=%s\n", fasta.c_str()); - } - - for (int seqid = 0; seqid < n_seqs; seqid++) { - fprintf(vcf_file, "##contig=\n", - seq_name[seqid].c_str(), seq_len[seqid]); - } - - - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); - - return 0; - } - - template - int stats_t::print_vcf_substitution( - FILE *vcf_file, int32_t seqid, int64_t pos, char old_base, char new_base, - const pos_stats_uncompr_t &psu) const { - assert(check_allocation()); - assert(vcf_file != nullptr); - - float alt_freq=1.0*psu.counters[nt256_nt4[static_cast(new_base)]]/psu.sum; - - fprintf(vcf_file, - "%s\t%" PRId64 "\t.\t%c\t%c\t100\tPASS\tAF=%.2f;CS=%" PRId32 ",%" PRId32 - ",%" PRId32 ",%" PRId32 ";COV=%" PRId32 "\n", - seq_name[seqid].c_str(), pos + 1, old_base, new_base, - round(alt_freq*100.0)/100, - psu.counters[0], psu.counters[1], psu.counters[2], psu.counters[3], - psu.sum); - - return 0; - } - - template - int stats_t::print_pileup_line( - FILE *pileup_file, int32_t seqid, int64_t pos, - const pos_stats_uncompr_t &psu) const { - assert(check_allocation()); - assert(pileup_file != nullptr); - - // todo: fix situation when depth is larger (use the printing buffer more - // timess) - - const int32_t max_depth = 1000; - - assert(psu.sum < max_depth); - char bases[max_depth]; - char qualities[max_depth]; - - char ref_nt256 = nt16_nt256[psu.nt16]; - - if (psu.sum == 0) { - return 0; - } - - if (ref_nt256 == '=') { - ref_nt256 = 'N'; - } - - int32_t j = 0; - - for (int32_t nt4 = 0; nt4 < 4; nt4++) { - const char filling_char = - nt4_nt16[nt4] == psu.nt16 ? '.' : nt4_nt256[nt4]; - for (int32_t i = 0; i < psu.counters[nt4]; i++, j++) { - bases[j] = filling_char; - qualities[j] = '~'; - } - } - - if (psu.sum >= max_depth) { - ococo::error("Too high coverage at position %" PRId64 - ". Pileup does not support coverage higher than %" PRId32 - ".", - pos, max_depth); - return -1; - } - - bases[j] = '\0'; - qualities[j] = '\0'; - - fprintf(pileup_file, "%s\t%" PRId64 "\t%c\t%" PRId32 "\t%s\t%s\n", - seq_name[seqid].c_str(), pos + 1, ref_nt256, psu.sum, bases, - qualities); - - return 0; - } - - template - std::string stats_t::debug_str_counters( - int32_t seqid, int64_t pos) const { - pos_stats_uncompr_t psu; - decompress_position_stats(seq_stats[seqid][pos], psu); - std::stringstream ss; - ss << "[" << nt16_nt256[psu.nt16] << "]" - << "(" << psu.counters[0] << "," << psu.counters[1] << "," - << psu.counters[2] << "," << psu.counters[3] << ")"; - return ss.str(); - } - - template - void stats_t::debug_print_counters() - const { - for (int seqid = 0; seqid < n_seqs; seqid++) { - fprintf(stderr, "%s\n", seq_name[seqid]); - for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { - fprintf(stderr, "%8" PRId64 " %04x \n", pos, seq_stats[seqid][pos]); - } - } - } - - template - inline int stats_t::set_nucl_nt256( - int32_t seqid, int64_t pos, const char &nt256) { - nt16_t nt16 = nt256_nt16[static_cast(nt256)]; - T n_psc = seq_stats[seqid][pos]; - n_psc >>= refbase_size; - n_psc <<= refbase_size; - n_psc |= nt16 & right_full_mask(); - seq_stats[seqid][pos] = n_psc; - return 0; - } - - template - inline int stats_t::get_nucl_nt256( - int32_t seqid, int64_t pos, char &nt256) const { - nt256 = nt16_nt256[seq_stats[seqid][pos] & right_full_mask()]; - if (nt256=='='){ - nt256='N'; - } - return 0; - } - - template - T stats_t::increment(T psc, nt4_t nt4) { - assert(0 <= nt4 && nt4 < 4); - - pos_stats_uncompr_t psu; - decompress_position_stats(psc, psu); - - if (psu.counters[nt4] == right_full_mask()) { - psu.counters[0] >>= 1; - psu.counters[1] >>= 1; - psu.counters[2] >>= 1; - psu.counters[3] >>= 1; - } - - psu.counters[nt4]++; - - return compress_position_stats(psu); - } - -} diff --git a/src/ococo_params.cpp b/src/params.cpp similarity index 95% rename from src/ococo_params.cpp rename to src/params.cpp index 55648d4..d808864 100644 --- a/src/ococo_params.cpp +++ b/src/params.cpp @@ -1,5 +1,5 @@ -#include "ococo_params.h" -#include "consensus_functions.h" +#include "params.h" +#include "consensus.h" /**************************** *** Consensus parameters *** @@ -29,6 +29,9 @@ void ococo::params_t::init_default_values() { pileup_file = nullptr; fasta_out_file = nullptr; sam_file = nullptr; + log_file = nullptr; + + n_upd=0; correctly_initialized=true; return_code=0; @@ -80,6 +83,10 @@ ococo::params_t::~params_t(){ return_code=-1; } } + + if (log_file != nullptr) { + int error_code = fclose(log_file); + } } void ococo::params_t::parse_commandline(int argc, const char *argv[]){ @@ -150,6 +157,10 @@ void ococo::params_t::parse_commandline(int argc, const char *argv[]){ "pileup,P", po::value(&pileup_fn), "Truncated pileup (- for standard output).") // + ( + "log", po::value(&log_fn), + "Auxiliary log file.") + // ("verbose", "Verbose mode.") // @@ -160,7 +171,7 @@ void ococo::params_t::parse_commandline(int argc, const char *argv[]){ // ( "counters,x", po::value(&counters_str)->default_value(counters_str), - "Counters configuration: \n - ococo16 (3 bits per counter)\n - ococo32 (7 bits per counter)\n - ococo64 (15 bits per counter)") + "Counters configuration: \n - ococo16 (3b/counter, 16b/position)\n - ococo32 (7b/counter, 32b/position)\n - ococo64 (15b/counter, 64b/position)") // ( "mode,m", po::value(&mode_str)->default_value(mode_str), diff --git a/src/ococo_params.h b/src/params.h similarity index 87% rename from src/ococo_params.h rename to src/params.h index 1db8529..93c8566 100644 --- a/src/ococo_params.h +++ b/src/params.h @@ -1,7 +1,7 @@ #pragma once -#include "ococo_types.h" -#include "ococo_misc.h" +#include "types.h" +#include "misc.h" #include #include @@ -49,8 +49,8 @@ namespace ococo { std::string command; /* - Counter parameters - */ + * Counter parameters + */ counter_configuration_t counter_configuration; std::string counters_str; std::string counters_str_descr; @@ -58,36 +58,38 @@ namespace ococo { int32_t stats_bits_per_nucleotide; /* - Input parameters - */ + * Input parameters + */ std::string sam_fn; std::string fasta_in_fn; std::string stats_in_fn; /* - Output parameters - */ + * Output parameters + */ bool verbose; std::string vcf_fn; std::string fasta_out_fn; std::string stats_out_fn; std::string pileup_fn; + std::string log_fn; /* - Files - */ + * Files + */ FILE *vcf_file; FILE *pileup_file; FILE *fasta_out_file; samFile *sam_file; + FILE *log_file; /* - Consensus calling parameters - */ + * Consensus calling parameters + */ mode_t mode; strategy_t strategy; @@ -110,10 +112,11 @@ namespace ococo { /* auxiliary */ std::string strategy_str; std::string mode_str; + int64_t n_upd; /* - Array of consensus calling functions - */ + * Array of consensus calling functions + */ char (*cons_alg[strategy_t::count])(const pos_stats_uncompr_t &psu, const params_t ¶ms); diff --git a/src/stats.h b/src/stats.h new file mode 100644 index 0000000..390ed44 --- /dev/null +++ b/src/stats.h @@ -0,0 +1,657 @@ +#pragma once + +#include + +#include "ococo.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + + +/*********************** + *** Main statistics *** + ***********************/ + +namespace ococo { + + KSEQ_INIT(gzFile, gzread); + + template + struct stats_t { + static_assert(8 * sizeof(T) >= 4 * counter_size + refbase_size, + "Too large counter size (does not fit into the main type)."); + + int32_t n_seqs; + bool *seq_active; + int64_t *seq_len; + std::string *seq_name; + std::string *seq_comment; + T **seq_stats; + + params_t *params; + + // stats_t(); + stats_t(params_t *params, bam_hdr_t &h); + ~stats_t(); + + /******* + * I/O * + *******/ + + int import_stats(const std::string &stats_fn); + int export_stats(const std::string &stats_fn) const; + + // Call consensus probabilistically. + int call_consensus(FILE *vcf_file, FILE *pileup_file); + int call_consensus_position(FILE *vcf_file, FILE *pileup_file, + int32_t seqid, int64_t pos); + + // Loader header from a BAM. + int load_headers_bam_hdr(const bam_hdr_t &h); + // Load header and data from a FASTA file and initialize statistics. + int load_fasta(const std::string &fasta_fn); + int save_fasta(const std::string &fasta_fn) const; + + int print_vcf_header(FILE *vcf_file, std::string cmd, + std::string fasta) const; + int print_vcf_substitution(FILE *vcf_file, int32_t seqid, int64_t pos, + char old_base, char new_base, + const pos_stats_uncompr_t &psu) const; + + int print_pileup_line(FILE *pileup_file, int32_t seqid, int64_t pos, + const pos_stats_uncompr_t &psu) const; + + /************************* + * Statistics & counters * + *************************/ + + inline int set_nucl_nt256(int32_t seqid, int64_t pos, const char &nt256); + inline int get_nucl_nt256(int32_t seqid, int64_t pos, char &nt256) const; + + static T compress_position_stats(const pos_stats_uncompr_t &psu); + static void decompress_position_stats(T psc, pos_stats_uncompr_t &psu); + static T increment(T psc, nt4_t nt4); + + /*********************** + * Debuging & checking * + ***********************/ + + // Check if everything was initialized. + bool check_allocation() const; + + // Check if a BAM header corresponds to the stats. + bool check_headers_bam_hdr(const bam_hdr_t &h) const; + + void debug_print_counters() const; + + std::string debug_str_counters(int32_t seqid, int64_t pos) const; + }; + + template + stats_t:: + stats_t(ococo::params_t *params, bam_hdr_t &h) + : n_seqs(h.n_targets), + seq_active(new (std::nothrow) bool[n_seqs]()), + seq_len(new (std::nothrow) int64_t[n_seqs]()), + seq_name(new (std::nothrow) std::string[n_seqs]()), + seq_comment(new (std::nothrow) std::string[n_seqs]()), + seq_stats(new (std::nothrow) T *[n_seqs]()), + params(params) { + + for (int seqid = 0; seqid < n_seqs; seqid++) { + seq_len[seqid] = h.target_len[seqid]; + seq_active[seqid] = true; + seq_name[seqid] = std::string(h.target_name[seqid]); + + seq_stats[seqid] = new (std::nothrow) T[seq_len[seqid]](); + } + } + + template + stats_t:: + ~stats_t() { + + if (seq_stats != nullptr) { + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + delete[] seq_stats[seqid]; + } + } + delete[] seq_active; + delete[] seq_len; + delete[] seq_name; + delete[] seq_comment; + delete[] seq_stats; + } + + template + int stats_t:: + load_fasta(const std::string &fasta_fn) { + + gzFile fp; + kseq_t *seq; + int l; + fp = gzopen(fasta_fn.c_str(), "r"); + seq = kseq_init(fp); + + constexpr int32_t max_counter_value = + ococo::right_full_mask(); + + if(errno!=0 || fp==nullptr){ + ococo::error("File '%s' could not be opened.\n", + fasta_fn.c_str()); + return -1; + + } + + for (int seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { + if (seq_name[seqid].compare(seq->name.s) != 0) { + error("Sequence names in BAM/SAM and in FASTA do not correspond " + "('%s'!='%s').\n", + seq_name[seqid].c_str(), seq->name.s); + return -1; + } + + if (seq_len[seqid] != static_cast(seq->seq.l)) { + error("Sequence lengths in BAM/SAM and in FASTA do not correspond " + "(%" PRId64 "!=%" PRId64 ").\n", + static_cast(seq->seq.l), + static_cast(seq_len[seqid])); + return -1; + } + + if (seq->comment.l && seq_comment[seqid].empty()) { + seq_comment[seqid] = std::string(seq->comment.s); + } + + for (int64_t pos = 0; pos < static_cast(seq->seq.l); pos++) { + assert(seq_stats[seqid][pos] == 0); + + pos_stats_uncompr_t psu = {0, {0, 0, 0, 0}, 0}; + psu.nt16 = nt256_nt16[static_cast(seq->seq.s[pos])]; + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + for (int32_t i = 0; i < 4; i++) { + psu.counters[i] = ((0x1 << i) & psu.nt16) + ? std::min(params->init_ref_weight, + max_counter_value) + : 0; + } + } + + psu.sum = psu.counters[0] + psu.counters[1] + psu.counters[2] + + psu.counters[3]; + seq_stats[seqid][pos] = compress_position_stats(psu); + } + } + kseq_destroy(seq); // STEP 5: destroy seq + gzclose(fp); // STEP 6: close the file handler + return 0; + } + + template + int stats_t:: + save_fasta( const std::string &fasta_fn) const { + + assert(check_allocation()); + + FILE *fasta_file = nullptr; + fasta_file = fopen(fasta_fn.c_str(), "w+"); + + char fasta_buffer[fasta_line_l]; + for (int s = 0; s < n_seqs; s++) { + // printf("%s\n",seq_name[s]); + if (!seq_comment[s].empty()) { + fprintf(fasta_file, ">%s %s\n", seq_name[s].c_str(), + seq_comment[s].c_str()); + } else { + fprintf(fasta_file, ">%s\n", seq_name[s].c_str()); + } + + for (int64_t i = 0, j = 0; i < seq_len[s]; i++, j++) { + get_nucl_nt256(s, i, fasta_buffer[j]); + + if (j == fasta_line_l - 1 || i == seq_len[s] - 1) { + fwrite(fasta_buffer, 1, j + 1, fasta_file); + fwrite("\n", 1, 1, fasta_file); + j = -1; + } + } + } + + fclose(fasta_file); + + return 0; + } + + template + bool stats_t:: + check_allocation() const { + + if (seq_active == nullptr || seq_len == nullptr || seq_stats == nullptr || + seq_name == nullptr || seq_comment == nullptr) { + return false; + } + + for (int i = 0; i < n_seqs; i++) { + if (seq_stats[i] == nullptr) { + return false; + } + } + + return true; + } + + template + bool stats_t:: + check_headers_bam_hdr(const bam_hdr_t &h) const { + + if (!check_allocation()) { + return false; + } + + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + if (seq_len[seqid] != static_cast(h.target_len[seqid])) { + return false; + } + if (seq_name[seqid].compare(h.target_name[seqid]) != 0) { + return false; + } + } + + return true; + } + + template + int stats_t:: + import_stats(const std::string &stats_fn) { + + assert(check_allocation()); + + int error_code = 0; + + FILE *fo = fopen(stats_fn.c_str(), "r"); + if (fo == nullptr) { + ococo::error("File with statistics could not be opened ('%s').\n", + stats_fn.c_str()); + return -1; + } + + /* number of seqs */ + int32_t n_seqs_loaded; + fread(&n_seqs_loaded, sizeof(int32_t), 1, fo); + + if (n_seqs_loaded != n_seqs) { + error("Numbers of sequences in stats and SAM/BAM do not correspond " + "%" PRId32 "!=%" PRId32 ").\n", + n_seqs_loaded, n_seqs); + return -1; + } + + for (int seqid = 0; seqid < n_seqs; seqid++) { + /* sequence */ + + single_seq_serial_t seq_ser; + fread(&seq_ser, sizeof(single_seq_serial_t), 1, fo); + + if (seq_ser.seq_active != seq_active[seqid]) { + error("Active sequences in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ").\n", + seqid); + return -1; + } + + if (seq_ser.seq_len != seq_len[seqid]) { + error("Sequence lengths in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ", %" PRId64 "!=%" PRId64 ").\n", + seqid, seq_ser.seq_len, seq_len[seqid]); + return -1; + } + + if (seq_name[seqid].compare(seq_ser.seq_name) != 0) { + error("Sequence names in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ", '%s'!='%s').\n", + seqid, seq_ser.seq_name, seq_name[seqid].c_str()); + return -1; + } + + fread(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); + } + + error_code = fclose(fo); + if (error_code != 0) { + ococo::error("File with statistics could not be closed ('%s').\n", + stats_fn.c_str()); + return -1; + } + + return 0; + } + + template + int stats_t:: + export_stats(const std::string &stats_fn) const { + + assert(check_allocation()); + + int error_code = 0; + + FILE *fo = fopen(stats_fn.c_str(), "w+"); + if (fo == nullptr) { + ococo::error("File with statistics could not be opened ('%s').\n", + stats_fn.c_str()); + return -1; + } + + /* number of seqs */ + fwrite(&n_seqs, sizeof(int32_t), 1, fo); + + for (int seqid = 0; seqid < n_seqs; seqid++) { + /* sequence */ + single_seq_serial_t seq_ser = {0}; + seq_ser.seq_active = seq_active[seqid]; + seq_ser.seq_len = seq_len[seqid]; + strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); + strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); + uint64_t written = 0; + written += fwrite(&seq_ser, sizeof(single_seq_serial_t), 1, fo); + written += fwrite(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); + if (written != 1 + static_cast(seq_len[seqid])) { + ococo::error( + "Problem with writting to the file with statistics ('%s').\n", + stats_fn.c_str()); + return -1; + } + } + + error_code = fclose(fo); + if (error_code != 0) { + ococo::error("File with statistics could not be closed ('%s').\n", + stats_fn.c_str()); + return -1; + } + return 0; + } + + template + int stats_t:: + call_consensus(FILE *vcf_file, FILE *pileup_file) { + assert(check_allocation()); + + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { + call_consensus_position(vcf_file, pileup_file, seqid, pos); + } + } + + return 0; + } + + template + int stats_t:: + call_consensus_position(FILE *vcf_file, FILE *pileup_file, int32_t seqid, int64_t pos) { + + pos_stats_uncompr_t psu; + decompress_position_stats(seq_stats[seqid][pos], psu); + + char old_base_nt256; + get_nucl_nt256(seqid, pos, old_base_nt256); + // const char new_base_nt256=cons_call_maj(psu); + const char new_base_nt256 = (params->cons_alg[params->strategy])(psu, *params); + + if (old_base_nt256 != new_base_nt256) { + if (vcf_file != nullptr) { + print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, + new_base_nt256, psu); + } + params->n_upd+=1; + + set_nucl_nt256(seqid, pos, new_base_nt256); + } + + if(params->verbose){ + if (old_base_nt256 == new_base_nt256) { + if (vcf_file != nullptr) { + print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, + new_base_nt256, psu); + } + } + } + + if (pileup_file != nullptr) { + print_pileup_line(pileup_file, seqid, pos, psu); + } + + return 0; + } + + template + T stats_t:: + compress_position_stats(const pos_stats_uncompr_t &psu) { + + T psc = 0; + + for (int32_t i = 0; i < 4; i++) { + psc <<= counter_size; + psc |= psu.counters[i] & right_full_mask(); + } + + psc <<= refbase_size; + psc |= psu.nt16; + + return psc; + } + + template + void stats_t:: + decompress_position_stats(T psc, pos_stats_uncompr_t &psu) { + + psu.nt16 = psc & right_full_mask(); + psc >>= refbase_size; + + psu.sum = 0; + for (int32_t i = 3; i >= 0; i--) { + psu.counters[i] = psc & right_full_mask(); + psu.sum += psu.counters[i]; + psc >>= counter_size; + } + } + + template + int stats_t:: + print_vcf_header(FILE *vcf_file, std::string cmd, std::string fasta) const { + + assert(check_allocation()); + assert(vcf_file != nullptr); + + std::time_t tt = std::time(nullptr); + tm *tm = localtime(&tt); + + fprintf(vcf_file, "##fileformat=VCFv4.3\n" + "##fileDate=%04d%02d%02d\n" + "##source=Ococo\n", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); + + if (!cmd.empty()) { + fprintf(vcf_file, "##ococo_command=%s\n", cmd.c_str()); + } + fprintf(vcf_file, "##ococo_stats_datatype_size=%zubits\n", 8 * sizeof(T)); + fprintf(vcf_file, "##ococo_counter_size=%dbits\n", counter_size); + + if (!fasta.empty()) { + fprintf(vcf_file, "##reference=%s\n", fasta.c_str()); + } + + for (int seqid = 0; seqid < n_seqs; seqid++) { + fprintf(vcf_file, "##contig=\n", + seq_name[seqid].c_str(), seq_len[seqid]); + } + + + fprintf(vcf_file, "##INFO=\n"); + fprintf(vcf_file, "##INFO=\n"); + fprintf(vcf_file, "##INFO=\n"); + fprintf(vcf_file, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); + + return 0; + } + + template + int stats_t:: + print_vcf_substitution( FILE *vcf_file, int32_t seqid, + int64_t pos, char old_base, char new_base, + const pos_stats_uncompr_t &psu) const { + + assert(check_allocation()); + assert(vcf_file != nullptr); + + float alt_freq=1.0*psu.counters[nt256_nt4[static_cast(new_base)]]/psu.sum; + + fprintf(vcf_file, + "%s\t%" PRId64 "\t.\t%c\t%c\t100\tPASS\tAF=%.2f;CS=%" PRId32 ",%" PRId32 + ",%" PRId32 ",%" PRId32 ";COV=%" PRId32 "\n", + seq_name[seqid].c_str(), pos + 1, old_base, new_base, + round(alt_freq*100.0)/100, + psu.counters[0], psu.counters[1], psu.counters[2], psu.counters[3], + psu.sum); + + return 0; + } + + template + int stats_t:: + print_pileup_line(FILE *pileup_file, int32_t seqid, int64_t pos, + const pos_stats_uncompr_t &psu) const { + + assert(check_allocation()); + assert(pileup_file != nullptr); + + // todo: fix situation when depth is larger (use the printing buffer more + // timess) + + const int32_t max_depth = 1000; + + assert(psu.sum < max_depth); + char bases[max_depth]; + char qualities[max_depth]; + + char ref_nt256 = nt16_nt256[psu.nt16]; + + if (psu.sum == 0) { + return 0; + } + + if (ref_nt256 == '=') { + ref_nt256 = 'N'; + } + + int32_t j = 0; + + for (int32_t nt4 = 0; nt4 < 4; nt4++) { + const char filling_char = + nt4_nt16[nt4] == psu.nt16 ? '.' : nt4_nt256[nt4]; + for (int32_t i = 0; i < psu.counters[nt4]; i++, j++) { + bases[j] = filling_char; + qualities[j] = '~'; + } + } + + if (psu.sum >= max_depth) { + ococo::error("Too high coverage at position %" PRId64 + ". Pileup does not support coverage higher than %" PRId32 + ".", + pos, max_depth); + return -1; + } + + bases[j] = '\0'; + qualities[j] = '\0'; + + fprintf(pileup_file, "%s\t%" PRId64 "\t%c\t%" PRId32 "\t%s\t%s\n", + seq_name[seqid].c_str(), pos + 1, ref_nt256, psu.sum, bases, + qualities); + + return 0; + } + + template + std::string stats_t:: + debug_str_counters(int32_t seqid, int64_t pos) const { + + pos_stats_uncompr_t psu; + decompress_position_stats(seq_stats[seqid][pos], psu); + std::stringstream ss; + ss << "[" << nt16_nt256[psu.nt16] << "]" + << "(" << psu.counters[0] << "," << psu.counters[1] << "," + << psu.counters[2] << "," << psu.counters[3] << ")"; + return ss.str(); + } + + template + void stats_t:: + debug_print_counters() const { + + for (int seqid = 0; seqid < n_seqs; seqid++) { + fprintf(stderr, "%s\n", seq_name[seqid]); + for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { + fprintf(stderr, "%8" PRId64 " %04x \n", pos, seq_stats[seqid][pos]); + } + } + } + + template + inline int stats_t:: + set_nucl_nt256(int32_t seqid, int64_t pos, const char &nt256) { + + nt16_t nt16 = nt256_nt16[static_cast(nt256)]; + T n_psc = seq_stats[seqid][pos]; + n_psc >>= refbase_size; + n_psc <<= refbase_size; + n_psc |= nt16 & right_full_mask(); + seq_stats[seqid][pos] = n_psc; + return 0; + } + + template + inline int stats_t:: + get_nucl_nt256(int32_t seqid, int64_t pos, char &nt256) const { + + nt256 = nt16_nt256[seq_stats[seqid][pos] & right_full_mask()]; + if (nt256=='='){ + nt256='N'; + } + return 0; + } + + template + T stats_t:: + increment(T psc, nt4_t nt4) { + + assert(0 <= nt4 && nt4 < 4); + + pos_stats_uncompr_t psu; + decompress_position_stats(psc, psu); + + if (psu.counters[nt4] == right_full_mask()) { + psu.counters[0] >>= 1; + psu.counters[1] >>= 1; + psu.counters[2] >>= 1; + psu.counters[3] >>= 1; + } + + psu.counters[nt4]++; + + return compress_position_stats(psu); + } + +} diff --git a/src/ococo_types.h b/src/types.h similarity index 100% rename from src/ococo_types.h rename to src/types.h From 9a51005020a19649d381e2185f8f124579857c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 26 Aug 2016 23:49:53 +0200 Subject: [PATCH 07/32] Update readme --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 25645b5..0d38049 100644 --- a/README.md +++ b/README.md @@ -40,17 +40,18 @@ Input options: Output options: -F [ --fasta-cons ] arg FASTA file with consensus. -S [ --stats-out ] arg Outputs statistics. - -V [ --vcf-cons ] arg VCF file with updates of consensus (- + -V [ --vcf-cons ] arg VCF file with updates of consensus (- for standard output). - -P [ --pileup ] arg Truncated pileup (- for standard + -P [ --pileup ] arg Truncated pileup (- for standard output). + --log arg Auxiliary log file. --verbose Verbose mode. Parameters of consensus calling: - -x [ --counters ] arg (=ococo16) Counters configuration: - - ococo16 (3 bits per counter) - - ococo32 (7 bits per counter) - - ococo64 (15 bits per counter) + -x [ --counters ] arg (=ococo16) Counters configuration: + - ococo16 (3b/counter, 16b/position) + - ococo32 (7b/counter, 32b/position) + - ococo64 (15b/counter, 64b/position) -m [ --mode ] arg (=batch) Mode: real-time / batch. -t [ --strategy ] arg (=majority) Strategy for updates: no-updates / majority / stochastic. From a9864eceb642b622c5d8daa865fa6568e7ee6e76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 11:37:32 +0200 Subject: [PATCH 08/32] Add mscode configuration --- .gitignore | 7 +------ .vscode/c_cpp_properties.json | 33 +++++++++++++++++++++++++++++++++ .vscode/tasks.json | 9 +++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 .vscode/c_cpp_properties.json create mode 100644 .vscode/tasks.json diff --git a/.gitignore b/.gitignore index 44daa44..88a8686 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,8 @@ ococo -ococo16 -ococo32 -a.fa - -Debug/ +.vscode/.* *~ *.swp - *.anjuta .anjuta* diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..417c7b8 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,33 @@ +{ + "configurations": [ + { + "name": "Mac", + "includePath": ["/usr/include"], + "browse" : { + "limitSymbolsToIncludedHeaders" : true, + "databaseFilename" : "" + } + }, + { + "name": "Linux", + "includePath": ["/usr/include"], + "browse" : { + "limitSymbolsToIncludedHeaders" : true, + "databaseFilename" : "" + } + }, + { + "name": "Win32", + "includePath": ["c:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/include"], + "browse" : { + "limitSymbolsToIncludedHeaders" : true, + "databaseFilename" : "" + } + } + ], + "clang_format" : { + "style" : "file", + "fallback-style" : "LLVM", + "sort-includes" : false + } +} diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..295b1d8 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,9 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "0.1.0", + "command": "make", + "isShellCommand": true, + "args": [], + "showOutput": "always" +} \ No newline at end of file From af5bfcfa9b7a6554a96dc7f92321c963b233d95c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 12:44:39 +0200 Subject: [PATCH 09/32] Add clang formatting conf. file --- .clang-format | 9 +++++++++ .vscode/settings.json | 4 ++++ 2 files changed, 13 insertions(+) create mode 100644 .clang-format create mode 100644 .vscode/settings.json diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..89eccd2 --- /dev/null +++ b/.clang-format @@ -0,0 +1,9 @@ +BasedOnStyle: Google +Language: Cpp +Standard: Cpp11 +AlignConsecutiveAssignments: true +SortIncludes: true +UseTab: Never +IndentWidth: 4 +IndentCaseLabels: true +NamespaceIndentation: None diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b6c21ef --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +// Place your settings in this file to overwrite default and user settings. +{ + "editor.tabSize": 4 +} \ No newline at end of file From c4b35cbfff872e83c1193160ff626816e6041c72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 12:45:28 +0200 Subject: [PATCH 10/32] Reformat with respect to specified style --- src/caller.h | 741 ++++++++++++++-------------- src/consensus.h | 242 +++++---- src/main.cpp | 84 ++-- src/misc.cpp | 65 ++- src/misc.h | 39 +- src/ococo.h | 8 +- src/params.cpp | 597 +++++++++++----------- src/params.h | 211 ++++---- src/stats.h | 1251 +++++++++++++++++++++++------------------------ src/types.h | 162 +++--- src/version.h | 5 +- 11 files changed, 1676 insertions(+), 1729 deletions(-) diff --git a/src/caller.h b/src/caller.h index 971bb41..f4680af 100644 --- a/src/caller.h +++ b/src/caller.h @@ -1,379 +1,380 @@ #pragma once #include - -#include "stats.h" #include "params.h" +#include "stats.h" -namespace ococo{ - - template - struct caller_t { - bool correctly_initialized; - int return_code; - - hts_itr_t *iter; - - bam1_t *b; - bam_hdr_t *header; - - stats_t *stats; - - params_t *params; - - caller_t(params_t *params_); - ~caller_t(); - - bool check_read(int32_t seqid, int32_t flags, int32_t mapq); - void run(); - }; - - template - caller_t::caller_t(params_t *params_): - params(params_) - { - /* - * Read SAM headers. - */ - - ococo::info("Initialing SAM/BAM reader.\n"); - - correctly_initialized=true; - return_code=EXIT_SUCCESS; - - iter = nullptr; - b = nullptr; - header = nullptr; - stats = nullptr; - - params->sam_file = sam_open(params->sam_fn.c_str(), "r"); - if (params->sam_file == nullptr) { - ococo::fatal_error("Problem with opening SAM/BAM file ('%s').\n", - params->sam_fn.c_str()); - correctly_initialized=false; - return; - } - - if ((header = sam_hdr_read(params->sam_file)) == 0) { - ococo::fatal_error("SAM/BAM headers are missing or corrupted.\n"); - correctly_initialized=false; - return; - } - - stats = new (std::nothrow) stats_t(params, *header); - if (stats == nullptr || !stats->check_allocation()) { - ococo::fatal_error("Allocation of the main structure failed.\n"); - correctly_initialized=false; - return; - } - - /* - * Load FASTA and stats. - */ - - if (!params->stats_in_fn.empty() && !params->fasta_in_fn.empty()) { - ococo::fatal_error("Initial FASTA reference and input statistics " - "cannot be used at the same time.\n"); - correctly_initialized=false; - return; - } - - if (!params->stats_in_fn.empty()) { - ococo::info("Loading statistics ('%s').\n", params->stats_in_fn.c_str()); - - int error_code = stats->import_stats(params->stats_in_fn); - if (error_code != 0) { - ococo::fatal_error("Import of statistics failed (file '%s').\n", - params->stats_in_fn.c_str()); - correctly_initialized=false; - return; - } - } else { - - if (!params->fasta_in_fn.empty()) { - ococo::info("Loading reference ('%s').\n", params->fasta_in_fn.c_str()); - - int error_code = stats->load_fasta(params->fasta_in_fn); - if (error_code != 0) { - ococo::fatal_error("Loading of FASTA failed (file '%s').\n", - params->fasta_in_fn.c_str()); - correctly_initialized=false; - return; - } - } - - else { - ococo::info("Neither reference, nor statistics provided. Going to " - "consider sequence of N's as a reference.\n"); - } - } - - /* - * Open VCF file. - */ - - if (params->vcf_fn.size() > 0) { - ococo::info("Opening VCF stream ('%s').\n", params->vcf_fn.c_str()); - - if (params->vcf_fn == std::string("-")) { - params->vcf_file = stdout; - } else { - params->vcf_file = fopen(params->vcf_fn.c_str(), "w+"); - if (params->vcf_file == nullptr) { - ococo::fatal_error("Problem with opening VCF file '%s'.\n", - params->vcf_fn.c_str()); - correctly_initialized=false; - return; - } - } - - char buf[PATH_MAX + 1]; - char *res = realpath(params->fasta_in_fn.c_str(), buf); - std::string fasta_full_path; - if (res) { - fasta_full_path = std::string(buf); - } else { - fasta_full_path = params->fasta_in_fn; - } - - stats->print_vcf_header(params->vcf_file, params->command, fasta_full_path); - } - - /* - * Open pileup file. - */ - - if (params->pileup_fn.size() > 0) { - ococo::info("Opening pileup stream ('%s').\n", params->pileup_fn.c_str()); - - if (params->pileup_fn == std::string("-")) { - params->pileup_file = stdout; - } else { - params->pileup_file = fopen(params->pileup_fn.c_str(), "w+"); - if (params->pileup_file == nullptr) { - ococo::fatal_error("Problem with opening pileup file '%s'.\n", - params->pileup_fn.c_str()); - correctly_initialized=false; - return; - } - } - - } - - /* - * Open consensus FASTA file. - */ - - if (params->fasta_out_fn.size() > 0) { - params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); - - ococo::info("Opening consensus file ('%s').\n", params->fasta_out_fn.c_str()); - params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); - - if (params->fasta_out_file == nullptr) { - ococo::fatal_error( - "Problem with opening FASTA for consensus: '%s'.\n", - params->fasta_out_fn.c_str()); - correctly_initialized=false; - return; - } - } - - /* - * Open log file. - */ - - if (params->log_fn.size() > 0) { - params->log_file = fopen(params->log_fn.c_str(), "w+"); - - ococo::info("Opening log file ('%s').\n", params->log_fn.c_str()); - params->log_file = fopen(params->log_fn.c_str(), "w+"); - - if (params->log_file == nullptr) { - ococo::fatal_error( - "Problem with opening log file: '%s'.\n", - params->log_fn.c_str()); - correctly_initialized=false; - return; - } - } - } - - /* - ////////////////////////////////////////////////////// - ////////////////////////////////////////////////////// - ////////////////////////////////////////////////////// - */ - - template - bool caller_t::check_read(int32_t seqid, int32_t flags, int32_t mapq) { - if ((flags & BAM_FUNMAP) != 0) { - return false; - } - - if (!stats->seq_active[seqid]) { - return false; - } - - if (mapq < stats->params->min_mapq) { - return false; - } - - return true; - } - - - - template - void caller_t::run() { - - /* - * Process alignments. - */ - ococo::info("Starting the main loop.\n"); - - int32_t r; - b = bam_init1(); - int64_t n_upd0=0; - int64_t i_read=0; - while ((r = sam_read1(params->sam_file, header, b)) >= 0) { - const char *rname = bam_get_qname(b); - const uint8_t *seq = bam_get_seq(b); - const uint8_t *qual = bam_get_qual(b); - const uint32_t *cigar = bam_get_cigar(b); - const int32_t n_cigar = b->core.n_cigar; - const int32_t seqid = b->core.tid; - const int64_t mappping_pos = b->core.pos; - const int32_t mapq = b->core.qual; - const int32_t flags = b->core.flag; - - bool read_ok = check_read(seqid, flags, mapq); - if(!read_ok){ - continue; - } - - int32_t ref_pos = mappping_pos; - for (int32_t cigar_grp = 0, read_pos = 0; cigar_grp < n_cigar; - cigar_grp++) { - const int32_t op = bam_cigar_op(cigar[cigar_grp]); - const int32_t ol = bam_cigar_oplen(cigar[cigar_grp]); - - const int32_t next_read_pos = read_pos + ol; - switch (op) { - case BAM_CMATCH: - case BAM_CDIFF: - case BAM_CEQUAL: - - for (; read_pos < next_read_pos; read_pos++, ref_pos++) { - const uint8_t nt16 = bam_seqi(seq, read_pos); - const uint8_t nt4 = ococo::nt16_nt4[nt16]; - //const char nt256 = ococo::nt16_nt256[nt16]; - const int32_t bq = qual[read_pos]; - - if (bq != 0xff && bq < (stats->params->min_baseq)) { - continue; - } - - if (nt4 == 0x4) { - continue; - } - - stats->seq_stats[seqid][ref_pos] = - stats->increment(stats->seq_stats[seqid][ref_pos], nt4); - - if (stats->params->mode == ococo::mode_t::REALTIME) { - stats->call_consensus_position(params->vcf_file, params->pileup_file, - seqid, ref_pos); - } - } - - break; - - case BAM_CDEL: - case BAM_CREF_SKIP: - ref_pos += ol; - break; - - case BAM_CSOFT_CLIP: - read_pos += ol; - break; - - case BAM_CBACK: - ref_pos -= ol; - break; - - case BAM_CINS: - read_pos += ol; - break; - - case BAM_CPAD: - case BAM_CHARD_CLIP: - break; - } - - if (stats->params->log_file != nullptr){ - fprintf(stats->params->log_file, "%" PRIu64 "\t%s\t%" PRIu64 "\n", i_read, rname, stats->params->n_upd - n_upd0); - n_upd0=stats->params->n_upd; - } - } - - i_read+=1; - } - - /* - * Call final consensus and export stats. - */ - - if (stats->params->mode == ococo::mode_t::BATCH) { - stats->call_consensus(params->vcf_file, params->pileup_file); - - if (params->fasta_out_fn.size() > 0) { - int error_code = stats->save_fasta(params->fasta_out_fn); - if (error_code != 0) { - ococo::error("FASTA '%s' could not be saved.\n", - params->fasta_out_fn.c_str()); - return_code = EXIT_FAILURE ; - } - } - } - - if (params->stats_out_fn.size() > 0) { - ococo::info("Saving statistics ('%s').\n", params->stats_out_fn.c_str()); - - int error_code = stats->export_stats(params->stats_out_fn); - if (error_code != 0) { - ococo::error("Statistics could not be saved ('%s').\n", - params->stats_out_fn.c_str()); - return_code = EXIT_FAILURE; - } - } - } - - - - - /* - ////////////////////////////////////////////////////// - ////////////////////////////////////////////////////// - ////////////////////////////////////////////////////// - */ - - - template - caller_t::~caller_t(){ - - hts_itr_destroy(iter); - bam_destroy1(b); - bam_hdr_destroy(header); - - if (stats != nullptr) { - delete stats; - } +namespace ococo { + +template +struct caller_t { + bool correctly_initialized; + int return_code; + + hts_itr_t *iter; + + bam1_t *b; + bam_hdr_t *header; + + stats_t *stats; + + params_t *params; + + caller_t(params_t *params_); + ~caller_t(); + + bool check_read(int32_t seqid, int32_t flags, int32_t mapq); + void run(); +}; + +template +caller_t::caller_t(params_t *params_) + : params(params_) { + /* + * Read SAM headers. + */ + + ococo::info("Initialing SAM/BAM reader.\n"); + + correctly_initialized = true; + return_code = EXIT_SUCCESS; + + iter = nullptr; + b = nullptr; + header = nullptr; + stats = nullptr; + + params->sam_file = sam_open(params->sam_fn.c_str(), "r"); + if (params->sam_file == nullptr) { + ococo::fatal_error("Problem with opening SAM/BAM file ('%s').\n", + params->sam_fn.c_str()); + correctly_initialized = false; + return; + } + + if ((header = sam_hdr_read(params->sam_file)) == 0) { + ococo::fatal_error("SAM/BAM headers are missing or corrupted.\n"); + correctly_initialized = false; + return; + } + + stats = new (std::nothrow) + stats_t(params, *header); + if (stats == nullptr || !stats->check_allocation()) { + ococo::fatal_error("Allocation of the main structure failed.\n"); + correctly_initialized = false; + return; + } + + /* + * Load FASTA and stats. + */ + + if (!params->stats_in_fn.empty() && !params->fasta_in_fn.empty()) { + ococo::fatal_error( + "Initial FASTA reference and input statistics " + "cannot be used at the same time.\n"); + correctly_initialized = false; + return; + } + + if (!params->stats_in_fn.empty()) { + ococo::info("Loading statistics ('%s').\n", + params->stats_in_fn.c_str()); + + int error_code = stats->import_stats(params->stats_in_fn); + if (error_code != 0) { + ococo::fatal_error("Import of statistics failed (file '%s').\n", + params->stats_in_fn.c_str()); + correctly_initialized = false; + return; + } + } else { + if (!params->fasta_in_fn.empty()) { + ococo::info("Loading reference ('%s').\n", + params->fasta_in_fn.c_str()); + + int error_code = stats->load_fasta(params->fasta_in_fn); + if (error_code != 0) { + ococo::fatal_error("Loading of FASTA failed (file '%s').\n", + params->fasta_in_fn.c_str()); + correctly_initialized = false; + return; + } + } + + else { + ococo::info( + "Neither reference, nor statistics provided. Going " + "to " + "consider sequence of N's as a reference.\n"); + } + } + + /* + * Open VCF file. + */ + + if (params->vcf_fn.size() > 0) { + ococo::info("Opening VCF stream ('%s').\n", params->vcf_fn.c_str()); + + if (params->vcf_fn == std::string("-")) { + params->vcf_file = stdout; + } else { + params->vcf_file = fopen(params->vcf_fn.c_str(), "w+"); + if (params->vcf_file == nullptr) { + ococo::fatal_error("Problem with opening VCF file '%s'.\n", + params->vcf_fn.c_str()); + correctly_initialized = false; + return; + } + } + + char buf[PATH_MAX + 1]; + char *res = realpath(params->fasta_in_fn.c_str(), buf); + std::string fasta_full_path; + if (res) { + fasta_full_path = std::string(buf); + } else { + fasta_full_path = params->fasta_in_fn; + } + + stats->print_vcf_header(params->vcf_file, params->command, + fasta_full_path); + } + + /* + * Open pileup file. + */ + + if (params->pileup_fn.size() > 0) { + ococo::info("Opening pileup stream ('%s').\n", + params->pileup_fn.c_str()); + + if (params->pileup_fn == std::string("-")) { + params->pileup_file = stdout; + } else { + params->pileup_file = fopen(params->pileup_fn.c_str(), "w+"); + if (params->pileup_file == nullptr) { + ococo::fatal_error("Problem with opening pileup file '%s'.\n", + params->pileup_fn.c_str()); + correctly_initialized = false; + return; + } + } + } + + /* + * Open consensus FASTA file. + */ + + if (params->fasta_out_fn.size() > 0) { + params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); + + ococo::info("Opening consensus file ('%s').\n", + params->fasta_out_fn.c_str()); + params->fasta_out_file = fopen(params->fasta_out_fn.c_str(), "w+"); + + if (params->fasta_out_file == nullptr) { + ococo::fatal_error( + "Problem with opening FASTA for consensus: '%s'.\n", + params->fasta_out_fn.c_str()); + correctly_initialized = false; + return; + } + } + + /* + * Open log file. + */ + + if (params->log_fn.size() > 0) { + params->log_file = fopen(params->log_fn.c_str(), "w+"); + + ococo::info("Opening log file ('%s').\n", params->log_fn.c_str()); + params->log_file = fopen(params->log_fn.c_str(), "w+"); + + if (params->log_file == nullptr) { + ococo::fatal_error("Problem with opening log file: '%s'.\n", + params->log_fn.c_str()); + correctly_initialized = false; + return; + } + } +} - if (return_code==EXIT_SUCCESS && correctly_initialized==true) { - ococo::info("Ococo successfully finished. Bye.\n"); - } +/* +////////////////////////////////////////////////////// +////////////////////////////////////////////////////// +////////////////////////////////////////////////////// +*/ + +template +bool caller_t::check_read(int32_t seqid, + int32_t flags, + int32_t mapq) { + if ((flags & BAM_FUNMAP) != 0) { + return false; + } + + if (!stats->seq_active[seqid]) { + return false; + } + + if (mapq < stats->params->min_mapq) { + return false; + } + + return true; +} - } +template +void caller_t::run() { + /* + * Process alignments. + */ + ococo::info("Starting the main loop.\n"); + + int32_t r; + b = bam_init1(); + int64_t n_upd0 = 0; + int64_t i_read = 0; + while ((r = sam_read1(params->sam_file, header, b)) >= 0) { + const char *rname = bam_get_qname(b); + const uint8_t *seq = bam_get_seq(b); + const uint8_t *qual = bam_get_qual(b); + const uint32_t *cigar = bam_get_cigar(b); + const int32_t n_cigar = b->core.n_cigar; + const int32_t seqid = b->core.tid; + const int64_t mappping_pos = b->core.pos; + const int32_t mapq = b->core.qual; + const int32_t flags = b->core.flag; + + bool read_ok = check_read(seqid, flags, mapq); + if (!read_ok) { + continue; + } + + int32_t ref_pos = mappping_pos; + for (int32_t cigar_grp = 0, read_pos = 0; cigar_grp < n_cigar; + cigar_grp++) { + const int32_t op = bam_cigar_op(cigar[cigar_grp]); + const int32_t ol = bam_cigar_oplen(cigar[cigar_grp]); + + const int32_t next_read_pos = read_pos + ol; + switch (op) { + case BAM_CMATCH: + case BAM_CDIFF: + case BAM_CEQUAL: + + for (; read_pos < next_read_pos; read_pos++, ref_pos++) { + const uint8_t nt16 = bam_seqi(seq, read_pos); + const uint8_t nt4 = ococo::nt16_nt4[nt16]; + // const char nt256 = + // ococo::nt16_nt256[nt16]; + const int32_t bq = qual[read_pos]; + + if (bq != 0xff && bq < (stats->params->min_baseq)) { + continue; + } + + if (nt4 == 0x4) { + continue; + } + + stats->seq_stats[seqid][ref_pos] = stats->increment( + stats->seq_stats[seqid][ref_pos], nt4); + + if (stats->params->mode == ococo::mode_t::REALTIME) { + stats->call_consensus_position(params->vcf_file, + params->pileup_file, + seqid, ref_pos); + } + } + + break; + + case BAM_CDEL: + case BAM_CREF_SKIP: + ref_pos += ol; + break; + + case BAM_CSOFT_CLIP: + read_pos += ol; + break; + + case BAM_CBACK: + ref_pos -= ol; + break; + + case BAM_CINS: + read_pos += ol; + break; + + case BAM_CPAD: + case BAM_CHARD_CLIP: + break; + } + + if (stats->params->log_file != nullptr) { + fprintf(stats->params->log_file, + "%" PRIu64 "\t%s\t%" PRIu64 "\n", i_read, rname, + stats->params->n_upd - n_upd0); + n_upd0 = stats->params->n_upd; + } + } + + i_read += 1; + } + + /* + * Call final consensus and export stats. + */ + + if (stats->params->mode == ococo::mode_t::BATCH) { + stats->call_consensus(params->vcf_file, params->pileup_file); + + if (params->fasta_out_fn.size() > 0) { + int error_code = stats->save_fasta(params->fasta_out_fn); + if (error_code != 0) { + ococo::error("FASTA '%s' could not be saved.\n", + params->fasta_out_fn.c_str()); + return_code = EXIT_FAILURE; + } + } + } + + if (params->stats_out_fn.size() > 0) { + ococo::info("Saving statistics ('%s').\n", + params->stats_out_fn.c_str()); + + int error_code = stats->export_stats(params->stats_out_fn); + if (error_code != 0) { + ococo::error("Statistics could not be saved ('%s').\n", + params->stats_out_fn.c_str()); + return_code = EXIT_FAILURE; + } + } +} +/* +////////////////////////////////////////////////////// +////////////////////////////////////////////////////// +////////////////////////////////////////////////////// +*/ + +template +caller_t::~caller_t() { + hts_itr_destroy(iter); + bam_destroy1(b); + bam_hdr_destroy(header); + + if (stats != nullptr) { + delete stats; + } + + if (return_code == EXIT_SUCCESS && correctly_initialized == true) { + ococo::info("Ococo successfully finished. Bye.\n"); + } +} } diff --git a/src/consensus.h b/src/consensus.h index 477033a..e4e8e2b 100644 --- a/src/consensus.h +++ b/src/consensus.h @@ -1,134 +1,132 @@ #pragma once -#include "types.h" #include "params.h" +#include "types.h" #include "cassert" #include "cmath" namespace ococo { - inline char cons_call_no_updates(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - return nt16_nt256[psu.nt16]; - } - - inline char cons_call_stoch(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - if (psu.sum == 0) { - return nt16_nt256[psu.nt16]; - } - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - if (psu.sum < params.min_coverage + params.init_ref_weight) { - return nt16_nt256[psu.nt16]; - } - } - - const int32_t prefsum[] = { - psu.counters[0], psu.counters[0] + psu.counters[1], - psu.counters[0] + psu.counters[1] + psu.counters[2], - psu.counters[0] + psu.counters[1] + psu.counters[2] + psu.counters[3]}; - - assert(prefsum[3] == psu.sum); - - const int32_t rn = rand() % psu.sum; - for (int32_t i = 0; i < 4; i++) { - if (rn < prefsum[i]) { - return nt4_nt256[i]; - } - } - - return 'n'; - } - - inline char cons_call_stoch_amb(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - if (psu.sum == 0) { - return nt16_nt256[psu.nt16]; - } - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - if (psu.sum < params.min_coverage + params.init_ref_weight) { - return nt16_nt256[psu.nt16]; - } - } - - nt16_t nucl_nt16 = nt256_nt16[static_cast('N')]; - - while (nucl_nt16 == nt256_nt16[static_cast('N')]) { - nucl_nt16 = 0; - for (int32_t i = 0; i < 4; i++) { - const int32_t rn = rand() % psu.sum; - - if (rn < psu.counters[i]) { - nucl_nt16 |= nt4_nt16[i]; - } - } - } - - return nt16_nt256[nucl_nt16]; - } - - inline char cons_call_maj(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - if (psu.sum == 0) { - return nt16_nt256[psu.nt16]; - } - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - if (psu.sum < params.min_coverage + params.init_ref_weight) { - return nt16_nt256[psu.nt16]; - } - } - - char nucl_nt256 = nt16_nt256[psu.nt16]; - - int32_t required_min = - static_cast(ceil(params.majority_threshold * psu.sum)); - int32_t max = 0; - for (int32_t i = 0; i < 4; i++) { - if (psu.counters[i] >= required_min) { - if (psu.counters[i] > max) { - max = psu.counters[i]; - nucl_nt256 = nt4_nt256[i]; - } - } - } - - return nucl_nt256; - } - - inline char cons_call_maj_amb(const pos_stats_uncompr_t &psu, - const params_t ¶ms) { - if (psu.sum == 0) { - return nt16_nt256[psu.nt16]; - } - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - if (psu.sum < params.min_coverage + params.init_ref_weight) { - return nt16_nt256[psu.nt16]; - } - } - - char nucl_nt16 = psu.nt16; - - int32_t required_min = - static_cast(round(params.majority_threshold * psu.sum)); - int32_t max = 0; - for (int32_t i = 0; i < 4; i++) { - if (psu.counters[i] >= required_min) { - if (psu.counters[i] > max) { - max = psu.counters[i]; - nucl_nt16 = nt4_nt16[i]; - } else if (psu.counters[i] >= max) { - nucl_nt16 |= nt4_nt16[i]; - } - } - } - - return nt16_nt256[static_cast(nucl_nt16)]; - } +inline char cons_call_no_updates(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + return nt16_nt256[psu.nt16]; +} + +inline char cons_call_stoch(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + if (psu.sum == 0) { + return nt16_nt256[psu.nt16]; + } + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + if (psu.sum < params.min_coverage + params.init_ref_weight) { + return nt16_nt256[psu.nt16]; + } + } + + const int32_t prefsum[] = { + psu.counters[0], psu.counters[0] + psu.counters[1], + psu.counters[0] + psu.counters[1] + psu.counters[2], + psu.counters[0] + psu.counters[1] + psu.counters[2] + psu.counters[3]}; + + assert(prefsum[3] == psu.sum); + + const int32_t rn = rand() % psu.sum; + for (int32_t i = 0; i < 4; i++) { + if (rn < prefsum[i]) { + return nt4_nt256[i]; + } + } + + return 'n'; +} + +inline char cons_call_stoch_amb(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + if (psu.sum == 0) { + return nt16_nt256[psu.nt16]; + } + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + if (psu.sum < params.min_coverage + params.init_ref_weight) { + return nt16_nt256[psu.nt16]; + } + } + nt16_t nucl_nt16 = nt256_nt16[static_cast('N')]; + + while (nucl_nt16 == nt256_nt16[static_cast('N')]) { + nucl_nt16 = 0; + for (int32_t i = 0; i < 4; i++) { + const int32_t rn = rand() % psu.sum; + + if (rn < psu.counters[i]) { + nucl_nt16 |= nt4_nt16[i]; + } + } + } + + return nt16_nt256[nucl_nt16]; } +inline char cons_call_maj(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + if (psu.sum == 0) { + return nt16_nt256[psu.nt16]; + } + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + if (psu.sum < params.min_coverage + params.init_ref_weight) { + return nt16_nt256[psu.nt16]; + } + } + + char nucl_nt256 = nt16_nt256[psu.nt16]; + + int32_t required_min = + static_cast(ceil(params.majority_threshold * psu.sum)); + int32_t max = 0; + for (int32_t i = 0; i < 4; i++) { + if (psu.counters[i] >= required_min) { + if (psu.counters[i] > max) { + max = psu.counters[i]; + nucl_nt256 = nt4_nt256[i]; + } + } + } + + return nucl_nt256; +} + +inline char cons_call_maj_amb(const pos_stats_uncompr_t &psu, + const params_t ¶ms) { + if (psu.sum == 0) { + return nt16_nt256[psu.nt16]; + } + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + if (psu.sum < params.min_coverage + params.init_ref_weight) { + return nt16_nt256[psu.nt16]; + } + } + + char nucl_nt16 = psu.nt16; + + int32_t required_min = + static_cast(round(params.majority_threshold * psu.sum)); + int32_t max = 0; + for (int32_t i = 0; i < 4; i++) { + if (psu.counters[i] >= required_min) { + if (psu.counters[i] > max) { + max = psu.counters[i]; + nucl_nt16 = nt4_nt16[i]; + } else if (psu.counters[i] >= max) { + nucl_nt16 |= nt4_nt16[i]; + } + } + } + + return nt16_nt256[static_cast(nucl_nt16)]; +} +} diff --git a/src/main.cpp b/src/main.cpp index f9648bf..ef6d3a2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,53 +6,41 @@ #include #include - -/* - * -------------------------- - */ - int main(int argc, const char *argv[]) { - - /* - * Default configuration. - */ - ococo::params_t params = ococo::params_t(argc, argv); - if (!params.correctly_initialized){ - return EXIT_FAILURE; - } - - switch (params.counter_configuration){ - - case ococo::OCOCO16: - { - ococo::caller_t caller(¶ms); - if (!caller.correctly_initialized){ - return EXIT_FAILURE ; - } - caller.run(); - return caller.return_code; - } - - case ococo::OCOCO32: - { - ococo::caller_t caller(¶ms); - if (!caller.correctly_initialized){ - return EXIT_FAILURE ; - } - caller.run(); - return caller.return_code; - } - - case ococo::OCOCO64: - { - ococo::caller_t caller(¶ms); - if (!caller.correctly_initialized){ - return EXIT_FAILURE ; - } - caller.run(); - return caller.return_code; - } - } - - return EXIT_FAILURE; + /* Use default configuration */ + ococo::params_t params = ococo::params_t(argc, argv); + if (!params.correctly_initialized) { + return EXIT_FAILURE; + } + + switch (params.counter_configuration) { + case ococo::OCOCO16: { + ococo::caller_t caller(¶ms); + if (!caller.correctly_initialized) { + return EXIT_FAILURE; + } + caller.run(); + return caller.return_code; + } + + case ococo::OCOCO32: { + ococo::caller_t caller(¶ms); + if (!caller.correctly_initialized) { + return EXIT_FAILURE; + } + caller.run(); + return caller.return_code; + } + + case ococo::OCOCO64: { + ococo::caller_t caller(¶ms); + if (!caller.correctly_initialized) { + return EXIT_FAILURE; + } + caller.run(); + return caller.return_code; + } + } + + return EXIT_FAILURE; } diff --git a/src/misc.cpp b/src/misc.cpp index a4adbd3..03864a1 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -1,51 +1,50 @@ #include "misc.h" -void ococo::print_version(){ - std::cout << "Program: Ococo (online consensus caller, " - << "http://github.com/karel-brinda/ococo)." << std::endl; - std::cout << "Version: " << OCOCO_VERSION << std::endl; +void ococo::print_version() { + std::cout << "Program: Ococo (online consensus caller, " + << "http://github.com/karel-brinda/ococo)." << std::endl; + std::cout << "Version: " << OCOCO_VERSION << std::endl; } void ococo::fatal_error(const char *format, ...) { - va_list args; - va_start(args, format); - fprintf(stderr, "[ococo:fatal-error]: "); - vfprintf(stderr, format, args); - va_end(args); + va_list args; + va_start(args, format); + fprintf(stderr, "[ococo:fatal-error]: "); + vfprintf(stderr, format, args); + va_end(args); } void ococo::error(const char *format, ...) { - va_list args; - va_start(args, format); - fprintf(stderr, "[ococo:error]: "); - vfprintf(stderr, format, args); - va_end(args); + va_list args; + va_start(args, format); + fprintf(stderr, "[ococo:error]: "); + vfprintf(stderr, format, args); + va_end(args); } void ococo::warning(const char *format, ...) { - va_list args; - va_start(args, format); - fprintf(stderr, "[ococo:warning]: "); - vfprintf(stderr, format, args); - va_end(args); + va_list args; + va_start(args, format); + fprintf(stderr, "[ococo:warning]: "); + vfprintf(stderr, format, args); + va_end(args); } void ococo::info(const char *format, ...) { - va_list args; - va_start(args, format); - fprintf(stderr, "[ococo]: "); - vfprintf(stderr, format, args); - va_end(args); + va_list args; + va_start(args, format); + fprintf(stderr, "[ococo]: "); + vfprintf(stderr, format, args); + va_end(args); } bool ococo::file_exists(const std::string &fn) { - FILE *file; - - file = fopen(fn.c_str(), "r"); - if (file) { - fclose(file); - return true; - } - return false; + FILE *file; + + file = fopen(fn.c_str(), "r"); + if (file) { + fclose(file); + return true; + } + return false; } - diff --git a/src/misc.h b/src/misc.h index 0076ce4..5624ddd 100644 --- a/src/misc.h +++ b/src/misc.h @@ -2,37 +2,36 @@ #include #include -#include #include +#include #include #include "version.h" namespace ococo { - void print_version(); +void print_version(); - void fatal_error(const char *format, ...); +void fatal_error(const char *format, ...); - void error(const char *format, ...); +void error(const char *format, ...); - void warning(const char *format, ...); +void warning(const char *format, ...); - void info(const char *format, ...); +void info(const char *format, ...); - bool file_exists(const std::string &fn); - - /* - * Get a right full mask (right n bits set to 1) - * - * T - type - * size - number of 1's - */ - template - constexpr T right_full_mask() { - static_assert(size <= 8 * sizeof(T), "Exceeding data type borders."); - return (size == 0) ? 0 - : (((static_cast(0x1) << (size - 1)) - 1) << 1) | 1; - } +bool file_exists(const std::string &fn); +/* + * Get a right full mask (right n bits set to 1) + * + * T - type + * size - number of 1's + */ +template +constexpr T right_full_mask() { + static_assert(size <= 8 * sizeof(T), "Exceeding data type borders."); + return (size == 0) ? 0 + : (((static_cast(0x1) << (size - 1)) - 1) << 1) | 1; +} } diff --git a/src/ococo.h b/src/ococo.h index fc9471e..4bf0da0 100644 --- a/src/ococo.h +++ b/src/ococo.h @@ -1,14 +1,13 @@ #pragma once +#include "caller.h" +#include "consensus.h" #include "misc.h" #include "params.h" -#include "consensus.h" -#include "types.h" #include "stats.h" -#include "caller.h" +#include "types.h" #include "version.h" - #include #include #include @@ -17,4 +16,3 @@ #include #include #include - diff --git a/src/params.cpp b/src/params.cpp index d808864..6cabac8 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -6,316 +6,303 @@ ****************************/ void ococo::params_t::init_default_values() { - verbose=false; - counters_str="ococo16"; - counter_configuration=OCOCO16; - mode=BATCH; - mode_str="batch"; - strategy=MAJORITY; - strategy_str="majority"; - min_mapq=1; - min_baseq=13; - init_ref_weight=0; - min_coverage=2; - majority_threshold=0.60; - - cons_alg[strategy_t::NO_UPDATES] = &cons_call_no_updates; - cons_alg[strategy_t::STOCHASTIC] = &cons_call_stoch; - cons_alg[strategy_t::STOCHASTIC_AMB] = &cons_call_stoch_amb; - cons_alg[strategy_t::MAJORITY] = &cons_call_maj; - cons_alg[strategy_t::MAJORITY_AMB] = &cons_call_maj_amb; - - vcf_file = nullptr; - pileup_file = nullptr; - fasta_out_file = nullptr; - sam_file = nullptr; - log_file = nullptr; - - n_upd=0; - - correctly_initialized=true; - return_code=0; + verbose = false; + counters_str = "ococo16"; + counter_configuration = OCOCO16; + mode = BATCH; + mode_str = "batch"; + strategy = MAJORITY; + strategy_str = "majority"; + min_mapq = 1; + min_baseq = 13; + init_ref_weight = 0; + min_coverage = 2; + majority_threshold = 0.60; + + cons_alg[strategy_t::NO_UPDATES] = &cons_call_no_updates; + cons_alg[strategy_t::STOCHASTIC] = &cons_call_stoch; + cons_alg[strategy_t::STOCHASTIC_AMB] = &cons_call_stoch_amb; + cons_alg[strategy_t::MAJORITY] = &cons_call_maj; + cons_alg[strategy_t::MAJORITY_AMB] = &cons_call_maj_amb; + + vcf_file = nullptr; + pileup_file = nullptr; + fasta_out_file = nullptr; + sam_file = nullptr; + log_file = nullptr; + + n_upd = 0; + + correctly_initialized = true; + return_code = 0; } -ococo::params_t::params_t(){ - init_default_values(); -} +ococo::params_t::params_t() { init_default_values(); } -ococo::params_t::params_t(int argc, const char *argv[]){ - init_default_values(); - parse_commandline(argc, argv); +ococo::params_t::params_t(int argc, const char *argv[]) { + init_default_values(); + parse_commandline(argc, argv); } -ococo::params_t::~params_t(){ - /* - * Close files. - */ - - if (sam_file != nullptr) { - int error_code = sam_close(sam_file); - if (error_code != 0) { - ococo::error("SAM file could not be closed.\n"); - return_code=-1; - } - } - - if (vcf_file != nullptr) { - int error_code = fclose(vcf_file); - if (error_code != 0) { - ococo::error("VCF file could not be closed.\n"); - return_code=-1; - } - } - - if (pileup_file != nullptr) { - int error_code = fclose(pileup_file); - if (error_code != 0) { - return_code=error_code; - ococo::error("Pileup file could not be closed.\n"); - return_code=-1; - } - } - - if (fasta_out_file != nullptr) { - int error_code = fclose(fasta_out_file); - if (error_code != 0) { - ococo::error("FASTA consensus file could not be closed.\n"); - return_code=-1; - } - } - - if (log_file != nullptr) { - int error_code = fclose(log_file); - } +ococo::params_t::~params_t() { + /* + * Close files. + */ + + if (sam_file != nullptr) { + int error_code = sam_close(sam_file); + if (error_code != 0) { + ococo::error("SAM file could not be closed.\n"); + return_code = -1; + } + } + + if (vcf_file != nullptr) { + int error_code = fclose(vcf_file); + if (error_code != 0) { + ococo::error("VCF file could not be closed.\n"); + return_code = -1; + } + } + + if (pileup_file != nullptr) { + int error_code = fclose(pileup_file); + if (error_code != 0) { + return_code = error_code; + ococo::error("Pileup file could not be closed.\n"); + return_code = -1; + } + } + + if (fasta_out_file != nullptr) { + int error_code = fclose(fasta_out_file); + if (error_code != 0) { + ococo::error("FASTA consensus file could not be closed.\n"); + return_code = -1; + } + } + + if (log_file != nullptr) { + int error_code = fclose(log_file); + } } -void ococo::params_t::parse_commandline(int argc, const char *argv[]){ - - /* Save cmd parameters */ - - std::stringstream cmd; - for (int32_t i = 0; i < argc; i++) { - cmd << argv[i]; - if (i != argc - 1) { - cmd << " "; - } - } - command=cmd.str(); - - - /* Parse cmd parameters*/ - - try { - - namespace po = boost::program_options; - - po::options_description options_generic("Generic options"); - options_generic.add_options() - // - ("version,v", - "Print version and exit.") - // - ("help,h", - "Print this message and exit.") - // - ; - - po::options_description options_input("Input options"); - options_input.add_options() - // - ("input,i", - po::value(&sam_fn)->required(), - "Input SAM/BAM file (- for standard input).") - // - ( - "fasta-ref,f", po::value(&fasta_in_fn), - "Initial FASTA reference (if not provided, sequence of N's is " - "considered as the reference).") - // - ("stats-in,s",po::value(&stats_in_fn), - "Input statistics.") - // - ; - - po::options_description options_output("Output options"); - options_output.add_options() - // - ( - "fasta-cons,F", po::value(&fasta_out_fn), - "FASTA file with consensus.") - // - ( - "stats-out,S", po::value(&stats_out_fn), - "Outputs statistics.") - // - ( - "vcf-cons,V", po::value(&vcf_fn), - "VCF file with updates of consensus (- for standard output)." - ) - // - ( - "pileup,P", po::value(&pileup_fn), - "Truncated pileup (- for standard output).") - // - ( - "log", po::value(&log_fn), - "Auxiliary log file.") - // - ("verbose", - "Verbose mode.") - // - ; - - po::options_description options_consensus("Parameters of consensus calling"); - options_consensus.add_options() - // - ( - "counters,x", po::value(&counters_str)->default_value(counters_str), - "Counters configuration: \n - ococo16 (3b/counter, 16b/position)\n - ococo32 (7b/counter, 32b/position)\n - ococo64 (15b/counter, 64b/position)") - // - ( - "mode,m", po::value(&mode_str)->default_value(mode_str), - "Mode: real-time / batch.") - // - ( - "strategy,t", po::value(&strategy_str)->default_value(strategy_str), - "Strategy for updates: no-updates / majority / stochastic." - ) - // - ("allow-amb,a", "Allow updates to ambiguous nucleotides.") - // - ( - "min-MQ,q", po::value(&min_mapq)->default_value(min_mapq), - "Skip alignments with mapping quality smaller than INT." - ) - // - ( - "min-BQ,Q", po::value(&min_baseq)->default_value(min_baseq), - "Skip bases with base quality smaller than INT." - ) - // - ( - "ref-weight,w", po::value(&init_ref_weight)->default_value(init_ref_weight), - "Initial counter value for nucleotides from the reference." - ) - // - ( - "min-coverage,c", - po::value(&min_coverage)->default_value(min_coverage), - "Minimum coverage required for update." - ) - // - ( - "majority-threshold,M", - po::value(&majority_threshold)->default_value(majority_threshold), - "Majority threshold." - ) - // - ; - - po::options_description options_all; - options_all.add(options_generic).add(options_input).add(options_output).add(options_consensus); - - po::variables_map vm; - try { - - po::store(po::command_line_parser(argc, argv) - .options(options_all) - .run(), - vm); // can throw - - if (vm.count("version")) { - std::cout<(&sam_fn)->required(), + "Input SAM/BAM file (- for standard input).") + // + ("fasta-ref,f", po::value(&fasta_in_fn), + "Initial FASTA reference (if not provided, sequence of N's is " + "considered as the reference).") + // + ("stats-in,s", po::value(&stats_in_fn), + "Input statistics.") + // + ; + + po::options_description options_output("Output options"); + options_output.add_options() + // + ("fasta-cons,F", po::value(&fasta_out_fn), + "FASTA file with consensus.") + // + ("stats-out,S", po::value(&stats_out_fn), + "Outputs statistics.") + // + ("vcf-cons,V", po::value(&vcf_fn), + "VCF file with updates of consensus (- for standard output).") + // + ("pileup,P", po::value(&pileup_fn), + "Truncated pileup (- for standard output).") + // + ("log", po::value(&log_fn), "Auxiliary log file.") + // + ("verbose", "Verbose mode.") + // + ; + + po::options_description options_consensus( + "Parameters of consensus calling"); + options_consensus.add_options() + // + ("counters,x", + po::value(&counters_str)->default_value(counters_str), + "Counters configuration: \n - ococo16 (3b/counter, " + "16b/position)\n - ococo32 (7b/counter, 32b/position)\n - ococo64 " + "(15b/counter, 64b/position)") + // + ("mode,m", + po::value(&mode_str)->default_value(mode_str), + "Mode: real-time / batch.") + // + ("strategy,t", + po::value(&strategy_str)->default_value(strategy_str), + "Strategy for updates: no-updates / majority / stochastic.") + // + ("allow-amb,a", "Allow updates to ambiguous nucleotides.") + // + ("min-MQ,q", po::value(&min_mapq)->default_value(min_mapq), + "Skip alignments with mapping quality smaller than INT.") + // + ("min-BQ,Q", + po::value(&min_baseq)->default_value(min_baseq), + "Skip bases with base quality smaller than INT.") + // + ("ref-weight,w", po::value(&init_ref_weight) + ->default_value(init_ref_weight), + "Initial counter value for nucleotides from the reference.") + // + ("min-coverage,c", + po::value(&min_coverage)->default_value(min_coverage), + "Minimum coverage required for update.") + // + ("majority-threshold,M", po::value(&majority_threshold) + ->default_value(majority_threshold), + "Majority threshold.") + // + ; + + po::options_description options_all; + options_all.add(options_generic) + .add(options_input) + .add(options_output) + .add(options_consensus); + + po::variables_map vm; + try { + po::store( + po::command_line_parser(argc, argv).options(options_all).run(), + vm); // can throw + + if (vm.count("version")) { + std::cout << std::endl; + print_version(); + std::cout << std::endl; + exit(0); + } + + if (vm.count("help")) { + std::cout << options_all << "\n"; + exit(0); + } + + po::notify(vm); + // throws on error, so do after help in case there + // are any problems + if (vm.count("strategy")) { + if (strategy_str.compare("no-updates") == 0) { + strategy = ococo::strategy_t::NO_UPDATES; + } else if (strategy_str.compare("majority") == 0) { + if (vm.count("allow-amb") == 0) { + strategy = ococo::strategy_t::MAJORITY; + } else { + strategy = ococo::strategy_t::MAJORITY_AMB; + } + } else if (strategy_str.compare("stochastic") == 0) { + if (vm.count("allow-amb") == 0) { + strategy = ococo::strategy_t::STOCHASTIC; + } else { + strategy = ococo::strategy_t::STOCHASTIC_AMB; + } + } else { + ococo::error( + "Unknown strategy '%s'. Possible strategies " + "are 'majority' and 'stochastic'.\n", + strategy_str.c_str()); + correctly_initialized = false; + return_code = -1; + return; + } + } + + if (vm.count("mode")) { + if (mode_str.compare("batch") == 0) { + mode = ococo::mode_t::BATCH; + } else if (mode_str.compare("real-time") == 0) { + mode = ococo::mode_t::REALTIME; + } else { + ococo::error( + "Unknown mode '%s'. Possible modes are " + "'batch' and 'real-time'.\n", + mode_str.c_str()); + correctly_initialized = false; + return_code = -1; + return; + } + } + + if (vm.count("verbose")) { + verbose = true; + } + + if (vm.count("counters")) { + if (counters_str.compare("ococo16") == 0) { + counter_configuration = OCOCO16; + counters_str_descr = + "ococo16 (16 bits per position, 3bits per nucleotide " + "counter)"; + } else if (counters_str.compare("ococo32") == 0) { + counter_configuration = OCOCO32; + counters_str_descr = + "ococo32 (32 bits per position, 7bits per nucleotide " + "counter)"; + } else if (counters_str.compare("ococo64") == 0) { + counter_configuration = OCOCO64; + counters_str_descr = + "ococo64 (64 bits per position, 15bits per nucleotide " + "counter)"; + } else { + ococo::error( + "Unknown counter configuration '%s'. Possible modes " + "are 'ococo16', 'ococo32', and 'ococo64'.\n", + counters_str.c_str()); + correctly_initialized = false; + return_code = -1; + return; + } + } + ococo::info("Ococo starting: %s\n", counters_str_descr.c_str()); + + } catch (po::error &e) { + std::cout << options_all << "\n"; + ococo::error("%s.\n", e.what()); + correctly_initialized = false; + return_code = -1; + return; + } + + } catch (std::exception &e) { + ococo::error("Unhandled Exception: %s.\n", e.what()); + correctly_initialized = false; + return_code = -1; + return; + } } diff --git a/src/params.h b/src/params.h index 93c8566..2ef7ed0 100644 --- a/src/params.h +++ b/src/params.h @@ -1,7 +1,7 @@ #pragma once -#include "types.h" #include "misc.h" +#include "types.h" #include #include @@ -15,9 +15,9 @@ #include #include #include +#include #include #include -#include /**************************** *** Consensus parameters *** @@ -25,110 +25,107 @@ namespace ococo { - enum mode_t { BATCH, REALTIME }; - - enum strategy_t { - NO_UPDATES, - STOCHASTIC, - STOCHASTIC_AMB, - MAJORITY, - MAJORITY_AMB, - count - }; - - enum counter_configuration_t { - OCOCO16, - OCOCO32, - OCOCO64, - }; - - struct params_t { - bool correctly_initialized; - int return_code; - - std::string command; - - /* - * Counter parameters - */ - counter_configuration_t counter_configuration; - std::string counters_str; - std::string counters_str_descr; - int32_t stats_bits_per_position; - int32_t stats_bits_per_nucleotide; - - /* - * Input parameters - */ - std::string sam_fn; - std::string fasta_in_fn; - std::string stats_in_fn; - - - /* - * Output parameters - */ - bool verbose; - - std::string vcf_fn; - std::string fasta_out_fn; - std::string stats_out_fn; - std::string pileup_fn; - std::string log_fn; - - /* - * Files - */ - - FILE *vcf_file; - FILE *pileup_file; - FILE *fasta_out_file; - samFile *sam_file; - FILE *log_file; - - - /* - * Consensus calling parameters - */ - - mode_t mode; - strategy_t strategy; - - /* minimum mapping quality for update */ - int32_t min_mapq; - - /* minimum base quality for update */ - int32_t min_baseq; - - /* initial values for counters corresponding to ref */ - int32_t init_ref_weight; - - /* minimum coverage for update (does not include init_ref_weight */ - int32_t min_coverage; - - /* threshold for having majority */ - double majority_threshold; - - /* auxiliary */ - std::string strategy_str; - std::string mode_str; - int64_t n_upd; - - /* - * Array of consensus calling functions - */ - char (*cons_alg[strategy_t::count])(const pos_stats_uncompr_t &psu, - const params_t ¶ms); - - params_t(); - - params_t(int argc, const char *argv[]); - - ~params_t(); - - void parse_commandline(int argc, const char *argv[]); - - void init_default_values(); - }; +enum mode_t { BATCH, REALTIME }; +enum strategy_t { + NO_UPDATES, + STOCHASTIC, + STOCHASTIC_AMB, + MAJORITY, + MAJORITY_AMB, + count +}; + +enum counter_configuration_t { + OCOCO16, + OCOCO32, + OCOCO64, +}; + +struct params_t { + bool correctly_initialized; + int return_code; + + std::string command; + + /* + * Counter parameters + */ + counter_configuration_t counter_configuration; + std::string counters_str; + std::string counters_str_descr; + int32_t stats_bits_per_position; + int32_t stats_bits_per_nucleotide; + + /* + * Input parameters + */ + std::string sam_fn; + std::string fasta_in_fn; + std::string stats_in_fn; + + /* + * Output parameters + */ + bool verbose; + + std::string vcf_fn; + std::string fasta_out_fn; + std::string stats_out_fn; + std::string pileup_fn; + std::string log_fn; + + /* + * Files + */ + + FILE *vcf_file; + FILE *pileup_file; + FILE *fasta_out_file; + samFile *sam_file; + FILE *log_file; + + /* + * Consensus calling parameters + */ + + mode_t mode; + strategy_t strategy; + + /* minimum mapping quality for update */ + int32_t min_mapq; + + /* minimum base quality for update */ + int32_t min_baseq; + + /* initial values for counters corresponding to ref */ + int32_t init_ref_weight; + + /* minimum coverage for update (does not include init_ref_weight */ + int32_t min_coverage; + + /* threshold for having majority */ + double majority_threshold; + + /* auxiliary */ + std::string strategy_str; + std::string mode_str; + int64_t n_upd; + + /* + * Array of consensus calling functions + */ + char (*cons_alg[strategy_t::count])(const pos_stats_uncompr_t &psu, + const params_t ¶ms); + + params_t(); + + params_t(int argc, const char *argv[]); + + ~params_t(); + + void parse_commandline(int argc, const char *argv[]); + + void init_default_values(); +}; } diff --git a/src/stats.h b/src/stats.h index 390ed44..b47a8ae 100644 --- a/src/stats.h +++ b/src/stats.h @@ -7,651 +7,634 @@ #include #include #include +#include #include #include -#include #include #include - /*********************** *** Main statistics *** ***********************/ namespace ococo { - KSEQ_INIT(gzFile, gzread); - - template - struct stats_t { - static_assert(8 * sizeof(T) >= 4 * counter_size + refbase_size, - "Too large counter size (does not fit into the main type)."); - - int32_t n_seqs; - bool *seq_active; - int64_t *seq_len; - std::string *seq_name; - std::string *seq_comment; - T **seq_stats; - - params_t *params; - - // stats_t(); - stats_t(params_t *params, bam_hdr_t &h); - ~stats_t(); - - /******* - * I/O * - *******/ - - int import_stats(const std::string &stats_fn); - int export_stats(const std::string &stats_fn) const; - - // Call consensus probabilistically. - int call_consensus(FILE *vcf_file, FILE *pileup_file); - int call_consensus_position(FILE *vcf_file, FILE *pileup_file, - int32_t seqid, int64_t pos); - - // Loader header from a BAM. - int load_headers_bam_hdr(const bam_hdr_t &h); - // Load header and data from a FASTA file and initialize statistics. - int load_fasta(const std::string &fasta_fn); - int save_fasta(const std::string &fasta_fn) const; - - int print_vcf_header(FILE *vcf_file, std::string cmd, - std::string fasta) const; - int print_vcf_substitution(FILE *vcf_file, int32_t seqid, int64_t pos, - char old_base, char new_base, - const pos_stats_uncompr_t &psu) const; - - int print_pileup_line(FILE *pileup_file, int32_t seqid, int64_t pos, - const pos_stats_uncompr_t &psu) const; - - /************************* - * Statistics & counters * - *************************/ - - inline int set_nucl_nt256(int32_t seqid, int64_t pos, const char &nt256); - inline int get_nucl_nt256(int32_t seqid, int64_t pos, char &nt256) const; - - static T compress_position_stats(const pos_stats_uncompr_t &psu); - static void decompress_position_stats(T psc, pos_stats_uncompr_t &psu); - static T increment(T psc, nt4_t nt4); - - /*********************** - * Debuging & checking * - ***********************/ - - // Check if everything was initialized. - bool check_allocation() const; - - // Check if a BAM header corresponds to the stats. - bool check_headers_bam_hdr(const bam_hdr_t &h) const; - - void debug_print_counters() const; - - std::string debug_str_counters(int32_t seqid, int64_t pos) const; - }; - - template - stats_t:: - stats_t(ococo::params_t *params, bam_hdr_t &h) - : n_seqs(h.n_targets), - seq_active(new (std::nothrow) bool[n_seqs]()), - seq_len(new (std::nothrow) int64_t[n_seqs]()), - seq_name(new (std::nothrow) std::string[n_seqs]()), - seq_comment(new (std::nothrow) std::string[n_seqs]()), - seq_stats(new (std::nothrow) T *[n_seqs]()), - params(params) { - - for (int seqid = 0; seqid < n_seqs; seqid++) { - seq_len[seqid] = h.target_len[seqid]; - seq_active[seqid] = true; - seq_name[seqid] = std::string(h.target_name[seqid]); - - seq_stats[seqid] = new (std::nothrow) T[seq_len[seqid]](); - } - } - - template - stats_t:: - ~stats_t() { - - if (seq_stats != nullptr) { - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - delete[] seq_stats[seqid]; - } - } - delete[] seq_active; - delete[] seq_len; - delete[] seq_name; - delete[] seq_comment; - delete[] seq_stats; - } - - template - int stats_t:: - load_fasta(const std::string &fasta_fn) { - - gzFile fp; - kseq_t *seq; - int l; - fp = gzopen(fasta_fn.c_str(), "r"); - seq = kseq_init(fp); - - constexpr int32_t max_counter_value = - ococo::right_full_mask(); - - if(errno!=0 || fp==nullptr){ - ococo::error("File '%s' could not be opened.\n", - fasta_fn.c_str()); - return -1; - - } - - for (int seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { - if (seq_name[seqid].compare(seq->name.s) != 0) { - error("Sequence names in BAM/SAM and in FASTA do not correspond " - "('%s'!='%s').\n", - seq_name[seqid].c_str(), seq->name.s); - return -1; - } - - if (seq_len[seqid] != static_cast(seq->seq.l)) { - error("Sequence lengths in BAM/SAM and in FASTA do not correspond " - "(%" PRId64 "!=%" PRId64 ").\n", - static_cast(seq->seq.l), - static_cast(seq_len[seqid])); - return -1; - } - - if (seq->comment.l && seq_comment[seqid].empty()) { - seq_comment[seqid] = std::string(seq->comment.s); - } - - for (int64_t pos = 0; pos < static_cast(seq->seq.l); pos++) { - assert(seq_stats[seqid][pos] == 0); - - pos_stats_uncompr_t psu = {0, {0, 0, 0, 0}, 0}; - psu.nt16 = nt256_nt16[static_cast(seq->seq.s[pos])]; - - if (psu.nt16 != nt256_nt16[static_cast('N')]) { - for (int32_t i = 0; i < 4; i++) { - psu.counters[i] = ((0x1 << i) & psu.nt16) - ? std::min(params->init_ref_weight, - max_counter_value) - : 0; - } - } - - psu.sum = psu.counters[0] + psu.counters[1] + psu.counters[2] + - psu.counters[3]; - seq_stats[seqid][pos] = compress_position_stats(psu); - } - } - kseq_destroy(seq); // STEP 5: destroy seq - gzclose(fp); // STEP 6: close the file handler - return 0; - } - - template - int stats_t:: - save_fasta( const std::string &fasta_fn) const { - - assert(check_allocation()); - - FILE *fasta_file = nullptr; - fasta_file = fopen(fasta_fn.c_str(), "w+"); - - char fasta_buffer[fasta_line_l]; - for (int s = 0; s < n_seqs; s++) { - // printf("%s\n",seq_name[s]); - if (!seq_comment[s].empty()) { - fprintf(fasta_file, ">%s %s\n", seq_name[s].c_str(), - seq_comment[s].c_str()); - } else { - fprintf(fasta_file, ">%s\n", seq_name[s].c_str()); - } - - for (int64_t i = 0, j = 0; i < seq_len[s]; i++, j++) { - get_nucl_nt256(s, i, fasta_buffer[j]); - - if (j == fasta_line_l - 1 || i == seq_len[s] - 1) { - fwrite(fasta_buffer, 1, j + 1, fasta_file); - fwrite("\n", 1, 1, fasta_file); - j = -1; - } - } - } - - fclose(fasta_file); - - return 0; - } - - template - bool stats_t:: - check_allocation() const { - - if (seq_active == nullptr || seq_len == nullptr || seq_stats == nullptr || - seq_name == nullptr || seq_comment == nullptr) { - return false; - } - - for (int i = 0; i < n_seqs; i++) { - if (seq_stats[i] == nullptr) { - return false; - } - } - - return true; - } - - template - bool stats_t:: - check_headers_bam_hdr(const bam_hdr_t &h) const { - - if (!check_allocation()) { - return false; - } - - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - if (seq_len[seqid] != static_cast(h.target_len[seqid])) { - return false; - } - if (seq_name[seqid].compare(h.target_name[seqid]) != 0) { - return false; - } - } - - return true; - } - - template - int stats_t:: - import_stats(const std::string &stats_fn) { - - assert(check_allocation()); - - int error_code = 0; - - FILE *fo = fopen(stats_fn.c_str(), "r"); - if (fo == nullptr) { - ococo::error("File with statistics could not be opened ('%s').\n", - stats_fn.c_str()); - return -1; - } - - /* number of seqs */ - int32_t n_seqs_loaded; - fread(&n_seqs_loaded, sizeof(int32_t), 1, fo); - - if (n_seqs_loaded != n_seqs) { - error("Numbers of sequences in stats and SAM/BAM do not correspond " - "%" PRId32 "!=%" PRId32 ").\n", - n_seqs_loaded, n_seqs); - return -1; - } - - for (int seqid = 0; seqid < n_seqs; seqid++) { - /* sequence */ - - single_seq_serial_t seq_ser; - fread(&seq_ser, sizeof(single_seq_serial_t), 1, fo); - - if (seq_ser.seq_active != seq_active[seqid]) { - error("Active sequences in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ").\n", - seqid); - return -1; - } - - if (seq_ser.seq_len != seq_len[seqid]) { - error("Sequence lengths in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ", %" PRId64 "!=%" PRId64 ").\n", - seqid, seq_ser.seq_len, seq_len[seqid]); - return -1; - } - - if (seq_name[seqid].compare(seq_ser.seq_name) != 0) { - error("Sequence names in stats and SAM/BAM do not correspond " - "(seqid %" PRId32 ", '%s'!='%s').\n", - seqid, seq_ser.seq_name, seq_name[seqid].c_str()); - return -1; - } - - fread(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); - } - - error_code = fclose(fo); - if (error_code != 0) { - ococo::error("File with statistics could not be closed ('%s').\n", - stats_fn.c_str()); - return -1; - } - - return 0; - } - - template - int stats_t:: - export_stats(const std::string &stats_fn) const { - - assert(check_allocation()); - - int error_code = 0; - - FILE *fo = fopen(stats_fn.c_str(), "w+"); - if (fo == nullptr) { - ococo::error("File with statistics could not be opened ('%s').\n", - stats_fn.c_str()); - return -1; - } - - /* number of seqs */ - fwrite(&n_seqs, sizeof(int32_t), 1, fo); - - for (int seqid = 0; seqid < n_seqs; seqid++) { - /* sequence */ - single_seq_serial_t seq_ser = {0}; - seq_ser.seq_active = seq_active[seqid]; - seq_ser.seq_len = seq_len[seqid]; - strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); - strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); - uint64_t written = 0; - written += fwrite(&seq_ser, sizeof(single_seq_serial_t), 1, fo); - written += fwrite(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); - if (written != 1 + static_cast(seq_len[seqid])) { - ococo::error( - "Problem with writting to the file with statistics ('%s').\n", - stats_fn.c_str()); - return -1; - } - } - - error_code = fclose(fo); - if (error_code != 0) { - ococo::error("File with statistics could not be closed ('%s').\n", - stats_fn.c_str()); - return -1; - } - return 0; - } - - template - int stats_t:: - call_consensus(FILE *vcf_file, FILE *pileup_file) { - assert(check_allocation()); - - for (int32_t seqid = 0; seqid < n_seqs; seqid++) { - for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { - call_consensus_position(vcf_file, pileup_file, seqid, pos); - } - } - - return 0; - } - - template - int stats_t:: - call_consensus_position(FILE *vcf_file, FILE *pileup_file, int32_t seqid, int64_t pos) { - - pos_stats_uncompr_t psu; - decompress_position_stats(seq_stats[seqid][pos], psu); - - char old_base_nt256; - get_nucl_nt256(seqid, pos, old_base_nt256); - // const char new_base_nt256=cons_call_maj(psu); - const char new_base_nt256 = (params->cons_alg[params->strategy])(psu, *params); - - if (old_base_nt256 != new_base_nt256) { - if (vcf_file != nullptr) { - print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, - new_base_nt256, psu); - } - params->n_upd+=1; - - set_nucl_nt256(seqid, pos, new_base_nt256); - } - - if(params->verbose){ - if (old_base_nt256 == new_base_nt256) { - if (vcf_file != nullptr) { - print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, - new_base_nt256, psu); - } - } - } - - if (pileup_file != nullptr) { - print_pileup_line(pileup_file, seqid, pos, psu); - } - - return 0; - } - - template - T stats_t:: - compress_position_stats(const pos_stats_uncompr_t &psu) { - - T psc = 0; - - for (int32_t i = 0; i < 4; i++) { - psc <<= counter_size; - psc |= psu.counters[i] & right_full_mask(); - } - - psc <<= refbase_size; - psc |= psu.nt16; - - return psc; - } - - template - void stats_t:: - decompress_position_stats(T psc, pos_stats_uncompr_t &psu) { - - psu.nt16 = psc & right_full_mask(); - psc >>= refbase_size; - - psu.sum = 0; - for (int32_t i = 3; i >= 0; i--) { - psu.counters[i] = psc & right_full_mask(); - psu.sum += psu.counters[i]; - psc >>= counter_size; - } - } - - template - int stats_t:: - print_vcf_header(FILE *vcf_file, std::string cmd, std::string fasta) const { - - assert(check_allocation()); - assert(vcf_file != nullptr); - - std::time_t tt = std::time(nullptr); - tm *tm = localtime(&tt); - - fprintf(vcf_file, "##fileformat=VCFv4.3\n" - "##fileDate=%04d%02d%02d\n" - "##source=Ococo\n", - tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); - - if (!cmd.empty()) { - fprintf(vcf_file, "##ococo_command=%s\n", cmd.c_str()); - } - fprintf(vcf_file, "##ococo_stats_datatype_size=%zubits\n", 8 * sizeof(T)); - fprintf(vcf_file, "##ococo_counter_size=%dbits\n", counter_size); - - if (!fasta.empty()) { - fprintf(vcf_file, "##reference=%s\n", fasta.c_str()); - } - - for (int seqid = 0; seqid < n_seqs; seqid++) { - fprintf(vcf_file, "##contig=\n", - seq_name[seqid].c_str(), seq_len[seqid]); - } - - - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "##INFO=\n"); - fprintf(vcf_file, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); - - return 0; - } - - template - int stats_t:: - print_vcf_substitution( FILE *vcf_file, int32_t seqid, - int64_t pos, char old_base, char new_base, - const pos_stats_uncompr_t &psu) const { - - assert(check_allocation()); - assert(vcf_file != nullptr); - - float alt_freq=1.0*psu.counters[nt256_nt4[static_cast(new_base)]]/psu.sum; - - fprintf(vcf_file, - "%s\t%" PRId64 "\t.\t%c\t%c\t100\tPASS\tAF=%.2f;CS=%" PRId32 ",%" PRId32 - ",%" PRId32 ",%" PRId32 ";COV=%" PRId32 "\n", - seq_name[seqid].c_str(), pos + 1, old_base, new_base, - round(alt_freq*100.0)/100, - psu.counters[0], psu.counters[1], psu.counters[2], psu.counters[3], - psu.sum); - - return 0; - } - - template - int stats_t:: - print_pileup_line(FILE *pileup_file, int32_t seqid, int64_t pos, - const pos_stats_uncompr_t &psu) const { - - assert(check_allocation()); - assert(pileup_file != nullptr); - - // todo: fix situation when depth is larger (use the printing buffer more - // timess) - - const int32_t max_depth = 1000; - - assert(psu.sum < max_depth); - char bases[max_depth]; - char qualities[max_depth]; - - char ref_nt256 = nt16_nt256[psu.nt16]; - - if (psu.sum == 0) { - return 0; - } - - if (ref_nt256 == '=') { - ref_nt256 = 'N'; - } - - int32_t j = 0; - - for (int32_t nt4 = 0; nt4 < 4; nt4++) { - const char filling_char = - nt4_nt16[nt4] == psu.nt16 ? '.' : nt4_nt256[nt4]; - for (int32_t i = 0; i < psu.counters[nt4]; i++, j++) { - bases[j] = filling_char; - qualities[j] = '~'; - } - } - - if (psu.sum >= max_depth) { - ococo::error("Too high coverage at position %" PRId64 - ". Pileup does not support coverage higher than %" PRId32 - ".", - pos, max_depth); - return -1; - } - - bases[j] = '\0'; - qualities[j] = '\0'; - - fprintf(pileup_file, "%s\t%" PRId64 "\t%c\t%" PRId32 "\t%s\t%s\n", - seq_name[seqid].c_str(), pos + 1, ref_nt256, psu.sum, bases, - qualities); - - return 0; - } - - template - std::string stats_t:: - debug_str_counters(int32_t seqid, int64_t pos) const { - - pos_stats_uncompr_t psu; - decompress_position_stats(seq_stats[seqid][pos], psu); - std::stringstream ss; - ss << "[" << nt16_nt256[psu.nt16] << "]" - << "(" << psu.counters[0] << "," << psu.counters[1] << "," - << psu.counters[2] << "," << psu.counters[3] << ")"; - return ss.str(); - } - - template - void stats_t:: - debug_print_counters() const { - - for (int seqid = 0; seqid < n_seqs; seqid++) { - fprintf(stderr, "%s\n", seq_name[seqid]); - for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { - fprintf(stderr, "%8" PRId64 " %04x \n", pos, seq_stats[seqid][pos]); - } - } - } - - template - inline int stats_t:: - set_nucl_nt256(int32_t seqid, int64_t pos, const char &nt256) { - - nt16_t nt16 = nt256_nt16[static_cast(nt256)]; - T n_psc = seq_stats[seqid][pos]; - n_psc >>= refbase_size; - n_psc <<= refbase_size; - n_psc |= nt16 & right_full_mask(); - seq_stats[seqid][pos] = n_psc; - return 0; - } - - template - inline int stats_t:: - get_nucl_nt256(int32_t seqid, int64_t pos, char &nt256) const { - - nt256 = nt16_nt256[seq_stats[seqid][pos] & right_full_mask()]; - if (nt256=='='){ - nt256='N'; - } - return 0; - } - - template - T stats_t:: - increment(T psc, nt4_t nt4) { - - assert(0 <= nt4 && nt4 < 4); - - pos_stats_uncompr_t psu; - decompress_position_stats(psc, psu); - - if (psu.counters[nt4] == right_full_mask()) { - psu.counters[0] >>= 1; - psu.counters[1] >>= 1; - psu.counters[2] >>= 1; - psu.counters[3] >>= 1; - } - - psu.counters[nt4]++; - - return compress_position_stats(psu); - } +KSEQ_INIT(gzFile, gzread); + +template +struct stats_t { + static_assert(8 * sizeof(T) >= 4 * counter_size + refbase_size, + "Too large counter size (does not fit into the main type)."); + + int32_t n_seqs; + bool *seq_active; + int64_t *seq_len; + std::string *seq_name; + std::string *seq_comment; + T **seq_stats; + + params_t *params; + + // stats_t(); + stats_t(params_t *params, bam_hdr_t &h); + ~stats_t(); + + /******* + * I/O * + *******/ + + int import_stats(const std::string &stats_fn); + int export_stats(const std::string &stats_fn) const; + + // Call consensus probabilistically. + int call_consensus(FILE *vcf_file, FILE *pileup_file); + int call_consensus_position(FILE *vcf_file, FILE *pileup_file, + int32_t seqid, int64_t pos); + + // Loader header from a BAM. + int load_headers_bam_hdr(const bam_hdr_t &h); + // Load header and data from a FASTA file and initialize statistics. + int load_fasta(const std::string &fasta_fn); + int save_fasta(const std::string &fasta_fn) const; + + int print_vcf_header(FILE *vcf_file, std::string cmd, + std::string fasta) const; + int print_vcf_substitution(FILE *vcf_file, int32_t seqid, int64_t pos, + char old_base, char new_base, + const pos_stats_uncompr_t &psu) const; + + int print_pileup_line(FILE *pileup_file, int32_t seqid, int64_t pos, + const pos_stats_uncompr_t &psu) const; + + /************************* + * Statistics & counters * + *************************/ + + inline int set_nucl_nt256(int32_t seqid, int64_t pos, const char &nt256); + inline int get_nucl_nt256(int32_t seqid, int64_t pos, char &nt256) const; + + static T compress_position_stats(const pos_stats_uncompr_t &psu); + static void decompress_position_stats(T psc, pos_stats_uncompr_t &psu); + static T increment(T psc, nt4_t nt4); + + /*********************** + * Debuging & checking * + ***********************/ + + // Check if everything was initialized. + bool check_allocation() const; + + // Check if a BAM header corresponds to the stats. + bool check_headers_bam_hdr(const bam_hdr_t &h) const; + + void debug_print_counters() const; + + std::string debug_str_counters(int32_t seqid, int64_t pos) const; +}; + +template +stats_t::stats_t(ococo::params_t *params, + bam_hdr_t &h) + : n_seqs(h.n_targets), + seq_active(new (std::nothrow) bool[n_seqs]()), + seq_len(new (std::nothrow) int64_t[n_seqs]()), + seq_name(new (std::nothrow) std::string[n_seqs]()), + seq_comment(new (std::nothrow) std::string[n_seqs]()), + seq_stats(new (std::nothrow) T *[n_seqs]()), + params(params) { + for (int seqid = 0; seqid < n_seqs; seqid++) { + seq_len[seqid] = h.target_len[seqid]; + seq_active[seqid] = true; + seq_name[seqid] = std::string(h.target_name[seqid]); + + seq_stats[seqid] = new (std::nothrow) T[seq_len[seqid]](); + } +} + +template +stats_t::~stats_t() { + if (seq_stats != nullptr) { + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + delete[] seq_stats[seqid]; + } + } + delete[] seq_active; + delete[] seq_len; + delete[] seq_name; + delete[] seq_comment; + delete[] seq_stats; +} + +template +int stats_t::load_fasta( + const std::string &fasta_fn) { + gzFile fp; + kseq_t *seq; + int l; + fp = gzopen(fasta_fn.c_str(), "r"); + seq = kseq_init(fp); + + constexpr int32_t max_counter_value = + ococo::right_full_mask(); + + if (errno != 0 || fp == nullptr) { + ococo::error("File '%s' could not be opened.\n", fasta_fn.c_str()); + return -1; + } + + for (int seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { + if (seq_name[seqid].compare(seq->name.s) != 0) { + error( + "Sequence names in BAM/SAM and in FASTA do not correspond " + "('%s'!='%s').\n", + seq_name[seqid].c_str(), seq->name.s); + return -1; + } + + if (seq_len[seqid] != static_cast(seq->seq.l)) { + error( + "Sequence lengths in BAM/SAM and in FASTA do not correspond " + "(%" PRId64 "!=%" PRId64 ").\n", + static_cast(seq->seq.l), + static_cast(seq_len[seqid])); + return -1; + } + + if (seq->comment.l && seq_comment[seqid].empty()) { + seq_comment[seqid] = std::string(seq->comment.s); + } + + for (int64_t pos = 0; pos < static_cast(seq->seq.l); pos++) { + assert(seq_stats[seqid][pos] == 0); + + pos_stats_uncompr_t psu = {0, {0, 0, 0, 0}, 0}; + psu.nt16 = nt256_nt16[static_cast(seq->seq.s[pos])]; + + if (psu.nt16 != nt256_nt16[static_cast('N')]) { + for (int32_t i = 0; i < 4; i++) { + psu.counters[i] = ((0x1 << i) & psu.nt16) + ? std::min(params->init_ref_weight, + max_counter_value) + : 0; + } + } + + psu.sum = psu.counters[0] + psu.counters[1] + psu.counters[2] + + psu.counters[3]; + seq_stats[seqid][pos] = compress_position_stats(psu); + } + } + kseq_destroy(seq); // STEP 5: destroy seq + gzclose(fp); // STEP 6: close the file handler + return 0; +} + +template +int stats_t::save_fasta( + const std::string &fasta_fn) const { + assert(check_allocation()); + + FILE *fasta_file = nullptr; + fasta_file = fopen(fasta_fn.c_str(), "w+"); + + char fasta_buffer[fasta_line_l]; + for (int s = 0; s < n_seqs; s++) { + // printf("%s\n",seq_name[s]); + if (!seq_comment[s].empty()) { + fprintf(fasta_file, ">%s %s\n", seq_name[s].c_str(), + seq_comment[s].c_str()); + } else { + fprintf(fasta_file, ">%s\n", seq_name[s].c_str()); + } + + for (int64_t i = 0, j = 0; i < seq_len[s]; i++, j++) { + get_nucl_nt256(s, i, fasta_buffer[j]); + + if (j == fasta_line_l - 1 || i == seq_len[s] - 1) { + fwrite(fasta_buffer, 1, j + 1, fasta_file); + fwrite("\n", 1, 1, fasta_file); + j = -1; + } + } + } + + fclose(fasta_file); + + return 0; +} + +template +bool stats_t::check_allocation() const { + if (seq_active == nullptr || seq_len == nullptr || seq_stats == nullptr || + seq_name == nullptr || seq_comment == nullptr) { + return false; + } + + for (int i = 0; i < n_seqs; i++) { + if (seq_stats[i] == nullptr) { + return false; + } + } + + return true; +} + +template +bool stats_t::check_headers_bam_hdr( + const bam_hdr_t &h) const { + if (!check_allocation()) { + return false; + } + + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + if (seq_len[seqid] != static_cast(h.target_len[seqid])) { + return false; + } + if (seq_name[seqid].compare(h.target_name[seqid]) != 0) { + return false; + } + } + + return true; +} + +template +int stats_t::import_stats( + const std::string &stats_fn) { + assert(check_allocation()); + + int error_code = 0; + + FILE *fo = fopen(stats_fn.c_str(), "r"); + if (fo == nullptr) { + ococo::error("File with statistics could not be opened ('%s').\n", + stats_fn.c_str()); + return -1; + } + + /* number of seqs */ + int32_t n_seqs_loaded; + fread(&n_seqs_loaded, sizeof(int32_t), 1, fo); + + if (n_seqs_loaded != n_seqs) { + error( + "Numbers of sequences in stats and SAM/BAM do not correspond " + "%" PRId32 "!=%" PRId32 ").\n", + n_seqs_loaded, n_seqs); + return -1; + } + + for (int seqid = 0; seqid < n_seqs; seqid++) { + /* sequence */ + + single_seq_serial_t seq_ser; + fread(&seq_ser, sizeof(single_seq_serial_t), 1, fo); + + if (seq_ser.seq_active != seq_active[seqid]) { + error( + "Active sequences in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ").\n", + seqid); + return -1; + } + + if (seq_ser.seq_len != seq_len[seqid]) { + error( + "Sequence lengths in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ", %" PRId64 "!=%" PRId64 ").\n", + seqid, seq_ser.seq_len, seq_len[seqid]); + return -1; + } + + if (seq_name[seqid].compare(seq_ser.seq_name) != 0) { + error( + "Sequence names in stats and SAM/BAM do not correspond " + "(seqid %" PRId32 ", '%s'!='%s').\n", + seqid, seq_ser.seq_name, seq_name[seqid].c_str()); + return -1; + } + + fread(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); + } + + error_code = fclose(fo); + if (error_code != 0) { + ococo::error("File with statistics could not be closed ('%s').\n", + stats_fn.c_str()); + return -1; + } + + return 0; +} + +template +int stats_t::export_stats( + const std::string &stats_fn) const { + assert(check_allocation()); + + int error_code = 0; + + FILE *fo = fopen(stats_fn.c_str(), "w+"); + if (fo == nullptr) { + ococo::error("File with statistics could not be opened ('%s').\n", + stats_fn.c_str()); + return -1; + } + + /* number of seqs */ + fwrite(&n_seqs, sizeof(int32_t), 1, fo); + + for (int seqid = 0; seqid < n_seqs; seqid++) { + /* sequence */ + single_seq_serial_t seq_ser = {0}; + seq_ser.seq_active = seq_active[seqid]; + seq_ser.seq_len = seq_len[seqid]; + strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); + strncpy(seq_ser.seq_name, seq_name[seqid].c_str(), 999); + uint64_t written = 0; + written += fwrite(&seq_ser, sizeof(single_seq_serial_t), 1, fo); + written += fwrite(seq_stats[seqid], sizeof(T), seq_len[seqid], fo); + if (written != 1 + static_cast(seq_len[seqid])) { + ococo::error( + "Problem with writting to the file with statistics ('%s').\n", + stats_fn.c_str()); + return -1; + } + } + + error_code = fclose(fo); + if (error_code != 0) { + ococo::error("File with statistics could not be closed ('%s').\n", + stats_fn.c_str()); + return -1; + } + return 0; +} + +template +int stats_t::call_consensus(FILE *vcf_file, + FILE *pileup_file) { + assert(check_allocation()); + + for (int32_t seqid = 0; seqid < n_seqs; seqid++) { + for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { + call_consensus_position(vcf_file, pileup_file, seqid, pos); + } + } + + return 0; +} + +template +int stats_t::call_consensus_position( + FILE *vcf_file, FILE *pileup_file, int32_t seqid, int64_t pos) { + pos_stats_uncompr_t psu; + decompress_position_stats(seq_stats[seqid][pos], psu); + + char old_base_nt256; + get_nucl_nt256(seqid, pos, old_base_nt256); + // const char new_base_nt256=cons_call_maj(psu); + const char new_base_nt256 = + (params->cons_alg[params->strategy])(psu, *params); + + if (old_base_nt256 != new_base_nt256) { + if (vcf_file != nullptr) { + print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, + new_base_nt256, psu); + } + params->n_upd += 1; + + set_nucl_nt256(seqid, pos, new_base_nt256); + } + + if (params->verbose) { + if (old_base_nt256 == new_base_nt256) { + if (vcf_file != nullptr) { + print_vcf_substitution(vcf_file, seqid, pos, old_base_nt256, + new_base_nt256, psu); + } + } + } + + if (pileup_file != nullptr) { + print_pileup_line(pileup_file, seqid, pos, psu); + } + + return 0; +} +template +T stats_t::compress_position_stats( + const pos_stats_uncompr_t &psu) { + T psc = 0; + + for (int32_t i = 0; i < 4; i++) { + psc <<= counter_size; + psc |= psu.counters[i] & right_full_mask(); + } + + psc <<= refbase_size; + psc |= psu.nt16; + + return psc; +} + +template +void stats_t::decompress_position_stats( + T psc, pos_stats_uncompr_t &psu) { + psu.nt16 = psc & right_full_mask(); + psc >>= refbase_size; + + psu.sum = 0; + for (int32_t i = 3; i >= 0; i--) { + psu.counters[i] = psc & right_full_mask(); + psu.sum += psu.counters[i]; + psc >>= counter_size; + } +} + +template +int stats_t::print_vcf_header( + FILE *vcf_file, std::string cmd, std::string fasta) const { + assert(check_allocation()); + assert(vcf_file != nullptr); + + std::time_t tt = std::time(nullptr); + tm *tm = localtime(&tt); + + fprintf(vcf_file, + "##fileformat=VCFv4.3\n" + "##fileDate=%04d%02d%02d\n" + "##source=Ococo\n", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday); + + if (!cmd.empty()) { + fprintf(vcf_file, "##ococo_command=%s\n", cmd.c_str()); + } + fprintf(vcf_file, "##ococo_stats_datatype_size=%zubits\n", 8 * sizeof(T)); + fprintf(vcf_file, "##ococo_counter_size=%dbits\n", counter_size); + + if (!fasta.empty()) { + fprintf(vcf_file, "##reference=%s\n", fasta.c_str()); + } + + for (int seqid = 0; seqid < n_seqs; seqid++) { + fprintf(vcf_file, "##contig=\n", + seq_name[seqid].c_str(), seq_len[seqid]); + } + + fprintf(vcf_file, + "##INFO=\n"); + fprintf(vcf_file, + "##INFO=\n"); + fprintf(vcf_file, + "##INFO=\n"); + fprintf(vcf_file, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); + + return 0; +} + +template +int stats_t::print_vcf_substitution( + FILE *vcf_file, int32_t seqid, int64_t pos, char old_base, char new_base, + const pos_stats_uncompr_t &psu) const { + assert(check_allocation()); + assert(vcf_file != nullptr); + + float alt_freq = + 1.0 * psu.counters[nt256_nt4[static_cast(new_base)]] / psu.sum; + + fprintf(vcf_file, + "%s\t%" PRId64 "\t.\t%c\t%c\t100\tPASS\tAF=%.2f;CS=%" PRId32 + ",%" PRId32 ",%" PRId32 ",%" PRId32 ";COV=%" PRId32 "\n", + seq_name[seqid].c_str(), pos + 1, old_base, new_base, + round(alt_freq * 100.0) / 100, psu.counters[0], psu.counters[1], + psu.counters[2], psu.counters[3], psu.sum); + + return 0; +} + +template +int stats_t::print_pileup_line( + FILE *pileup_file, int32_t seqid, int64_t pos, + const pos_stats_uncompr_t &psu) const { + assert(check_allocation()); + assert(pileup_file != nullptr); + + // todo: fix situation when depth is larger (use the printing buffer more + // timess) + + const int32_t max_depth = 1000; + + assert(psu.sum < max_depth); + char bases[max_depth]; + char qualities[max_depth]; + + char ref_nt256 = nt16_nt256[psu.nt16]; + + if (psu.sum == 0) { + return 0; + } + + if (ref_nt256 == '=') { + ref_nt256 = 'N'; + } + + int32_t j = 0; + + for (int32_t nt4 = 0; nt4 < 4; nt4++) { + const char filling_char = + nt4_nt16[nt4] == psu.nt16 ? '.' : nt4_nt256[nt4]; + for (int32_t i = 0; i < psu.counters[nt4]; i++, j++) { + bases[j] = filling_char; + qualities[j] = '~'; + } + } + + if (psu.sum >= max_depth) { + ococo::error("Too high coverage at position %" PRId64 + ". Pileup does not support coverage higher than %" PRId32 + ".", + pos, max_depth); + return -1; + } + + bases[j] = '\0'; + qualities[j] = '\0'; + + fprintf(pileup_file, "%s\t%" PRId64 "\t%c\t%" PRId32 "\t%s\t%s\n", + seq_name[seqid].c_str(), pos + 1, ref_nt256, psu.sum, bases, + qualities); + + return 0; +} + +template +std::string stats_t::debug_str_counters( + int32_t seqid, int64_t pos) const { + pos_stats_uncompr_t psu; + decompress_position_stats(seq_stats[seqid][pos], psu); + std::stringstream ss; + ss << "[" << nt16_nt256[psu.nt16] << "]" + << "(" << psu.counters[0] << "," << psu.counters[1] << "," + << psu.counters[2] << "," << psu.counters[3] << ")"; + return ss.str(); +} + +template +void stats_t::debug_print_counters() const { + for (int seqid = 0; seqid < n_seqs; seqid++) { + fprintf(stderr, "%s\n", seq_name[seqid]); + for (int64_t pos = 0; pos < seq_len[seqid]; pos++) { + fprintf(stderr, "%8" PRId64 " %04x \n", pos, seq_stats[seqid][pos]); + } + } +} + +template +inline int stats_t::set_nucl_nt256( + int32_t seqid, int64_t pos, const char &nt256) { + nt16_t nt16 = nt256_nt16[static_cast(nt256)]; + T n_psc = seq_stats[seqid][pos]; + n_psc >>= refbase_size; + n_psc <<= refbase_size; + n_psc |= nt16 & right_full_mask(); + seq_stats[seqid][pos] = n_psc; + return 0; +} + +template +inline int stats_t::get_nucl_nt256( + int32_t seqid, int64_t pos, char &nt256) const { + nt256 = + nt16_nt256[seq_stats[seqid][pos] & right_full_mask()]; + if (nt256 == '=') { + nt256 = 'N'; + } + return 0; +} + +template +T stats_t::increment(T psc, nt4_t nt4) { + assert(0 <= nt4 && nt4 < 4); + + pos_stats_uncompr_t psu; + decompress_position_stats(psc, psu); + + if (psu.counters[nt4] == right_full_mask()) { + psu.counters[0] >>= 1; + psu.counters[1] >>= 1; + psu.counters[2] >>= 1; + psu.counters[3] >>= 1; + } + + psu.counters[nt4]++; + + return compress_position_stats(psu); +} } diff --git a/src/types.h b/src/types.h index e161c5a..7893b42 100644 --- a/src/types.h +++ b/src/types.h @@ -1,89 +1,87 @@ #pragma once -#include #include +#include namespace ococo { - const int fasta_line_l = 50; - const int stats_delim_l = 10; - - typedef uint8_t nt4_t; - typedef uint8_t nt16_t; - typedef uint8_t nt256_t; - - /****************** - * * - * Structures * - * * - ******************/ - - /***************** - *** Auxiliary *** - *****************/ - - struct single_seq_serial_t { - bool seq_active; - int64_t seq_len; - char seq_name[1000]; - char seq_comment[1000]; - }; - - /************************************ - *** Single position uncompressed *** - ************************************/ - - struct pos_stats_uncompr_t { - nt16_t nt16; - - int32_t counters[4]; - int32_t sum; - }; - - /************************** - *** Translation tables *** - **************************/ - - static const uint8_t nt256_nt4[] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; - - static const uint8_t nt16_nt4[] = {4, 0, 1, 4, 2, 4, 4, 4, - 3, 4, 4, 4, 4, 4, 4, 4}; - - static const uint8_t nt256_nt16[] = { - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 1, 2, 4, 8, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0 /*=*/, 15, 15, - 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, - 15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, - 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, - 15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, - - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; - - static const uint8_t nt16_nt256[] = "=ACMGRSVTWYHKDBN"; - - static const uint8_t nt4_nt256[] = "ACGTN"; - - static const uint8_t nt4_nt16[] = {1, 2, 4, 8, 15}; - +const int fasta_line_l = 50; +const int stats_delim_l = 10; + +typedef uint8_t nt4_t; +typedef uint8_t nt16_t; +typedef uint8_t nt256_t; + +/****************** + * * + * Structures * + * * + ******************/ + +/***************** + *** Auxiliary *** + *****************/ + +struct single_seq_serial_t { + bool seq_active; + int64_t seq_len; + char seq_name[1000]; + char seq_comment[1000]; +}; + +/************************************ + *** Single position uncompressed *** + ************************************/ + +struct pos_stats_uncompr_t { + nt16_t nt16; + + int32_t counters[4]; + int32_t sum; +}; + +/************************** + *** Translation tables *** + **************************/ + +static const uint8_t nt256_nt4[] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; + +static const uint8_t nt16_nt4[] = {4, 0, 1, 4, 2, 4, 4, 4, + 3, 4, 4, 4, 4, 4, 4, 4}; + +static const uint8_t nt256_nt16[] = { + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 1, 2, 4, 8, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0 /*=*/, 15, 15, + 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, + 15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, + 15, 1, 14, 2, 13, 15, 15, 4, 11, 15, 15, 12, 15, 3, 15, 15, + 15, 15, 5, 6, 8, 15, 7, 9, 15, 10, 15, 15, 15, 15, 15, 15, + + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; + +static const uint8_t nt16_nt256[] = "=ACMGRSVTWYHKDBN"; + +static const uint8_t nt4_nt256[] = "ACGTN"; + +static const uint8_t nt4_nt16[] = {1, 2, 4, 8, 15}; } - diff --git a/src/version.h b/src/version.h index a3ec639..fcf01ed 100644 --- a/src/version.h +++ b/src/version.h @@ -1,6 +1,5 @@ #pragma once -namespace ococo{ - static const char *OCOCO_VERSION = "0.1.2"; +namespace ococo { +static const char *OCOCO_VERSION = "0.1.2"; } - From ffd9c80a6b54632bf44940976694509de440405d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 16:55:39 +0200 Subject: [PATCH 11/32] Non-working options, compilable code --- src/main.cpp | 2 +- src/params.cpp | 62 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/params.h | 4 ++-- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index ef6d3a2..9fefc9a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,7 +6,7 @@ #include #include -int main(int argc, const char *argv[]) { +int main(int argc, const char **argv) { /* Use default configuration */ ococo::params_t params = ococo::params_t(argc, argv); if (!params.correctly_initialized) { diff --git a/src/params.cpp b/src/params.cpp index 6cabac8..b2c2d5e 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -1,6 +1,17 @@ #include "params.h" +#include #include "consensus.h" +#include +#include +#include +#include +#include + +#define no_argument 0 +#define required_argument 1 +#define optional_argument 2 + /**************************** *** Consensus parameters *** ****************************/ @@ -39,7 +50,7 @@ void ococo::params_t::init_default_values() { ococo::params_t::params_t() { init_default_values(); } -ococo::params_t::params_t(int argc, const char *argv[]) { +ococo::params_t::params_t(int argc, const char **argv) { init_default_values(); parse_commandline(argc, argv); } @@ -87,7 +98,7 @@ ococo::params_t::~params_t() { } } -void ococo::params_t::parse_commandline(int argc, const char *argv[]) { +void ococo::params_t::parse_commandline(int argc, const char **argv) { /* Save cmd parameters */ std::stringstream cmd; @@ -100,6 +111,53 @@ void ococo::params_t::parse_commandline(int argc, const char *argv[]) { command = cmd.str(); /* Parse cmd parameters */ + //int getopt_long(int argc, char *const *argv, const char *optstring, + // const struct option *longopts, int *longindex); + + const struct option lopts[] = { + {"version", no_argument, NULL, 'v'}, + {"help", no_argument, NULL, 'h'}, + // + {"input", required_argument, NULL, 'i'}, + {"fasta-ref", required_argument, NULL, 'f'}, + {"stats-in", required_argument, NULL, 's'}, + // + {"fasta-cons", required_argument, NULL, 'F'}, + {"stats-out", required_argument, NULL, 'S'}, + {"vcf-cons", required_argument, NULL, 'V'}, + {"pileup", required_argument, NULL, 'P'}, + {"log", required_argument, NULL, 'L'}, + {"verbose", required_argument, NULL, 'W'}, + // + {"counters", required_argument, NULL, 'x'}, // require flag + {"mode", required_argument, NULL, 'm'}, // filter flag + {"strategy", required_argument, NULL, 's'}, + {"min-MQ", required_argument, NULL, 'q'}, + {"min-BQ", required_argument, NULL, 'Q'}, + {"ref-weight", required_argument, NULL, 'w'}, + {"min-coverage", required_argument, NULL, 'c'}, + {"majority-threshold", required_argument, NULL, 'M'}, + // + {NULL, 0, NULL, 0}}; + + int c; + while ((c = getopt_long(argc,(char *const *) argv, "vhi:f:s:F:S:V:P:L:W:x:m:s:q:Q:w:c:M:", + lopts, NULL)) >= 0) { + switch (c) { + case 'h': + // print_help(); + break; + case 'v': + // print_help(); + break; + case 1: + break; + case '?': + correctly_initialized = false; + return_code = -1; + // return -1; + } + } try { namespace po = boost::program_options; diff --git a/src/params.h b/src/params.h index 2ef7ed0..23c4034 100644 --- a/src/params.h +++ b/src/params.h @@ -120,11 +120,11 @@ struct params_t { params_t(); - params_t(int argc, const char *argv[]); + params_t(int argc, const char **argv); ~params_t(); - void parse_commandline(int argc, const char *argv[]); + void parse_commandline(int argc, const char **argv); void init_default_values(); }; From 00b38f512e4f9d2b7fb053355ca9cccfb78e2bc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 17:43:54 +0200 Subject: [PATCH 12/32] Add help message --- src/params.cpp | 73 +++++++++++++++++++++++++++++++++++++------------- src/params.h | 2 ++ 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/params.cpp b/src/params.cpp index b2c2d5e..cbdb9e2 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -1,16 +1,7 @@ #include "params.h" -#include #include "consensus.h" -#include -#include -#include -#include -#include - -#define no_argument 0 -#define required_argument 1 -#define optional_argument 2 +#include /**************************** *** Consensus parameters *** @@ -98,6 +89,51 @@ ococo::params_t::~params_t() { } } +void ococo::params_t::print_help() { + std::cerr + << + + // clang-format off + // "---------------------------------------------------------------------------------" + "Generic options:\n" + " -v, --version print version and exit\n" + " -h, --help print this message and exit\n\n" + // "---------------------------------------------------------------------------------" + "Input options:\n" + " -i, --input FILE input SAM/BAM file (- for standard input)\n" + " -f, --fasta-ref FILE initial FASTA reference (otherwise sequence of N's \n" + " considered as the reference)\n" + " -s, --stats-in arg input statistics.\n\n" + // "---------------------------------------------------------------------------------" + "Output options:\n" + " -F, --fasta-cons FILE FASTA file with consensus\n" + " -S, --stats-out FILE outputs statistics\n" + " -V, --vcf-cons FILE VCF file with updates of consensus (- for standard output)\n" + " -P, --pileup FILE truncated pileup (- for standard output)\n" + " --log FILE auxiliary log file\n" + " --verbose verbose mode\n\n" + // "---------------------------------------------------------------------------------" + "Parameters of consensus calling:\n" + " -x, --counters STR counters configuration: [ococo16]\n" + " - ococo16 (3b/counter, 16b/position)\n" + " - ococo32 (7b/counter, 32b/position)\n" + " - ococo64 (15b/counter, 64b/position)\n" + " -m, --mode STR mode: [batch]\n" + " - real-time / batch\n" + " -t, --strategy STR strategy for updates: [majority]\n" + " - no-updates / majority / stochastic\n" + //" -a [ --allow-amb ] Allow updates to ambiguous " + //"nucleotides.\n" + " -q, --min-MQ INT skip alignments with mapping quality smaller than INT [1]\n" + " -Q, --min-BQ INT skip bases with base quality smaller than INT [13]\n" + " -w, --ref-weight INT initial counter value for nucleotides from ref [0]\n" + " -c, --min-cov INT minimum coverage required for update [2]\n" + " -M, --maj-thres FLOAT majority threshold [0.6]" + // "---------------------------------------------------------------------------------" + // clang-format on + << std::endl; +} + void ococo::params_t::parse_commandline(int argc, const char **argv) { /* Save cmd parameters */ @@ -111,8 +147,6 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { command = cmd.str(); /* Parse cmd parameters */ - //int getopt_long(int argc, char *const *argv, const char *optstring, - // const struct option *longopts, int *longindex); const struct option lopts[] = { {"version", no_argument, NULL, 'v'}, @@ -129,20 +163,23 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { {"log", required_argument, NULL, 'L'}, {"verbose", required_argument, NULL, 'W'}, // - {"counters", required_argument, NULL, 'x'}, // require flag - {"mode", required_argument, NULL, 'm'}, // filter flag + {"counters", required_argument, NULL, 'x'}, + {"mode", required_argument, NULL, 'm'}, {"strategy", required_argument, NULL, 's'}, {"min-MQ", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"ref-weight", required_argument, NULL, 'w'}, - {"min-coverage", required_argument, NULL, 'c'}, - {"majority-threshold", required_argument, NULL, 'M'}, + {"min-cov", required_argument, NULL, 'c'}, + {"min-coverage", required_argument, NULL, 'c'}, // deprec + {"maj-thres", required_argument, NULL, 'M'}, + {"majority-threshold", required_argument, NULL, 'M'}, // deprec // {NULL, 0, NULL, 0}}; int c; - while ((c = getopt_long(argc,(char *const *) argv, "vhi:f:s:F:S:V:P:L:W:x:m:s:q:Q:w:c:M:", - lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, (char *const *)argv, + "vhi:f:s:F:S:V:P:L:W:x:m:s:q:Q:w:c:M:", lopts, + NULL)) >= 0) { switch (c) { case 'h': // print_help(); diff --git a/src/params.h b/src/params.h index 23c4034..7b1e549 100644 --- a/src/params.h +++ b/src/params.h @@ -126,6 +126,8 @@ struct params_t { void parse_commandline(int argc, const char **argv); + void print_help(); + void init_default_values(); }; } From beb3551fbb3ac72607e4bf87ae2c2e5c4d387e5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 18:42:24 +0200 Subject: [PATCH 13/32] Replace Boots by optarg, not working yet --- .travis.yml | 12 -- CMakeLists.txt | 17 +- src/params.cpp | 354 ++++++++++++-------------------- src/params.h | 5 - src/stats.h | 2 + tests/unit_tests/CMakeLists.txt | 2 +- 6 files changed, 141 insertions(+), 251 deletions(-) diff --git a/.travis.yml b/.travis.yml index 521b4bb..58d3d06 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,16 +3,6 @@ language: cpp cache: apt: true -env: - global: - # Maintenance note: to move to a new version - # of boost, update both BOOST_ROOT and BOOST_URL. - # Note that for simplicity, BOOST_ROOT's final - # namepart must match the folder name internal - # to boost's .tar.gz. - - BOOST_ROOT=$HOME/boost_1_60_0 - - BOOST_URL='http://downloads.sourceforge.net/project/boost/boost/1.60.0/boost_1_60_0.tar.gz?r=https%3A%2F%2Fsourceforge.net%2Fprojects%2Fboost%2Ffiles%2Fboost%2F1.60.0%2Fboost_1_60_0.tar.gz&ts=1460417589&use_mirror=netix' - matrix: include: @@ -29,7 +19,6 @@ matrix: - sudo apt-get install g++-4.8 cmake zlib1g-dev git-svn bats libgtest-dev - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90 - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 90 - - ./scripts/install-boost.sh - cd /usr/src/gtest && sudo cmake . && sudo cmake --build . && sudo mv libg* /usr/local/lib/ ; cd - script: - cmake . -DBUILD_TESTS=1 && make && ./run.sh @@ -42,7 +31,6 @@ matrix: - sudo brew update - sudo brew tap homebrew/versions - sudo brew install llvm37 - - sudo brew install boost155 - sudo brew install bats - sudo brew unlink cmake && sudo brew install cmake31 - sudo pip install --upgrade jinja2 nose diff --git a/CMakeLists.txt b/CMakeLists.txt index 6cd5fc8..2885afc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,6 @@ project (ococo C CXX) # COMPILATION MODE SWITCHERS # ############################## -set(Boost_USE_STATIC_LIBS 1) - if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting build type to 'RELEASE' as none was specified.") set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "Choose the type of build." FORCE) @@ -28,15 +26,6 @@ set(DEBUGGING_SEVERITY "trace" CACHE STRING "Verbosity of debugging mode.") option(BUILD_TESTS "Build tests." OFF) -######### -# BOOST # -######### - -find_package( Boost 1.46 COMPONENTS program_options REQUIRED ) - -include_directories( ${Boost_INCLUDE_DIR} ) - - ########## # HTSLIB # ########## @@ -73,8 +62,6 @@ else() endif(ZLIB_FOUND) include_directories(${htslib_INSTALL}/include) -find_package (Threads) - ################ # MAIN PROGRAM # @@ -100,8 +87,8 @@ add_library(ococo_core src/stats.h src/version.h ) -target_link_libraries(ococo_core ${Boost_LIBRARIES} ${CMAKE_BINARY_DIR}/ext/htslib/libhts.a ${ZLIB_LIBRARIES}) -target_link_libraries(ococo ${CMAKE_THREAD_LIBS_INIT} ${ZLIB_LIBRARIES} ococo_core) +target_link_libraries(ococo_core ${CMAKE_BINARY_DIR}/ext/htslib/libhts.a ${ZLIB_LIBRARIES}) +target_link_libraries(ococo ${ZLIB_LIBRARIES} ococo_core) ################ diff --git a/src/params.cpp b/src/params.cpp index cbdb9e2..128409a 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -111,17 +111,18 @@ void ococo::params_t::print_help() { " -V, --vcf-cons FILE VCF file with updates of consensus (- for standard output)\n" " -P, --pileup FILE truncated pileup (- for standard output)\n" " --log FILE auxiliary log file\n" - " --verbose verbose mode\n\n" + " --verbose verbose mode (report every counter update)\n\n" // "---------------------------------------------------------------------------------" "Parameters of consensus calling:\n" " -x, --counters STR counters configuration: [ococo16]\n" " - ococo16 (3b/counter, 16b/position)\n" " - ococo32 (7b/counter, 32b/position)\n" " - ococo64 (15b/counter, 64b/position)\n" - " -m, --mode STR mode: [batch]\n" - " - real-time / batch\n" + " -m, --mode STR mode: real-time / batch [batch]\n" " -t, --strategy STR strategy for updates: [majority]\n" - " - no-updates / majority / stochastic\n" + " - majority (update to majority base)\n" + " - stochastic (update to stochastically chosen base)\n" + " - no-updates (useful when only pileup is needed)\n" //" -a [ --allow-amb ] Allow updates to ambiguous " //"nucleotides.\n" " -q, --min-MQ INT skip alignments with mapping quality smaller than INT [1]\n" @@ -135,230 +136,92 @@ void ococo::params_t::print_help() { } void ococo::params_t::parse_commandline(int argc, const char **argv) { - /* Save cmd parameters */ - - std::stringstream cmd; - for (int32_t i = 0; i < argc; i++) { - cmd << argv[i]; - if (i != argc - 1) { - cmd << " "; - } - } - command = cmd.str(); /* Parse cmd parameters */ const struct option lopts[] = { - {"version", no_argument, NULL, 'v'}, - {"help", no_argument, NULL, 'h'}, + {"version", no_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, 'h'}, // - {"input", required_argument, NULL, 'i'}, - {"fasta-ref", required_argument, NULL, 'f'}, - {"stats-in", required_argument, NULL, 's'}, + {"input", required_argument, nullptr, 'i'}, + {"fasta-ref", required_argument, nullptr, 'f'}, + {"stats-in", required_argument, nullptr, 's'}, // - {"fasta-cons", required_argument, NULL, 'F'}, - {"stats-out", required_argument, NULL, 'S'}, - {"vcf-cons", required_argument, NULL, 'V'}, - {"pileup", required_argument, NULL, 'P'}, - {"log", required_argument, NULL, 'L'}, - {"verbose", required_argument, NULL, 'W'}, + {"fasta-cons", required_argument, nullptr, 'F'}, + {"stats-out", required_argument, nullptr, 'S'}, + {"vcf-cons", required_argument, nullptr, 'V'}, + {"pileup", required_argument, nullptr, 'P'}, + {"log", required_argument, nullptr, 'L'}, + {"verbose", required_argument, nullptr, 'W'}, // - {"counters", required_argument, NULL, 'x'}, - {"mode", required_argument, NULL, 'm'}, - {"strategy", required_argument, NULL, 's'}, - {"min-MQ", required_argument, NULL, 'q'}, - {"min-BQ", required_argument, NULL, 'Q'}, - {"ref-weight", required_argument, NULL, 'w'}, - {"min-cov", required_argument, NULL, 'c'}, - {"min-coverage", required_argument, NULL, 'c'}, // deprec - {"maj-thres", required_argument, NULL, 'M'}, - {"majority-threshold", required_argument, NULL, 'M'}, // deprec + {"counters", required_argument, nullptr, 'x'}, + {"mode", required_argument, nullptr, 'm'}, + {"strategy", required_argument, nullptr, 's'}, + {"min-MQ", required_argument, nullptr, 'q'}, + {"min-BQ", required_argument, nullptr, 'Q'}, + {"ref-weight", required_argument, nullptr, 'w'}, + {"min-cov", required_argument, nullptr, 'c'}, + {"min-coverage", required_argument, nullptr, 'c'}, // deprec + {"maj-thres", required_argument, nullptr, 'M'}, + {"majority-threshold", required_argument, nullptr, 'M'}, // deprec // - {NULL, 0, NULL, 0}}; + {nullptr, 0, nullptr, 0}}; int c; + using std::string; while ((c = getopt_long(argc, (char *const *)argv, "vhi:f:s:F:S:V:P:L:W:x:m:s:q:Q:w:c:M:", lopts, - NULL)) >= 0) { + nullptr)) >= 0) { switch (c) { - case 'h': - // print_help(); - break; - case 'v': - // print_help(); - break; - case 1: - break; - case '?': - correctly_initialized = false; - return_code = -1; - // return -1; - } - } - - try { - namespace po = boost::program_options; - - po::options_description options_generic("Generic options"); - options_generic.add_options() - // - ("version,v", "Print version and exit.") - // - ("help,h", "Print this message and exit.") - // - ; - - po::options_description options_input("Input options"); - options_input.add_options() - // - ("input,i", po::value(&sam_fn)->required(), - "Input SAM/BAM file (- for standard input).") - // - ("fasta-ref,f", po::value(&fasta_in_fn), - "Initial FASTA reference (if not provided, sequence of N's is " - "considered as the reference).") - // - ("stats-in,s", po::value(&stats_in_fn), - "Input statistics.") - // - ; - - po::options_description options_output("Output options"); - options_output.add_options() - // - ("fasta-cons,F", po::value(&fasta_out_fn), - "FASTA file with consensus.") - // - ("stats-out,S", po::value(&stats_out_fn), - "Outputs statistics.") - // - ("vcf-cons,V", po::value(&vcf_fn), - "VCF file with updates of consensus (- for standard output).") - // - ("pileup,P", po::value(&pileup_fn), - "Truncated pileup (- for standard output).") - // - ("log", po::value(&log_fn), "Auxiliary log file.") - // - ("verbose", "Verbose mode.") - // - ; - - po::options_description options_consensus( - "Parameters of consensus calling"); - options_consensus.add_options() - // - ("counters,x", - po::value(&counters_str)->default_value(counters_str), - "Counters configuration: \n - ococo16 (3b/counter, " - "16b/position)\n - ococo32 (7b/counter, 32b/position)\n - ococo64 " - "(15b/counter, 64b/position)") - // - ("mode,m", - po::value(&mode_str)->default_value(mode_str), - "Mode: real-time / batch.") - // - ("strategy,t", - po::value(&strategy_str)->default_value(strategy_str), - "Strategy for updates: no-updates / majority / stochastic.") - // - ("allow-amb,a", "Allow updates to ambiguous nucleotides.") - // - ("min-MQ,q", po::value(&min_mapq)->default_value(min_mapq), - "Skip alignments with mapping quality smaller than INT.") - // - ("min-BQ,Q", - po::value(&min_baseq)->default_value(min_baseq), - "Skip bases with base quality smaller than INT.") - // - ("ref-weight,w", po::value(&init_ref_weight) - ->default_value(init_ref_weight), - "Initial counter value for nucleotides from the reference.") - // - ("min-coverage,c", - po::value(&min_coverage)->default_value(min_coverage), - "Minimum coverage required for update.") - // - ("majority-threshold,M", po::value(&majority_threshold) - ->default_value(majority_threshold), - "Majority threshold.") - // - ; - - po::options_description options_all; - options_all.add(options_generic) - .add(options_input) - .add(options_output) - .add(options_consensus); - - po::variables_map vm; - try { - po::store( - po::command_line_parser(argc, argv).options(options_all).run(), - vm); // can throw - - if (vm.count("version")) { - std::cout << std::endl; + case 'v': { print_version(); - std::cout << std::endl; exit(0); + break; } - - if (vm.count("help")) { - std::cout << options_all << "\n"; + case 'h': { + print_help(); exit(0); + break; } - - po::notify(vm); - // throws on error, so do after help in case there - // are any problems - if (vm.count("strategy")) { - if (strategy_str.compare("no-updates") == 0) { - strategy = ococo::strategy_t::NO_UPDATES; - } else if (strategy_str.compare("majority") == 0) { - if (vm.count("allow-amb") == 0) { - strategy = ococo::strategy_t::MAJORITY; - } else { - strategy = ococo::strategy_t::MAJORITY_AMB; - } - } else if (strategy_str.compare("stochastic") == 0) { - if (vm.count("allow-amb") == 0) { - strategy = ococo::strategy_t::STOCHASTIC; - } else { - strategy = ococo::strategy_t::STOCHASTIC_AMB; - } - } else { - ococo::error( - "Unknown strategy '%s'. Possible strategies " - "are 'majority' and 'stochastic'.\n", - strategy_str.c_str()); - correctly_initialized = false; - return_code = -1; - return; - } + case 'i': { + sam_fn = optarg; + break; } - - if (vm.count("mode")) { - if (mode_str.compare("batch") == 0) { - mode = ococo::mode_t::BATCH; - } else if (mode_str.compare("real-time") == 0) { - mode = ococo::mode_t::REALTIME; - } else { - ococo::error( - "Unknown mode '%s'. Possible modes are " - "'batch' and 'real-time'.\n", - mode_str.c_str()); - correctly_initialized = false; - return_code = -1; - return; - } + case 'f': { + fasta_in_fn = optarg; + break; } - - if (vm.count("verbose")) { + case 's': { + stats_in_fn = optarg; + break; + } + case 'F': { + fasta_out_fn = optarg; + break; + } + case 'S': { + stats_out_fn = optarg; + break; + } + case 'V': { + vcf_fn = optarg; + break; + } + case 'P': { + pileup_fn = optarg; + break; + } + case 'L': { + log_fn = optarg; + break; + } + case 'W': { verbose = true; + break; } + case 'x': { + counters_str = optarg; - if (vm.count("counters")) { if (counters_str.compare("ococo16") == 0) { counter_configuration = OCOCO16; counters_str_descr = @@ -383,21 +246,76 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { return_code = -1; return; } + + break; } - ococo::info("Ococo starting: %s\n", counters_str_descr.c_str()); + case 'm': { + mode_str = optarg; - } catch (po::error &e) { - std::cout << options_all << "\n"; - ococo::error("%s.\n", e.what()); - correctly_initialized = false; - return_code = -1; - return; - } + if (mode_str.compare("batch") == 0) { + mode = ococo::mode_t::BATCH; + } else if (mode_str.compare("real-time") == 0) { + mode = ococo::mode_t::REALTIME; + } else { + ococo::error( + "Unknown mode '%s'. Possible modes are 'batch' and " + "'real-time'.\n", + mode_str.c_str()); + correctly_initialized = false; + return_code = -1; + return; + } + + break; + } + case 't': { + strategy_str = optarg; + + if (strategy_str.compare("stochastic") == 0) { + strategy = ococo::strategy_t::STOCHASTIC; + } else if (strategy_str.compare("no-updates") == 0) { + strategy = ococo::strategy_t::NO_UPDATES; + } else if (strategy_str.compare("majority") == 0) { + strategy = ococo::strategy_t::MAJORITY; + } else { + ococo::error( + "Unknown strategy '%s'. Possible strategies are " + "'majority', 'stochastic' and 'no-updates'.\n", + strategy_str.c_str()); + correctly_initialized = false; + return_code = -1; + return; + } - } catch (std::exception &e) { - ococo::error("Unhandled Exception: %s.\n", e.what()); - correctly_initialized = false; - return_code = -1; - return; + break; + } + case 'q': { + min_mapq = atoi(optarg); + break; + } + case 'Q': { + min_baseq = atoi(optarg); + break; + } + case 'w': { + init_ref_weight = atoi(optarg); + break; + } + case 'c': { + min_coverage = atoi(optarg); + break; + } + case 'M': { + majority_threshold = atof(optarg); + exit(0); + break; + } + case '?': { + ococo::error("probably unknown option"); + exit(1); + break; + } + } } + ococo::info("Ococo starting: %s\n", counters_str_descr.c_str()); } diff --git a/src/params.h b/src/params.h index 7b1e549..0572c38 100644 --- a/src/params.h +++ b/src/params.h @@ -7,11 +7,6 @@ #include #include -#include -#include -#include -#include - #include #include #include diff --git a/src/stats.h b/src/stats.h index b47a8ae..466b922 100644 --- a/src/stats.h +++ b/src/stats.h @@ -12,7 +12,9 @@ #include #include +#include #include +#include /*********************** *** Main statistics *** diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt index 7d07604..f476bec 100644 --- a/tests/unit_tests/CMakeLists.txt +++ b/tests/unit_tests/CMakeLists.txt @@ -12,6 +12,6 @@ if (BUILD_TESTS) find_package(GTest REQUIRED) include_directories(${GTEST_INCLUDE_DIRS}) - target_link_libraries(unittests ${GTEST_BOTH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${Boost_LIBRARIES} ${htslib_LIB} ${zlib_LIB} ococo_core) + target_link_libraries(unittests ${GTEST_BOTH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${htslib_LIB} ${zlib_LIB} ococo_core) endif (BUILD_TESTS) From 9a6571624f6f136a393de6edbc36accb9d13758e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 18:46:11 +0200 Subject: [PATCH 14/32] Working version --- .travis.yml | 5 +---- src/params.cpp | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 58d3d06..803a69e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -39,7 +39,4 @@ matrix: script: - cmake . && make && ./run.sh - (cd tests/integration_tests && bats ${CI:+--tap} integration_tests.bats) - -cache: - directories: - - $BOOST_ROOT + \ No newline at end of file diff --git a/src/params.cpp b/src/params.cpp index 128409a..60b9af9 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -156,7 +156,7 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { // {"counters", required_argument, nullptr, 'x'}, {"mode", required_argument, nullptr, 'm'}, - {"strategy", required_argument, nullptr, 's'}, + {"strategy", required_argument, nullptr, 't'}, {"min-MQ", required_argument, nullptr, 'q'}, {"min-BQ", required_argument, nullptr, 'Q'}, {"ref-weight", required_argument, nullptr, 'w'}, @@ -170,7 +170,7 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { int c; using std::string; while ((c = getopt_long(argc, (char *const *)argv, - "vhi:f:s:F:S:V:P:L:W:x:m:s:q:Q:w:c:M:", lopts, + "vhi:f:s:F:S:V:P:L:W:x:m:t:q:Q:w:c:M:", lopts, nullptr)) >= 0) { switch (c) { case 'v': { From 0c0916110a0fcabd5120c1e5e3414a37fc8cb6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 18:52:06 +0200 Subject: [PATCH 15/32] Reorganize tests --- .travis.yml | 14 +-- .../data/alignment_AA_1.sam | 0 .../data/alignment_AA_unm.sam | 0 .../data/alignment_A_2.sam | 0 .../data/alignment_C_2.sam | 0 .../{integration_tests => }/data/fasta_AA.fa | 0 .../{integration_tests => }/data/fasta_NA.fa | 0 .../{integration_tests => }/data/fasta_NC.fa | 0 .../{integration_tests => }/data/fasta_NN.fa | 0 .../{integration_tests => }/output/.gitignore | 0 .../test_aligned_reads_len1.sh | 2 +- .../test_aligned_reads_len2.sh | 2 +- .../test_no_parameters.sh | 2 +- .../test_stats_import_export.sh | 6 +- .../test_unaligned_reads.sh | 2 +- .../integration_tests.bats => tests.bats} | 2 +- tests/unit_tests/.gitignore | 2 - tests/unit_tests/CMakeLists.txt | 17 --- tests/unit_tests/unittests.cpp | 107 ------------------ 19 files changed, 14 insertions(+), 142 deletions(-) rename tests/{integration_tests => }/data/alignment_AA_1.sam (100%) rename tests/{integration_tests => }/data/alignment_AA_unm.sam (100%) rename tests/{integration_tests => }/data/alignment_A_2.sam (100%) rename tests/{integration_tests => }/data/alignment_C_2.sam (100%) rename tests/{integration_tests => }/data/fasta_AA.fa (100%) rename tests/{integration_tests => }/data/fasta_NA.fa (100%) rename tests/{integration_tests => }/data/fasta_NC.fa (100%) rename tests/{integration_tests => }/data/fasta_NN.fa (100%) rename tests/{integration_tests => }/output/.gitignore (100%) rename tests/{integration_tests => }/test_aligned_reads_len1.sh (95%) rename tests/{integration_tests => }/test_aligned_reads_len2.sh (95%) rename tests/{integration_tests => }/test_no_parameters.sh (80%) rename tests/{integration_tests => }/test_stats_import_export.sh (94%) rename tests/{integration_tests => }/test_unaligned_reads.sh (95%) rename tests/{integration_tests/integration_tests.bats => tests.bats} (96%) delete mode 100644 tests/unit_tests/.gitignore delete mode 100644 tests/unit_tests/CMakeLists.txt delete mode 100644 tests/unit_tests/unittests.cpp diff --git a/.travis.yml b/.travis.yml index 803a69e..21e4349 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,14 +16,12 @@ matrix: - sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y - sudo add-apt-repository ppa:duggan/bats -y - sudo apt-get update - - sudo apt-get install g++-4.8 cmake zlib1g-dev git-svn bats libgtest-dev + - sudo apt-get install g++-4.8 cmake zlib1g-dev bats - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90 - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 90 - - cd /usr/src/gtest && sudo cmake . && sudo cmake --build . && sudo mv libg* /usr/local/lib/ ; cd - script: - - cmake . -DBUILD_TESTS=1 && make && ./run.sh - - (cd tests/integration_tests && bats ${CI:+--tap} integration_tests.bats) - - ./tests/unit_tests/unittests + - cmake . && make && ./run.sh + - (cd tests && bats ${CI:+--tap} tests.bats) - os: osx compiler: clang-3.7 @@ -33,10 +31,10 @@ matrix: - sudo brew install llvm37 - sudo brew install bats - sudo brew unlink cmake && sudo brew install cmake31 - - sudo pip install --upgrade jinja2 nose + #- sudo pip install --upgrade jinja2 nose install: - export CXX="clang++-3.7" CC="clang-3.7" script: - cmake . && make && ./run.sh - - (cd tests/integration_tests && bats ${CI:+--tap} integration_tests.bats) - \ No newline at end of file + - (cd tests && bats ${CI:+--tap} tests.bats) + diff --git a/tests/integration_tests/data/alignment_AA_1.sam b/tests/data/alignment_AA_1.sam similarity index 100% rename from tests/integration_tests/data/alignment_AA_1.sam rename to tests/data/alignment_AA_1.sam diff --git a/tests/integration_tests/data/alignment_AA_unm.sam b/tests/data/alignment_AA_unm.sam similarity index 100% rename from tests/integration_tests/data/alignment_AA_unm.sam rename to tests/data/alignment_AA_unm.sam diff --git a/tests/integration_tests/data/alignment_A_2.sam b/tests/data/alignment_A_2.sam similarity index 100% rename from tests/integration_tests/data/alignment_A_2.sam rename to tests/data/alignment_A_2.sam diff --git a/tests/integration_tests/data/alignment_C_2.sam b/tests/data/alignment_C_2.sam similarity index 100% rename from tests/integration_tests/data/alignment_C_2.sam rename to tests/data/alignment_C_2.sam diff --git a/tests/integration_tests/data/fasta_AA.fa b/tests/data/fasta_AA.fa similarity index 100% rename from tests/integration_tests/data/fasta_AA.fa rename to tests/data/fasta_AA.fa diff --git a/tests/integration_tests/data/fasta_NA.fa b/tests/data/fasta_NA.fa similarity index 100% rename from tests/integration_tests/data/fasta_NA.fa rename to tests/data/fasta_NA.fa diff --git a/tests/integration_tests/data/fasta_NC.fa b/tests/data/fasta_NC.fa similarity index 100% rename from tests/integration_tests/data/fasta_NC.fa rename to tests/data/fasta_NC.fa diff --git a/tests/integration_tests/data/fasta_NN.fa b/tests/data/fasta_NN.fa similarity index 100% rename from tests/integration_tests/data/fasta_NN.fa rename to tests/data/fasta_NN.fa diff --git a/tests/integration_tests/output/.gitignore b/tests/output/.gitignore similarity index 100% rename from tests/integration_tests/output/.gitignore rename to tests/output/.gitignore diff --git a/tests/integration_tests/test_aligned_reads_len1.sh b/tests/test_aligned_reads_len1.sh similarity index 95% rename from tests/integration_tests/test_aligned_reads_len1.sh rename to tests/test_aligned_reads_len1.sh index 33489f6..c011ee0 100755 --- a/tests/integration_tests/test_aligned_reads_len1.sh +++ b/tests/test_aligned_reads_len1.sh @@ -6,7 +6,7 @@ set -o pipefail for strat in "majority" "stochastic"; do - ../../ococo \ + ococo \ -m batch \ -i data/alignment_A_2.sam \ -f data/fasta_NN.fa \ diff --git a/tests/integration_tests/test_aligned_reads_len2.sh b/tests/test_aligned_reads_len2.sh similarity index 95% rename from tests/integration_tests/test_aligned_reads_len2.sh rename to tests/test_aligned_reads_len2.sh index 867d8e7..1c1156b 100755 --- a/tests/integration_tests/test_aligned_reads_len2.sh +++ b/tests/test_aligned_reads_len2.sh @@ -6,7 +6,7 @@ set -o pipefail for strat in "majority" "stochastic"; do - ../../ococo \ + ococo \ -m batch \ -i data/alignment_AA_1.sam \ -f data/fasta_NN.fa \ diff --git a/tests/integration_tests/test_no_parameters.sh b/tests/test_no_parameters.sh similarity index 80% rename from tests/integration_tests/test_no_parameters.sh rename to tests/test_no_parameters.sh index 63b33dc..a981415 100755 --- a/tests/integration_tests/test_no_parameters.sh +++ b/tests/test_no_parameters.sh @@ -3,4 +3,4 @@ set -eux set -o pipefail -../../ococo +ococo diff --git a/tests/integration_tests/test_stats_import_export.sh b/tests/test_stats_import_export.sh similarity index 94% rename from tests/integration_tests/test_stats_import_export.sh rename to tests/test_stats_import_export.sh index 77cbab0..22438a6 100755 --- a/tests/integration_tests/test_stats_import_export.sh +++ b/tests/test_stats_import_export.sh @@ -6,7 +6,7 @@ set -o xtrace rm -f output/stats.ococo -../../ococo \ +ococo \ -m batch \ -i data/alignment_C_2.sam \ -f data/fasta_NN.fa \ @@ -21,7 +21,7 @@ echo echo "===============================" echo -../../ococo \ +ococo \ -m batch \ -i data/alignment_A_2.sam \ -F output/fasta_NA.fa \ @@ -34,7 +34,7 @@ echo echo "===============================" echo -../../ococo \ +ococo \ -m batch \ -i data/alignment_A_2.sam \ -F output/fasta_NA.fa \ diff --git a/tests/integration_tests/test_unaligned_reads.sh b/tests/test_unaligned_reads.sh similarity index 95% rename from tests/integration_tests/test_unaligned_reads.sh rename to tests/test_unaligned_reads.sh index 85c84c4..647b4b5 100755 --- a/tests/integration_tests/test_unaligned_reads.sh +++ b/tests/test_unaligned_reads.sh @@ -6,7 +6,7 @@ set -o pipefail for strat in "majority" "stochastic"; do - ../../ococo \ + ococo \ -m batch \ -i data/alignment_AA_unm.sam \ -f data/fasta_NN.fa \ diff --git a/tests/integration_tests/integration_tests.bats b/tests/tests.bats similarity index 96% rename from tests/integration_tests/integration_tests.bats rename to tests/tests.bats index 8cc9095..accd858 100755 --- a/tests/integration_tests/integration_tests.bats +++ b/tests/tests.bats @@ -1,6 +1,6 @@ #!/usr/bin/env bats -export PATH=$PATH:../.. +export PATH=$PATH:.. @test "Test of ococo without parameters" { diff --git a/tests/unit_tests/.gitignore b/tests/unit_tests/.gitignore deleted file mode 100644 index 34be296..0000000 --- a/tests/unit_tests/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -unittests - diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt deleted file mode 100644 index f476bec..0000000 --- a/tests/unit_tests/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -include_directories(../..) - -if (BUILD_TESTS) - - list(APPEND CMAKE_CXX_FLAGS "-Wall -std=c++11") - - add_executable(unittests unittests.cpp) - - find_package (Threads REQUIRED) - - enable_testing() - find_package(GTest REQUIRED) - include_directories(${GTEST_INCLUDE_DIRS}) - - target_link_libraries(unittests ${GTEST_BOTH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${htslib_LIB} ${zlib_LIB} ococo_core) - -endif (BUILD_TESTS) diff --git a/tests/unit_tests/unittests.cpp b/tests/unit_tests/unittests.cpp deleted file mode 100644 index d9ceff9..0000000 --- a/tests/unit_tests/unittests.cpp +++ /dev/null @@ -1,107 +0,0 @@ -#include "ococo.h" -#include "gtest/gtest.h" - -#include - -using namespace ococo; -using namespace std; - -vector nucls { - nt256_nt16[(int)'A'], - nt256_nt16[(int)'C'], - nt256_nt16[(int)'G'], - nt256_nt16[(int)'T'] -}; - -namespace { - - class BitFunctionsTest : public ::testing::Test { - protected: - BitFunctionsTest() { - } - - virtual ~BitFunctionsTest() { - } - - virtual void SetUp() { - } - - virtual void TearDown() { - } - }; - - - class ConsensusTest : public ::testing::Test { - protected: - ConsensusTest() { - } - - virtual ~ConsensusTest() { - } - - virtual void SetUp() { - } - - virtual void TearDown() { - } - }; - - TEST_F(BitFunctionsTest, Basic) { - ASSERT_EQ( 0x00, (ococo::right_full_mask()) ); - ASSERT_EQ( 0x01, (ococo::right_full_mask()) ); - ASSERT_EQ( 0xff, (ococo::right_full_mask()) ); - ASSERT_EQ( 0xffff, (ococo::right_full_mask()) ); - - } - - TEST_F(ConsensusTest, EmptyStats) { - char nucl; - - params_t params=params_t(); - for(uint32_t i=0;i<4;i++){ - { - ococo::pos_stats_uncompr_t psu = {nt256_nt16[(int)'C'],{0,0,0,0},0}; - nucl=(params.cons_alg[i])(psu, params); - ASSERT_EQ('C',nucl); - } - - { - ococo::pos_stats_uncompr_t psu = {nt256_nt16[(int)'N'],{0,0,0,0},0}; - nucl=(params.cons_alg[i])(psu, params); - ASSERT_EQ('N',nucl); - } - } - - } - - TEST_F(ConsensusTest, Majority) { - char nucl; - - params_t params=params_t(); - params.majority_threshold=0.6; - - { - ococo::pos_stats_uncompr_t psu = {nt256_nt16[(int)'T'],{6,0,0,3},9}; - nucl=cons_call_maj(psu, params); - ASSERT_EQ('A',nucl); - } - - { - ococo::pos_stats_uncompr_t psu = {nt256_nt16[(int)'T'],{0,0,6,4},10}; - nucl=cons_call_maj(psu, params); - ASSERT_EQ('G',nucl); - } - - } - - -} // namespace - - - -int main(int argc, char** argv){ - cout << endl << "DEBUG INFO" << endl; - - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} From 425fe134034e1ca0ea69e3582227a5d42498f4c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 18:54:43 +0200 Subject: [PATCH 16/32] Remove links to gtests --- CMakeLists.txt | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2885afc..22212e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,9 +23,6 @@ endif(CMAKE_BUILD_TYPE MATCHES RELEASE) option(INSTALL_DEBUG_SCRIPTS "Install debugging scripts." OFF) set(DEBUGGING_SEVERITY "trace" CACHE STRING "Verbosity of debugging mode.") -option(BUILD_TESTS "Build tests." OFF) - - ########## # HTSLIB # ########## @@ -68,7 +65,6 @@ include_directories(${htslib_INSTALL}/include) ################ include_directories( "src" ) -add_subdirectory(tests/unit_tests) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -Wno-missing-field-initializers -Wshadow -g ") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -DNDEBUG") @@ -100,11 +96,3 @@ install(TARGETS ococo DESTINATION bin) if(INSTALL_DEBUG_SCRIPTS) install(PROGRAMS scripts/ococo_test_bam.sh DESTINATION bin) endif(INSTALL_DEBUG_SCRIPTS) - - -######### -# TESTS # -######### - -add_test(NAME unit_tests COMMAND unittests) -#add_test(NAME integration_tests COMMAND "cd tests/integration_test/ && ./integration_tests.bats") From 203f89fa52f5c2841f4e1d1e4882eec579e94ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 19:01:15 +0200 Subject: [PATCH 17/32] Link pthread (HTSlib requires it) --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 22212e1..09ce256 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,8 @@ else() endif(ZLIB_FOUND) include_directories(${htslib_INSTALL}/include) +find_package (Threads) + ################ # MAIN PROGRAM # @@ -84,7 +86,7 @@ add_library(ococo_core src/version.h ) target_link_libraries(ococo_core ${CMAKE_BINARY_DIR}/ext/htslib/libhts.a ${ZLIB_LIBRARIES}) -target_link_libraries(ococo ${ZLIB_LIBRARIES} ococo_core) +target_link_libraries(ococo ${ZLIB_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ococo_core) ################ From 8ed91f1fd182edac74038af334a34bf8a106d6ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 21:41:11 +0200 Subject: [PATCH 18/32] Improve help message --- src/misc.cpp | 11 +++++++--- src/params.cpp | 57 ++++++++++++++++++++++++++++---------------------- src/params.h | 6 ++++++ 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/src/misc.cpp b/src/misc.cpp index 03864a1..8a4f800 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -1,9 +1,14 @@ #include "misc.h" void ococo::print_version() { - std::cout << "Program: Ococo (online consensus caller, " - << "http://github.com/karel-brinda/ococo)." << std::endl; - std::cout << "Version: " << OCOCO_VERSION << std::endl; + // clang-format off + std::cerr << + "\n" + "Program: ococo (Online consensus caller, call cons. from unsorted SAM/BAM stream)\n" + "Version: " << OCOCO_VERSION << "\n" + "Contact: Karel Brinda \n"; + // clang-format on + std::cerr << std::endl; } void ococo::fatal_error(const char *format, ...) { diff --git a/src/params.cpp b/src/params.cpp index 60b9af9..fcae43f 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -15,11 +15,11 @@ void ococo::params_t::init_default_values() { mode_str = "batch"; strategy = MAJORITY; strategy_str = "majority"; - min_mapq = 1; - min_baseq = 13; - init_ref_weight = 0; - min_coverage = 2; - majority_threshold = 0.60; + min_mapq = default_q; + min_baseq = default_Q; + init_ref_weight = default_w; + min_coverage = default_c; + majority_threshold = default_M; cons_alg[strategy_t::NO_UPDATES] = &cons_call_no_updates; cons_alg[strategy_t::STOCHASTIC] = &cons_call_stoch; @@ -90,53 +90,60 @@ ococo::params_t::~params_t() { } void ococo::params_t::print_help() { + print_version(); + std::cerr << - // clang-format off + // "---------------------------------------------------------------------------------" - "Generic options:\n" - " -v, --version print version and exit\n" - " -h, --help print this message and exit\n\n" + "Usage: ococo -i [other options]\n\n" + // "---------------------------------------------------------------------------------" + //"Generic options:\n" + //" -h, --help print this message and exit\n\n" // "---------------------------------------------------------------------------------" "Input options:\n" " -i, --input FILE input SAM/BAM file (- for standard input)\n" - " -f, --fasta-ref FILE initial FASTA reference (otherwise sequence of N's \n" - " considered as the reference)\n" - " -s, --stats-in arg input statistics.\n\n" + " -f, --fasta-ref FILE initial FASTA reference (otherwise seq of N's is used)\n" + " -s, --stats-in FILE input statistics\n\n" // "---------------------------------------------------------------------------------" "Output options:\n" " -F, --fasta-cons FILE FASTA file with consensus\n" - " -S, --stats-out FILE outputs statistics\n" + " -S, --stats-out FILE output statistics\n" " -V, --vcf-cons FILE VCF file with updates of consensus (- for standard output)\n" " -P, --pileup FILE truncated pileup (- for standard output)\n" - " --log FILE auxiliary log file\n" - " --verbose verbose mode (report every counter update)\n\n" + //" --log FILE auxiliary log file\n" + " --verbose verbose mode (report every update of a counter)\n\n" // "---------------------------------------------------------------------------------" - "Parameters of consensus calling:\n" - " -x, --counters STR counters configuration: [ococo16]\n" + "Parameters for consensus calling:\n" + " -x, --counters STR counter configuration: [ococo16]\n" " - ococo16 (3b/counter, 16b/position)\n" " - ococo32 (7b/counter, 32b/position)\n" " - ococo64 (15b/counter, 64b/position)\n" - " -m, --mode STR mode: real-time / batch [batch]\n" + " -m, --mode STR mode: [batch]\n" + " - real-time (updates reported immediately)\n" + " - batch (updates reported after end of algn stream)\n" " -t, --strategy STR strategy for updates: [majority]\n" " - majority (update to majority base)\n" " - stochastic (update to stochastically chosen base)\n" - " - no-updates (useful when only pileup is needed)\n" + " - no-updates (no updates, only counters updated)\n" //" -a [ --allow-amb ] Allow updates to ambiguous " //"nucleotides.\n" - " -q, --min-MQ INT skip alignments with mapping quality smaller than INT [1]\n" - " -Q, --min-BQ INT skip bases with base quality smaller than INT [13]\n" - " -w, --ref-weight INT initial counter value for nucleotides from ref [0]\n" - " -c, --min-cov INT minimum coverage required for update [2]\n" - " -M, --maj-thres FLOAT majority threshold [0.6]" + " -q, --min-MQ INT skip alignments with mapping quality smaller than INT [" << default_q << "]\n" + " -Q, --min-BQ INT skip bases with base quality smaller than INT [" << default_Q <<"]\n" + " -w, --ref-weight INT initial counter value for nucleotides from ref ["<< default_w <<"]\n" + " -c, --min-cov INT minimum coverage required for update [" << default_c <<"]\n" + " -M, --maj-thres FLOAT majority threshold [" << default_M << "]\n\n" + // "---------------------------------------------------------------------------------" + "Examples:\n" + " ococo -i test.bam -f test.fa -m real-time -V -\n" + " ococo -x ococo64 -i test.bam -f test.fa -V - -P pileup.txt\n" // "---------------------------------------------------------------------------------" // clang-format on << std::endl; } void ococo::params_t::parse_commandline(int argc, const char **argv) { - /* Parse cmd parameters */ const struct option lopts[] = { diff --git a/src/params.h b/src/params.h index 0572c38..4495c34 100644 --- a/src/params.h +++ b/src/params.h @@ -20,6 +20,12 @@ namespace ococo { +const int default_c = 2; +const float default_M = 0.5; +const int default_w = 0; +const int default_q = 1; +const int default_Q = 13; + enum mode_t { BATCH, REALTIME }; enum strategy_t { From 8f2ae64da24b00fd4570ae78c0d9b9fd090cf2f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 22:41:31 +0200 Subject: [PATCH 19/32] Set majority to 0.51 --- src/params.cpp | 2 +- src/params.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/params.cpp b/src/params.cpp index fcae43f..42845c4 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -159,7 +159,7 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { {"vcf-cons", required_argument, nullptr, 'V'}, {"pileup", required_argument, nullptr, 'P'}, {"log", required_argument, nullptr, 'L'}, - {"verbose", required_argument, nullptr, 'W'}, + {"verbose", no_argument, nullptr, 'W'}, // {"counters", required_argument, nullptr, 'x'}, {"mode", required_argument, nullptr, 'm'}, diff --git a/src/params.h b/src/params.h index 4495c34..56d6256 100644 --- a/src/params.h +++ b/src/params.h @@ -21,7 +21,7 @@ namespace ococo { const int default_c = 2; -const float default_M = 0.5; +const float default_M = 0.51; const int default_w = 0; const int default_q = 1; const int default_Q = 13; From 3907cbe0665c26049fee8a0c18c10c25eec5967f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 22:51:06 +0200 Subject: [PATCH 20/32] Update README --- README.md | 66 +++++++++++++++++++++++--------------------------- src/params.cpp | 4 +-- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 0d38049..4f02276 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,6 @@ Welcome to OCOCO, an online consensus caller. * GCC 4.8+ or equivalent * CMake (http://cmake.org/) -* Boost 1.46+ (http://www.boost.org/) * ZLib ## Getting started @@ -25,46 +24,41 @@ cd ococo && cmake . && make ## Command line parameters ``` -Generic options: - -v [ --version ] Print version and exit. - -h [ --help ] Print this message and exit. +Usage: ococo -i [other options] Input options: - -i [ --input ] arg Input SAM/BAM file (- for standard - input). - -f [ --fasta-ref ] arg Initial FASTA reference (if not - provided, sequence of N's is considered - as the reference). - -s [ --stats-in ] arg Input statistics. + -i, --input FILE input SAM/BAM file (- for standard input) + -f, --fasta-ref FILE initial FASTA reference (otherwise seq of N's is used) + -s, --stats-in FILE input statistics Output options: - -F [ --fasta-cons ] arg FASTA file with consensus. - -S [ --stats-out ] arg Outputs statistics. - -V [ --vcf-cons ] arg VCF file with updates of consensus (- - for standard output). - -P [ --pileup ] arg Truncated pileup (- for standard - output). - --log arg Auxiliary log file. - --verbose Verbose mode. + -F, --fasta-cons FILE FASTA file with consensus + -S, --stats-out FILE output statistics + -V, --vcf-cons FILE VCF file with updates of consensus (- for standard output) + -P, --pileup FILE truncated pileup (- for standard output) + --verbose verbose mode (report every update of a counter) -Parameters of consensus calling: - -x [ --counters ] arg (=ococo16) Counters configuration: - - ococo16 (3b/counter, 16b/position) - - ococo32 (7b/counter, 32b/position) - - ococo64 (15b/counter, 64b/position) - -m [ --mode ] arg (=batch) Mode: real-time / batch. - -t [ --strategy ] arg (=majority) Strategy for updates: no-updates / - majority / stochastic. - -a [ --allow-amb ] Allow updates to ambiguous nucleotides. - -q [ --min-MQ ] arg (=1) Skip alignments with mapping quality - smaller than INT. - -Q [ --min-BQ ] arg (=13) Skip bases with base quality smaller - than INT. - -w [ --ref-weight ] arg (=0) Initial counter value for nucleotides - from the reference. - -c [ --min-coverage ] arg (=2) Minimum coverage required for update. - -M [ --majority-threshold ] arg (=0.59999999999999998) - Majority threshold. +Parameters for consensus calling: + -x, --counters STR counter configuration: [ococo16] + - ococo16 (3b/counter, 16b/position) + - ococo32 (7b/counter, 32b/position) + - ococo64 (15b/counter, 64b/position) + -m, --mode STR mode: [batch] + - real-time (updates reported immediately) + - batch (updates reported after end of algn stream) + -t, --strategy STR strategy for updates: [majority] + - majority (update to majority base) + - stochastic (update to stochastically drawn base) + - no-updates (no updates, only counters updated) + -q, --min-MQ INT skip alignments with mapping quality smaller than INT [1] + -Q, --min-BQ INT skip bases with base quality smaller than INT [13] + -w, --ref-weight INT initial counter value for nucleotides from ref [0] + -c, --min-cov INT minimum coverage required for update [2] + -M, --maj-thres FLOAT majority threshold [0.51] + +Examples: + ococo -i test.bam -f test.fa -m real-time -V - + ococo -x ococo64 -i test.bam -f test.fa -P - -V variants.vcf ``` ## Citing OCOCO diff --git a/src/params.cpp b/src/params.cpp index 42845c4..283be0e 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -125,7 +125,7 @@ void ococo::params_t::print_help() { " - batch (updates reported after end of algn stream)\n" " -t, --strategy STR strategy for updates: [majority]\n" " - majority (update to majority base)\n" - " - stochastic (update to stochastically chosen base)\n" + " - stochastic (update to stochastically drawn base)\n" " - no-updates (no updates, only counters updated)\n" //" -a [ --allow-amb ] Allow updates to ambiguous " //"nucleotides.\n" @@ -137,7 +137,7 @@ void ococo::params_t::print_help() { // "---------------------------------------------------------------------------------" "Examples:\n" " ococo -i test.bam -f test.fa -m real-time -V -\n" - " ococo -x ococo64 -i test.bam -f test.fa -V - -P pileup.txt\n" + " ococo -x ococo64 -i test.bam -f test.fa -P - -V variants.vcf\n" // "---------------------------------------------------------------------------------" // clang-format on << std::endl; From 0378549e1e5c4a2380af55eb4c36f6c365134fad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 23:15:08 +0200 Subject: [PATCH 21/32] Report times --- src/caller.h | 3 +++ src/misc.cpp | 16 ++++++++++++++++ src/misc.h | 6 ++++++ 3 files changed, 25 insertions(+) diff --git a/src/caller.h b/src/caller.h index f4680af..a55cea8 100644 --- a/src/caller.h +++ b/src/caller.h @@ -19,6 +19,7 @@ struct caller_t { stats_t *stats; params_t *params; + double t_real; caller_t(params_t *params_); ~caller_t(); @@ -34,6 +35,7 @@ caller_t::caller_t(params_t *params_) * Read SAM headers. */ + t_real = realtime(); ococo::info("Initialing SAM/BAM reader.\n"); correctly_initialized = true; @@ -375,6 +377,7 @@ caller_t::~caller_t() { if (return_code == EXIT_SUCCESS && correctly_initialized == true) { ococo::info("Ococo successfully finished. Bye.\n"); + ococo::info("%.3f sec; CPU: %.3f sec\n", realtime() - t_real, cputime()); } } } diff --git a/src/misc.cpp b/src/misc.cpp index 8a4f800..eff6465 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -53,3 +53,19 @@ bool ococo::file_exists(const std::string &fn) { } return false; } + +double ococo::realtime() +{ + struct timeval tp; + //struct timezone tzp; + //gettimeofday(&tp, &tzp); + gettimeofday(&tp, nullptr); + return tp.tv_sec + tp.tv_usec * 1e-6; +} + +double ococo::cputime() +{ + struct rusage r; + getrusage(RUSAGE_SELF, &r); + return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); +} diff --git a/src/misc.h b/src/misc.h index 5624ddd..9e27110 100644 --- a/src/misc.h +++ b/src/misc.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "version.h" @@ -22,6 +23,11 @@ void info(const char *format, ...); bool file_exists(const std::string &fn); +double realtime(); + +double cputime(); + + /* * Get a right full mask (right n bits set to 1) * From cc025724d20f3769d3280b3c2b8f44b31bb13480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 23:26:54 +0200 Subject: [PATCH 22/32] Fix required '-i' --- src/params.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/params.cpp b/src/params.cpp index 283be0e..a5b82c0 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -146,6 +146,11 @@ void ococo::params_t::print_help() { void ococo::params_t::parse_commandline(int argc, const char **argv) { /* Parse cmd parameters */ + if (argc == 1) { + print_help(); + exit(1); + } + const struct option lopts[] = { {"version", no_argument, nullptr, 'v'}, {"help", no_argument, nullptr, 'h'}, @@ -249,9 +254,7 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { "Unknown counter configuration '%s'. Possible modes " "are 'ococo16', 'ococo32', and 'ococo64'.\n", counters_str.c_str()); - correctly_initialized = false; - return_code = -1; - return; + exit(1); } break; @@ -268,9 +271,7 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { "Unknown mode '%s'. Possible modes are 'batch' and " "'real-time'.\n", mode_str.c_str()); - correctly_initialized = false; - return_code = -1; - return; + exit(1); } break; @@ -287,11 +288,9 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { } else { ococo::error( "Unknown strategy '%s'. Possible strategies are " - "'majority', 'stochastic' and 'no-updates'.\n", + "'majority', 'stochastic', and 'no-updates'.\n", strategy_str.c_str()); - correctly_initialized = false; - return_code = -1; - return; + exit(1); } break; @@ -318,11 +317,15 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { break; } case '?': { - ococo::error("probably unknown option"); + ococo::error("Unknown error"); exit(1); break; } } } + if (sam_fn.size()==0){ + ococo::error("SAM/BAM file must be specified (option '-i').\n"); + exit(1); + } ococo::info("Ococo starting: %s\n", counters_str_descr.c_str()); } From 27b7597b2e393784b8930ec7eb96f0a656a9d0f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 23:34:30 +0200 Subject: [PATCH 23/32] Fix command in VCF --- src/params.cpp | 9 +++++++++ src/params.h | 1 + 2 files changed, 10 insertions(+) diff --git a/src/params.cpp b/src/params.cpp index a5b82c0..764dec0 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -145,6 +145,15 @@ void ococo::params_t::print_help() { void ococo::params_t::parse_commandline(int argc, const char **argv) { /* Parse cmd parameters */ + std::stringstream cmd; + for (int32_t i = 0; i < argc; i++) { + cmd << argv[i]; + if (i != argc - 1) { + cmd << " "; + } + } + command=cmd.str(); + if (argc == 1) { print_help(); diff --git a/src/params.h b/src/params.h index 56d6256..88d55b1 100644 --- a/src/params.h +++ b/src/params.h @@ -5,6 +5,7 @@ #include #include +#include #include #include From 9acea1ba3e651c2cca9746f0e04b4a5d9c4ba4fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 23:42:19 +0200 Subject: [PATCH 24/32] Fix bug with equal files @closes #10 --- src/params.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/params.cpp b/src/params.cpp index 764dec0..4e50f00 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -336,5 +336,11 @@ void ococo::params_t::parse_commandline(int argc, const char **argv) { ococo::error("SAM/BAM file must be specified (option '-i').\n"); exit(1); } + + if(pileup_fn.size()!=0 && pileup_fn.compare(vcf_fn)==0){ + ococo::error("Pileup and VCF files cannot be the same (both currently '%s').\n", pileup_fn.c_str()); + exit(1); + } + ococo::info("Ococo starting: %s\n", counters_str_descr.c_str()); } From 60a889ab1c651e71fc156c5c4afb17bd05abf55d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 23:57:45 +0200 Subject: [PATCH 25/32] Fix missing header file for Linux --- src/misc.cpp | 3 +++ src/misc.h | 1 + 2 files changed, 4 insertions(+) diff --git a/src/misc.cpp b/src/misc.cpp index eff6465..22f44b5 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -67,5 +67,8 @@ double ococo::cputime() { struct rusage r; getrusage(RUSAGE_SELF, &r); + + //todo: check also memory + //std::cerr << r.ru_maxrss << std::endl; return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); } diff --git a/src/misc.h b/src/misc.h index 9e27110..e94d049 100644 --- a/src/misc.h +++ b/src/misc.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "version.h" From d713e87bbe31ee5460b4bd4cfb9f5cc0607ce304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sat, 27 Aug 2016 23:59:47 +0200 Subject: [PATCH 26/32] Bold title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4f02276..17a9839 100644 --- a/README.md +++ b/README.md @@ -63,4 +63,4 @@ Examples: ## Citing OCOCO -* K. Brinda, V. Boeva, G. Kucherov. Dynamic read mapping and online consensus calling for better variant detection. arXiv:1605.09070v1 [q-bio.GN], 2016. http://arxiv.org/abs/1605.09070 +* K. Brinda, V. Boeva, G. Kucherov. **Dynamic read mapping and online consensus calling for better variant detection.** arXiv:1605.09070v1 [q-bio.GN], 2016. http://arxiv.org/abs/1605.09070 From 9c662797391654b6c719499600f441e07f2d6c32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 28 Aug 2016 11:11:16 +0200 Subject: [PATCH 27/32] Don't use CMake --- .gitignore | 7 ---- .travis.yml | 12 ++---- CMakeLists.txt | 100 ------------------------------------------------- Makefile | 24 ++++++++++++ src/Makefile | 9 +++++ 5 files changed, 36 insertions(+), 116 deletions(-) delete mode 100644 CMakeLists.txt create mode 100644 Makefile create mode 100644 src/Makefile diff --git a/.gitignore b/.gitignore index 88a8686..e454805 100644 --- a/.gitignore +++ b/.gitignore @@ -9,13 +9,6 @@ ococo CTestTestfile.cmake Testing -CMakeCache.txt -CMakeFiles -CMakeScripts -Makefile -cmake_install.cmake -install_manifest.txt - # Compiled Object files *.slo *.lo diff --git a/.travis.yml b/.travis.yml index 21e4349..ba9fbbd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,19 +8,15 @@ matrix: - os: linux compiler: gcc - addons: - apt: - sources: ['ubuntu-toolchain-r-test', 'george-edison55-precise-backports'] - packages: ['cmake', 'cmake-data'] install: - sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y - sudo add-apt-repository ppa:duggan/bats -y - sudo apt-get update - - sudo apt-get install g++-4.8 cmake zlib1g-dev bats + - sudo apt-get install g++-4.8 zlib1g-dev bats - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90 - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 90 script: - - cmake . && make && ./run.sh + - make -j && ./run.sh - (cd tests && bats ${CI:+--tap} tests.bats) - os: osx @@ -30,11 +26,9 @@ matrix: - sudo brew tap homebrew/versions - sudo brew install llvm37 - sudo brew install bats - - sudo brew unlink cmake && sudo brew install cmake31 - #- sudo pip install --upgrade jinja2 nose install: - export CXX="clang++-3.7" CC="clang-3.7" script: - - cmake . && make && ./run.sh + - make -j && ./run.sh - (cd tests && bats ${CI:+--tap} tests.bats) diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 09ce256..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,100 +0,0 @@ -cmake_minimum_required (VERSION 2.6) -project (ococo C CXX) - - -############################## -# COMPILATION MODE SWITCHERS # -############################## - -if(NOT CMAKE_BUILD_TYPE) - message(STATUS "Setting build type to 'RELEASE' as none was specified.") - set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "Choose the type of build." FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "DEBUG" "RELEASE") -endif() - -if(CMAKE_BUILD_TYPE MATCHES DEBUG) - message("DEBUG mode") -endif(CMAKE_BUILD_TYPE MATCHES DEBUG) - -if(CMAKE_BUILD_TYPE MATCHES RELEASE) - message("RELEASE mode") -endif(CMAKE_BUILD_TYPE MATCHES RELEASE) - -option(INSTALL_DEBUG_SCRIPTS "Install debugging scripts." OFF) -set(DEBUGGING_SEVERITY "trace" CACHE STRING "Verbosity of debugging mode.") - -########## -# HTSLIB # -########## - -if (CMAKE_GENERATOR STREQUAL "Unix Makefiles") - set(MAKE_COMMAND "$(MAKE)") -else() - find_program(MAKE_COMMAND NAMES make gmake) -endif() - -include(ExternalProject) -ExternalProject_Add(htslib - PREFIX ${CMAKE_BINARY_DIR}/ext/htslib.tmp - SOURCE_DIR "${CMAKE_BINARY_DIR}/ext/htslib" - BUILD_IN_SOURCE 1 - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND ${MAKE_COMMAND} lib-static - INSTALL_COMMAND "" - ) - -include_directories(${CMAKE_BINARY_DIR}/ext/htslib) - - -######## -# ZLIB # -######## - -find_package(ZLIB REQUIRED) -if (ZLIB_FOUND) - include_directories(${ZLIB_INCLUDE_DIRS}) -else() - message (FATAL_ERROR "zlib not found.") -endif(ZLIB_FOUND) - -include_directories(${htslib_INSTALL}/include) -find_package (Threads) - - -################ -# MAIN PROGRAM # -################ - -include_directories( "src" ) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -Wno-missing-field-initializers -Wshadow -g ") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -DNDEBUG") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -pedantic") - -add_executable(ococo src/main.cpp) -add_dependencies(ococo htslib ococo_core) -add_library(ococo_core - src/caller.h - src/params.cpp - src/params.h - src/misc.cpp - src/misc.h - src/ococo.h - src/types.h - src/stats.h - src/version.h - ) -target_link_libraries(ococo_core ${CMAKE_BINARY_DIR}/ext/htslib/libhts.a ${ZLIB_LIBRARIES}) -target_link_libraries(ococo ${ZLIB_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ococo_core) - - -################ -# INSTALLATION # -################ - -install(TARGETS ococo DESTINATION bin) - -if(INSTALL_DEBUG_SCRIPTS) - install(PROGRAMS scripts/ococo_test_bam.sh DESTINATION bin) -endif(INSTALL_DEBUG_SCRIPTS) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a34e7b3 --- /dev/null +++ b/Makefile @@ -0,0 +1,24 @@ +CXX ?= g++ +CXXFLAGS = -std=c++11 -Wall -Wextra -Wno-missing-field-initializers -Wshadow -g -O2 +LIBS = -lm -lz -lpthread + +export CXX +export CXXFLAGS + +.PHONY: all clean + +all: ococo + +ococo: + $(MAKE) -C ./ext/htslib lib-static + $(MAKE) -C ./src + + $(CXX) $(CXXFLAGS) $(DFLAGS) ./src/*.o -o $@ -L. $(LIBS) ./ext/htslib/libhts.a + +ext/htslib/libhts.a: + $(MAKE) -C ext/htslib lib-static + +clean: + $(MAKE) -C ext/htslib clean + $(MAKE) -C src clean + rm ococo diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..e3766d3 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,9 @@ +.PHONY: all clean + +all: misc.o main.o params.o + +%.o: %.cpp + $(CXX) $(CXXFLAGS) $(DFLAGS) -c $< + +clean: + rm -f *.o From 21b399c9c0e0b3a18f1fa837ef97f39abadb3dd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 28 Aug 2016 11:17:30 +0200 Subject: [PATCH 28/32] Fix htslib including --- src/htslib | 1 + 1 file changed, 1 insertion(+) create mode 120000 src/htslib diff --git a/src/htslib b/src/htslib new file mode 120000 index 0000000..96c9c02 --- /dev/null +++ b/src/htslib @@ -0,0 +1 @@ +../ext/htslib/htslib \ No newline at end of file From f138ef01f8741455038cb48fafb0357aef987edf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 28 Aug 2016 11:28:41 +0200 Subject: [PATCH 29/32] Fix included dirs --- src/Makefile | 2 +- src/htslib | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 120000 src/htslib diff --git a/src/Makefile b/src/Makefile index e3766d3..ccfc525 100644 --- a/src/Makefile +++ b/src/Makefile @@ -3,7 +3,7 @@ all: misc.o main.o params.o %.o: %.cpp - $(CXX) $(CXXFLAGS) $(DFLAGS) -c $< + $(CXX) $(CXXFLAGS) $(DFLAGS) -c $< -I ../ext/htslib/ clean: rm -f *.o diff --git a/src/htslib b/src/htslib deleted file mode 120000 index 96c9c02..0000000 --- a/src/htslib +++ /dev/null @@ -1 +0,0 @@ -../ext/htslib/htslib \ No newline at end of file From 646f7d5b6fe723b6731cd4d01168f4f9f6a194c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 28 Aug 2016 12:38:11 +0200 Subject: [PATCH 30/32] Add manpage --- ococo.1 | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 ococo.1 diff --git a/ococo.1 b/ococo.1 new file mode 100644 index 0000000..a69dc8b --- /dev/null +++ b/ococo.1 @@ -0,0 +1,105 @@ +.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.47.4. +.TH OCOCO "1" "August 2016" "ococo " "User Commands" +.SH NAME +ococo \- Online consensus caller +.SH SYNOPSIS +ococo -i [other options] +.SH DESCRIPTION +Ococo is a program to call genomic consensus directly from an unsorted +SAM/BAM stream. +.SS "Input options:" +.TP +\fB\-i\fR, \fB\-\-input\fR FILE +Input SAM/BAM file (\- for standard input). +.TP +\fB\-f\fR, \fB\-\-fasta\-ref\fR FILE +Initial FASTA reference (otherwise seq of N's is used). +.TP +\fB\-s\fR, \fB\-\-stats\-in\fR FILE +Input statistics. +.SS "Output options:" +.HP +\fB\-F\fR, \fB\-\-fasta\-cons\fR FILE FASTA file with consensus +.TP +\fB\-S\fR, \fB\-\-stats\-out\fR FILE +Output statistics. +.TP +\fB\-V\fR, \fB\-\-vcf\-cons\fR FILE +VCF file with updates of consensus (\- for standard output) +.TP +\fB\-P\fR, \fB\-\-pileup\fR FILE +Truncated pileup (\- for standard output). +.TP +\fB\-\-verbose\fR +Verbose mode (report every update of a counter). +.SS "Parameters for consensus calling:" +.TP +\fB\-x\fR, \fB\-\-counters\fR STR +Counter configuration [ococo16]. + +.TS +l l l . +.B +configuration bits/counter bits/position +ococo16 3 16 +ococo32 7 32 +ococo64 15 64 +.TE + +.TP +\fB\-m\fR, \fB\-\-mode\fR STR +Mode [batch]. + +.TS +l l . +.B +mode description +real\-time updates reported immediately +batch updates reported after end of algn stream +.TE + +.TP +\fB\-t\fR, \fB\-\-strategy\fR STR +Strategy for updates [majority]. + +.TS +l l l . +.B +strategy description +majority update to majority base +stochastic update to stochastically drawn base +no-updates no updates, only counters updated +.TE + +.TP +\fB\-q\fR, \fB\-\-min\-MQ\fR INT +Skip alignments with mapping quality smaller than INT [1]. +.TP +\fB\-Q\fR, \fB\-\-min\-BQ\fR INT +Skip bases with base quality smaller than INT [13]. +.TP +\fB\-w\fR, \fB\-\-ref\-weight\fR INT +Initial counter value for nucleotides from ref [0]. +.TP +\fB\-c\fR, \fB\-\-min\-cov\fR INT +Minimum coverage required for update [2]. +.TP +\fB\-M\fR, \fB\-\-maj\-thres\fR FLOAT +Majority threshold [0.51]. +.SH AUTHOR +.IP +Written by Karel Brinda (karel.brinda@gmail.com) at LIGM Universite Paris-Est Marne-la-Vallee, France +.SH REPORTING BUGS +Report bugs on https://github.com/karel-brinda/ococo/issues +.SH EXAMPLES +.IP +ococo \-i test.bam \-f test.fa \-m real\-time \-V \- + +ococo \-x ococo64 \-i test.bam \-f test.fa \-P \- \-V variants.vcf +.SH LICENSE AND CITATION +Ococo is distributed under the MIT license. If you use the program, +please cite the following paper: + +[1] K. Brinda, V. Boeva, G. Kucherov. +.B Dynamic read mapping and online consensus calling for better variant detection. +arXiv:1605.09070 [q-bio.GN]. From 8aafa8ce0c27fe928b071273ecbe95d1538e8882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 28 Aug 2016 12:58:06 +0200 Subject: [PATCH 31/32] Fix installation using Make --- Makefile | 11 ++++++++++- ococo.1 | 4 +--- src/Makefile | 8 +++++++- src/params.cpp | 5 ++++- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index a34e7b3..9755156 100644 --- a/Makefile +++ b/Makefile @@ -2,13 +2,22 @@ CXX ?= g++ CXXFLAGS = -std=c++11 -Wall -Wextra -Wno-missing-field-initializers -Wshadow -g -O2 LIBS = -lm -lz -lpthread +PREFIX = $(DESTDIR)/usr/local +BINDIR = $(PREFIX)/bin +MANDIR = $(PREFIX)/share/man/man1 +MANPAGE = ococo.1 + export CXX export CXXFLAGS -.PHONY: all clean +.PHONY: all clean install ococo all: ococo +install: ococo + install ococo $(BINDIR)/ococo + install $(MANPAGE) $(MANDIR)/$(MANPAGE) + ococo: $(MAKE) -C ./ext/htslib lib-static $(MAKE) -C ./src diff --git a/ococo.1 b/ococo.1 index a69dc8b..d2a8798 100644 --- a/ococo.1 +++ b/ococo.1 @@ -87,12 +87,10 @@ Minimum coverage required for update [2]. \fB\-M\fR, \fB\-\-maj\-thres\fR FLOAT Majority threshold [0.51]. .SH AUTHOR -.IP -Written by Karel Brinda (karel.brinda@gmail.com) at LIGM Universite Paris-Est Marne-la-Vallee, France +Written by Karel Brinda (karel.brinda@gmail.com) at LIGM Universite Paris-Est Marne-la-Vallee, France. .SH REPORTING BUGS Report bugs on https://github.com/karel-brinda/ococo/issues .SH EXAMPLES -.IP ococo \-i test.bam \-f test.fa \-m real\-time \-V \- ococo \-x ococo64 \-i test.bam \-f test.fa \-P \- \-V variants.vcf diff --git a/src/Makefile b/src/Makefile index ccfc525..7d63f39 100644 --- a/src/Makefile +++ b/src/Makefile @@ -2,7 +2,13 @@ all: misc.o main.o params.o -%.o: %.cpp +HEADERS=types.h stats.h version.h + +main.o: main.cpp consensus.h caller.h version.h $(HEADERS) + $(CXX) $(CXXFLAGS) $(DFLAGS) -c $< -I ../ext/htslib/ + + +%.o: %.cpp %.h $(HEADERS) $(CXX) $(CXXFLAGS) $(DFLAGS) -c $< -I ../ext/htslib/ clean: diff --git a/src/params.cpp b/src/params.cpp index 4e50f00..44f51ac 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -137,7 +137,10 @@ void ococo::params_t::print_help() { // "---------------------------------------------------------------------------------" "Examples:\n" " ococo -i test.bam -f test.fa -m real-time -V -\n" - " ococo -x ococo64 -i test.bam -f test.fa -P - -V variants.vcf\n" + " ococo -x ococo64 -i test.bam -f test.fa -P - -V variants.vcf\n\n" + // "---------------------------------------------------------------------------------" + "Note:\n" + " For more details, see the manual page 'man ./ococo.1'.\n" // "---------------------------------------------------------------------------------" // clang-format on << std::endl; From 82cfb0cb17286f0755ca06fb0cc54680659e5326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Sun, 28 Aug 2016 13:16:36 +0200 Subject: [PATCH 32/32] Update --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 17a9839..f626480 100644 --- a/README.md +++ b/README.md @@ -8,14 +8,13 @@ Welcome to OCOCO, an online consensus caller. ## Prerequisities * GCC 4.8+ or equivalent -* CMake (http://cmake.org/) * ZLib ## Getting started ```bash git clone --recursive https://github.com/karel-brinda/ococo -cd ococo && cmake . && make +cd ococo && make -j ./ococo -i test.bam -f test.fa -m real-time --vcf-cons - ```