From dbd2e8399ca6718023e1d6b5c370c47bce83e2a6 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 27 Sep 2024 13:28:06 +0200 Subject: [PATCH 1/4] [MISC] Remove duplicate read_levels --- src/estimate.cpp | 39 ++------------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/src/estimate.cpp b/src/estimate.cpp index ff93f38..701c6d4 100644 --- a/src/estimate.cpp +++ b/src/estimate.cpp @@ -107,44 +107,9 @@ void check_ibf(min_arguments const & args, } // Reads the level file ibf creates +// Defined in ibf.cpp template -void read_levels(std::vector> & expressions, std::filesystem::path filename) -{ - std::ifstream fin; - fin.open(filename); - auto stream_view = seqan3::detail::istreambuf(fin); - auto stream_it = std::ranges::begin(stream_view); - size_t j{0}; - std::vector empty_vector{}; - - std::string buffer{}; - - // Read line = expression levels - do - { - if (j == expressions.size()) - expressions.push_back(empty_vector); - std::ranges::copy(stream_view | seqan3::detail::take_until_or_throw(seqan3::is_char<' '>), - std::back_inserter(buffer)); - if constexpr (std::same_as) - expressions[j].push_back((uint16_t)std::stoi(buffer)); - else - expressions[j].push_back((double)std::stod(buffer)); - buffer.clear(); - if (*stream_it != '/') - ++stream_it; - - if (*stream_it == '\n') - { - ++stream_it; - j++; - } - } - while (*stream_it != '/'); - ++stream_it; - - fin.close(); -} +void read_levels(std::vector> &, std::filesystem::path); /*! \brief Function to estimate expression value. * \param args The arguments. From 8a9c7107306d168f421572e13ba364c881578499 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 27 Sep 2024 13:29:53 +0200 Subject: [PATCH 2/4] [INFRA] Make datasources quiet Use FetchContent, CMake 3.30 makes FetchContent quiet by default: https://cmake.org/cmake/help/latest/policy/CMP0168.html --- CMakeLists.txt | 2 +- cmake/test/declare_datasource.cmake | 23 +++++++++-------------- doc/CMakeLists.txt | 2 +- src/CMakeLists.txt | 2 +- test/CMakeLists.txt | 2 +- test/api/CMakeLists.txt | 2 +- test/cli/CMakeLists.txt | 2 +- 7 files changed, 15 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37bf56a..142dad4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.25) +cmake_minimum_required (VERSION 3.25...3.30) # Define the application name and version. project (needle diff --git a/cmake/test/declare_datasource.cmake b/cmake/test/declare_datasource.cmake index 7955c4b..a7a236d 100644 --- a/cmake/test/declare_datasource.cmake +++ b/cmake/test/declare_datasource.cmake @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik # SPDX-License-Identifier: CC0-1.0 -include (ExternalProject) +include (FetchContent) # Example call: # @@ -50,19 +50,14 @@ function (declare_datasource) # create data folder file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/data) - ExternalProject_Add ("${datasource_name}" - URL "${ARG_URL}" - URL_HASH "${ARG_URL_HASH}" - DOWNLOAD_NAME "${ARG_FILE}" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E create_symlink - ${CMAKE_CURRENT_BINARY_DIR}/data/${ARG_FILE} - TEST_COMMAND "" - PREFIX "${CMAKE_CURRENT_BINARY_DIR}/_datasources" - DOWNLOAD_NO_EXTRACT TRUE # don't extract archive files like .tar.gz. - EXCLUDE_FROM_ALL TRUE - ${ARG_UNPARSED_ARGUMENTS} + FetchContent_Populate ("${datasource_name}" + URL "${ARG_URL}" + URL_HASH "${ARG_URL_HASH}" + DOWNLOAD_NAME "${ARG_FILE}" + DOWNLOAD_NO_EXTRACT TRUE # don't extract archive files like .tar.gz. + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/data/" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/data/" + EXCLUDE_FROM_ALL TRUE ${ARG_UNPARSED_ARGUMENTS} ) add_dependencies (${PROJECT_NAME}_test "${datasource_name}") diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index fec0d08..2d660ad 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.25) +cmake_minimum_required (VERSION 3.25...3.30) # Find doxygen. find_package (Doxygen 1.9.4 QUIET) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 517f7ca..b9357d1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.25) +cmake_minimum_required (VERSION 3.25...3.30) find_package (OpenMP REQUIRED COMPONENTS CXX) add_library ("${PROJECT_NAME}_lib" STATIC estimate.cpp ibf.cpp) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2b1048c..aedd0dd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik # SPDX-License-Identifier: CC0-1.0 -cmake_minimum_required (VERSION 3.25) +cmake_minimum_required (VERSION 3.25...3.30) # This includes `cmake/test/config.cmake` which takes care of setting up the test infrastructure. It also provides # the `add_app_test` macro, which is used to add tests to the CMake test suite. diff --git a/test/api/CMakeLists.txt b/test/api/CMakeLists.txt index 105e00b..26aea24 100644 --- a/test/api/CMakeLists.txt +++ b/test/api/CMakeLists.txt @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik # SPDX-License-Identifier: CC0-1.0 -cmake_minimum_required (VERSION 3.25) +cmake_minimum_required (VERSION 3.25...3.30) add_app_test (count_test.cpp) add_app_test (insert_delete_test.cpp) diff --git a/test/cli/CMakeLists.txt b/test/cli/CMakeLists.txt index 8c984d1..ca75ed5 100644 --- a/test/cli/CMakeLists.txt +++ b/test/cli/CMakeLists.txt @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik # SPDX-License-Identifier: CC0-1.0 -cmake_minimum_required (VERSION 3.25) +cmake_minimum_required (VERSION 3.25...3.30) add_app_test (needle_options_test.cpp) add_app_test (ibf_options_test.cpp) From 3a219ec902d20325bdd8fc462662bc34b9a8e740 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 27 Sep 2024 13:50:41 +0200 Subject: [PATCH 3/4] [DOC] Update README --- README.md | 129 +++++++++++++++++++++++----------------- cmake/test/config.cmake | 2 +- src/CMakeLists.txt | 1 + 3 files changed, 77 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 69558e1..c0c0b83 100644 --- a/README.md +++ b/README.md @@ -10,65 +10,72 @@ ### A fast and space-efficient pre-filter for estimating the quantification of very large collections of nucleotide sequences Needle is a tool for semi-quantitative analysis of very large collections of nucleotide sequences. -Needle stores its data in multiple interleaved Bloom filter, a fast and space efficient probabilistic data structure and uses a windowing scheme (also called minimisers) to reduce the amount of data to store. How many interleaved Bloom filter are used is defined by the user. Each interleaved Bloom filter has a so called expression threshold and stores minimisers with an occurrence greater than or equal to its own expression threshold and smaller than the next biggest expression threshold (if there is no bigger expression threshold, all greater than or equal to the threshold are stored). These expression thresholds are then used during the query (called estimate) to approximate the expression values of given transcripts. -## Citation +Needle stores its data in multiple Interleaved Bloom Filter (IBF), a fast and space efficient probabilistic data structure and uses a windowing scheme (also called minimisers) to reduce the amount of data to store. How many Interleaved Bloom Filter are used is defined by the user. + +Each IBF has a so-called expression threshold and stores minimisers with an occurrence greater than or equal to its own expression threshold and smaller than the next biggest expression threshold (if there is no bigger expression threshold, all greater than or equal to the threshold are stored). These expression thresholds are then used during the query (called estimate) to approximate the expression values of given transcripts. -Please cite: +## Citation -Mitra Darvish, Enrico Seiler, Svenja Mehringer, René Rahn, Knut Reinert, Needle: a fast and space-efficient prefilter for estimating the quantification of very large collections of expression experiments, Bioinformatics, Volume 38, Issue 17, 1 September 2022, Pages 4100–4108, https://doi.org/10.1093/bioinformatics/btac492 +In your academic works (also comparisons and pipelines) please cite: + * *Needle: a fast and space-efficient prefilter for estimating the quantification of very large collections of expression experiments*; + Mitra Darvish, Enrico Seiler, Svenja Mehringer, René Rahn, and Knut Reinert; Bioinformatics, Volume 38, Issue 17, 1 September 2022, Pages 4100–4108. + doi: https://doi.org/10.1093/bioinformatics/btac492 -## Download, Install & Build +## Download and Installation -
Prerequisites (click to expand) +### From Source -* CMake >= 3.10 -* GCC 10, 11 or 12 (most recent minor version) +**Prerequisites**: +* CMake >= 3.25 +* GCC >= 12 +* LLVM Clang >= 17 +* Intel oneAPI DPC++/C++ Compiler >= 2024.2 (older versions may work, but are not tested) * git -Refer to the [Seqan3 Setup Tutorial](https://docs.seqan.de/seqan/3-master-user/setup.html) for more in depth +Refer to the [Seqan3 Setup Tutorial](https://docs.seqan.de/seqan3/main_user/setup.html) for more in depth information. -
- -### Install with [bioconda](https://bioconda.github.io/recipes/needle/README.html) (Linux) - -```bash -conda install -c bioconda -c conda-forge needle -``` - -### Install via github Needle can be built by following these commands: -``` -git clone --recurse-submodules https://github.com/seqan/needle.git +```bash +git clone https://github.com/seqan/needle.git mkdir build-needle && cd build-needle -cmake ../needle +cmake ../needle -DCMAKE_BUILD_TYPE=Release make ``` -Run test to check, if Needle is working as intended. All tests should pass. +Run tests to check if Needle is working as intended. All tests should pass. +```bash +make check ``` -make test + +### Install with [bioconda](https://bioconda.github.io/recipes/needle/README.html) (Linux) + +```bash +conda install -c bioconda -c conda-forge needle ``` -If you are interested in building the documentation, just use the command: `make doc` +## Usage + +### Build a Needle index +To build a Needle index, several sequence files have to be given. All sequence file formats supported by SeqAn3 are accepted as an input (fasta, fastq, embl,... and their compressed forms). + +The flag `--paired` in the example below indicates that the given sequence files are paired-end experiments. Furthermore, the false positive rate has to be specified with the parameter `f`. -## Build an Needle index -In order to build a Needle index a number of sequence files have to be given. All sequence file formats supported by seqan3 are accepted as an input (fasta, fastq, embl,... and their compressed forms). The flag `--paired` in the example below indicates that the given sequence files are paired-end experiments. Furthermore, the false positive rate has to be specified with the parameter `f`. Use -h/--help for more information and to see further parameters. The flag `-c` can be used to build a compressed Needle index. The following example creates a compressed Needle index for two paired-end experiments for the expression thresholds 4 and 32. -``` +```bash ./bin/needle ibf ../needle/test/data/exp_*.fasta --paired -e 16 -e 32 -f 0.3 -c -o example ``` -Although, this works. It is recommended to calculate the minimisers beforehand by using the option `minimisers`. It calculates the minimisers of given experiments and stores their hash values and their occurrences in a binary file named ".minimiser". +Even though this works, it is recommended to calculate the minimisers beforehand by using the option `minimisers`. It calculates the minimisers of given experiments and stores their hash values and their occurrences in a binary file named ".minimiser". The following command calculates the minimisers in the two experiments. -``` +```bash ./bin/needle minimiser ../needle/test/data/exp_*.fasta --paired ``` @@ -81,53 +88,67 @@ A minimiser file is a binary file containing the following data: - shape (uint64_t), if flag is false - all minimiser hashes (uint64_t) with their occurrences (uint16_t) -Based on the minimiser files the Needle index can be computed by using the following command: -``` +Based on the minimiser files, the Needle index can be computed by using the following command: +```bash ./bin/needle ibfmin exp*.minimiser -e 16 -e 32 -f 0.3 -c -o example ``` -## Estimate -To estimate the expression value of one transcript a sequence file has to be given. Use the parameter "-i" to define where the Needle index can be found (should be equal with "-o" in the previous commands). +### Estimate +To estimate the expression value of one transcript, a sequence file has to be given + +Use the parameter "-i" to define where the Needle index can be found (should be equal with "-o" in the previous commands). + Use -h/--help for more information and to see further parameters. + The following example searches for one gene, which is expressed in the first experiment with expression 6 and in the second with expression 37. Therefore, it should be found only in the second experiment but not the first when using expression levels of 16 and 32. -``` +```bash ./bin/needle estimate ../needle/test/data/gene.fasta -i example ``` The created file "expressions.out" (if you prefer a different name, use "-o") should contain the following: -``` +```text GeneA 0 32 ``` -## Insert into an existing Needle index -It is possible to insert new sequence files into an uncompressed Needle index. Similar to the build step this can be done by either using the sequence files as input directly or the minimiser files outputed by `needle minimiser`. Most options are the same as the ones from the build step, however as the Needle index already exist, neither the false positive rate nor the number of hash functions can be changed. It is necessary to specify `i` to the directory, where the existing Needle index can be found. +### Insert into an existing Needle index +It is possible to insert new sequence files into an uncompressed Needle index. + +Similar to the build step, this can be done by either using the sequence files as input directly or the minimiser files outputted by `needle minimiser`. + +Most options are the same as the ones from the build step, however as the Needle index already exists, neither the false positive rate nor the number of hash functions can be changed. + +It is necessary to specify `i` to the directory, where the existing Needle index can be found. The following example inserts into the Needle index build above for two paired-end experiments. -``` -./bin/needle ibf ../needle/test/data/exp_0*.fasta --paired -e 16 -e 32 -f 0.3 -c -o example // Create Index -./bin/needle insert ../needle/test/data/exp_1*.fasta --paired -i example // Insert into created index +```bash +# Create Index +./bin/needle ibf ../needle/test/data/exp_0*.fasta --paired -e 16 -e 32 -f 0.3 -c -o example +# Insert into created index +./bin/needle insert ../needle/test/data/exp_1*.fasta --paired -i example ``` -Based on minimiser files an insertion to the Needle index can be achieved by using the following command: -``` -./bin/needle ibf ../needle/test/data/exp_0*.fasta --paired -e 16 -e 32 -f 0.3 -c -o example // Create Index -./bin/needle insertmin exp*.minimiser -i example // Insert into created index +Based on minimiser files, an insertion to the Needle index can be achieved by using the following command: +```bash +# Create Index +./bin/needle ibf ../needle/test/data/exp_0*.fasta --paired -e 16 -e 32 -f 0.3 -c -o example +# Insert into created index +./bin/needle insertmin exp*.minimiser -i example ``` The insert methods based on minimiser or on sequence files is independent of the way the index was created. -## Delete experiments from an existing Needle index -It is possible to delete sequence files from an uncompressed Needle index by specifiying the position of the experiment, which should be deleted. -These deleted experiments won't change the size of the index as the space is kept for later insertions. -``` -./bin/needle ibf ../needle/test/data/exp_*.fasta --paired -e 16 -e 32 -f 0.3 -c -o example // Create Index -./bin/needle delete -i example 0 // Delete first experiment exp_0 (with position 0) from index -`` - +### Delete experiments from an existing Needle index +It is possible to delete sequence files from an uncompressed Needle index by specifying the position of the experiment, which should be deleted. +These deleted experiments won't change the size of the index, as the space is kept for later insertions. +```bash +# Create Index +./bin/needle ibf ../needle/test/data/exp_*.fasta --paired -e 16 -e 32 -f 0.3 -c -o example +# Delete first experiment exp_0 (with position 0) from index +./bin/needle delete -i example 0 +``` ## Note - -This app was created with the [seqan3 app-template](https://github.com/seqan/app-template). +This app was created with the [SeqAn app-template](https://github.com/seqan/app-template). diff --git a/cmake/test/config.cmake b/cmake/test/config.cmake index f411a7d..c29ac3f 100644 --- a/cmake/test/config.cmake +++ b/cmake/test/config.cmake @@ -13,7 +13,7 @@ enable_testing () file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/output) add_definitions (-DOUTPUTDIR=\"${CMAKE_CURRENT_BINARY_DIR}/output/\") add_definitions (-DDATADIR=\"${CMAKE_CURRENT_BINARY_DIR}/data/\") -add_definitions (-DBINDIR=\"${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/\") +add_definitions (-DBINDIR=\"${CMAKE_BINARY_DIR}/bin/\") add_definitions (-DAPPNAME=\"${PROJECT_NAME}\") # Add the test interface library. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b9357d1..071777e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,3 +9,4 @@ target_include_directories ("${PROJECT_NAME}_lib" PUBLIC ../include) add_executable ("${PROJECT_NAME}" main.cpp) target_link_libraries ("${PROJECT_NAME}" PRIVATE "${PROJECT_NAME}_lib") +set_target_properties ("${PROJECT_NAME}" PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") From 67ae60898fc46a9ecaae7781ececcf829c0239de Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 27 Sep 2024 14:01:12 +0200 Subject: [PATCH 4/4] [MISC] Remove unused test header --- test/cli/cli_test.hpp | 90 ------------------------------------------- 1 file changed, 90 deletions(-) delete mode 100644 test/cli/cli_test.hpp diff --git a/test/cli/cli_test.hpp b/test/cli/cli_test.hpp deleted file mode 100644 index 80dcf03..0000000 --- a/test/cli/cli_test.hpp +++ /dev/null @@ -1,90 +0,0 @@ -#include - -#include // system calls -#include // test directory creation -#include // ostringstream -#include // strings - -// Include the EXPECT_RANGE_EQ macro for better information if range elements differ. -#include - -#pragma once - -// Provides functions for CLI test implementation. -struct cli_test : public ::testing::Test -{ -private: - // Holds the original work directory where Gtest has been started. - std::filesystem::path original_workdir{}; - -protected: - // Result struct for captured streams and exit code. - struct cli_test_result - { - std::string out{}; - std::string err{}; - int exit_code{}; - }; - - // Invoke the app execution. The command line call should be given as separate parameters. - template - cli_test_result execute_app(CommandItemTypes &&... command_items) - { - cli_test_result result{}; - - // Assemble the command string and disable version check. - std::ostringstream command{}; - command << "SEQAN3_NO_VERSION_CHECK=1 " << BINDIR; - int a[] = {0, ((void)(command << command_items << ' '), 0)...}; - (void)a; - - // Always capture the output streams. - testing::internal::CaptureStdout(); - testing::internal::CaptureStderr(); - - // Run the command and return results. - result.exit_code = std::system(command.str().c_str()); - result.out = testing::internal::GetCapturedStdout(); - result.err = testing::internal::GetCapturedStderr(); - return result; - } - - // Generate the full path of a test input file that is provided in the data directory. - static std::filesystem::path data(std::string const & filename) - { - return std::filesystem::path{std::string{DATADIR}}.concat(filename); - } - - // Create an individual work directory for the current test. - void SetUp() override - { - // Assemble the directory name. - ::testing::TestInfo const * const info = ::testing::UnitTest::GetInstance()->current_test_info(); - std::filesystem::path const test_dir{std::string{OUTPUTDIR} + std::string{info->test_case_name()} - + std::string{"."} + std::string{info->name()}}; - try - { - std::filesystem::remove_all(test_dir); // delete the directory if it exists - std::filesystem::create_directories(test_dir); // create the new empty directory - original_workdir = std::filesystem::current_path(); // store original work dir path - std::filesystem::current_path(test_dir); // change the work dir - } - catch (std::exception const & exc) - { - FAIL() << "Failed to set up the test directory " << test_dir << ":\n" << exc.what(); - } - } - - // Switch back to the initial work directory. - void TearDown() override - { - try - { - std::filesystem::current_path(original_workdir); // restore the original work dir - } - catch (std::exception const & exc) - { - FAIL() << "Failed to set the work directory to " << original_workdir << ":\n" << exc.what(); - } - } -};