diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 0000000..8277385 --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,74 @@ +name: Docker Build and Test + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + inputs: + use_cache: + description: 'Use cache for this run' + required: true + default: 'true' + num_build_jobs: + description: 'Number of build jobs' + required: true + default: '2' + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Delete huge unnecessary tools folder + run: rm -rf /opt/hostedtoolcache + + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Cache build directory + if: github.event_name != 'workflow_dispatch' || github.event.inputs.use_cache == 'true' + uses: actions/cache@v3 + with: + path: build + key: ${{ runner.os }}-build-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-build- + + - name: Set lowercase repository owner + id: repo_owner + run: echo "::set-output name=owner::${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]' + + - name: Build Docker image + uses: docker/build-push-action@v4 + with: + context: . + load: true + tags: ghcr.io/${{ steps.repo_owner.outputs.owner }}/build_test:latest + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + BUILDKIT_INLINE_CACHE=1 + RAWHASH_NUM_BUILD_JOBS=${{ github.event.inputs.num_build_jobs || '2' }} + + - name: Extract build directory from Docker image + run: | + container_id=$(docker create ghcr.io/${{ steps.repo_owner.outputs.owner }}/build_test:latest) + docker cp $container_id:/rawhash2/build ./build + docker rm $container_id + + - name: Run Docker container + run: docker run --rm ghcr.io/${{ steps.repo_owner.outputs.owner }}/build_test:latest -h diff --git a/.gitignore b/.gitignore index 275a796..6de53e7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,9 @@ bin/ +build/ +run_dir/ +.venv/ +example_out/ +__pycache__/ test/data/d1_* test/data/d2_* @@ -8,7 +13,6 @@ test/data/d5_* test/data/d6_* test/data/d7_* test/data/d8_* -test/data/download_d8_* test/data/*/*.fa test/data/*/*.fasta test/data/*/*.tar @@ -73,13 +77,17 @@ test/eval/*.summary */.DS_Store test/*.time test/*.idx -src/*.o +#src/*.o extern/pod5* test/evaluation/rawsamble/ -extern/tensorflow/ +test/scripts/*test* +test/evaluation/read_mapping/*/parameters.txt +test/evaluation/read_mapping/*/results.txt +test/evaluation/read_mapping/*parameters*.txt +test/evaluation/read_mapping/*parameters/ +test/evaluation/read_mapping/s_modify.py -test/evaluation/read_mapping/*_parameters/ -test/evaluation/read_mapping/s_modify.py \ No newline at end of file +test/evaluation/read_mapping/*.features diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..3953cca --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,4 @@ +cmake_minimum_required(VERSION 3.10) +project(RawHash2Root) + +add_subdirectory(src) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4792071 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM gcc:latest + +RUN apt-get update && apt-get install -y \ + cmake make mold ccache \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /rawhash2 +COPY . /rawhash2 + +ARG RAWHASH_NUM_BUILD_JOBS +RUN mkdir -p build && cd build \ + && cmake .. \ + && make -j $RAWHASH_NUM_BUILD_JOBS + +ENTRYPOINT ["./build/bin/rawhash2"] + +LABEL Name=rawhash2 Version=0.0.1 diff --git a/Makefile b/Makefile deleted file mode 100644 index eeb1108..0000000 --- a/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -.PHONY: all subset clean help - -all:rawhash2 - -help: ##Show help - +$(MAKE) -C src help - -rawhash2: - @if [ ! -e bin ] ; then mkdir -p ./bin/ ; fi - +$(MAKE) -C src - mv ./src/rawhash2 ./bin/ - -subset: - @if [ ! -e bin ] ; then mkdir -p ./bin/ ; fi - +$(MAKE) -C src subset - mv ./src/rawhash2 ./bin/ - -clean: - rm -rf bin/ - +$(MAKE) clean -C ./src/ - diff --git a/README.md b/README.md index 901f276..109e506 100644 --- a/README.md +++ b/README.md @@ -40,46 +40,76 @@ RawHash performs real-time mapping of nanopore raw signals. When the prefix of r # Installation -* Clone the code from its GitHub repository (`--recursive` must be used): +* Clone the code from its GitHub repository and recursively initialize submodules: ```bash -git clone --recursive https://github.com/CMU-SAFARI/RawHash.git rawhash2 +git clone https://github.com/CMU-SAFARI/RawHash.git rawhash2 +cd rawhash2 && git submodule update --init --recursive ``` * Compile (Make sure you have a C++ compiler and GNU make): ```bash -cd rawhash2 && make +# if not doing a fresh clone, make sure that the submodules don't have anything built from previous makefile-based +# setup , i.e. delete extern directory, then initialize submodules as above +(mkdir -p build && cd build && cmake .. && make -j) +build/bin/rawhash2 -h ``` -If the compilation is successful, the path to the binary will be `bin/rawhash2`. +Troubleshooting: +- `makefile error 2`: rerun `make -j`, then the actual error is shown +- updating submodules: the current cmake setup may not correctly handle this, so the easiest solution is to delete the build directory + +If the compilation is successful, the default path to the binary will be `build/bin/rawhash2`. + +* Installation + +You can install RawHash2 into the CMake-provided platform-specific destination (e.g. `/usr/local/` on UNIX) with `make install`: + +```bash +make install +rawhash2 -h +``` + +Installation directory can be overridden by providing `-DCMAKE_INSTALL_PREFIX=...` argument to the `cmake ..` command, e.g. + +```bash +cmake -DCMAKE_INSTALL_PREFIX=./install .. +make -j +make install +./install/bin/rawhash2 -h +``` + +Note that `CMAKE_INSTALL_PREFIX` is a cached variable in CMake. ## Compiling with HDF5, SLOW5, and POD5 -We are aware that some of the pre-compiled libraries (e.g., POD5) may not work in your system and you may need to compile these libraries from scratch. Additionally, it may be possible that you may not want to compile any of the HDF5, SLOW5, or POD5 libraries if you are not going to use them. RawHash2 provides a flexible Makefile to enable custom compilation of these libraries. +We are aware that some of the pre-compiled libraries (e.g., POD5) may not work in your system and you may need to compile these libraries from scratch. Additionally, it may be possible that you may not want to compile any of the HDF5, SLOW5, or POD5 libraries if you are not going to use them. RawHash2 provides several CMake options to enable custom compilation of these libraries. -* It is possible to provide your own include and lib directories for *any* of the HDF5, SLOW5, and POD5 libraries, if you do not want to use the source code or the pre-compiled binaries that come with RawHash2. To use your own include and lib directories you should pass them to `make` when compiling as follows: +It is possible to provide your own include and lib directories for *any* of the HDF5, SLOW5, and POD5 libraries, if you do not want to use the source code or the pre-compiled binaries that come with RawHash2. To use your own include and lib directories you should pass them to `cmake` when compiling as follows: ```bash -#Provide the path to all of the HDF5/SLOW5/POD5 include and lib directories during compilation -make HDF5_INCLUDE_DIR=/path/to/hdf5/include HDF5_LIB_DIR=/path/to/hdf5/lib \ - SLOW5_INCLUDE_DIR=/path/to/slow5/include SLOW5_LIB_DIR=/path/to/slow5/lib \ - POD5_INCLUDE_DIR=/path/to/pod5/include POD5_LIB_DIR=/path/to/pod5/lib +# Provide the path to all of the HDF5/SLOW5/POD5 include and lib directories during compilation +cmake -DHDF5_DIR=/path/to/hdf5 -DSLOW5_DIR=/path/to/slow5 -DPOD5_DIR=/path/to/pod5 .. -#Provide the path to only POD5 include and lib directories during compilation -make POD5_INCLUDE_DIR=/path/to/pod5/include POD5_LIB_DIR=/path/to/pod5/lib +# Provide the path to only POD5 include and lib directories during compilation +cmake -DPOD5_DIR=/path/to/pod5 ``` -* It is possible to disable compiling *any* of the HDF5, SLOW5, and POD5 libraries. To disable them, you can use the following variables +Note that the provided path should generally contain _both_ `include/` and `lib/` folders with the corresponding project's include and library files. + +It is possible to disable compiling *any* of the HDF5, SLOW5, and POD5 libraries. To disable them, you can use the following variables ```bash -#Disables compiling HDF5 -make NOHDF5=1 +# Disables compiling HDF5 +cmake -DNOHDF5=1 .. -#Disables compiling SLOW5 and POD5 -make NOSLOW5=1 NOPOD5=1 +# Disables compiling SLOW5 and POD5 +cmake -DNOSLOW5=1 -DNOPOD5=1 .. ``` +The variables and paths will be stored in CMake cache, meaning that you would need to run `cmake` again with explicitly provided new values to change them. + # Usage ## Getting help diff --git a/cmake/SetupCCacheMold.cmake b/cmake/SetupCCacheMold.cmake new file mode 100644 index 0000000..e3e3635 --- /dev/null +++ b/cmake/SetupCCacheMold.cmake @@ -0,0 +1,57 @@ +function(enable_ccache) + # export PATH="/usr/lib/ccache:$PATH" + find_program(CCACHE_EXE ccache) + if(CCACHE_EXE) + message(STATUS "found ccache at ${CCACHE_EXE}, using it") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_EXE}" CACHE STRING "C compiler launcher") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXE}" CACHE STRING "C++ compiler launcher") + else() + message(STATUS "ccache not found, not using it") + endif() +endfunction() + +# mold is a much faster linker than ld, also see for mold: https://github.com/heavyai/heavydb/blob/master/CMakeLists.txt +# or try: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/8861, can now use CMAKE_LINKER_TYPE +macro(set_alternate_linker linker) + find_program(LINKER_EXECUTABLE ld.${linker} ${linker}) + if(LINKER_EXECUTABLE) + message(STATUS "Found linker ${linker}: ${LINKER_EXECUTABLE}") + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 12.0.0) + add_link_options("-ld-path=${linker}") + elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 12.1.0 AND "${linker}" STREQUAL "mold") + # LINKER_EXECUTABLE will be a full path to ld.mold, so we replace the end of the path, resulting in the relative + # libexec/mold dir, and tell GCC to look there first for an override version of executables, in this case, ld + string(REPLACE "bin/ld.mold" "libexec/mold" PATH_TO_LIBEXEC_MOLD ${LINKER_EXECUTABLE}) + add_link_options("-B${PATH_TO_LIBEXEC_MOLD}") + else() + add_link_options("-fuse-ld=${linker}") + endif() + else() + message(FATAL_ERROR "Could not find linker ${linker}") + endif() +endmacro() + +# not working +# function(setup_ccache_mold) +# if(USE_CCACHE) +# find_program(CCACHE_PROGRAM ccache) +# if(CCACHE_PROGRAM) +# message(STATUS "ccache found: ${CCACHE_PROGRAM}") +# set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") +# set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") +# else() +# message(WARNING "ccache not found, using default compiler") +# endif() +# endif() + +# if(USE_MOLD) +# find_program(MOLD_PROGRAM mold) +# # todo: does not seem to work, for a working configuration, see https://github.com/ratschlab/readuntil_fake/blob/refactor/cmake/utils.cmake +# if(MOLD_PROGRAM) +# message(STATUS "mold found: ${MOLD_PROGRAM}") +# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${MOLD_PROGRAM}") +# else() +# message(WARNING "mold not found, using default linker") +# endif() +# endif() +# endfunction() diff --git a/cmake/SetupHDF5.cmake b/cmake/SetupHDF5.cmake new file mode 100644 index 0000000..dae4c68 --- /dev/null +++ b/cmake/SetupHDF5.cmake @@ -0,0 +1,41 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Utils.cmake) + +function(add_hdf5_to_target TARGET_NAME) + if(NOHDF5) + target_compile_definitions(${TARGET_NAME} PRIVATE NHDF5RH=1) + else() + if(HDF5_COMPILE) + add_dependencies(${TARGET_NAME} hdf5_build) + endif() + add_imported_library(${TARGET_NAME} hdf5) + endif() +endfunction() + +function(setup_hdf5) + if(NOT NOHDF5) + # print HDF5_DIR + message(STATUS "EXTERNAL_PROJECTS_BUILD_DIR: ${EXTERNAL_PROJECTS_BUILD_DIR}") + message(STATUS "HDF5_DIR: ${HDF5_DIR}") + set(HDF5_SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/hdf5) + if(HDF5_COMPILE) + if(NOT HDF5_DIR) + override_cached(HDF5_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/hdf5) + endif() + set(HDF5_BUILD_DIR ${HDF5_DIR}/build) + ExternalProject_Add( + hdf5_build + BUILD_ALWAYS 1 # Rebuild if local checkout is updated + SOURCE_DIR ${HDF5_SOURCE_DIR} + BINARY_DIR ${HDF5_BUILD_DIR} + CONFIGURE_COMMAND ${HDF5_SOURCE_DIR}/configure --enable-threadsafe --disable-hl --prefix=${HDF5_BUILD_DIR} + # INSTALL_DIR and DCMAKE_INSTALL_PREFIX are ignored by hdf5 + INSTALL_COMMAND make install prefix=${HDF5_DIR} + ) + else() + if(NOT HDF5_DIR) + message(FATAL_ERROR "HDF5_COMPILE is OFF, but no dir provided") + endif() + endif() + define_imported_library(hdf5 ${HDF5_DIR} SHARED) + endif() +endfunction() diff --git a/cmake/SetupPOD5.cmake b/cmake/SetupPOD5.cmake new file mode 100644 index 0000000..9acc711 --- /dev/null +++ b/cmake/SetupPOD5.cmake @@ -0,0 +1,197 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Utils.cmake) + +function(add_zstd_to_target TARGET_NAME) + set(ZSTD_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/zstd) + add_dependencies(${TARGET_NAME} zstd_build) + add_imported_library(${TARGET_NAME} zstd) +endfunction() + +function(setup_zstd) + set(ZSTD_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/zstd) + ExternalProject_Add( + zstd_build + BUILD_ALWAYS 1 # Rebuild if local checkout is updated + SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/zstd/build/cmake + BINARY_DIR ${ZSTD_DIR}/build + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZSTD_DIR} + ) + define_imported_library(zstd ${ZSTD_DIR} SHARED) +endfunction() + +function(add_pod5_to_target TARGET_NAME) + if(NOPOD5) + target_compile_definitions(${TARGET_NAME} PRIVATE NPOD5RH=1) + else() + add_zstd_to_target(${TARGET_NAME}) + + set(POD5_VERSION "0.2.2") + set(POD5_URLDIR "pod5-${POD5_VERSION}-${CMAKE_SYSTEM_NAME}") + set(POD5_REPO "https://github.com/nanoporetech/pod5-file-format") + + resolve_pod5_url() + + if(POD5_DOWNLOAD) + add_dependencies(${TARGET_NAME} pod5_download) + endif() + target_link_libraries(${TARGET_NAME} PRIVATE ${POD5_LIBRARIES}) + endif() +endfunction() + +function(setup_pod5) + if(NOT NOPOD5) + setup_zstd() + + set(POD5_VERSION "0.2.2") + set(POD5_URLDIR "pod5-${POD5_VERSION}-${CMAKE_SYSTEM_NAME}") + set(POD5_REPO "https://github.com/nanoporetech/pod5-file-format") + + resolve_pod5_url() + + if(POD5_DOWNLOAD) + if(NOT POD5_DIR) + override_cached(POD5_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}) + endif() + ExternalProject_Add( + pod5_download + SOURCE_DIR ${POD5_DIR} + URL ${POD5_URL} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + # requires cmake >= 3.24 + # DOWNLOAD_EXTRACT_TIMESTAMP TRUE + ) + add_dependencies(${TARGET_NAME} pod5_download) + else() + if(NOT POD5_DIR) + message(FATAL_ERROR "POD5_DOWNLOAD is OFF, but no dir provided") + endif() + endif() + include_directories(${POD5_DIR}/include) + endif() +endfunction() + +# POD5_URL and POD5_LIBRARIES are set at PARENT_SCOPE +function(resolve_pod5_url) + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(POD5_LIB "lib64") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm)") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-gcc7-arm64.tar.gz") + set(POD5_LIB "lib") + else() + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-arm64.tar.gz" PARENT_SCOPE) + endif() + else() + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-x64.tar.gz" PARENT_SCOPE) + endif() + set(POD5_LIB_DIR "${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}/${POD5_LIB}") + set(POD5_LIBRARIES "${POD5_LIB_DIR}/libpod5_format.so" + "${POD5_LIB_DIR}/libarrow.so" + "${POD5_LIB_DIR}/libjemalloc_pic.so" PARENT_SCOPE) + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-osx-11.0-arm64.tar.gz") + set(POD5_LIB_DIR "${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}/lib") + set(POD5_LIBRARIES "${POD5_LIB_DIR}/libpod5_format.so" + "${POD5_LIB_DIR}/libarrow.so" PARENT_SCOPE) + elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows_NT") + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-win" PARENT_SCOPE) + endif() +endfunction() + + +# not working because of improper design, PARENT_SCOPE should not be used, rather define targets properly +# include(${CMAKE_CURRENT_LIST_DIR}/Utils.cmake) + +# set(ZSTD_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/zstd) + +# function(setup_zstd) +# ExternalProject_Add( +# zstd_build +# SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/zstd/build/cmake +# BINARY_DIR ${ZSTD_DIR}/build +# CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZSTD_DIR} +# ) +# endfunction() + +# function(add_zstd_to_target TARGET_NAME) +# add_dependencies(${TARGET_NAME} zstd_build) +# link_imported_library(${TARGET_NAME} zstd ${ZSTD_DIR}) +# target_link_libraries(${TARGET_NAME} PRIVATE zstd) +# endfunction() + +# function(setup_pod5) +# if(NOPOD5) +# return() +# endif() + +# setup_zstd() + +# set(POD5_VERSION "0.2.2") +# set(POD5_URLDIR "pod5-${POD5_VERSION}-${CMAKE_SYSTEM_NAME}") +# set(POD5_REPO "https://github.com/nanoporetech/pod5-file-format") + +# resolve_pod5_url() + +# if(POD5_DOWNLOAD) +# if(NOT POD5_DIR) +# override_cached(POD5_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}) +# endif() +# ExternalProject_Add( +# pod5_download +# SOURCE_DIR ${POD5_DIR} +# URL ${POD5_URL} +# CONFIGURE_COMMAND "" +# BUILD_COMMAND "" +# INSTALL_COMMAND "" +# # requires cmake >= 3.24 +# # DOWNLOAD_EXTRACT_TIMESTAMP TRUE +# ) +# else() +# if(NOT POD5_DIR) +# message(FATAL_ERROR "POD5_DOWNLOAD is OFF, but no dir provided") +# endif() +# endif() +# endfunction() + +# function(add_pod5_to_target TARGET_NAME) +# if(NOPOD5) +# target_compile_definitions(${TARGET_NAME} PRIVATE NPOD5RH=1) +# else() +# add_zstd_to_target(${TARGET_NAME}) + +# add_dependencies(${TARGET_NAME} pod5_download) +# include_directories(${POD5_DIR}/include) +# message(STATUS "Adding include dir ${POD5_DIR}/include, POD5 libraries: ${POD5_LIBRARIES}") +# target_link_libraries(${TARGET_NAME} PRIVATE ${POD5_LIBRARIES}) +# endif() +# endfunction() + +# # POD5_URL and POD5_LIBRARIES are set at PARENT_SCOPE +# function(resolve_pod5_url) +# if(CMAKE_SYSTEM_NAME STREQUAL "Linux") +# set(POD5_LIB "lib64") +# if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm)") +# if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-gcc7-arm64.tar.gz") +# set(POD5_LIB "lib") +# else() +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-arm64.tar.gz" PARENT_SCOPE) +# endif() +# else() +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-x64.tar.gz" PARENT_SCOPE) +# endif() +# set(POD5_LIB_DIR "${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}/${POD5_LIB}") +# set(POD5_LIBRARIES "${POD5_LIB_DIR}/libpod5_format.a" +# "${POD5_LIB_DIR}/libarrow.a" +# "${POD5_LIB_DIR}/libjemalloc_pic.a" PARENT_SCOPE) +# elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-osx-11.0-arm64.tar.gz") +# set(POD5_LIB_DIR "${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}/lib") +# set(POD5_LIBRARIES "${POD5_LIB_DIR}/libpod5_format.a" +# "${POD5_LIB_DIR}/libarrow.a" PARENT_SCOPE) +# elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows_NT") +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-win" PARENT_SCOPE) +# # todo: not setting libraries! +# endif() +# endfunction() diff --git a/cmake/SetupRUClient.cmake b/cmake/SetupRUClient.cmake new file mode 100644 index 0000000..c817827 --- /dev/null +++ b/cmake/SetupRUClient.cmake @@ -0,0 +1,23 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Utils.cmake) + +function(add_ruclient_to_target TARGET_NAME) + if(RUCLIENT_ENABLED) + set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 20) + target_compile_definitions(${TARGET_NAME} PRIVATE RUCLIENT_ENABLED) + target_sources(${TARGET_NAME} PRIVATE rawhash_ruclient.cpp) + target_link_libraries(${TARGET_NAME} PRIVATE ont_device_client_LIB ru_method_LIB) + endif() +endfunction() + +function(setup_ruclient) + if(RUCLIENT_ENABLED) + if(NOT RUCLIENT_DIR) + override_cached(RUCLIENT_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/ruclient) + endif() + set(RUCLIENT_SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/readuntil_fake) + add_subdirectory(${RUCLIENT_SOURCE_DIR} ${RUCLIENT_DIR}) + include_directories(${RUCLIENT_SOURCE_DIR}/include) + else() + message(STATUS "ruclient disabled") + endif() +endfunction() diff --git a/cmake/SetupRawHashLikeTarget.cmake b/cmake/SetupRawHashLikeTarget.cmake new file mode 100644 index 0000000..0ba1c9b --- /dev/null +++ b/cmake/SetupRawHashLikeTarget.cmake @@ -0,0 +1,88 @@ +include(${CMAKE_CURRENT_LIST_DIR}/SetupRUClient.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/SetupPOD5.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/SetupHDF5.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/SetupSLOW5.cmake) + +setup_pod5() +setup_ruclient() +setup_hdf5() +setup_slow5() + +function(setup_rawhashlike_target TARGET_NAME) + target_compile_options(${TARGET_NAME} PRIVATE -w) # disable all warnings, dangerous! + + set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 11) + set_target_properties(${TARGET_NAME} PROPERTIES C_STANDARD 11) + + find_package(Threads REQUIRED) + target_link_libraries(${TARGET_NAME} PRIVATE Threads::Threads m z dl) + target_compile_options(${TARGET_NAME} PRIVATE -Wall -fopenmp -march=native -O3) + target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_KALLOC) + + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + target_compile_options(${TARGET_NAME} PRIVATE -D_FILE_OFFSET_BITS=64 -fsigned-char) + elseif(DEFINED ARM_NEON) + target_compile_options(${TARGET_NAME} PRIVATE -D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char) + endif() + + if(ENABLE_ASAN) + message(STATUS "AddressSanitizer enabled") + target_compile_options(${TARGET_NAME} PRIVATE -fsanitize=address) + target_link_libraries(${TARGET_NAME} PRIVATE -fsanitize=address) + endif() + + if(ENABLE_TSAN) + message(STATUS "ThreadSanitizer enabled") + target_compile_options(${TARGET_NAME} PRIVATE -fsanitize=thread) + target_link_libraries(${TARGET_NAME} PRIVATE -fsanitize=thread) + endif() + + target_sources(${TARGET_NAME} PRIVATE + bseq.c + dtw.cpp + kalloc.c + kthread.c + revent.c + rmap.cpp + roptions.c + rsketch.c + rutils.c + sequence_until.c + ) + # C files that rely on hdf5_tools.hpp + # Should be compiled as CXX for now + set(PSEUDO_C_SOURCES + hit.c + lchain.c + rindex.c + rseed.c + rsig.c + ) + foreach(source IN LISTS PSEUDO_C_SOURCES) + set_source_files_properties(${source} PROPERTIES LANGUAGE CXX) + endforeach() + target_sources(${TARGET_NAME} PRIVATE ${PSEUDO_C_SOURCES}) + + + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options(${TARGET_NAME} PRIVATE + -g + -fsanitize=address + ) + endif() + + if(PROFILE) + target_compile_options(${TARGET_NAME} PRIVATE + -g + -fno-omit-frame-pointer + ) + target_compile_definitions(${TARGET_NAME} PRIVATE + PROFILERH=1 + ) + endif() + + add_pod5_to_target(${TARGET_NAME}) + add_hdf5_to_target(${TARGET_NAME}) + add_slow5_to_target(${TARGET_NAME}) + add_ruclient_to_target(${TARGET_NAME}) +endfunction() diff --git a/cmake/SetupSLOW5.cmake b/cmake/SetupSLOW5.cmake new file mode 100644 index 0000000..f7b1803 --- /dev/null +++ b/cmake/SetupSLOW5.cmake @@ -0,0 +1,42 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Utils.cmake) + +function(add_slow5_to_target TARGET_NAME) + if(NOSLOW5) + target_compile_definitions(${TARGET_NAME} PRIVATE NSLOW5RH=1) + else() + if(SLOW5_COMPILE) + add_dependencies(${TARGET_NAME} slow5_build) + endif() + add_imported_library(${TARGET_NAME} slow5) + endif() +endfunction() + +function(setup_slow5) + if(NOT NOSLOW5) + set(SLOW5_SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/slow5lib) + if(SLOW5_COMPILE) + if(NOT SLOW5_DIR) + override_cached(SLOW5_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/slow5lib) + endif() + message(STATUS "Compiling slow5 to ${SLOW5_DIR}") + ExternalProject_Add( + slow5_build + BUILD_ALWAYS 1 # Rebuild if local checkout is updated + BINARY_DIR ${SLOW5_DIR} + SOURCE_DIR ${SLOW5_SOURCE_DIR} + CMAKE_ARGS + -DSLOW5_ENABLE_MT=1 + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${SLOW5_SOURCE_DIR}/include ${SLOW5_DIR}/include + && ${CMAKE_COMMAND} -E make_directory ${SLOW5_DIR}/lib + && ${CMAKE_COMMAND} -E rename ${SLOW5_DIR}/libslow5.so ${SLOW5_DIR}/lib/libslow5.so + ) + message(STATUS "Current dir: ${CMAKE_CURRENT_BINARY_DIR}") + else() + if(NOT SLOW5_DIR) + message(FATAL_ERROR "SLOW5_COMPILE is OFF, but no dir provided") + endif() + endif() + message(STATUS "Using slow5 from ${SLOW5_DIR}") + define_imported_library(slow5 ${SLOW5_DIR} SHARED) + endif() +endfunction() diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake new file mode 100644 index 0000000..4c827db --- /dev/null +++ b/cmake/Utils.cmake @@ -0,0 +1,45 @@ +include(ExternalProject) + +function(override_cached name value) + message(STATUS "Overriding ${name} to ${value}") + get_property(doc_string CACHE ${name} PROPERTY HELPSTRING) + get_property(type CACHE ${name} PROPERTY TYPE) + set(${name} ${value} CACHE ${type} ${doc_string} FORCE) +endfunction() + + +function(add_imported_library TARGET_NAME LIB_NAME) + target_link_libraries(${TARGET_NAME} PRIVATE ${LIB_NAME}_RAWHASH_LIB) +endfunction() + +function(define_imported_library LIB_NAME LIB_DIR LIB_TYPE) + add_library(${LIB_NAME}_RAWHASH_LIB ${LIB_TYPE} IMPORTED) + set(EXTENSION ".a") + if(LIB_TYPE STREQUAL SHARED) + set(EXTENSION ".so") + endif() + set_target_properties(${LIB_NAME}_RAWHASH_LIB PROPERTIES + IMPORTED_LOCATION ${LIB_DIR}/lib/lib${LIB_NAME}${EXTENSION} + INTERFACE_INCLUDE_DIRECTORIES ${LIB_DIR}/include) + file(MAKE_DIRECTORY ${LIB_DIR}/include) + # Can't install(TARGETS ...) for external projects + # Also some .so are symlinks, so install all + install(DIRECTORY ${LIB_DIR}/lib/ DESTINATION lib + FILES_MATCHING PATTERN *${EXTENSION}*) + install(DIRECTORY ${LIB_DIR}/include/ + DESTINATION include/${PROJECT_NAME}) +endfunction() + + +function(check_directory_exists_and_non_empty DIR) + if(NOT EXISTS ${DIR}) + message(FATAL_ERROR "Directory ${DIR} does not exist.") + endif() + + file(GLOB DIRECTORY_CONTENTS "${DIR}/*") + if(DIRECTORY_CONTENTS STREQUAL "") + message(FATAL_ERROR "Directory ${DIR} is empty.") + endif() + + message(STATUS "Directory ${DIR} exists and is non-empty.") +endfunction() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..ba9d5df --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,61 @@ +cmake_minimum_required(VERSION 3.10) +project(RawHash2) + +# option(USE_CCACHE "Use ccache to speed up rebuilds" ON) +# option(USE_MOLD "Use mold linker for faster linking" ON) + +# option(PYBINDING "Build with Python bindings" OFF) # WIP + +# for debugging +option(ENABLE_ASAN "Enable AddressSanitizer" OFF) +option(ENABLE_TSAN "Enable ThreadSanitizer" OFF) +option(PROFILE "Enable profiling support" OFF) + +# whether to compile or instead assume it is prebuilt +option(POD5_DOWNLOAD "Whether to build POD5" ON) # todo: rename to POD5_COMPILE +option(HDF5_COMPILE "Compile HDF5" ON) +option(SLOW5_COMPILE "Compile slow5lib" ON) + +# features to enable/disable in RawHash +# option(NOPOD5 "Disable POD5 in build" OFF) # todo: figure out how to enable shared libs +option(NOPOD5 "Disable POD5 in build" ON) +option(NOHDF5 "Disable HDF5 in build" OFF) +option(NOSLOW5 "Disable SLOW5 in build" OFF) +option(RUCLIENT_ENABLED "Enable ReadUntil client" OFF) + +set(POD5_DIR "" CACHE PATH "Path to POD5 directory (already built or where it should be built)") +set(HDF5_DIR "" CACHE PATH "Path to HDF5 directory (already built or where it should be built)") +set(SLOW5_DIR "" CACHE PATH "Path to SLOW5 directory (already built or where it should be built)") +set(RUCLIENT_DIR "" CACHE PATH "Path to ReadUntil directory (where it should be built)") + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Directory where to put binaries") + +set(EXTERNAL_PROJECTS_BUILD_DIR ${CMAKE_BINARY_DIR}/extern) # default build directory for external projects +file(MAKE_DIRECTORY ${EXTERNAL_PROJECTS_BUILD_DIR}) +message(STATUS "External projects build directory: ${EXTERNAL_PROJECTS_BUILD_DIR}") + +include(${CMAKE_SOURCE_DIR}/cmake/SetupCCacheMold.cmake) +include(${CMAKE_SOURCE_DIR}/cmake/SetupRawHashLikeTarget.cmake) + +set(CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "verbose makefile" FORCE) # print compilation commands + +enable_ccache() +set_alternate_linker(mold) + +add_executable(rawhash2_builtin main.cpp) +setup_rawhashlike_target(rawhash2_builtin) + +add_library(rawhash2_lib SHARED rawhash_wrapper.cpp rawhash_wrapper.hpp) +setup_rawhashlike_target(rawhash2_lib) + +file(GLOB_RECURSE HEADER_FILES ${CMAKE_SOURCE_DIR}/src/*.h ${CMAKE_SOURCE_DIR}/src/*.hpp) +set_property(TARGET rawhash2_lib PROPERTY PUBLIC_HEADER ${HEADER_FILES}) +set_target_properties(rawhash2_lib PROPERTIES INSTALL_RPATH $ORIGIN OUTPUT_NAME rawhash2) +install(TARGETS rawhash2_lib + LIBRARY DESTINATION lib + PUBLIC_HEADER DESTINATION include/${PROJECT_NAME}) + +add_executable(rawhash2 wrapper_example.cpp) +target_link_libraries(rawhash2 PRIVATE rawhash2_lib) +set_target_properties(rawhash2 PROPERTIES INSTALL_RPATH $ORIGIN/../lib) +install(TARGETS rawhash2 RUNTIME DESTINATION bin) diff --git a/src/Makefile b/src/Makefile deleted file mode 100644 index ac9f02d..0000000 --- a/src/Makefile +++ /dev/null @@ -1,255 +0,0 @@ -#pass DEBUG=1 to make to enable debugging -ifdef DEBUG - CFLAGS=-Wall -O2 -fsanitize=address -g -Wc++-compat -march=native -pthread -DHAVE_KALLOC - CPPFLAGS=--std=c++11 -Wall -O2 -fsanitize=address -g -march=native -pthread -DHAVE_KALLOC -else - CFLAGS=-Wall -O3 -Wc++-compat -march=native -pthread -DHAVE_KALLOC - CPPFLAGS=-std=c++11 -Wall -O3 -march=native -pthread -DHAVE_KALLOC -endif - -#pass PROFILE=1 to make to enable profiling -ifdef PROFILE - CFLAGS+=-g -fno-omit-frame-pointer -march=native -DPROFILERH=1 - CPPFLAGS+=-g -fno-omit-frame-pointer -march=native -DPROFILERH=1 -endif - -OBJS= kthread.o kalloc.o bseq.o roptions.o sequence_until.o rutils.o rsig.o revent.o rsketch.o rindex.o lchain.o rseed.o rmap.o dtw.o hit.o main.o - -CXX_COMPILER_VERSION ?= $(shell $(CXX) -dumpversion) -SYSTEM_PROCESSOR ?= $(shell uname -m) -SYSTEM_NAME ?= $(shell uname -s) - -WORKDIR = $(shell pwd)/../extern - -POD5_VERSION = 0.3.10 -POD5_URLDIR = pod5-$(POD5_VERSION)-$(SYSTEM_NAME) -POD5_REPO = https://github.com/nanoporetech/pod5-file-format -ifdef POD5_INCLUDE_DIR - POD5_DOWNLOAD = 0 -endif -POD5_INCLUDE_DIR ?= $(WORKDIR)/$(POD5_URLDIR)/include - -ifeq ($(SYSTEM_NAME),Linux) - POD5_LIB ?= lib64 - ifeq ($(shell echo $(SYSTEM_PROCESSOR) | grep -E '^(aarch64|arm)'),) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-linux-x64.tar.gz - else ifeq ($(shell expr $(CXX_COMPILER_VERSION) \< 9.0),1) - POD5_URLDIR = pod5-$(POD5_VERSION)-$(SYSTEM_NAME) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-linux-gcc7-arm64.tar.gz - POD5_LIB ?= lib - else - POD5_URLDIR = pod5-$(POD5_VERSION)-$(SYSTEM_NAME) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-linux-arm64.tar.gz - endif - POD5_LIB_DIR ?= $(WORKDIR)/$(POD5_URLDIR)/$(POD5_LIB) - POD5_LIBRARIES ?= $(POD5_LIB_DIR)/libpod5_format.a \ - $(POD5_LIB_DIR)/libarrow.a \ - $(POD5_LIB_DIR)/libjemalloc_pic.a -endif - -ifeq ($(SYSTEM_NAME),Darwin) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-osx-11.0-arm64.tar.gz - POD5_LIB_DIR ?= $(WORKDIR)/$(POD5_URLDIR)/lib - POD5_LIBRARIES ?= $(POD5_LIB_DIR)/libpod5_format.a \ - $(POD5_LIB_DIR)/libarrow.a -endif - -ifeq ($(SYSTEM_NAME),Windows_NT) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-win -endif - -HDF5_DIR ?= ${WORKDIR}/hdf5/ -HDF5_BUILD_DIR ?= ${HDF5_DIR}/build -ifdef HDF5_INCLUDE_DIR - HDF5_COMPILE = 0 -endif -HDF5_INCLUDE_DIR ?= ${HDF5_BUILD_DIR}/include -HDF5_LIB_DIR ?= ${HDF5_BUILD_DIR}/lib -HDF5_LIB ?= hdf5 - -SLOW5_DIR ?= ${WORKDIR}/slow5lib/ -ifdef SLOW5_INCLUDE_DIR - SLOW5_COMPILE = 0 -endif -SLOW5_INCLUDE_DIR ?= ${SLOW5_DIR}/include -SLOW5_LIB_DIR ?= ${SLOW5_DIR}/lib - -#pass NOPOD5=1 to make to disable compiling with POD5 -ifeq ($(NOPOD5),1) - CFLAGS+=-DNPOD5RH=1 - CPPFLAGS+=-DNPOD5RH=1 -else - INCLUDES+=-I${POD5_INCLUDE_DIR} - LIBS+=-L${WORKDIR}/zstd/lib/ ${POD5_LIBRARIES} -lzstd -endif - -#pass NOHDF5=1 to make to disable compiling with HDF5 -ifeq ($(NOHDF5),1) - CFLAGS+=-DNHDF5RH=1 - CPPFLAGS+=-DNHDF5RH=1 -else - INCLUDES+=-I${HDF5_INCLUDE_DIR} - LIBS+=${HDF5_LIB_DIR}/lib${HDF5_LIB}.a -endif - -#pass NOSLOW5=1 to make to disable compiling with SLOW5 -ifeq ($(NOSLOW5),1) - CFLAGS+=-DNSLOW5RH=1 - CPPFLAGS+=-DNSLOW5RH=1 -else - INCLUDES+=-I${SLOW5_INCLUDE_DIR} - LIBS+=${SLOW5_LIB_DIR}/libslow5.a -endif - -LIBS+=-lm -lz -ldl - -PROG=rawhash2 - -ifneq ($(aarch64),) - arm_neon=1 -endif - -ifneq ($(arm_neon),) # if arm_neon is defined -ifeq ($(aarch64),) #if aarch64 is not defined - CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char -else #if aarch64 is defined - CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char -endif -endif - -ifneq ($(asan),) - CFLAGS+=-fsanitize=address - LIBS+=-fsanitize=address -endif - -ifneq ($(tsan),) - CFLAGS+=-fsanitize=thread - LIBS+=-fsanitize=thread -endif - -.PHONY: all subset clean help - -all: zstd hdf5 slow5 pod5 check_hdf5 check_slow5 check_pod5 $(PROG) ## Build RawHash with HDF5, SLOW5, and POD5 -subset: check_zstd check_pod5 check_slow5 check_hdf5 $(PROG) ## Build RawHash without recompiling HDF5, SLOW5, and POD5 - -help: ##Show help - @echo "Set PROFILE=1 to enable profiling" - @echo "Set DEBUG=1 to enable debugging" - @echo - @echo "Set NOHDF5=1 to disable compiling with HDF5 (note that at least one of HDF5, SLOW5, and POD5 must be enabled)" - @echo "Set NOSLOW5=1 to disable compiling with SLOW5 (note that at least one of HDF5, SLOW5, and POD5 must be enabled)" - @echo "Set NOPOD5=1 to disable compiling with POD5 (note that at least one of HDF5, SLOW5, and POD5 must be enabled)" - @echo - @echo "use \"make subset\" to prevent compiling HDF5, SLOW5, and POD5 libraries from scratch if they are already built" - @echo "use \"make clean\" to remove all generated files" - -check_zstd: - @if [ "$(NOPOD5)" != "1" ]; then \ - [ -f "${WORKDIR}/zstd/lib/libzstd.so" ] || { echo "ZSTD library not found" >&2; exit 1; }; \ - fi - -check_hdf5: - @if [ "$(NOHDF5)" != "1" ]; then \ - [ -f "${HDF5_INCLUDE_DIR}/H5pubconf.h" ] || { echo "HDF5 headers not found" >&2; exit 1; }; \ - [ -f "${HDF5_LIB_DIR}/lib${HDF5_LIB}.so" ] || [ -f "${HDF5_LIB_DIR}/lib${HDF5_LIB}.a" ] || { echo "HDF5 library not found" >&2; exit 1; }; \ - fi - -check_slow5: - @if [ "$(NOSLOW5)" != "1" ]; then \ - [ -f "${SLOW5_INCLUDE_DIR}/slow5/slow5.h" ] || { echo "SLOW5 headers not found" >&2; exit 1; }; \ - [ -f "${SLOW5_LIB_DIR}/libslow5.so" ] || [ -f "${SLOW5_LIB_DIR}/libslow5.a" ] || { echo "SLOW5 library not found" >&2; exit 1; }; \ - fi - -check_pod5: - @if [ "$(NOPOD5)" != "1" ]; then \ - [ -f "${POD5_INCLUDE_DIR}/pod5_format/c_api.h" ] || { echo "POD5 headers not found" >&2; exit 1; }; \ - [ -f "$(POD5_LIB_DIR)/libpod5_format.a" ] || { echo "POD5 library not found" >&2; exit 1; }; \ - fi - -zstd: - @if [ "$(NOPOD5)" != "1" ]; then \ - cd ${WORKDIR}/zstd && make -j; \ - fi - -hdf5: - @if [ "$(NOHDF5)" != "1" ] && [ "$(HDF5_COMPILE)" != "0" ]; then \ - cd ${HDF5_DIR}; \ - mkdir -p build; \ - ./configure --enable-threadsafe --disable-hl --prefix="${HDF5_BUILD_DIR}"; \ - make -j; \ - make install; \ - fi - -slow5: - @if [ "$(NOSLOW5)" != "1" ] && [ "$(SLOW5_COMPILE)" != "0" ]; then \ - make -C ${SLOW5_DIR} slow5_mt=1 zstd=1; \ - fi - -pod5: check_zstd - @if [ "$(NOPOD5)" != "1" ] && [ "$(POD5_DOWNLOAD)" != "0" ]; then \ - cd $(WORKDIR) && mkdir -p $(POD5_URLDIR); cd $(POD5_URLDIR); wget -qO- $(POD5_URL) | tar -xzv; \ - fi - -# ifeq ($(NOHDF5),1) -# FASTDEPCOM=${CC} -# FASTDEPCOM_FLAGS=$(CFLAGS) -# else -FASTDEPCOM=${CXX} -FASTDEPCOM_FLAGS=$(CPPFLAGS) -# endif - -# $(PROG): $(OBJS) -# ${FASTDEPCOM} $(FASTDEPCOM_FLAGS) $(OBJS) -o $(PROG) $(LIBS) - -$(PROG): $(OBJS) - ${CXX} $(CPPFLAGS) $(OBJS) -o $(PROG) $(LIBS) -lstdc++ - -# rsig.o: rsig.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# rindex.o: rindex.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# main.o: main.cpp -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# rmap.o: rmap.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# lchain.o: lchain.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# rseed.o: rseed.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# hit.o: hit.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -.SUFFIXES: -.SUFFIXES:.cpp .o - -%.o: %.cpp - ${CXX} -c $(CPPFLAGS) $(INCLUDES) $< -o $@ - -.SUFFIXES: -.SUFFIXES:.c .o - -# %.o: %.c -# ${CC} -c $(CFLAGS) $(INCLUDES) $< -o $@ - -%.o: %.c - $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -clean: ## Remove all generated files - rm -fr *.o $(PROG) *~ - -rsketch.o: rutils.h rh_kvec.h -lchain.o: kalloc.h rutils.h rseed.h rsketch.h chain.h krmq.h -hit.o: chain.h kalloc.h khash.h -rsig.o: hdf5_tools.hpp rh_kvec.h -rseed.o: rsketch.h kalloc.h rutils.h rindex.h -hit.o: rmap.h kalloc.h khash.h -rmap.o: rindex.h rsig.h kthread.h rh_kvec.h rutils.h rsketch.h revent.h sequence_until.h dtw.h -revent.o: roptions.h kalloc.h -rindex.o: roptions.h rutils.h rsketch.h rsig.h bseq.h khash.h rh_kvec.h kthread.h -main:o rawhash.h ketopt.h rutils.h diff --git a/src/cli_parsing.cpp b/src/cli_parsing.cpp new file mode 100644 index 0000000..087d4cc --- /dev/null +++ b/src/cli_parsing.cpp @@ -0,0 +1,513 @@ +// parse rawhash options + +#include +#include + +#include "ketopt.h" +#include "rawhash.h" + +#define RH_VERSION "2.1" + +static ko_longopt_t long_options[] = { + { (char*)"level_column", ko_required_argument, 300 }, + { (char*)"q-mid-occ", ko_required_argument, 301 }, + { (char*)"mid_occ_frac", ko_required_argument, 302 }, + { (char*)"min-events", ko_required_argument, 303 }, + { (char*)"bw", ko_required_argument, 304 }, + { (char*)"max-target-gap", ko_required_argument, 305 }, + { (char*)"max-query-gap", ko_required_argument, 306 }, + { (char*)"min-anchors", ko_required_argument, 307 }, + { (char*)"min-score", ko_required_argument, 308 }, + { (char*)"chain-gap-scale", ko_required_argument, 309 }, + { (char*)"chain-skip-scale", ko_required_argument, 310 }, + { (char*)"best-chains", ko_required_argument, 311 }, + { (char*)"primary-ratio", ko_required_argument, 312 }, + { (char*)"primary-length", ko_required_argument, 313 }, + { (char*)"max-skips", ko_required_argument, 314 }, + { (char*)"max-iterations", ko_required_argument, 315 }, + { (char*)"rmq", ko_no_argument, 316 }, + { (char*)"rmq-inner-dist", ko_required_argument, 317 }, + { (char*)"rmq-size-cap", ko_required_argument, 318 }, + { (char*)"bw-long", ko_required_argument, 319 }, + { (char*)"max-chunks", ko_required_argument, 320 }, + { (char*)"min-mapq", ko_required_argument, 321 }, + { (char*)"alt-drop", ko_required_argument, 322 }, + { (char*)"w-besta", ko_required_argument, 323 }, + { (char*)"w-bestma", ko_required_argument, 324 }, + { (char*)"w-bestq", ko_required_argument, 325 }, + { (char*)"w-bestmq", ko_required_argument, 326 }, + { (char*)"w-bestmc", ko_required_argument, 327 }, + { (char*)"w-threshold", ko_required_argument, 328 }, + { (char*)"bp-per-sec", ko_required_argument, 329 }, + { (char*)"sample-rate", ko_required_argument, 330 }, + { (char*)"chunk-size", ko_required_argument, 331 }, + { (char*)"seg-window-length1", ko_required_argument, 332 }, + { (char*)"seg-window-length2", ko_required_argument, 333 }, + { (char*)"seg-threshold1", ko_required_argument, 334 }, + { (char*)"seg-threshold2", ko_required_argument, 335 }, + { (char*)"seg-peak-height", ko_required_argument, 336 }, + { (char*)"sequence-until", ko_no_argument, 337 }, + { (char*)"threshold", ko_required_argument, 338 }, + { (char*)"n-samples", ko_required_argument, 339 }, + { (char*)"test-frequency", ko_required_argument, 340 }, + { (char*)"min-reads", ko_required_argument, 341 }, + { (char*)"occ-frac", ko_required_argument, 342 }, + { (char*)"depletion", ko_no_argument, 343 }, + { (char*)"store-sig", ko_no_argument, 344 }, + { (char*)"sig-target", ko_no_argument, 345 }, + { (char*)"disable-adaptive", ko_no_argument, 346 }, + { (char*)"sig-diff", ko_required_argument, 347 }, + { (char*)"align", ko_no_argument, 348 }, + { (char*)"dtw-evaluate-chains", ko_no_argument, 349 }, + { (char*)"dtw-output-cigar", ko_no_argument, 350 }, + { (char*)"dtw-border-constraint", ko_required_argument, 351 }, + { (char*)"dtw-log-scores", ko_no_argument, 352 }, + { (char*)"no-chainingscore-filtering", ko_no_argument, 353 }, + { (char*)"dtw-match-bonus", ko_required_argument, 354 }, + { (char*)"output-chains", ko_no_argument, 355 }, + { (char*)"dtw-fill-method", ko_required_argument, 356 }, + { (char*)"dtw-min-score", ko_required_argument, 357 }, + { (char*)"log-anchors", ko_no_argument, 358 }, + { (char*)"log-num-anchors", ko_no_argument, 359 }, + // { (char*)"ava", ko_no_argument, 360 }, + { (char*)"r10", ko_no_argument, 361 }, + { (char*)"rev-query", ko_no_argument, 362 }, + // { (char*)"map-model", ko_required_argument, 363 }, + { (char*)"fine-min", ko_required_argument, 364 }, + { (char*)"fine-max", ko_required_argument, 365 }, + { (char*)"fine-range", ko_required_argument, 366 }, + { (char*)"ru-server-port", ko_required_argument, 367 }, // readuntil for real device + { (char*)"out-quantize", ko_no_argument, 368 }, + { (char*)"no-event-detection", ko_no_argument, 369 }, + { (char*)"version", ko_no_argument, 370 }, + { (char*)"no-revcomp-query", ko_no_argument, 371 }, + { 0, 0, 0 } +}; + +static inline int64_t mm_parse_num(const char *str) +{ + double x; + char *p; + x = strtod(str, &p); + if (*p == 'G' || *p == 'g') x *= 1e9, ++p; + else if (*p == 'M' || *p == 'm') x *= 1e6, ++p; + else if (*p == 'K' || *p == 'k') x *= 1e3, ++p; + return (int64_t)(x + .499); +} + +// static inline void yes_or_no(ri_mapopt_t *opt, int64_t flag, int long_idx, const char *arg, int yes_to_set) +// { +// if (yes_to_set) { +// if (strcmp(arg, "yes") == 0 || strcmp(arg, "y") == 0) opt->flag |= flag; +// else if (strcmp(arg, "no") == 0 || strcmp(arg, "n") == 0) opt->flag &= ~flag; +// else fprintf(stderr, "[WARNING]\033[1;31m option '--%s' only accepts 'yes' or 'no'.\033[0m\n", long_options[long_idx].name); +// } else { +// if (strcmp(arg, "yes") == 0 || strcmp(arg, "y") == 0) opt->flag &= ~flag; +// else if (strcmp(arg, "no") == 0 || strcmp(arg, "n") == 0) opt->flag |= flag; +// else fprintf(stderr, "[WARNING]\033[1;31m option '--%s' only accepts 'yes' or 'no'.\033[0m\n", long_options[long_idx].name); +// } +// } + +// set index and mapping options, initialize when preset is NULL +int ri_set_opt(const char *preset, ri_idxopt_t *io, ri_mapopt_t *mo) +{ + if (preset == 0) { + ri_idxopt_init(io); + ri_mapopt_init(mo); + } else if (strcmp(preset, "viral") == 0) { + io->e = 6; + mo->bw = 100; mo->max_target_gap_length = 500; mo->max_query_gap_length = 500; + mo->max_num_chunk = 5, mo->min_chaining_score = 10; mo->chain_gap_scale = 1.2f; mo->chain_skip_scale = 0.3f; + } else if (strcmp(preset, "sensitive") == 0) { + //default + } else if (strcmp(preset, "fast") == 0) { + io->fine_range = 0.6; + mo->min_mapq = 5, mo->min_chaining_score = 10, mo->chain_gap_scale = 0.6f; + } else if (strcmp(preset, "faster") == 0) { + io->e = 11; io->w = 3; + io->fine_range = 0.6; + mo->max_num_chunk = 5; mo->min_mapq = 5, mo->min_chaining_score = 10, mo->chain_gap_scale = 0.6f; + } else if (strcmp(preset, "ava-viral") == 0) { + io->e = 6; + mo->bw = 100; mo->max_target_gap_length = 500; mo->max_query_gap_length = 500; + mo->max_num_chunk = 5, mo->min_chaining_score = 10; mo->chain_gap_scale = 1.2f; mo->chain_skip_scale = 0.3f; + + io->flag |= RI_I_SIG_TARGET; + mo->flag |= RI_M_ALL_CHAINS; + mo->min_chaining_score = 80; + mo->flag |= RI_M_NO_ADAPTIVE; + } else if (strcmp(preset, "ava-small") == 0) { + //default + + io->flag |= RI_I_SIG_TARGET; + mo->flag |= RI_M_ALL_CHAINS; + mo->min_chaining_score = 100; + mo->flag |= RI_M_NO_ADAPTIVE; + + } else if (strcmp(preset, "ava-large") == 0) { + io->fine_range = 0.6; + mo->chain_gap_scale = 0.6f; + + io->flag |= RI_I_SIG_TARGET; + mo->flag |= RI_M_ALL_CHAINS; + mo->min_chaining_score = 70; + mo->flag |= RI_M_NO_ADAPTIVE; + + } else if (strcmp(preset, "ava-pro") == 0) { + io->flag |= RI_I_SIG_TARGET; + mo->flag |= RI_M_ALL_CHAINS; + mo->min_chaining_score = 50; + mo->flag |= RI_M_NO_ADAPTIVE; + } + + else if (strcmp(preset, "ava-faster") == 0) { + io->e = 11; io->w = 3; + io->fine_range = 0.6; + mo->max_num_chunk = 5; mo->min_mapq = 5, mo->chain_gap_scale = 0.6f; + + io->flag |= RI_I_SIG_TARGET; + mo->flag |= RI_M_ALL_CHAINS; + mo->min_chaining_score = 100; + mo->flag |= RI_M_NO_ADAPTIVE; + + } else if (strcmp(preset, "sequence-until") == 0) { + //default + } else return -1; + return 0; +} + +int ri_mapopt_parse_dtw_border_constraint(ri_mapopt_t *opt, char* arg){ + if(strcmp(arg, "global") == 0){ + opt->dtw_border_constraint = RI_M_DTW_BORDER_CONSTRAINT_GLOBAL; + } + else if(strcmp(arg, "sparse") == 0){ + opt->dtw_border_constraint = RI_M_DTW_BORDER_CONSTRAINT_SPARSE; + } + else if(strcmp(arg, "local") == 0){ + opt->dtw_border_constraint = RI_M_DTW_BORDER_CONSTRAINT_LOCAL; + } + else{ + return -1; + } + return 0; +} + +int ri_mapopt_parse_dtw_fill_method(ri_mapopt_t *opt, char* arg) { + if (strcmp(arg, "banded") == 0) { + opt->dtw_fill_method = RI_M_DTW_FILL_METHOD_BANDED; + } else if (strcmp(arg, "full") == 0) { + opt->dtw_fill_method = RI_M_DTW_FILL_METHOD_FULL; + } else if (strncmp(arg, "banded=", 7) == 0) { + opt->dtw_fill_method = RI_M_DTW_FILL_METHOD_BANDED; + opt->dtw_band_radius_frac = std::atof(arg + 7); + } else { + return -1; + } + return 0; +} + +const char* ri_maptopt_dtw_mode_to_string(uint32_t dtw_border_constraint){ + switch(dtw_border_constraint){ + case RI_M_DTW_BORDER_CONSTRAINT_GLOBAL: + return "full"; + case RI_M_DTW_BORDER_CONSTRAINT_SPARSE: + return "sparse"; + case RI_M_DTW_BORDER_CONSTRAINT_LOCAL: + return "window"; + default: + return "unknown"; + } +} + +struct CLIParsedArgs { + ri_mapopt_t opt; + ri_idxopt_t ipt; + int n_threads = 3; + char* idx_out_filename = 0; // where to dump index + char* fpore = 0; // pore model filename + FILE *fp_help = stderr; // where to print the help message + int ru_server_port = -1; // for connecting to a real device via ReadUntil + ketopt_t o; // for further parsing +}; +// parse rawhash args +CLIParsedArgs parse_args(int argc, char *argv[]) { + CLIParsedArgs parsed_args; + ri_mapopt_t& opt = parsed_args.opt; + ri_idxopt_t& ipt = parsed_args.ipt; + int& n_threads = parsed_args.n_threads; + char*& idx_out_filename = parsed_args.idx_out_filename; + char*& fpore = parsed_args.fpore; + FILE*& fp_help = parsed_args.fp_help; + int& ru_server_port = parsed_args.ru_server_port; + ketopt_t& o = parsed_args.o; + + ri_set_opt(0, &ipt, &opt); + + fprintf(stderr, "Using RawHash version 2.1\n"); + // print args + fprintf(stderr, "Received the following args: "); + for (int i = 0; i < argc; i++) { + fprintf(stderr, "%s ", argv[i]); + } + fprintf(stderr, "\n"); + + int c; // for parsing options, char or int + char *s; + const char *opt_str = "k:d:p:e:q:w:n:o:t:K:x:h"; + + o = KETOPT_INIT; + // test command line options and apply option -x/preset first + while ((c = ketopt(&o, argc, argv, 1, opt_str, long_options)) >= 0) { + if (c == 'x') { + if (ri_set_opt(o.arg, &ipt, &opt) < 0) { + fprintf(stderr, "[ERROR] unknown preset '%s'\n", o.arg); + exit(EXIT_FAILURE); + } + } else if (c == ':') { + fprintf(stderr, "[ERROR] missing option argument\n"); + exit(EXIT_FAILURE); + } else if (c == '?') { + fprintf(stderr, "[ERROR] unknown option in \"%s\"\n", argv[o.i - 1]); + exit(EXIT_FAILURE); + } + } + o = KETOPT_INIT; + + while ((c = ketopt(&o, argc, argv, 1, opt_str, long_options)) >= 0) { + if (c == 'd') idx_out_filename = o.arg; + else if (c == 'p') fpore = o.arg; + else if (c == 'k') ipt.k = atoi(o.arg); + else if (c == 'e') ipt.e = atoi(o.arg); + else if (c == 'q') ipt.q = atoi(o.arg); + else if (c == 'w') ipt.w = atoi(o.arg); + else if (c == 'n') ipt.n = atoi(o.arg); + else if (c == 't') n_threads = atoi(o.arg); + else if (c == 'v') ri_verbose = atoi(o.arg); + else if (c == 'K') {opt.mini_batch_size = mm_parse_num(o.arg);} + else if (c == 'h') fp_help = stdout; + else if (c == 'o') { + if (strcmp(o.arg, "-") != 0) { + if (freopen(o.arg, "wb", stdout) == NULL) { + fprintf(stderr, "[ERROR]\033[1;31m failed to write the output to file '%s'\033[0m: %s\n", o.arg, strerror(errno)); + exit(1); + } + } + } + else if (c == 300) ipt.lev_col = atoi(o.arg);// --level_column + else if (c == 301) { //--q-mid-occ + opt.min_mid_occ = strtol(o.arg, &s, 10); // min + if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10); //max + // opt.q_mid_occ = atoi(o.arg);// --q-mid-occ + } + else if (c == 302) opt.mid_occ_frac = atof(o.arg);// --occ-frac + else if (c == 303) opt.min_events = (uint32_t)atoi(o.arg); // --min-events + else if (c == 304) opt.bw = atoi(o.arg);// --bw + else if (c == 305) opt.max_target_gap_length = atoi(o.arg);// --max-target-gap + else if (c == 306) opt.max_query_gap_length = atoi(o.arg);// --max-query-gap + else if (c == 307) opt.min_num_anchors = atoi(o.arg);// --min-anchors + else if (c == 308) opt.min_chaining_score = atoi(o.arg);// --min-score + else if (c == 309) opt.chain_gap_scale = atof(o.arg);// --chain-gap-scale + else if (c == 310) opt.chain_skip_scale = atof(o.arg);// --chain-skip-scale + else if (c == 311) opt.best_n = atoi(o.arg);// --best-chains + else if (c == 312) opt.mask_level = atof(o.arg);// --primary-ratio + else if (c == 313) opt.mask_len = atoi(o.arg);// --primary-length + else if (c == 314) opt.max_num_skips = atoi(o.arg);// --max-skips + else if (c == 315) opt.max_chain_iter = atoi(o.arg);// --max-iterations + else if (c == 316) opt.flag |= RI_M_RMQ; // --rmq + else if (c == 317) opt.rmq_inner_dist = atoi(o.arg); // --rmq-inner-dist + else if (c == 318) opt.rmq_size_cap = atoi(o.arg); // --rmq-size-cap + else if (c == 319) opt.bw_long = atoi(o.arg);// --bw-long + else if (c == 320) opt.max_num_chunk = atoi(o.arg);// --max-chunks + else if (c == 321) opt.min_mapq = atoi(o.arg);// --min-mapq + else if (c == 322) opt.alt_drop = atof(o.arg);// --alt-drop + else if (c == 323) opt.w_besta = atof(o.arg);// --w-besta + else if (c == 324) opt.w_bestma = atof(o.arg);// --w-bestma + else if (c == 325) opt.w_bestq = atof(o.arg);// --w-bestq + else if (c == 326) opt.w_bestmq = atof(o.arg);// --w-bestmq + else if (c == 327) opt.w_bestmc = atof(o.arg);// --w-bestmc + else if (c == 328) opt.w_threshold = atof(o.arg);// --w-threshold + else if (c == 329) { + opt.bp_per_sec = atoi(o.arg); opt.sample_per_base = (float)opt.sample_rate / opt.bp_per_sec; + ipt.bp_per_sec = atoi(o.arg); ipt.sample_per_base = (float)ipt.sample_rate / ipt.bp_per_sec; + }// --bp-per-sec + else if (c == 330) { + opt.sample_rate = atoi(o.arg); opt.sample_per_base = (float)opt.sample_rate / opt.bp_per_sec; + ipt.sample_rate = atoi(o.arg); ipt.sample_per_base = (float)ipt.sample_rate / ipt.bp_per_sec; + }// --sample-rate + else if (c == 331) opt.chunk_size = atoi(o.arg);// --chunk-size + else if (c == 332) {opt.window_length1 = atoi(o.arg); ipt.window_length1 = atoi(o.arg);}// --seg-window-length1 + else if (c == 333) {opt.window_length2 = atoi(o.arg); ipt.window_length2 = atoi(o.arg);}// --seg-window-length2 + else if (c == 334) {opt.threshold1 = atof(o.arg); ipt.threshold1 = atof(o.arg);}// --seg-threshold1 + else if (c == 335) {opt.threshold2 = atof(o.arg); ipt.threshold2 = atof(o.arg);}// --seg-threshold2 + else if (c == 336) {opt.peak_height = atof(o.arg); ipt.peak_height = atof(o.arg);}// --seg-peak-height + else if (c == 337) opt.flag |= RI_M_SEQUENCEUNTIL;// --sequence-until + else if (c == 338) opt.t_threshold = atof(o.arg);// --threshold + else if (c == 339) opt.tn_samples = atoi(o.arg);// --n-samples + else if (c == 340) opt.ttest_freq = atoi(o.arg);// --test-frequency + else if (c == 341) opt.tmin_reads = atoi(o.arg);// --min-reads + else if (c == 342) opt.mid_occ_frac = atof(o.arg);// --occ-frac + else if (c == 343) { // --depletion + opt.best_n = 5; opt.min_mapq = 10; opt.w_threshold = 0.50f; + opt.min_num_anchors = 2; opt.min_chaining_score = 15; opt.chain_skip_scale = 0.0f; + } + else if (c == 344) {ipt.flag |= RI_I_STORE_SIG;} // --store-sig + else if (c == 345) {ipt.flag |= RI_I_SIG_TARGET;} // --sig-target + else if (c == 346) {opt.flag |= RI_M_NO_ADAPTIVE;} // --disable-adaptive + else if (c == 347) {ipt.diff = atof(o.arg);} // --sig-diff + else if (c == 348) {opt.flag |= RI_M_ALIGN;} // --align + else if (c == 349) opt.flag |= RI_M_DTW_EVALUATE_CHAINS; // --dtw-evaluate-chains + else if (c == 350) opt.flag |= RI_M_DTW_OUTPUT_CIGAR; // --dtw-output-cigar + else if (c == 351) { //--dtw-border-constraint + if(ri_mapopt_parse_dtw_border_constraint(&opt, o.arg) != 0){ + fprintf(stderr, "[ERROR] unknown DTW border constraint in \"%s\"\n", argv[o.i - 1]); + exit(EXIT_FAILURE); + } + } + else if (c == 352) opt.flag |= RI_M_DTW_LOG_SCORES; // --dtw-log-scores + else if (c == 353) opt.flag |= RI_M_DISABLE_CHAININGSCORE_FILTERING; // --no-chainingscore-filtering + else if (c == 354) opt.dtw_match_bonus = atof(o.arg); // --dtw-match-bonus + else if (c == 355) opt.flag |= RI_M_OUTPUT_CHAINS; // --output-chains + else if (c == 356) { //dtw-fill-method + if(ri_mapopt_parse_dtw_fill_method(&opt, o.arg) != 0){ + fprintf(stderr, "[ERROR] unknown DTW fill method in \"%s\"\n", argv[o.i - 1]); + exit(EXIT_FAILURE); + } + } + else if (c == 357) opt.dtw_min_score = atof(o.arg); // --dtw-min-score + else if (c == 358) opt.flag |= RI_M_LOG_ANCHORS; // --log-anchors + else if (c == 359) opt.flag |= RI_M_LOG_NUM_ANCHORS; // --log-num-anchors + else if (c == 361) { // --r10 + ipt.k = 9; + + ipt.window_length1 = 3; ipt.window_length2 = 6; + ipt.threshold1 = 6.5f; ipt.threshold2 = 4.0f; + ipt.peak_height = 0.2f; + + opt.window_length1 = 3; opt.window_length2 = 6; + opt.threshold1 = 6.5f; opt.threshold2 = 4.0f; + opt.peak_height = 0.2f; + + opt.chain_gap_scale = 1.2f; + } + else if (c == 362) {ipt.flag |= RI_I_REV_QUERY;}// --rev-query, better name: --idx-dont-store-rev, or negate + // else if (c == 363) {opt.model_map_path = o.arg;}// --map-model + else if (c == 364) {ipt.fine_min = atof(o.arg);}// --fine-min + else if (c == 365) {ipt.fine_max = atof(o.arg);}// --fine-max + else if (c == 366) {ipt.fine_range = atof(o.arg);}// --fine-range + else if (c == 367) {ru_server_port = atoi(o.arg); assert(ru_server_port >= 0); }// --ru-server-port + else if (c == 368) {ipt.flag |= RI_I_OUT_QUANTIZE; ipt.flag |= RI_I_SIG_TARGET;}// --out-quantize + else if (c == 369) {ipt.flag |= RI_I_NO_EVENT_DETECTION;}// --no-event-detection + else if (c == 370) {puts(RH_VERSION); exit(EXIT_SUCCESS);}// --version + else if (c == 371) {opt.flag |= RI_M_DONT_QUERY_REVCOMP;}// --no-revcomp-query, only has an effect when "--rev-query" is used, to avoid querying the index with the revcomp + else if (c == 'V') {puts(RH_VERSION); exit(EXIT_SUCCESS);} + } + + if (argc == o.ind || fp_help == stdout) { + fprintf(fp_help, "Usage: rawhash [options] | [query.fast5] [...]\n"); + fprintf(fp_help, "Options:\n"); + + fprintf(fp_help, " K-mer (pore) Model:\n"); + fprintf(fp_help, " -p FILE pore model FILE [].\n"); + fprintf(fp_help, " -k INT size of the k-mers in the pore model [%d]. This is usually 6 for R9.4 and 9 for R10.\n", ipt.k); + fprintf(fp_help, " --level_column INT 0-based column index where the mean values are stored in the pore file [%d]. This is usually 1 for both R9.4 and R10.\n", ipt.lev_col); + + fprintf(fp_help, "\n Indexing:\n"); + fprintf(fp_help, " -d FILE [Strongly recommended to create before mapping] dump index to FILE [].\n"); + fprintf(fp_help, " -e INT number of events concatanated in a single hash (usually no larger than 10). Also applies during mapping [%d].\n", ipt.e); + fprintf(fp_help, " -q INT Number of bits to use for quantization [%d]. Number of quantized buckets are created accordingly (2^INT).\n", ipt.q); + fprintf(fp_help, " -w INT minimizer window size [%d]. Enables minimizer-based seeding in indexing and mapping (may reduce accuracy but improves the performance and memory space efficiency).\n", ipt.w); + fprintf(fp_help, " --store-sig Stores the target signal in the index file.\n"); + fprintf(fp_help, " --sig-target The target sequence (reference) contains signals rather than base characters.\n"); + fprintf(fp_help, " --sig-diff FLOAT [Advanced] Signal value (FLOAT) difference between two consecutive events to be packed together in a single hash value [%g].\n", ipt.diff); + + // fprintf(fp_help, " -n NUM number of consecutive seeds to use for BLEND-based seeding [%d]. Enables the BLEND mechanism (may improve accuracy but reduces the performance at the moment)\n", ipt.n); + + fprintf(fp_help, "\n Seeding:\n"); + fprintf(fp_help, " --q-mid-occ INT1[,INT2] Lower and upper bounds of k-mer occurrences [%d, %d]. The final k-mer occurrence threshold is max{INT1, min{INT2, --occ-frac}}. This option prevents excessively small or large -f estimated from the input reference.\n", opt.min_mid_occ, opt.max_mid_occ); + fprintf(fp_help, " --occ-frac FLOAT Discard a query seed if its occurrence is higher than FLOAT fraction of all query seeds [%g]. Set 0 to disable. [Note: Both --q-mid-occ and --occ-frac should be met for a seed to be discarded].\n", opt.q_occ_frac); + + fprintf(fp_help, "\n Chaining Parameters:\n"); + fprintf(fp_help, " --min-events INT minimum number of INT events in a chunk to start chain elongation [%u].\n", opt.min_events); + fprintf(fp_help, " --bw INT maximum INT gap length in a chain [%d].\n", opt.bw); + fprintf(fp_help, " --max-target-gap INT maximum INT target gap length in a chain [%d].\n", opt.max_target_gap_length); + fprintf(fp_help, " --max-query-gap INT maximum INT query gap length in a chain [%d].\n", opt.max_query_gap_length); + fprintf(fp_help, " --min-anchors INT chain is discarded if it contains less than INT number of anchors [%d].\n", opt.min_num_anchors); + fprintf(fp_help, " --best-chains INT best INT secondary chains to keep with their primary chains when making the mapping decisions [%d]\n", opt.best_n); + fprintf(fp_help, " --min-score INT chain is discarded if its score is < INT [%d]\n", opt.min_chaining_score); + fprintf(fp_help, " --chain-gap-scale FLOAT [Advanced] Determines [chain gap penalty] = FLOAT * 0.01 * e [%g]\n", opt.chain_gap_scale); + fprintf(fp_help, " --chain-skip-scale FLOAT [Advanced] Determines [chain skip penalty] = FLOAT * 0.01 * e [%g]\n", opt.chain_skip_scale); + // fprintf(fp_help, " --chain-match-score INT [Advanced] Match score (Used in MAPQ and Primary chain identification) [%d]\n", opt.a); + fprintf(fp_help, " --primary-ratio FLOAT [Advanced] The chain is primary if its region ratio uncovered by other chains is larger than FLOAT [%g]\n", opt.mask_level); + fprintf(fp_help, " --primary-length INT [Advanced] The chain is primary if its region length uncovered by other chains is larger than INT [%d]\n", opt.mask_len); + fprintf(fp_help, " --max-skips INT [Advanced] stop looking for a predecessor for an anchor if the best predecessor is not updated after INT many iterations [%d]\n", opt.max_num_skips); + fprintf(fp_help, " --max-iterations INT [Advanced] maximum INT number predecessor anchors to check to calculate the best score for an anchor [%d]\n", opt.max_chain_iter); + fprintf(fp_help, " --rmq [Advanced] Uses RMQ-based chaining. Faster but less accurate than default (DP)\n"); + fprintf(fp_help, " --rmq-inner-dist INT [Advanced] RMQ inner distance [%d]\n", opt.rmq_inner_dist); + fprintf(fp_help, " --rmq-size-cap INT [Advanced] RMQ cap size [%d]\n", opt.rmq_size_cap); + fprintf(fp_help, " --bw-long INT [Advanced] maximum long INT gap length to re-chain the chains. Disabled by default. To enable, set it to larger than --bw [%d]\n", opt.bw_long); + + fprintf(fp_help, "\n DTW Parameters (as introduced in RawAlign):\n"); + fprintf(fp_help, " --dtw-evaluate-chains evaluate chains using DTW. Note, the index must be built using --store-sig for this functionality to work [%s]\n", opt.flag & RI_M_DTW_EVALUATE_CHAINS? "yes" : "no"); + fprintf(fp_help, " --dtw-output-cigar output CIGAR string for DTW [%s]\n", opt.flag & RI_M_DTW_OUTPUT_CIGAR? "yes" : "no"); + fprintf(fp_help, " --dtw-border-constraint STR DTW border constraint: 'global', 'sparse' (i.e., align only between anchors), 'local' [%s]\n", ri_maptopt_dtw_mode_to_string(opt.dtw_border_constraint)); + fprintf(fp_help, " --dtw-match-bonus FLOAT DTW match bonus FLOAT [%g]\n", opt.dtw_match_bonus); + + fprintf(fp_help, "\n Mapping Decisions (Mapping and sequencing is stopped after taking any of these decisions):\n"); + fprintf(fp_help, " --max-chunks INT stop mapping (read not mapped) after sequencing INT number of chunks [%u]\n", opt.max_num_chunk); + fprintf(fp_help, " --min-mapq INT map the read if there is only one chain and its MAPQ > INT [%d]\n", opt.min_mapq); + fprintf(fp_help, " --disable-adaptive Disables stopping the read early and rather lets the read to be sequenced fully to make the analysis. This is not activated by default.\n"); + + fprintf(fp_help, "\n Nanopore Parameters:\n"); + fprintf(fp_help, " --bp-per-sec INT DNA molecules transiting through the pore (bp per second) [%u]\n", opt.bp_per_sec); + fprintf(fp_help, " --sample-rate INT current sample rate in Hz [%u]\n", opt.sample_rate); + fprintf(fp_help, " --chunk-size INT current samples in a single chunk (by default set to the amount of signals sampled in 1 second) [%u]\n", opt.chunk_size); + + fprintf(fp_help, " --seg-window-length1 INT [Advanced] First window length in segmentation [%u]\n", opt.window_length1); + fprintf(fp_help, " --seg-window-length2 INT [Advanced] Second window length in segmentation [%u]\n", opt.window_length2); + fprintf(fp_help, " --seg-threshold1 FLOAT [Advanced] Peak value threshold for the first window in segmentation [%g]\n", opt.threshold1); + fprintf(fp_help, " --seg-threshold2 FLOAT [Advanced] Peak value threshold for the first window in segmentation [%g]\n", opt.threshold2); + fprintf(fp_help, " --seg-peak-height FLOAT [Advanced] Peak height than the current signal to confirm the peak point in segmentation [%g]\n", opt.peak_height); + + fprintf(fp_help, "\n Sequence Until Parameters:\n"); + fprintf(fp_help, " --sequence-until Activates Sequence Until and performs real-time relative abundance calculations. The computation will stop as soon as an estimation with high confidence is reached without processing further reads from the set.\n"); + fprintf(fp_help, " --threshold FLOAT outliers are determined if cross-correlation distance > FLOAT [%g]. Sequencing will stop if there are no outliers in the sample of estimations.\n", opt.t_threshold); + fprintf(fp_help, " --n-samples INT New estimation is tested against INT many previous estimations [%u]\n", opt.tn_samples); + fprintf(fp_help, " --test-frequency INT Make a new estimation after every INT reads [%u]\n", opt.ttest_freq); + fprintf(fp_help, " --min-reads INT Minimum number of reads to sequence before making the first estimation [%u]\n", opt.tmin_reads); + + fprintf(fp_help, "\n Input/Output:\n"); + fprintf(fp_help, " -o FILE output mappings to FILE [stdout]\n"); + fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads); + fprintf(fp_help, " -K NUM minibatch size for mapping [500M]. Increasing this value may increase thread utilization. If there are many larger FAST5 files, it is recommended to keep this value between 500M - 5G to use less memory while utilizing threads nicely.\n"); +// fprintf(fp_help, " -v INT verbose level [%d]\n", ri_verbose); + fprintf(fp_help, " --version show version number\n"); + + fprintf(fp_help, "\n Experimental/Debugging Parameters:\n"); + fprintf(fp_help, " --out-quantize Output the quantized values from raw signals provided as input. Mapping is not performed and the index file is not needed.\n"); + fprintf(fp_help, " --no-event-detection Do not perform event detection. This can be set if your raw signal is already segmented.\n"); + + fprintf(fp_help, "\n Presets:\n"); + fprintf(fp_help, " --depletion Should be used for quickly depleting organisms for use cases that require high precision (e.g., for contamination analysis or relative abundance estimation). Can be used with or without the -x preset\n"); + fprintf(fp_help, " --r10 Sets the segmentation parameters for R10.4.1. Can be used with or without the -x preset\n"); + fprintf(fp_help, " -x STR preset (always applied before other options) []\n"); + fprintf(fp_help, " - viral Enables accurate mapping to very small genomes such as viral genomes.\n"); + fprintf(fp_help, " - sensitive Enables sensitive mapping. Suitable when working with small genomes of size < 500M.\n"); + fprintf(fp_help, " - fast Enables fast mapping with slightly reduced accuracy. Suitable when reads are mapped to large genomes of size > 500M and < 5Gb\n"); + fprintf(fp_help, " - faster Enables faster mapping than '-x fast' and reduced memory space usage for indexing with slightly reduced accuracy. This mechanism uses the minimizer sketching technique and should be used when '-x fast' cannot meet the real-time requirements for a particular genome (e.g., for very large genomes > 5Gb)\n"); + fprintf(fp_help, " - ava-viral All-vs-all overlapping for very small genomes such as viral genomes.\n"); + fprintf(fp_help, " - ava-sensitive All-vs-all overlapping for small genomes of size < 500M.\n"); + fprintf(fp_help, " - ava-fast All-vs-all overlapping for large genomes of size > 500M and < 5Gb\n"); + fprintf(fp_help, " - ava-faster All-vs-all overlapping for very large genomes > 5Gb)\n"); + + // fprintf(fp_help, "\nSee `man ./rawhash.1' for detailed description of these and other advanced command-line options.\n"); + if (fp_help == stdout) exit(EXIT_SUCCESS); + else exit(EXIT_FAILURE); + } + + if(ipt.w && ipt.n){ + fprintf(stderr, "[ERROR] minimizer window 'w' ('%d') and BLEND 'neighbor' ('%d') values cannot be set together. At least one of them must be zero to enable one of the seeding options: %s\n", ipt.w, ipt.n, strerror(errno)); + exit(EXIT_FAILURE); + } + + return parsed_args; +} diff --git a/src/hdf5_tools.hpp b/src/hdf5_tools.hpp index d9fb715..56aa546 100644 --- a/src/hdf5_tools.hpp +++ b/src/hdf5_tools.hpp @@ -8,6 +8,7 @@ #ifndef __HDF5_TOOLS_HPP #define __HDF5_TOOLS_HPP +#include #include #include #include diff --git a/src/rawhash_wrapper.cpp b/src/rawhash_wrapper.cpp new file mode 100644 index 0000000..3ab707a --- /dev/null +++ b/src/rawhash_wrapper.cpp @@ -0,0 +1,174 @@ + +#include +#include + +#include "rawhash_wrapper.hpp" +#include "rawhash.h" +#include "rmap.h" +#include "cli_parsing.cpp" + +RawHashMapper::RawHashMapper(int argc, char *argv[]) { + std::cout << "RawHashMapper constructor" << std::endl; + + // copied from main() + liftrlimit(); + ri_realtime0 = ri_realtime(); + + CLIParsedArgs parsed_args = parse_args(argc, argv); + ri_mapopt_t& opt = parsed_args.opt; + ri_idxopt_t& ipt = parsed_args.ipt; + int& n_threads = parsed_args.n_threads; + // int n_parts; + char*& idx_out_filename = parsed_args.idx_out_filename; + char*& fpore = parsed_args.fpore; + // FILE*& fp_help = parsed_args.fp_help; + int& ru_server_port = parsed_args.ru_server_port; + ketopt_t& o = parsed_args.o; + + ri_idx_reader_t *idx_rdr; + idx_rdr = ri_idx_reader_open(argv[o.ind], &ipt, idx_out_filename); + if (idx_rdr == 0) { + fprintf(stderr, "[ERROR] failed to open file '%s': %s\n", argv[o.ind], strerror(errno)); + exit(EXIT_FAILURE); + } + + if (!idx_rdr->is_idx && idx_out_filename == 0 && argc - o.ind < 2 && !(ipt.flag&RI_I_OUT_QUANTIZE)) { + fprintf(stderr, "[ERROR] missing input: please specify a query FAST5/SLOW5/POD5 file(s) to map or option -d to store the index in a file before running the mapping\n"); + ri_idx_reader_close(idx_rdr); + exit(EXIT_FAILURE); + } + + // load the pore model + ri_pore_t pore; + pore.pore_vals = NULL; + pore.pore_inds = NULL; + pore.max_val = -5000.0; + pore.min_val = 5000.0; + if(!(ipt.flag&RI_I_OUT_QUANTIZE)) { + if(!idx_rdr->is_idx && fpore == 0) { + fprintf(stderr, "[ERROR] missing input: please specify a pore model file with -p when generating the index from a sequence file\n"); + ri_idx_reader_close(idx_rdr); + exit(EXIT_FAILURE); + } else if(!idx_rdr->is_idx && fpore) { + load_pore(fpore, ipt.k, ipt.lev_col, &pore); + if(!pore.pore_vals){ + fprintf(stderr, "[ERROR] cannot parse the k-mer pore model file. Please see the example k-mer model files provided in the RawHash repository.\n"); + ri_idx_reader_close(idx_rdr); + exit(EXIT_FAILURE); + } + } + } + + + ri_idx_t* ri = ri_idx_reader_read(idx_rdr, &pore, n_threads); + if (ri == 0) { + fprintf(stderr, "[ERROR] failed to read the index\n"); + ri_idx_reader_close(idx_rdr); + exit(EXIT_FAILURE); + } + + if (ri_verbose >= 3) + fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", + __func__, ri_realtime() - ri_realtime0, ri_cputime() / (ri_realtime() - ri_realtime0), ri->n_seq); + + ri_mapopt_update(&opt, ri); + if (ri_verbose >= 3) ri_idx_stat(ri); + + _ri = ri; + _opt = new ri_mapopt_t; + *((ri_mapopt_t*)_opt) = opt; // copy constructor +} + +RawHashMapper::~RawHashMapper() { + fprintf(stderr, "[M::%s] closing the index\n", __func__); + ri_idx_destroy((ri_idx_t*)_ri); + delete _opt; +} + +void RawHashMapper::print_idx_info() const { + fprintf(stderr, "[M::%s] index info\n", __func__); // todo + ri_idx_stat((ri_idx_t*)_ri); +} + +std::vector RawHashMapper::map(float* raw_signal, int signal_length) { + // print + // fprintf(stderr, "[M::%s] mapping\n", __func__); + + if (ri_verbose >= 3) { + for (int i = 0; (i < signal_length) && (i <= 10); i++) { + fprintf(stderr, "%f ", raw_signal[i]); + } + fprintf(stderr, "..."); + } + + ri_idx_t* ri = (ri_idx_t*)_ri; + pipeline_mt pipeline = { + // .n_processed = 0, + // .n_threads = 1, + // .n_fp = 0, + // .cur_fp = 0, + // .n_f = 0, + // .cur_f = 0, + // .mini_batch_size = 0, + .opt = (ri_mapopt_t*)_opt, + // .f = NULL, + // .fp = NULL, + .ri = ri, + // .fn = NULL, + // .su_nreads = 0, + // .su_nestimations = 0, + // .ab_count = 0, + // .su_cur = 0, + // .su_estimations = NULL, + // .su_c_estimations = NULL, + // .su_stop = 0, + // .map_model = NULL + }; + ri_tbuf_t *buf = ri_tbuf_init(); + char* read_name = "read123"; + ri_sig_t signal = { + .rid = 123, + .l_sig = signal_length, + .name = read_name, + + .offset = 0, // todo + .sig = raw_signal, + }; + ri_sig_t* signal_ptr = &signal; // since &&signal not working since &signal is temporary + ri_reg1_t reg0_noptr; // will be initialized + ri_reg1_t* reg0 = ®0_noptr; + step_mt data = { + .p = &pipeline, + .n_sig = 1, + .sig = (ri_sig_t**)(&signal_ptr), + .reg = (ri_reg1_t**)(®0), + .buf = &buf + }; + map_worker_for(&data, 0, 0); + + // write out + if (ri_verbose >= 3) { + write_out_mappings_to_stdout(reg0, ri); + } + std::vector alignments; + for (int m = 0; m < reg0->n_maps; ++m) { + if (reg0->maps[m].ref_id < ri->n_seq) { + // mapped + alignments.push_back(Alignment { + .contig = (ri->flag & RI_I_SIG_TARGET) ? ri->sig[reg0->maps[m].ref_id].name : ri->seq[reg0->maps[m].ref_id].name, + .ref_start = reg0->maps[m].fragment_start_position, + .ref_end = reg0->maps[m].fragment_start_position + reg0->maps[m].fragment_length, + .is_pos_strand = not reg0->maps[m].rev + }); + } + } + // dummy alignment + // alignments.push_back(Alignment { + // .contig = "fakealignment_chr123", .ref_start = 123, .ref_end = 456, .is_pos_strand = true + // }); + free_mappings_ri_reg1_t(reg0); + + ri_tbuf_destroy(buf); + + return alignments; +} diff --git a/src/rawhash_wrapper.hpp b/src/rawhash_wrapper.hpp new file mode 100644 index 0000000..e489ac9 --- /dev/null +++ b/src/rawhash_wrapper.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +struct Alignment { + const char* contig; + uint32_t ref_start; + uint32_t ref_end; // exclusive + bool is_pos_strand; +}; + +class RawHashMapper { + /** + * @brief Wrapper that can be used from other code (as an alternative to the CLI) + * + * This API is inefficient (no threading) and only for experimenting + * with RawHash from Python without having to load the index repeatedly. + * + * Also mixes malloc/free and new/delete + */ +public: + /** + * @brief Use the same arguments as the CLI + * + * Including the leading program name (which will be ignored) + * Note that the option "-d index" (to dump the index) is ignored, so the index should be + * dumped with the rawhash CLI, then it can be used for mapping with this function. + * + * @param argc + * @param argv + */ + RawHashMapper(int argc, char *argv[]); + ~RawHashMapper(); + + /* + * Map the raw read, returning the alignments + * + * Same as ri_map_file_frag -> map_worker_pipeline -> map_worker_for, + * except it reads from memory rather than the file + * and does not perform pipeling or threading + */ + std::vector map(float* raw_signal, int signal_length); + + /** + * Get info about the index + */ + void print_idx_info() const; + +private: + void *_ri; // ri_idx_t* + void* _opt; // ri_mapopt_t* +}; diff --git a/src/rmap.cpp b/src/rmap.cpp index ac3c97d..084857f 100644 --- a/src/rmap.cpp +++ b/src/rmap.cpp @@ -25,7 +25,7 @@ double ri_seedsorttime = 0.0; double ri_chainsorttime = 0.0; #endif -static ri_tbuf_t *ri_tbuf_init(void) +ri_tbuf_t *ri_tbuf_init(void) { ri_tbuf_t *b; b = (ri_tbuf_t*)calloc(1, sizeof(ri_tbuf_t)); @@ -33,7 +33,7 @@ static ri_tbuf_t *ri_tbuf_init(void) return b; } -static void ri_tbuf_destroy(ri_tbuf_t *b) +void ri_tbuf_destroy(ri_tbuf_t *b) { if (b == 0) return; ri_km_destroy(b->km); @@ -386,9 +386,9 @@ void ri_map_frag(const ri_idx_t *ri, reg->offset += n_events; } -static void map_worker_for(void *_data, - long i, - int tid) // kt_for() callback +void map_worker_for(void *_data, + long i, + int tid) // kt_for() callback { step_mt *s = (step_mt*)_data; //s->sig and s->n_sig (signals read in this step and num of them) const ri_mapopt_t *opt = s->p->opt; @@ -745,39 +745,15 @@ static void *map_worker_pipeline(void *shared, // fprintf(stderr, "%s %d %d\n", reg0->read_name, reg0->n_maps, reg0->maps[0].mapped); if(reg0->read_name){ - if(reg0->n_maps > 0 && (!p->su_stop || k < p->su_stop)){ - for(uint32_t m = 0; m < reg0->n_maps; ++m){ - if(reg0->maps[m].ref_id < ri->n_seq) - fprintf(stdout, "%s\t%u\t%u\t%u\t%c\t%s\t%u\t%u\t%u\t%u\t%u\t%u\t%s\n", - reg0->read_name, - reg0->maps[m].read_length, - reg0->maps[m].read_start_position, - reg0->maps[m].read_end_position, - reg0->maps[m].rev?'-':'+', - (ri->flag&RI_I_SIG_TARGET)?ri->sig[reg0->maps[m].ref_id].name:ri->seq[reg0->maps[m].ref_id].name, - (ri->flag&RI_I_SIG_TARGET)?ri->sig[reg0->maps[m].ref_id].l_sig:ri->seq[reg0->maps[m].ref_id].len, - reg0->maps[m].fragment_start_position, - reg0->maps[m].fragment_start_position + reg0->maps[m].fragment_length, - reg0->maps[m].read_end_position-reg0->maps[m].read_start_position-1, - reg0->maps[m].fragment_length, - reg0->maps[m].mapq, - reg0->maps[m].tags); - if(reg0->maps[m].tags) {free(reg0->maps[m].tags); reg0->maps[m].tags = NULL;} - } + if(!p->su_stop || k < p->su_stop){ + write_out_mappings_to_stdout(reg0, ri); }else{ - fprintf(stdout, "%s\t%u\t*\t*\t*\t*\t*\t*\t*\t*\t*\t%u\t%s\n", - reg0->read_name, - reg0->maps[0].read_length, - reg0->maps[0].mapq, - reg0->maps[0].tags); - - if(reg0->maps[0].tags) {free(reg0->maps[0].tags); reg0->maps[0].tags = NULL;} + write_out_mappings_to_stdout(reg0, ri, false); } + free_mappings_ri_reg1_t(reg0); + free(reg0); } - - // if(reg0->tags) {free(reg0->tags); reg0->tags = NULL;} - if(reg0->maps){free(reg0->maps); reg0->maps = NULL;} - free(reg0); s->reg[k] = NULL; + s->reg[k] = NULL; // was freed fflush(stdout); } } @@ -799,6 +775,56 @@ static void *map_worker_pipeline(void *shared, return 0; } +void free_mappings_ri_reg1_t(ri_reg1_t *reg0) { + for (int m = 0; m < reg0->n_maps; ++m) { + if (reg0->maps[m].tags) { + free(reg0->maps[m].tags); + reg0->maps[m].tags = NULL; + } + } + if(reg0->maps){free(reg0->maps); reg0->maps = NULL;} +} + +// todo4, todo1: refactor out reg0->maps,read_name into new structure so reg0 can be freed +void write_out_mappings_to_stdout(ri_reg1_t *reg0, const ri_idx_t *ri, bool was_mapped) { + was_mapped = was_mapped && (reg0->n_maps > 0); + if (was_mapped) { + for (int m = 0; m < reg0->n_maps; ++m) { + if (reg0->maps[m].ref_id < ri->n_seq) + fprintf(stdout, "%s\t%u\t%u\t%u\t%c\t%s\t%u\t%u\t%u\t%u\t%u\t%u\t%s\n", + reg0->read_name, + reg0->maps[m].read_length, + reg0->maps[m].read_start_position, + reg0->maps[m].read_end_position, + reg0->maps[m].rev ? '-' : '+', + (ri->flag & RI_I_SIG_TARGET) ? ri->sig[reg0->maps[m].ref_id].name : ri->seq[reg0->maps[m].ref_id].name, + (ri->flag & RI_I_SIG_TARGET) ? ri->sig[reg0->maps[m].ref_id].l_sig : ri->seq[reg0->maps[m].ref_id].len, + reg0->maps[m].fragment_start_position, + reg0->maps[m].fragment_start_position + reg0->maps[m].fragment_length, + reg0->maps[m].read_end_position - reg0->maps[m].read_start_position - 1, + reg0->maps[m].fragment_length, + reg0->maps[m].mapq, + reg0->maps[m].tags); + // if (reg0->maps[m].tags) { + // free(reg0->maps[m].tags); + // reg0->maps[m].tags = NULL; + // } + } + } else { + // maps[0] contains all necessary information + fprintf(stdout, "%s\t%u\t*\t*\t*\t*\t*\t*\t*\t*\t*\t%u\t%s\n", + reg0->read_name, + reg0->maps[0].read_length, + reg0->maps[0].mapq, + reg0->maps[0].tags); + + // if(reg0->maps[0].tags) {free(reg0->maps[0].tags); reg0->maps[0].tags = NULL;} + } + +// if(reg0->maps){free(reg0->maps); reg0->maps = NULL;} + // free(reg0); +} + int ri_map_file(const ri_idx_t *idx, const char *fn, const ri_mapopt_t *opt, diff --git a/src/rmap.h b/src/rmap.h index 5dcce6f..9964776 100644 --- a/src/rmap.h +++ b/src/rmap.h @@ -93,6 +93,27 @@ int ri_map_file(const ri_idx_t *idx, const char *fn, const ri_mapopt_t *opt, int */ int ri_map_file_frag(const ri_idx_t *idx, int n_segs, const char **fn, const ri_mapopt_t *opt, int n_threads, int io_n_threads = 1); + +// functions used in rmap.cpp and rawhash_wrapper.cpp + +// init thread-local buffer +ri_tbuf_t *ri_tbuf_init(void); +// destroy thread-local buffer +void ri_tbuf_destroy(ri_tbuf_t *b); + +// write out relevant mapping results to stdout, then free reg0 +// was_mapped: if false, write out "*" for mapping info even when read was mapped, e.g. when sequence until terminates the pipeline) +void write_out_mappings_to_stdout(ri_reg1_t *reg0, const ri_idx_t *ri, bool was_mapped=true); + +// frees mappings (works even if no mappings were set), which is not freed by "free_most_of_ri_reg1_t" and reg0 itself +void free_mappings_ri_reg1_t(ri_reg1_t* reg0); +// finally need to call free(reg0) + +// map a single read to a reference, called in parallel +void map_worker_for(void *_data, + long i, + int tid); + #ifdef __cplusplus } #endif diff --git a/src/roptions.h b/src/roptions.h index a6b6f36..16dc780 100644 --- a/src/roptions.h +++ b/src/roptions.h @@ -11,6 +11,7 @@ #define RI_I_SYNCMER 0x8 #define RI_I_STORE_SIG 0x10 #define RI_I_SIG_TARGET 0x20 +#define RI_I_REV_QUERY 0x40 // whether the index should NOT store the reverse strand, this assumes that we query the index with the revcomp of the read as well #define RI_I_NO_REV_TARGET 0x40 #define RI_I_OUT_QUANTIZE 0x80 #define RI_I_NO_EVENT_DETECTION 0x100 @@ -31,6 +32,7 @@ #define RI_M_LOG_NUM_ANCHORS 0x1000 //Overlapping related #define RI_M_ALL_CHAINS 0x2000 +#define RI_M_DONT_QUERY_REVCOMP 0x4000 // whether to transform the read to its revcomp and query index (defaults to RI_I_REV_QUERY) //Characterization related #define RI_M_OUT_ALL_CHAINS 0x4000 diff --git a/src/rseed.c b/src/rseed.c index 5bd2a6f..9af347b 100644 --- a/src/rseed.c +++ b/src/rseed.c @@ -7,8 +7,6 @@ void ri_seed_select(int32_t n, ri_seed_t *a, uint32_t len, int max_occ, int max_max_occ, int dist) { // for high-occ minimizers, choose up to max_high_occ in each high-occ streak - extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*); - extern void ks_heapmake_uint64_t(size_t n, uint64_t*); int32_t i, last0, m; uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation diff --git a/src/rutils.h b/src/rutils.h index 6c0fbf9..d9e8e42 100644 --- a/src/rutils.h +++ b/src/rutils.h @@ -38,6 +38,10 @@ void liftrlimit(void); void load_pore(const char* fpore, const short k, const short lev_col, ri_pore_t* pore); +// Explicit definition for rseed.c +extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*); +extern void ks_heapmake_uint64_t(size_t n, uint64_t*); + #ifdef __cplusplus } #endif diff --git a/src/wrapper_example.cpp b/src/wrapper_example.cpp new file mode 100644 index 0000000..99fc3eb --- /dev/null +++ b/src/wrapper_example.cpp @@ -0,0 +1,7 @@ +#include "rawhash_wrapper.hpp" + +int main(int argc, char *argv[]) { + RawHashMapper mapper(argc, argv); + mapper.print_idx_info(); + return 0; +}