From dc3c15150544e5750e9a8bf9fefe880a4b5d3dca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Gudy=C5=9B?= Date: Tue, 20 Aug 2024 14:22:14 +0200 Subject: [PATCH] Above and below options working correctly in all2all-sp mode. --- .github/workflows/self-hosted.yml | 45 +++++++++--- README.md | 72 ++++++++++--------- src/console_all2all_sparse.cpp | 8 ++- src/params.cpp | 57 ++++++++++----- src/version.h | 7 +- ...above-below.csv => a2a.sparse.above-below} | 0 ...above-below.csv => n2a.sparse.above-below} | 0 7 files changed, 123 insertions(+), 66 deletions(-) rename test/synth/{a2a.sparse.above-below.csv => a2a.sparse.above-below} (100%) rename test/synth/{n2a.sparse.above-below.csv => n2a.sparse.above-below} (100%) diff --git a/.github/workflows/self-hosted.yml b/.github/workflows/self-hosted.yml index f20b486..aa98b3b 100644 --- a/.github/workflows/self-hosted.yml +++ b/.github/workflows/self-hosted.yml @@ -85,21 +85,40 @@ jobs: run: | ${EXEC} build -multisample-fasta -k 21 ${INPUT_DIR}/synth.list synth.db - - name: all2all (dense and sparse) + - name: all2all (dense) run: | ${EXEC} all2all synth.db a2a - ${EXEC} all2all -sparse synth.db a2a-sparse - ${EXEC} all2all -sparse -below 40 -above 30 synth.db a2a.sparse.above-below.csv - ${EXEC} all2all -sparse -below_eq 39 -above_eq 31 synth.db a2a.sparse.above_eq-below_eq.csv cmp ${INPUT_DIR}/a2a a2a + + - name: all2all (sparse) + run: | + ${EXEC} all2all -sparse synth.db a2a-sparse cmp ${INPUT_DIR}/a2a-sparse a2a-sparse - cmp ${INPUT_DIR}/a2a.sparse.above-below.csv a2a.sparse.above-below.csv - cmp ${INPUT_DIR}/a2a.sparse.above-below.csv a2a.sparse.above_eq-below_eq.csv + + - name: all2all (sparse + above + below) + run: | + ${EXEC} all2all -sparse -below 40 -above 30 synth.db a2a.sparse.above-below + cmp ${INPUT_DIR}/a2a.sparse.above-below a2a.sparse.above-below + + - name: all2all (sparse + above_eq + below_eq) + run: | + ${EXEC} all2all -sparse -below_eq 39 -above_eq 31 synth.db a2a.sparse.above_eq-below_eq + cmp ${INPUT_DIR}/a2a.sparse.above-below a2a.sparse.above_eq-below_eq - name: all2all-sp run: | ${EXEC} all2all-sp synth.db a2a-sp cmp ${INPUT_DIR}/a2a-sparse a2a-sp + + - name: all2all-sp (sparse + above + below) + run: | + ${EXEC} all2all-sp -below 40 -above 30 synth.db a2a-sp.above-below + cmp ${INPUT_DIR}/a2a.sparse.above-below a2a-sp.above-below + + - name: all2all-sp (sparse + above_eq + below_eq) + run: | + ${EXEC} all2all-sp -below_eq 39 -above_eq 31 synth.db a2a-sp.above_eq-below_eq + cmp ${INPUT_DIR}/a2a.sparse.above-below a2a-sp.above_eq-below_eq - name: all2all distances (dense -> dense) run: | @@ -126,14 +145,20 @@ jobs: cmp ${INPUT_DIR}/a2a.mash.above-below a2a-sparse.mash - - name: new2all (dense and sparse) + - name: new2all (dense) run: | ${EXEC} new2all -multisample-fasta synth.db ${INPUT_DIR}/synth.list n2a - ${EXEC} new2all -multisample-fasta -sparse synth.db ${INPUT_DIR}/synth.list n2a-sparse - ${EXEC} new2all -multisample-fasta -sparse -below 70 -above 20 synth.db ${INPUT_DIR}/synth.list n2a.sparse.above-below.csv cmp ${INPUT_DIR}/n2a n2a + + - name: new2all sparse) + run: | + ${EXEC} new2all -multisample-fasta -sparse synth.db ${INPUT_DIR}/synth.list n2a-sparse cmp ${INPUT_DIR}/n2a-sparse n2a-sparse - cmp ${INPUT_DIR}/n2a.sparse.above-below.csv n2a.sparse.above-below.csv + + - name: new2all (sparse + above + below) + run: | + ${EXEC} new2all -multisample-fasta -sparse -below 70 -above 20 synth.db ${INPUT_DIR}/synth.list n2a.sparse.above-below + cmp ${INPUT_DIR}/n2a.sparse.above-below n2a.sparse.above-below - name: new2all distances (dense -> dense) run: | diff --git a/README.md b/README.md index 58a7d46..a4f6228 100644 --- a/README.md +++ b/README.md @@ -107,12 +107,12 @@ Kmer-db operates in one of the following modes: * `build` - building a database from samples, * `all2all` - counting common k-mers - all samples in the database, -* `all2all-sp` - counting common k-mers - all samples in the database (sparse computation) -* `all2all-parts` - counting common k-mers - all samples in the database parts (sparse computation) +* `all2all-sp` - counting common k-mers - all samples in the database (sparse computation), +* `all2all-parts` - counting common k-mers - all samples in the database parts (sparse computation), * `new2all` - counting common k-mers - set of new samples versus database, * `one2all` - counting common k-mers - single sample versus database, * `distance` - calculating similarities/distances, -* `minhash` - storing minhashed k-mers, +* `minhash` - storing minhashed k-mers. Common options: * `-t ` - number of threads (default: number of available cores), @@ -142,9 +142,9 @@ Parameters: ... ``` By default, the tool requires uncompressed or compressed FASTA files for each sample. If a file on the list cannot be found, the package tries adding the following extensions: *fna*, *fasta*, *gz*, *fna.gz*, *fasta.gz* . When `-from-kmers` switch is specified, corresponding [KMC-generated](https://github.com/refresh-bio/KMC) k-mer files (*.kmc_pre* and *.kmc_suf*) are required. If `-from-minhash` switch is present, minhashed k-mer files (*.minhash*) must be generated by `minhash` command [prior to the database construction](#24-storing-minhashed-k-mers). Note, that minhashing may be also done during the database construction by specyfying `-f` option. -* `database` (output) - file with generated k-mer database. -* `-k ` - length of k-mers (default: 18); ignored when `-from-kmers` or `-from-minhash` switch is specified. -* `-f ` - fraction of all k-mers to be accepted by the minhash filter during database construction (default: 1); ignored when `-from-minhash` switch is present. +* `database` (output) - file with generated k-mer database, +* `-k ` - length of k-mers (default: 18); ignored when `-from-kmers` or `-from-minhash` switch is specified, +* `-f ` - fraction of all k-mers to be accepted by the minhash filter during database construction (default: 1); ignored when `-from-minhash` switch is present, * `-multisample-fasta` - each sequence in a FASTA file is treated as a separate sample, * `-extend` - extend the existing database with new samples, * `-t ` - number of threads (default: number of available cores). @@ -155,7 +155,7 @@ Parameters: Dense computations - recomended when the distance matrix contains few zeros. Output can be stored in the dense or sparse form (`-sparse` switch). -`kmer-db all2all [-buffer ] [-sparse] [-t ] [-above ] [-below ] [-above_eq ] [-below_eq ] ` +`kmer-db all2all [-buffer ] [-t ] [-sparse [-above ] [-below ] [-above_eq ] [-below_eq ]] ` Sparse computations - recommended when the distance matrix contains many zeros. Output matrix is always in the sparse form: @@ -168,30 +168,31 @@ Sparse computations, partial databases - use when the distance matrix contains m Parameters: * `database` (input) - k-mer database file created by `build` mode, * `db_list` (input) - file containing list of databases files created by `build` mode, -* `common_table` (output) - file containing table with common k-mer counts. -* `-buffer ` - size of cache buffer in megabytes; use L3 size for Intel CPUs and L2 for AMD for best performance; default: 8 -* `-sparse` - stores output matrix in a sparse form, -* `-above ` - retains elements greater then `` -* `-below ` - retains elements less then `` -* `-above_eq ` - retains elements greater or equal `` -* `-below_eq ` - retains elements less or equal `` -* `-t ` - number of threads (default: number of available cores). +* `common_table` (output) - file containing table with common k-mer counts, +* `-buffer ` - size of cache buffer in megabytes; use L3 size for Intel CPUs and L2 for AMD for best performance; default: 8, +* `-t ` - number of threads (default: number of available cores), +* `-sparse` - stores output matrix in a sparse form (always on in `all2all-sp` and `all2all-parts` modes), +* `-above ` - retains elements greater then ``, +* `-below ` - retains elements less then ``, +* `-above_eq ` - retains elements greater or equal ``, +* `-below_eq ` - retains elements less or equal ``. + ### New samples against the database: -`kmer-db new2all [-multisample-fasta | -from-kmers | -from-minhash] [-sparse] [-t ] [-above ] [-below ] [-above_eq ] [-below_eq ] ` +`kmer-db new2all [-multisample-fasta | -from-kmers | -from-minhash] [-t ] [-sparse [-above ] [-below ] [-above_eq ] [-below_eq ]] ` Parameters: -* `database` (input) - k-mer database file created by `build` mode. -* `sample_list` (input) - file containing list of samples in one of the supported formats (see `build` mode); if samples are given as genomes (default) or k-mers (`-from-kmers` switch), the minhashing is done automatically with the same filter as in the database. -* `common_table` (output) - file containing table with common k-mer counts. -* `-multisample-fasta` / `-from-kmers` / `-from-minhash` - see `build` mode for details. +* `database` (input) - k-mer database file created by `build` mode, +* `sample_list` (input) - file containing list of samples in one of the supported formats (see `build` mode); if samples are given as genomes (default) or k-mers (`-from-kmers` switch), the minhashing is done automatically with the same filter as in the database, +* `common_table` (output) - file containing table with common k-mer counts, +* `-multisample-fasta` / `-from-kmers` / `-from-minhash` - see `build` mode for details, +* `-t ` - number of threads (default: number of available cores), * `-sparse` - stores output matrix in a sparse form, -* `-above ` - retains elements greater then `` -* `-below ` - retains elements less then `` -* `-above_eq ` - retains elements greater or equal `` -* `-below_eq ` - retains elements less or equal `` -* `-t ` - number of threads (default: number of available cores). +* `-above ` - retains elements greater then ``, +* `-below ` - retains elements less then ``, +* `-above_eq ` - retains elements greater or equal ``, +* `-below_eq ` - retains elements less or equal ``. ### Single sample against the database: @@ -239,20 +240,21 @@ When `-sparse` switch is specified, the table is stored in a sparse form. In par `kmer-db distance [] [-sparse] [-above ] [-below ] [-above_eq ] [-below_eq ] ` Parameters: -* `common_table` (input) - file containing table with numbers of common k-mers produced by `all2all`, `new2all`, or `one2all` mode (both, dense and sparse matrices are supported). +* `common_table` (input) - file containing table with numbers of common k-mers produced by `all2all`, `new2all`, or `one2all` mode (both, dense and sparse matrices are supported), * `measures` - names of the similarity/distance measures to be calculated, can be one or several of the following (is not specified, `jaccard` is used): * `jaccard`: $J(q,s) = |p \cap q| / |p \cup q|$, * `min`: $\min(q,s) = |p \cap q| / \min(|p|,|q|)$, * `max`: $\max(q,s) = |p \cap q| / \max(|p|,|q|)$, * `cosine`: $\cos(q,s) = |p \cap q| / \sqrt{|p| \cdot |q|}$, - * `mash` (Mash distance): $\textrm{Mash}(q,s) = -\frac{1}{k}ln\frac{2 \cdot J(q,s)}{1 + J(q,s)}$ - * `ani` (average nucleotide identity): $\textrm{ANI}(q,s) = 1 - \textrm{Mash}(p,q)$ + * `mash` (Mash distance): $\textrm{Mash}(q,s) = -\frac{1}{k}ln\frac{2 \cdot J(q,s)}{1 + J(q,s)}$, + * `ani` (average nucleotide identity): $\textrm{ANI}(q,s) = 1 - \textrm{Mash}(p,q)$, + * `ani-shorter` - same as `ani` but with `min` used instead of `jaccard`. * `-phylip-out` - store output distance matrix in a Phylip format, * `-sparse` - outputs a sparse matrix (only for dense input matrices - sparse inputs always produce sparse outputs), -* `-above ` - retains elements greater then `` -* `-below ` - retains elements less then `` -* `-above_eq ` - retains elements greater or equal `` -* `-below_eq ` - retains elements less or equal `` +* `-above ` - retains elements greater then ``, +* `-below ` - retains elements less then ``, +* `-above_eq ` - retains elements greater or equal ``, +* `-below_eq ` - retains elements less or equal ``. This mode generates a file with similarity/distance table for each selected measure. Name of the output file is produced by adding to the input file an extension with a measure name. @@ -266,9 +268,9 @@ This is an optional analysis step which stores minhashed k-mers on the hard disk `kmer-db minhash -from-kmers [-f ] ` Parameters: - * `sample_list` (input) - file containing list of samples in one of the supported formats (see `build` mode). - * `-f ` - fraction of all k-mers to be accepted by the minhash filter (default: 0.01). - * `-k ` - length of k-mers (default: 18; maximum: 30); ignored when `-from-kmers` switch is specified. + * `sample_list` (input) - file containing list of samples in one of the supported formats (see `build` mode), + * `-f ` - fraction of all k-mers to be accepted by the minhash filter (default: 0.01), + * `-k ` - length of k-mers (default: 18; maximum: 30); ignored when `-from-kmers` switch is specified, * `-multisample-fasta` / `-from-kmers` - see `build` mode for details. For each sample from the list, a binary file with *.minhash* extension containing filtered k-mers is created. diff --git a/src/console_all2all_sparse.cpp b/src/console_all2all_sparse.cpp index 6bdf023..ab8952a 100644 --- a/src/console_all2all_sparse.cpp +++ b/src/console_all2all_sparse.cpp @@ -20,8 +20,8 @@ void All2AllSparseConsole::run(const Params& params) { const std::string& dbFilename = params.files[0]; const std::string& similarityFile = params.files[1]; - //uint32_t below = (uint32_t)lrint(params.below); - //uint32_t above = (uint32_t)std::max(0l, lrint(params.above)); + uint32_t below = (uint32_t)lrint(params.below); + uint32_t above = (uint32_t)std::max(0l, lrint(params.above)); std::ifstream dbFile(dbFilename, std::ios::binary); std::ofstream ofs(similarityFile, std::ios::binary); @@ -58,11 +58,13 @@ void All2AllSparseConsole::run(const Params& params) { *ptr++ = '\n'; ofs.write(row, ptr - row); + matrix.compact(below, above, params.numThreads); + for (size_t sid = 0; sid < db->getSamplesCount(); ++sid) { ptr = row; ptr += sprintf(ptr, "%s,%lu,", db->getSampleNames()[sid].c_str(), db->getSampleKmersCount()[sid]); - ptr += matrix.saveRowSparse(sid, ptr); + ptr += matrix.saveRowSparse(sid, ptr, 0); *ptr++ = '\n'; ofs.write(row, ptr - row); diff --git a/src/params.cpp b/src/params.cpp index f868ce1..425078c 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -235,8 +235,9 @@ void Params::showInstructions(Mode mode) const { << "Counting common k-mers for all the samples in the database:" << endl << " kmer-db " << MODE_ALL_2_ALL << " [" << OPTION_BUFFER << " ]" - << " [" << SWITCH_SPARSE << "]" - << " [" << OPTION_THREADS << " ] " << endl << endl + << " [" << OPTION_THREADS << " ]" + << " [" << SWITCH_SPARSE << " [" << OPTION_ABOVE << " ] [" << OPTION_BELOW << " ] [" << OPTION_ABOVE_EQ << " ] [" << OPTION_BELOW_EQ << " ]]" + << " " << endl << "Positional arguments:" << endl << " database (input) - k-mer database file" << endl @@ -245,16 +246,22 @@ void Params::showInstructions(Mode mode) const { << "Options:" << endl << " " << OPTION_BUFFER << " - size of cache buffer in megabytes" << endl << " (use L3 size for Intel CPUs and L2 for AMD to maximize performance; default: 8)" << endl + << " " << OPTION_THREADS << " - number of threads (default: number of available cores)" << endl << " " << SWITCH_SPARSE << " - produce sparse matrix as output" << endl - << " " << OPTION_THREADS << " - number of threads (default: number of available cores)" << endl << endl; + << " " << OPTION_ABOVE << " - retains elements larger then " << endl + << " " << OPTION_BELOW << " - retains elements smaller then " << endl + << " " << OPTION_ABOVE_EQ << " - retains elements larger or equal " << endl + << " " << OPTION_BELOW_EQ << " - retains elements smaller or equal " << endl + << endl; } else if (mode == Mode::all2all_sparse) { LOG_NORMAL << "Counting common k-mers for all the samples in the database (sparse computation):" << endl << " kmer-db " << MODE_ALL_2_ALL_SPARSE << " [" << OPTION_BUFFER << " ]" - << " [" << SWITCH_SPARSE << "]" - << " [" << OPTION_THREADS << " ] " << endl << endl + << " [" << OPTION_THREADS << " ]" + << " [" << OPTION_ABOVE << " ] [" << OPTION_BELOW << " ] [" << OPTION_ABOVE_EQ << " ] [" << OPTION_BELOW_EQ << " ]" + << " " << endl << endl << "Positional arguments:" << endl << " database (input) - k-mer database file" << endl @@ -263,16 +270,21 @@ void Params::showInstructions(Mode mode) const { << "Options:" << endl << " " << OPTION_BUFFER << " - size of cache buffer in megabytes" << endl << " (use L3 size for Intel CPUs and L2 for AMD to maximize performance; default: 8)" << endl - << " " << SWITCH_SPARSE << " - produce sparse matrix as output" << endl - << " " << OPTION_THREADS << " - number of threads (default: number of available cores)" << endl << endl; + << " " << OPTION_THREADS << " - number of threads (default: number of available cores)" << endl + << " " << OPTION_ABOVE << " - retains elements larger then " << endl + << " " << OPTION_BELOW << " - retains elements smaller then " << endl + << " " << OPTION_ABOVE_EQ << " - retains elements larger or equal " << endl + << " " << OPTION_BELOW_EQ << " - retains elements smaller or equal " << endl + << endl; } else if (mode == Mode::all2all_parts) { LOG_NORMAL << "Counting common k-mers for all the samples in the database parts (sparse computation):" << endl << " kmer-db " << MODE_ALL_2_ALL_PARTS << " [" << OPTION_BUFFER << " ]" - << " [" << SWITCH_SPARSE << "]" - << " [" << OPTION_THREADS << " ] " << endl << endl + << " [" << OPTION_THREADS << " ]" + << " [" << OPTION_ABOVE << " ] [" << OPTION_BELOW << " ] [" << OPTION_ABOVE_EQ << " ] [" << OPTION_BELOW_EQ << " ]" + << " " << endl << endl << "Positional arguments:" << endl << " db_list (input) - file containing list of database file names" << endl @@ -281,8 +293,12 @@ void Params::showInstructions(Mode mode) const { << "Options:" << endl << " " << OPTION_BUFFER << " - size of cache buffer in megabytes" << endl << " (use L3 size for Intel CPUs and L2 for AMD to maximize performance; default: 8)" << endl - << " " << SWITCH_SPARSE << " - produce sparse matrix as output" << endl - << " " << OPTION_THREADS << " - number of threads (default: number of available cores)" << endl << endl; + << " " << OPTION_THREADS << " - number of threads (default: number of available cores)" << endl + << " " << OPTION_ABOVE << " - retains elements larger then " << endl + << " " << OPTION_BELOW << " - retains elements smaller then " << endl + << " " << OPTION_ABOVE_EQ << " - retains elements larger or equal " << endl + << " " << OPTION_BELOW_EQ << " - retains elements smaller or equal " << endl + << endl; } /* else if (mode == Mode::db2db) { @@ -310,7 +326,7 @@ void Params::showInstructions(Mode mode) const { << "Counting common kmers between set of new samples and all the samples in the database:" << endl << " kmer-db " << MODE_NEW_2_ALL << " [" << SWITCH_MULTISAMPLE_FASTA << " | " << SWITCH_KMC_SAMPLES << " | " << SWITCH_MINHASH_SAMPLES << "]" - << " [" << SWITCH_SPARSE << "]" + << " [" << SWITCH_SPARSE << " [" << OPTION_ABOVE << " ] [" << OPTION_BELOW << " ] [" << OPTION_ABOVE_EQ << " ] [" << OPTION_BELOW_EQ << " ]] " << " [" << OPTION_THREADS << " ] " << endl << endl << "Positional arguments:" << endl @@ -321,8 +337,13 @@ void Params::showInstructions(Mode mode) const { << "Options:" << endl << " " << SWITCH_MULTISAMPLE_FASTA << " - each sequence in a FASTA file is treated as a separate sample" << endl + << " " << OPTION_THREADS << " - number of threads (default: number of available cores)" << endl << " " << SWITCH_SPARSE << " - outputs a sparse matrix" << endl - << " " << OPTION_THREADS << " - number of threads (default: number of available cores)" << endl << endl; + << " " << OPTION_ABOVE << " - retains elements larger then " << endl + << " " << OPTION_BELOW << " - retains elements smaller then " << endl + << " " << OPTION_ABOVE_EQ << " - retains elements larger or equal " << endl + << " " << OPTION_BELOW_EQ << " - retains elements smaller or equal " << endl + << endl; } else if (mode == Mode::one2all) { LOG_NORMAL @@ -340,7 +361,8 @@ void Params::showInstructions(Mode mode) const { LOG_NORMAL << "Calculating similarities/distances on the basis of common k-mers:" << endl << " kmer-db " << MODE_DISTANCE << " []" - << " [" << SWITCH_SPARSE << " [" << OPTION_ABOVE << " ] [" << OPTION_BELOW << " ]] " << endl << endl + << " [" << SWITCH_SPARSE << " [" << OPTION_ABOVE << " ] [" << OPTION_BELOW << " ] [" << OPTION_ABOVE_EQ << " ] [" << OPTION_BELOW_EQ << " ]] " + << "" << endl << endl << "Positional arguments:" << endl << " common_table (input) - comma-separated table with a number of common k-mers" << endl @@ -350,8 +372,11 @@ void Params::showInstructions(Mode mode) const { << " jaccard (default), min, max, cosine, mash, ani, ani-shorter." << endl << " " << SWITCH_PHYLIP_OUT << " - store output distance matrix in a Phylip format" << endl << " " << SWITCH_SPARSE << " - outputs a sparse matrix (independently of the input matrix format)" << endl - << " " << OPTION_ABOVE << " - retains elements larger then " << endl - << " " << OPTION_BELOW << " - retains elements smaller then " << endl << endl + << " " << OPTION_ABOVE << " - retains elements larger then " << endl + << " " << OPTION_BELOW << " - retains elements smaller then " << endl + << " " << OPTION_ABOVE_EQ << " - retains elements larger or equal " << endl + << " " << OPTION_BELOW_EQ << " - retains elements smaller or equal " << endl + << endl << "This mode generates a file with similarity/distance table for each selected measure." << endl << "Name of the output file is produced by adding to the input file an extension with a measure name." << endl << endl; } diff --git a/src/version.h b/src/version.h index 4be0210..b5b6e52 100644 --- a/src/version.h +++ b/src/version.h @@ -1,12 +1,15 @@ #pragma once -#define VERSION "2.0.3" -#define DATE "28.06.2024" +#define VERSION "2.0.4" +#define DATE "20.08.2024" /* Version history +2.0.4 (20.08.2024) +- Above and below options working correctly in all2all-sp mode. + 2.0.3 (28.06.2024) - Fixed bug with empty sample. diff --git a/test/synth/a2a.sparse.above-below.csv b/test/synth/a2a.sparse.above-below similarity index 100% rename from test/synth/a2a.sparse.above-below.csv rename to test/synth/a2a.sparse.above-below diff --git a/test/synth/n2a.sparse.above-below.csv b/test/synth/n2a.sparse.above-below similarity index 100% rename from test/synth/n2a.sparse.above-below.csv rename to test/synth/n2a.sparse.above-below