Above and below options working correctly in all2all-sp mode.

refresh-bio · Aug 20, 2024 · dc3c151 · dc3c151
1 parent f093656
commit dc3c151
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 66 deletions.
diff --git a/.github/workflows/self-hosted.yml b/.github/workflows/self-hosted.yml
@@ -85,21 +85,40 @@ jobs:
       run: |
         ${EXEC} build -multisample-fasta -k 21 ${INPUT_DIR}/synth.list synth.db
     
-    - name: all2all (dense and sparse)
+    - name: all2all (dense)
       run: |     
         ${EXEC} all2all synth.db a2a
-        ${EXEC} all2all -sparse synth.db a2a-sparse
-        ${EXEC} all2all -sparse -below 40 -above 30 synth.db a2a.sparse.above-below.csv
-        ${EXEC} all2all -sparse -below_eq 39 -above_eq 31 synth.db a2a.sparse.above_eq-below_eq.csv
         cmp ${INPUT_DIR}/a2a a2a
+        
+    - name: all2all (sparse)
+      run: |
+        ${EXEC} all2all -sparse synth.db a2a-sparse
         cmp ${INPUT_DIR}/a2a-sparse a2a-sparse
-        cmp ${INPUT_DIR}/a2a.sparse.above-below.csv a2a.sparse.above-below.csv    
-        cmp ${INPUT_DIR}/a2a.sparse.above-below.csv a2a.sparse.above_eq-below_eq.csv    
+  
+    - name: all2all (sparse + above + below)
+      run: |
+        ${EXEC} all2all -sparse -below 40 -above 30 synth.db a2a.sparse.above-below
+        cmp ${INPUT_DIR}/a2a.sparse.above-below a2a.sparse.above-below
+
+    - name: all2all (sparse + above_eq + below_eq)
+      run: |
+        ${EXEC} all2all -sparse -below_eq 39 -above_eq 31 synth.db a2a.sparse.above_eq-below_eq
+        cmp ${INPUT_DIR}/a2a.sparse.above-below a2a.sparse.above_eq-below_eq 
         
     - name: all2all-sp
       run: | 
         ${EXEC} all2all-sp synth.db a2a-sp
         cmp ${INPUT_DIR}/a2a-sparse a2a-sp
+
+    - name: all2all-sp (sparse + above + below)
+      run: | 
+        ${EXEC} all2all-sp -below 40 -above 30 synth.db a2a-sp.above-below
+        cmp ${INPUT_DIR}/a2a.sparse.above-below a2a-sp.above-below    
+
+    - name: all2all-sp (sparse + above_eq + below_eq)
+      run: | 
+        ${EXEC} all2all-sp -below_eq 39 -above_eq 31 synth.db a2a-sp.above_eq-below_eq
+        cmp ${INPUT_DIR}/a2a.sparse.above-below a2a-sp.above_eq-below_eq   
     
     - name: all2all distances (dense -> dense)
       run: |
@@ -126,14 +145,20 @@ jobs:
         cmp ${INPUT_DIR}/a2a.mash.above-below a2a-sparse.mash
         
        
-    - name: new2all (dense and sparse)
+    - name: new2all (dense)
       run: |
         ${EXEC} new2all -multisample-fasta synth.db ${INPUT_DIR}/synth.list n2a
-        ${EXEC} new2all -multisample-fasta -sparse synth.db ${INPUT_DIR}/synth.list n2a-sparse 
-        ${EXEC} new2all -multisample-fasta -sparse -below 70 -above 20 synth.db ${INPUT_DIR}/synth.list n2a.sparse.above-below.csv
         cmp ${INPUT_DIR}/n2a n2a
+
+    - name: new2all sparse)
+      run: |
+        ${EXEC} new2all -multisample-fasta -sparse synth.db ${INPUT_DIR}/synth.list n2a-sparse 
         cmp ${INPUT_DIR}/n2a-sparse n2a-sparse        
-        cmp ${INPUT_DIR}/n2a.sparse.above-below.csv n2a.sparse.above-below.csv
+
+    - name: new2all (sparse + above + below)
+      run: |
+        ${EXEC} new2all -multisample-fasta -sparse -below 70 -above 20 synth.db ${INPUT_DIR}/synth.list n2a.sparse.above-below
+        cmp ${INPUT_DIR}/n2a.sparse.above-below n2a.sparse.above-below
             
     - name: new2all distances (dense -> dense)
       run: |

diff --git a/README.md b/README.md
@@ -107,12 +107,12 @@ Kmer-db operates in one of the following modes:
 
 * `build` - building a database from samples,
 * `all2all` - counting common k-mers - all samples in the database,
-* `all2all-sp` - counting common k-mers - all samples in the database (sparse computation)
-* `all2all-parts` - counting common k-mers - all samples in the database parts (sparse computation)
+* `all2all-sp` - counting common k-mers - all samples in the database (sparse computation),
+* `all2all-parts` - counting common k-mers - all samples in the database parts (sparse computation),
 * `new2all` - counting common k-mers - set of new samples versus database,
 * `one2all` - counting common k-mers - single sample versus database,
 * `distance` - calculating similarities/distances,
-* `minhash` - storing minhashed k-mers,
+* `minhash` - storing minhashed k-mers.
 
 Common options:
 * `-t <threads>` - number of threads (default: number of available cores),
@@ -142,9 +142,9 @@ Parameters:
     ...
     ```
     By default, the tool requires uncompressed or compressed FASTA files for each sample. If a file on the list cannot be found, the package tries adding the following extensions: *fna*, *fasta*, *gz*, *fna.gz*, *fasta.gz* . When `-from-kmers` switch is specified, corresponding [KMC-generated](https://github.com/refresh-bio/KMC) k-mer files (*.kmc_pre* and *.kmc_suf*) are required. If `-from-minhash` switch is present, minhashed k-mer files (*.minhash*) must be generated by `minhash` command [prior to the database construction](#24-storing-minhashed-k-mers). Note, that minhashing may be also done during the database construction by specyfying `-f` option.
-* `database` (output) - file with generated k-mer database. 
-* `-k <kmer-length>` - length of k-mers (default: 18); ignored when `-from-kmers` or `-from-minhash` switch is specified.
-* `-f <fraction>` - fraction of all k-mers to be accepted by the minhash filter during database construction (default: 1); ignored when `-from-minhash` switch is present.
+* `database` (output) - file with generated k-mer database, 
+* `-k <kmer-length>` - length of k-mers (default: 18); ignored when `-from-kmers` or `-from-minhash` switch is specified,
+* `-f <fraction>` - fraction of all k-mers to be accepted by the minhash filter during database construction (default: 1); ignored when `-from-minhash` switch is present,
 * `-multisample-fasta` - each sequence in a FASTA file is treated as a separate sample,
 * `-extend` - extend the existing database with new samples,
 * `-t <threads>` - number of threads (default: number of available cores).
@@ -155,7 +155,7 @@ Parameters:
 
 Dense computations - recomended when the distance matrix contains few zeros. Output can be stored in the dense or sparse form (`-sparse` switch).
 
-`kmer-db all2all [-buffer <size_mb>] [-sparse] [-t <threads>] [-above <v>] [-below <v>] [-above_eq <v>] [-below_eq <v>] <database> <common_table>`
+`kmer-db all2all [-buffer <size_mb>] [-t <threads>] [-sparse [-above <v>] [-below <v>] [-above_eq <v>] [-below_eq <v>]] <database> <common_table>`
 
 Sparse computations - recommended when the distance matrix contains many zeros. Output matrix is always in the sparse form:
 
@@ -168,30 +168,31 @@ Sparse computations, partial databases - use when the distance matrix contains m
 Parameters:
 * `database` (input) - k-mer database file created by `build` mode,
 * `db_list` (input) - file containing list of databases files created by `build` mode,
-* `common_table` (output) - file containing table with common k-mer counts.
-* `-buffer <size_mb>` - size of cache buffer in megabytes; use L3 size for Intel CPUs and L2 for AMD for best performance; default: 8
-* `-sparse` - stores output matrix in a sparse form,
-* `-above <v>` - retains elements greater then `<v>`
-* `-below <v>` - retains elements less then `<v>`
-* `-above_eq <v>` - retains elements greater or equal `<v>`
-* `-below_eq <v>` - retains elements less or equal `<v>`
-* `-t <threads>` - number of threads (default: number of available cores).
+* `common_table` (output) - file containing table with common k-mer counts,
+* `-buffer <size_mb>` - size of cache buffer in megabytes; use L3 size for Intel CPUs and L2 for AMD for best performance; default: 8,
+* `-t <threads>` - number of threads (default: number of available cores),
+* `-sparse` - stores output matrix in a sparse form (always on in `all2all-sp` and `all2all-parts` modes),
+* `-above <v>` - retains elements greater then `<v>`,
+* `-below <v>` - retains elements less then `<v>`,
+* `-above_eq <v>` - retains elements greater or equal `<v>`,
+* `-below_eq <v>` - retains elements less or equal `<v>`.
+
 
 ### New samples against the database:
 
-`kmer-db new2all [-multisample-fasta | -from-kmers | -from-minhash] [-sparse] [-t <threads>] [-above <v>] [-below <v>] [-above_eq <v>] [-below_eq <v>] <database> <sample_list> <common_table>`
+`kmer-db new2all [-multisample-fasta | -from-kmers | -from-minhash] [-t <threads>] [-sparse [-above <v>] [-below <v>] [-above_eq <v>] [-below_eq <v>]] <database> <sample_list> <common_table>`
 
 Parameters:
-* `database` (input) - k-mer database file created by `build` mode.
-* `sample_list` (input) - file containing list of samples in one of the supported formats (see `build` mode); if samples are given as genomes (default) or k-mers (`-from-kmers` switch), the minhashing is done automatically with the same filter as in the database.
-* `common_table` (output) - file containing table with common k-mer counts.
-* `-multisample-fasta` / `-from-kmers` / `-from-minhash` - see `build` mode for details.
+* `database` (input) - k-mer database file created by `build` mode,
+* `sample_list` (input) - file containing list of samples in one of the supported formats (see `build` mode); if samples are given as genomes (default) or k-mers (`-from-kmers` switch), the minhashing is done automatically with the same filter as in the database,
+* `common_table` (output) - file containing table with common k-mer counts,
+* `-multisample-fasta` / `-from-kmers` / `-from-minhash` - see `build` mode for details,
+* `-t <threads>` - number of threads (default: number of available cores),
 * `-sparse` - stores output matrix in a sparse form,
-* `-above <v>` - retains elements greater then `<v>`
-* `-below <v>` - retains elements less then `<v>`
-* `-above_eq <v>` - retains elements greater or equal `<v>`
-* `-below_eq <v>` - retains elements less or equal `<v>`
-* `-t <threads>` - number of threads (default: number of available cores).
+* `-above <v>` - retains elements greater then `<v>`,
+* `-below <v>` - retains elements less then `<v>`,
+* `-above_eq <v>` - retains elements greater or equal `<v>`,
+* `-below_eq <v>` - retains elements less or equal `<v>`.
 
 ### Single sample against the database:
 
@@ -239,20 +240,21 @@ When `-sparse` switch is specified, the table is stored in a sparse form. In par
 `kmer-db distance [<measures>] [-sparse] [-above <v>] [-below <v>] [-above_eq <v>] [-below_eq <v>] <common_table>`
 
 Parameters:
-* `common_table` (input) - file containing table with numbers of common k-mers produced by `all2all`, `new2all`, or `one2all` mode (both, dense and sparse matrices are supported). 
+* `common_table` (input) - file containing table with numbers of common k-mers produced by `all2all`, `new2all`, or `one2all` mode (both, dense and sparse matrices are supported), 
 * `measures` - names of the similarity/distance measures to be calculated, can be one or several of the following (is not specified, `jaccard` is used): 
   * `jaccard`: $J(q,s) = |p \cap q| / |p \cup q|$, 
   * `min`: $\min(q,s) =  |p \cap q| / \min(|p|,|q|)$, 
   * `max`: $\max(q,s) =  |p \cap q| / \max(|p|,|q|)$, 
   * `cosine`: $\cos(q,s) = |p \cap q| / \sqrt{|p| \cdot |q|}$,  
-  * `mash` (Mash distance): $\textrm{Mash}(q,s) = -\frac{1}{k}ln\frac{2 \cdot J(q,s)}{1 + J(q,s)}$ 
-  * `ani` (average nucleotide identity): $\textrm{ANI}(q,s) = 1 - \textrm{Mash}(p,q)$
+  * `mash` (Mash distance): $\textrm{Mash}(q,s) = -\frac{1}{k}ln\frac{2 \cdot J(q,s)}{1 + J(q,s)}$, 
+  * `ani` (average nucleotide identity): $\textrm{ANI}(q,s) = 1 - \textrm{Mash}(p,q)$,
+  * `ani-shorter` - same as `ani` but with `min` used instead of `jaccard`.
 * `-phylip-out` - store output distance matrix in a Phylip format,
 * `-sparse` - outputs a sparse matrix (only for dense input matrices - sparse inputs always produce sparse outputs),
-* `-above <v>` - retains elements greater then `<v>`
-* `-below <v>` - retains elements less then `<v>`
-* `-above_eq <v>` - retains elements greater or equal `<v>`
-* `-below_eq <v>` - retains elements less or equal `<v>`
+* `-above <v>` - retains elements greater then `<v>`,
+* `-below <v>` - retains elements less then `<v>`,
+* `-above_eq <v>` - retains elements greater or equal `<v>`,
+* `-below_eq <v>` - retains elements less or equal `<v>`.
 
 This mode generates a file with similarity/distance table for each selected measure. Name of the output file is produced by adding to the input file an extension with a measure name.
 
@@ -266,9 +268,9 @@ This is an optional analysis step which stores minhashed k-mers on the hard disk
 `kmer-db minhash -from-kmers [-f <fraction>] <sample_list>`
 
 Parameters:
- * `sample_list` (input) - file containing list of samples in one of the supported formats (see `build` mode). 
- * `-f <fraction>` - fraction of all k-mers to be accepted by the minhash filter (default: 0.01).
- * `-k <kmer-length>` - length of k-mers (default: 18; maximum: 30); ignored when `-from-kmers` switch is specified.
+ * `sample_list` (input) - file containing list of samples in one of the supported formats (see `build` mode), 
+ * `-f <fraction>` - fraction of all k-mers to be accepted by the minhash filter (default: 0.01),
+ * `-k <kmer-length>` - length of k-mers (default: 18; maximum: 30); ignored when `-from-kmers` switch is specified,
  * `-multisample-fasta` / `-from-kmers` - see `build` mode for details.
 
 For each sample from the list, a binary file with *.minhash* extension containing filtered k-mers is created.

diff --git a/src/console_all2all_sparse.cpp b/src/console_all2all_sparse.cpp
@@ -20,8 +20,8 @@ void All2AllSparseConsole::run(const Params& params) {
 	const std::string& dbFilename = params.files[0];
 	const std::string& similarityFile = params.files[1];
 
-	//uint32_t below = (uint32_t)lrint(params.below);
-	//uint32_t above = (uint32_t)std::max(0l, lrint(params.above));
+	uint32_t below = (uint32_t)lrint(params.below);
+	uint32_t above = (uint32_t)std::max(0l, lrint(params.above));
 
 	std::ifstream dbFile(dbFilename, std::ios::binary);
 	std::ofstream ofs(similarityFile, std::ios::binary);
@@ -58,11 +58,13 @@ void All2AllSparseConsole::run(const Params& params) {
 	*ptr++ = '\n';
 	ofs.write(row, ptr - row);
 
+	matrix.compact(below, above, params.numThreads);
+
 	for (size_t sid = 0; sid < db->getSamplesCount(); ++sid) {
 		ptr = row;
 		ptr += sprintf(ptr, "%s,%lu,", db->getSampleNames()[sid].c_str(), db->getSampleKmersCount()[sid]);
 
-		ptr += matrix.saveRowSparse(sid, ptr);
+		ptr += matrix.saveRowSparse(sid, ptr, 0);
 
 		*ptr++ = '\n';
 		ofs.write(row, ptr - row);