From 9c0def1209730363721ceff9a3353a0372335dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 Jan 2020 13:50:55 -0500 Subject: [PATCH 01/22] Update version and license --- LICENSE | 2 +- README.md | 9 ++++----- src/prophasm.cpp | 7 +++---- src/version.h | 2 +- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/LICENSE b/LICENSE index 6173d0f..69d87ba 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License -Copyright (c) 2016-2017 Karel Brinda +Copyright (c) 2016-2020 Karel Brinda Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 6064073..05ee9a8 100644 --- a/README.md +++ b/README.md @@ -69,8 +69,8 @@ USAGE-BEGIN --> ``` Program: prophasm (a greedy assembler for k-mer set compression) -Version: 0.1.0 -Contact: Karel Brinda +Version: 0.1.1 +Contact: Karel Brinda Usage: prophasm [options] @@ -92,7 +92,6 @@ Command-line parameters: Note that '-' can be used for standard input/output. ``` - @@ -115,7 +114,7 @@ def extend_simplitig_forward (K, simplitig): S.remove (reverse_complement (kmer)) break return S, s - + def get_maximal_simplitig (K, initial_kmer): simplitig = initial_kmer K.remove (initial_kmer) @@ -124,7 +123,7 @@ def get_maximal_simplitig (K, initial_kmer): simplitig = reverse_completent (simplitig) K, simplitig = extend_simplitig_forward (K, simplitig) return K, simplitig - + def compute_simplitigs (kmers): K = set() for kmer in kmers: diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 7518f1b..d201ff4 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -1,7 +1,7 @@ /* The MIT License - Copyright (c) 2016-2017 Karel Brinda + Copyright (c) 2016-2020 Karel Brinda Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -29,14 +29,13 @@ Description: Get k-mer sets from FASTA files, extract the intersection, and - assemble all the resulting k-mer sets into contigs. The assembly is + assemble all the resulting k-mer sets into simplitigs. The assembly is done by greedy enumeration of disjoint paths in the corresponding de-Bruijn graphs. Todo: * Find a library for sets of integers bigger than uint64_t (to support k-mers longer than 32). * Optimize loading FASTA files. - * Check memory consumption (and put it here). */ #include "kseq.h" #include "version.h" @@ -94,7 +93,7 @@ void print_help(){ "\n" << "Program: prophasm (a greedy assembler for k-mer set compression)\n" << "Version: " VERSION "\n" << - "Contact: Karel Brinda \n" << + "Contact: Karel Brinda \n" << "\n" << "Usage: prophasm [options]\n" << "\n" << diff --git a/src/version.h b/src/version.h index a3e6f45..080e36d 100644 --- a/src/version.h +++ b/src/version.h @@ -1,2 +1,2 @@ -#define VERSION "0.1.0" +#define VERSION "0.1.1" From 8673bcc4c8bed6d68447a53baf20f5b905da5f32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 Jan 2020 13:52:00 -0500 Subject: [PATCH 02/22] Rename contigs to simplitigs --- src/prophasm.cpp | 52 ++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index d201ff4..7161d9b 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -59,7 +59,7 @@ typedef uint64_t nkmer_t; typedef std::set set_t; const int32_t fasta_line_length=60; -const int32_t max_contig_length=10000000; +const int32_t max_simplitig_length=10000000; const int32_t max_allowed_kmer_length=sizeof(nkmer_t)*4; //const int32_t default_k=31; @@ -219,16 +219,16 @@ void debug_print_kmer_set(_set_T &set, int k, bool verbose){ } -struct contig_t{ +struct simplitig_t{ int32_t k; - /* contig buffer */ + /* simplitig buffer */ char *seq_buffer; - /* the first position of the contig */ + /* the first position of the simplitig */ char *l_ext; - /* the last position of the contig +1 (semiopen ) */ + /* the last position of the simplitig +1 (semiopen ) */ char *r_ext; /* min possible value of l_ext */ @@ -237,17 +237,17 @@ struct contig_t{ /* max possible value of l_ext */ char *r_ext_border; - contig_t(uint32_t _k){ + simplitig_t(uint32_t _k){ this->k=_k; - seq_buffer=new char[k+2*max_contig_length+1](); + seq_buffer=new char[k+2*max_simplitig_length+1](); l_ext_border=seq_buffer; - r_ext_border=seq_buffer+2*max_contig_length; + r_ext_border=seq_buffer+2*max_simplitig_length; } - int32_t new_contig(const char *base_kmer){ + int32_t new_simplitig(const char *base_kmer){ assert(static_cast(strlen(base_kmer))==k); - l_ext = r_ext = &seq_buffer[max_contig_length]; + l_ext = r_ext = &seq_buffer[max_simplitig_length]; *r_ext='\0'; for(int32_t i=0;i=r_ext_border) || (l_ext<=l_ext_border); } - int32_t print_to_fasta(FILE* fasta_file, const char* contig_name, const char *comment=nullptr) const { + int32_t print_to_fasta(FILE* fasta_file, const char* simplitig_name, const char *comment=nullptr) const { if (comment==nullptr){ - fprintf(fasta_file,">%s\n",contig_name); + fprintf(fasta_file,">%s\n",simplitig_name); } else { - fprintf(fasta_file,">%s %s\n",contig_name,comment); + fprintf(fasta_file,">%s %s\n",simplitig_name,comment); } char print_buffer[fasta_line_length+1]={0}; @@ -452,11 +452,11 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE* fstats, test_file(file, fasta_fn); } char kmer_str[max_allowed_kmer_length+1]; - contig_t contig(k); + simplitig_t simplitig(k); const std::vector nucls = {'A','C','G','T'}; //int32_t i=0; - int32_t contig_id=1; + int32_t simplitig_id=1; while(set.size()>0){ const auto central_nkmer=*(set.begin()); @@ -464,7 +464,7 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE* fstats, std::string central_kmer_string; decode_kmer(central_nkmer,k,central_kmer_string); - contig.new_contig(central_kmer_string.c_str()); + simplitig.new_simplitig(central_kmer_string.c_str()); typename _set_T::value_type nkmer; @@ -504,17 +504,17 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE* fstats, if(set.count( nkmer )){ //std::cerr << "extending " << c << std::endl; //debug_print_kmer_set(set,k); - //std::cerr << std::string(contig.l_ext) << c << std::endl; + //std::cerr << std::string(simplitig.l_ext) << c << std::endl; //std::cerr << std::endl; if(direction==0){ - contig.r_extend(c); + simplitig.r_extend(c); } else{ - contig.l_extend(c); + simplitig.l_extend(c); } set.erase(nkmer); - if(!contig.is_full()){ + if(!simplitig.is_full()){ extending=true; } break; @@ -526,16 +526,16 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE* fstats, //std::cerr << "====================" << std::endl; std::stringstream ss; - ss<<"c"< Date: Wed, 29 Jan 2020 13:55:10 -0500 Subject: [PATCH 03/22] Don't break fasta lines --- src/prophasm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 7161d9b..ce925b3 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -58,8 +58,8 @@ typedef uint64_t nkmer_t; typedef std::set set_t; -const int32_t fasta_line_length=60; const int32_t max_simplitig_length=10000000; +const int32_t fasta_line_length=max_simplitig_length; // do not break fasta lines const int32_t max_allowed_kmer_length=sizeof(nkmer_t)*4; //const int32_t default_k=31; From e0edb18a47700ab9ad904e25f2c9249735f15ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 Jan 2020 13:57:18 -0500 Subject: [PATCH 04/22] Add make format --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a9acffd..fbc7fc6 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all help clean test prophasm readme +.PHONY: all help clean test prophasm readme format SHELL=/usr/bin/env bash -eo pipefail @@ -15,6 +15,9 @@ help: ## Print help message test: $(MAKE) -C tests +format: + clang-format -i src/*.cpp src/*.h + readme: f=$$(mktemp);\ echo $$f;\ From 9222b961d7bc7ed6f9a6bc1f4a00665cefeb9081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 Jan 2020 14:04:48 -0500 Subject: [PATCH 05/22] Update cpp style --- .clang-format | 11 +++++++++++ Makefile | 2 +- src/prophasm.cpp | 2 ++ 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..f4cc551 --- /dev/null +++ b/.clang-format @@ -0,0 +1,11 @@ +BasedOnStyle: Google +Language: Cpp +Standard: Cpp11 +AlignConsecutiveAssignments: true +SortIncludes: true +UseTab: Never +IndentWidth: 4 +IndentCaseLabels: true +NamespaceIndentation: None +ColumnLimit: 100 + diff --git a/Makefile b/Makefile index fbc7fc6..0e61aab 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ test: $(MAKE) -C tests format: - clang-format -i src/*.cpp src/*.h + clang-format -i src/*.cpp #src/*.h readme: f=$$(mktemp);\ diff --git a/src/prophasm.cpp b/src/prophasm.cpp index ce925b3..03a8f1c 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -65,6 +65,7 @@ const int32_t max_allowed_kmer_length=sizeof(nkmer_t)*4; static const uint8_t nt4_nt256[] = "ACGTN"; +// clang-format off static const uint8_t nt256_nt4[] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -83,6 +84,7 @@ static const uint8_t nt256_nt4[] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; +// clang-format on KSEQ_INIT(gzFile, gzread) From 4fc7f6983902fa2a6e1703f63a191d0334288014 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 Jan 2020 14:05:11 -0500 Subject: [PATCH 06/22] Reformat using clang-format --- src/prophasm.cpp | 1179 ++++++++++++++++++++++------------------------ 1 file changed, 576 insertions(+), 603 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 03a8f1c..ae637d1 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -1,67 +1,68 @@ /* - The MIT License - - Copyright (c) 2016-2020 Karel Brinda - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. + The MIT License + + Copyright (c) 2016-2020 Karel Brinda + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. */ /* Description: - Get k-mer sets from FASTA files, extract the intersection, and - assemble all the resulting k-mer sets into simplitigs. The assembly is - done by greedy enumeration of disjoint paths in the corresponding - de-Bruijn graphs. + Get k-mer sets from FASTA files, extract the intersection, and + assemble all the resulting k-mer sets into simplitigs. The assembly is + done by greedy enumeration of disjoint paths in the corresponding + de-Bruijn graphs. Todo: - * Find a library for sets of integers bigger than uint64_t (to support k-mers longer than 32). - * Optimize loading FASTA files. + * Find a library for sets of integers bigger than uint64_t (to support k-mers longer than +32). + * Optimize loading FASTA files. */ -#include "kseq.h" -#include "version.h" - +#include #include +#include +#include #include #include #include #include -#include -#include #include -#include #include #include -#include +#include + +#include "kseq.h" +#include "version.h" -//typedef __uint128_t nkmer_t; +// typedef __uint128_t nkmer_t; typedef uint64_t nkmer_t; typedef std::set set_t; -const int32_t max_simplitig_length=10000000; -const int32_t fasta_line_length=max_simplitig_length; // do not break fasta lines -const int32_t max_allowed_kmer_length=sizeof(nkmer_t)*4; -//const int32_t default_k=31; +const int32_t max_simplitig_length = 10000000; +const int32_t fasta_line_length = max_simplitig_length; // do not break fasta lines +const int32_t max_allowed_kmer_length = sizeof(nkmer_t) * 4; +// const int32_t default_k=31; static const uint8_t nt4_nt256[] = "ACGTN"; @@ -86,638 +87,610 @@ static const uint8_t nt256_nt4[] = { }; // clang-format on - KSEQ_INIT(gzFile, gzread) - -void print_help(){ - std::cerr << - "\n" << - "Program: prophasm (a greedy assembler for k-mer set compression)\n" << - "Version: " VERSION "\n" << - "Contact: Karel Brinda \n" << - "\n" << - "Usage: prophasm [options]\n" << - "\n" << - "Examples: prophasm -k 15 -i f1.fa -i f2.fa -x fx.fa\n" << - " - compute intersection of f1 and f2\n" << - " prophasm -k 15 -i f1.fa -i f2.fa -x fx.fa -o g1.fa -o g2.fa\n" << - " - compute intersection of f1 and f2, and subtract it from them\n" << - " prophasm -k 15 -i f1.fa -o g1.fa\n" << - " - re-assemble f1 to g1\n" << - "\n" << - "Command-line parameters:\n" << - " -k INT K-mer size.\n" << - " -i FILE Input FASTA file (can be used multiple times).\n" << - " -o FILE Output FASTA file (if used, must be used as many times as -i).\n" << - " -x FILE Compute intersection, subtract it, save it.\n" << - " -s FILE Output file with k-mer statistics.\n" << - //" -k INT K-mer size. [" << default_k << "]\n" << - " -S Silent mode.\n" << - "\n" << - "Note that '-' can be used for standard input/output. \n" << - std::endl; +void print_help() { + std::cerr << "\n" + << "Program: prophasm (a greedy assembler for k-mer set compression)\n" + << "Version: " VERSION "\n" + << "Contact: Karel Brinda \n" + << "\n" + << "Usage: prophasm [options]\n" + << "\n" + << "Examples: prophasm -k 15 -i f1.fa -i f2.fa -x fx.fa\n" + << " - compute intersection of f1 and f2\n" + << " prophasm -k 15 -i f1.fa -i f2.fa -x fx.fa -o g1.fa -o g2.fa\n" + << " - compute intersection of f1 and f2, and subtract it from them\n" + << " prophasm -k 15 -i f1.fa -o g1.fa\n" + << " - re-assemble f1 to g1\n" + << "\n" + << "Command-line parameters:\n" + << " -k INT K-mer size.\n" + << " -i FILE Input FASTA file (can be used multiple times).\n" + << " -o FILE Output FASTA file (if used, must be used as many times as -i).\n" + << " -x FILE Compute intersection, subtract it, save it.\n" + << " -s FILE Output file with k-mer statistics.\n" + << + //" -k INT K-mer size. [" << default_k << "]\n" << + " -S Silent mode.\n" + << "\n" + << "Note that '-' can be used for standard input/output. \n" + << std::endl; } -void test_file(FILE *fo, std::string fn){ - if(fo==nullptr){ - std::cerr << "Error: file '" << fn << "' could not be open (error " << errno << ", " << strerror(errno) << ")." << std::endl; - exit(1); - } +void test_file(FILE *fo, std::string fn) { + if (fo == nullptr) { + std::cerr << "Error: file '" << fn << "' could not be open (error " << errno << ", " + << strerror(errno) << ")." << std::endl; + exit(1); + } } -template -int32_t encode_forward(const char *kmers, const int32_t k, _nkmer_T &nkmer){ - nkmer=0; - for (int32_t i=0;i(kmers[i])]; - if (nt4==4){ - return -1; - } - - nkmer <<= 2; - nkmer |= nt4; - } - return 0; +template +int32_t encode_forward(const char *kmers, const int32_t k, _nkmer_T &nkmer) { + nkmer = 0; + for (int32_t i = 0; i < k; i++) { + uint8_t nt4 = nt256_nt4[static_cast(kmers[i])]; + if (nt4 == 4) { + return -1; + } + + nkmer <<= 2; + nkmer |= nt4; + } + return 0; } -template -int32_t encode_reverse(const char *kmers, const int32_t k, _nkmer_T &nkmer){ - nkmer=0; - for (int32_t i=0;i(kmers[k-i-1])]; - if (nt4==4){ - return -1; - } - - //complement - nt4 = 3-nt4; - - nkmer <<= 2; - nkmer |= nt4; - } - return 0; +template +int32_t encode_reverse(const char *kmers, const int32_t k, _nkmer_T &nkmer) { + nkmer = 0; + for (int32_t i = 0; i < k; i++) { + uint8_t nt4 = nt256_nt4[static_cast(kmers[k - i - 1])]; + if (nt4 == 4) { + return -1; + } + + // complement + nt4 = 3 - nt4; + + nkmer <<= 2; + nkmer |= nt4; + } + return 0; } -template -int32_t encode_canonical(const char *kmers, const int32_t k, _nkmer_T &nkmer){ - _nkmer_T nkmer_f; - _nkmer_T nkmer_r; +template +int32_t encode_canonical(const char *kmers, const int32_t k, _nkmer_T &nkmer) { + _nkmer_T nkmer_f; + _nkmer_T nkmer_r; - int32_t error_code; + int32_t error_code; - error_code=encode_forward(kmers, k, nkmer_f); - if(error_code!=0){ - return error_code; - } + error_code = encode_forward(kmers, k, nkmer_f); + if (error_code != 0) { + return error_code; + } - error_code=encode_reverse(kmers, k, nkmer_r); - if(error_code!=0){ - return error_code; - } + error_code = encode_reverse(kmers, k, nkmer_r); + if (error_code != 0) { + return error_code; + } - nkmer=std::min(nkmer_f,nkmer_r); + nkmer = std::min(nkmer_f, nkmer_r); - return 0; + return 0; } -template -int32_t decode_kmer(_nkmer_T nkmer, int32_t k, std::string &kmer){ - kmer.resize(k); - for(int32_t i=0;i>=2; - kmer[k-i-1]=nt4_nt256[nt4]; - } +template +int32_t decode_kmer(_nkmer_T nkmer, int32_t k, std::string &kmer) { + kmer.resize(k); + for (int32_t i = 0; i < k; i++) { + uint8_t nt4 = nkmer & 0x3; + nkmer >>= 2; + kmer[k - i - 1] = nt4_nt256[nt4]; + } - return 0; + return 0; } -void reverse_complement_in_place(std::string &kmer){ - //std::cerr << "before reverse complementing " << kmer << std::endl; +void reverse_complement_in_place(std::string &kmer) { + // std::cerr << "before reverse complementing " << kmer << std::endl; std::reverse(kmer.begin(), kmer.end()); - for (int32_t i=0;i(kmer.size());i++){ - char nt4=nt256_nt4[static_cast(kmer[i])]; - if (nt4<4){ - nt4=3-nt4; - } - kmer[i]=nt4_nt256[static_cast(nt4)]; - } - //std::cerr << "after reverse complementing " << kmer << std::endl; + for (int32_t i = 0; i < static_cast(kmer.size()); i++) { + char nt4 = nt256_nt4[static_cast(kmer[i])]; + if (nt4 < 4) { + nt4 = 3 - nt4; + } + kmer[i] = nt4_nt256[static_cast(nt4)]; + } + // std::cerr << "after reverse complementing " << kmer << std::endl; } -template -void debug_print_kmer_set(_set_T &set, int k, bool verbose){ - std::string kmer; - for(auto x: set){ - decode_kmer(x, k, kmer); - if(verbose){ - std::cerr << x << " " << kmer << "; "; - } - } - if(verbose){ - std::cerr< +void debug_print_kmer_set(_set_T &set, int k, bool verbose) { + std::string kmer; + for (auto x : set) { + decode_kmer(x, k, kmer); + if (verbose) { + std::cerr << x << " " << kmer << "; "; + } + } + if (verbose) { + std::cerr << std::endl; + } } +struct simplitig_t { + int32_t k; -struct simplitig_t{ - int32_t k; - - /* simplitig buffer */ - char *seq_buffer; + /* simplitig buffer */ + char *seq_buffer; - /* the first position of the simplitig */ - char *l_ext; + /* the first position of the simplitig */ + char *l_ext; - /* the last position of the simplitig +1 (semiopen ) */ - char *r_ext; + /* the last position of the simplitig +1 (semiopen ) */ + char *r_ext; - /* min possible value of l_ext */ - char *l_ext_border; + /* min possible value of l_ext */ + char *l_ext_border; - /* max possible value of l_ext */ - char *r_ext_border; + /* max possible value of l_ext */ + char *r_ext_border; - simplitig_t(uint32_t _k){ - this->k=_k; - seq_buffer=new char[k+2*max_simplitig_length+1](); - l_ext_border=seq_buffer; - r_ext_border=seq_buffer+2*max_simplitig_length; - } + simplitig_t(uint32_t _k) { + this->k = _k; + seq_buffer = new char[k + 2 * max_simplitig_length + 1](); + l_ext_border = seq_buffer; + r_ext_border = seq_buffer + 2 * max_simplitig_length; + } - int32_t new_simplitig(const char *base_kmer){ - assert(static_cast(strlen(base_kmer))==k); + int32_t new_simplitig(const char *base_kmer) { + assert(static_cast(strlen(base_kmer)) == k); - l_ext = r_ext = &seq_buffer[max_simplitig_length]; - *r_ext='\0'; + l_ext = r_ext = &seq_buffer[max_simplitig_length]; + *r_ext = '\0'; - for(int32_t i=0;i(c)]; + int32_t r_extend(char c) { + uint8_t nt4 = nt256_nt4[static_cast(c)]; - if (nt4==4){ - return -1; - } + if (nt4 == 4) { + return -1; + } - *r_ext=nt4_nt256[nt4]; - ++r_ext; - *r_ext='\0'; - return 0; - } + *r_ext = nt4_nt256[nt4]; + ++r_ext; + *r_ext = '\0'; + return 0; + } - int32_t l_extend(char c){ - uint8_t nt4 = nt256_nt4[static_cast(c)]; + int32_t l_extend(char c) { + uint8_t nt4 = nt256_nt4[static_cast(c)]; - if (nt4==4){ - return -1; - } + if (nt4 == 4) { + return -1; + } - --l_ext; - *l_ext=nt4_nt256[3-nt4]; - return 0; - } + --l_ext; + *l_ext = nt4_nt256[3 - nt4]; + return 0; + } - ~simplitig_t(){ - delete[] seq_buffer; - } + ~simplitig_t() { delete[] seq_buffer; } - bool is_full(){ - return (r_ext>=r_ext_border) || (l_ext<=l_ext_border); - } + bool is_full() { return (r_ext >= r_ext_border) || (l_ext <= l_ext_border); } - int32_t print_to_fasta(FILE* fasta_file, const char* simplitig_name, const char *comment=nullptr) const { - if (comment==nullptr){ - fprintf(fasta_file,">%s\n",simplitig_name); - } else { - fprintf(fasta_file,">%s %s\n",simplitig_name,comment); - } + int32_t print_to_fasta(FILE *fasta_file, const char *simplitig_name, + const char *comment = nullptr) const { + if (comment == nullptr) { + fprintf(fasta_file, ">%s\n", simplitig_name); + } else { + fprintf(fasta_file, ">%s %s\n", simplitig_name, comment); + } - char print_buffer[fasta_line_length+1]={0}; + char print_buffer[fasta_line_length + 1] = {0}; - for (char *p=l_ext;p -template -int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE* fstats,bool verbose){ - - if (verbose){ - std::cerr << "Loading " << fasta_fn << std::endl; - } - - set.clear(); - - kseq_t *seq; - int64_t l; - - FILE *instream = nullptr; - if(fasta_fn=="-"){ - instream = stdin; - } - else { - instream = fopen(fasta_fn.c_str(), "r"); - test_file(instream, fasta_fn); - } - gzFile fp = gzdopen(fileno(instream), "r"); - seq = kseq_init(fp); - - typename _set_T::value_type nkmer; - - for(int32_t seqid=0;(l = kseq_read(seq)) >= 0;seqid++) { - //std::cerr << "kmers from fasta" << std::endl; - - //std::cerr << "starting iterator" << std::endl; - for(char *kmer=seq->seq.s; kmer < (seq->seq.s) + (seq->seq.l) -k +1 ;kmer++){ - int c = encode_canonical(kmer, k, nkmer); - if (c==0){ - set.insert(nkmer); - } - else{ - //std::cerr << "problem" < +template +int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, + bool verbose) { + if (verbose) { + std::cerr << "Loading " << fasta_fn << std::endl; + } + + set.clear(); + + kseq_t *seq; + int64_t l; + + FILE *instream = nullptr; + if (fasta_fn == "-") { + instream = stdin; + } else { + instream = fopen(fasta_fn.c_str(), "r"); + test_file(instream, fasta_fn); + } + gzFile fp = gzdopen(fileno(instream), "r"); + seq = kseq_init(fp); + + typename _set_T::value_type nkmer; + + for (int32_t seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { + // std::cerr << "kmers from fasta" << std::endl; + + // std::cerr << "starting iterator" << std::endl; + for (char *kmer = seq->seq.s; kmer < (seq->seq.s) + (seq->seq.l) - k + 1; kmer++) { + int c = encode_canonical(kmer, k, nkmer); + if (c == 0) { + set.insert(nkmer); + } else { + // std::cerr << "problem" < -int32_t find_intersection(const std::vector<_set_T> &sets, _set_T &intersection){ - assert(sets.size()>0); +template +int32_t find_intersection(const std::vector<_set_T> &sets, _set_T &intersection) { + assert(sets.size() > 0); - /* - 1) Find the smallest set from sets. - */ + /* + 1) Find the smallest set from sets. + */ - int32_t min=std::numeric_limits::max(); - int32_t i_min=-1; + int32_t min = std::numeric_limits::max(); + int32_t i_min = -1; - //std::cerr << "searching min" << std::endl; + // std::cerr << "searching min" << std::endl; - for(int32_t i=0;i(sets.size());i++){ - if (static_cast(sets[i].size())(sets.size()); i++) { + if (static_cast(sets[i].size()) < min) { + min = sets[i].size(); + i_min = i; + // std::cerr << "new min" << i << std::endl; + } + } - assert(i_min != std::numeric_limits::max() && i_min!=-1); + assert(i_min != std::numeric_limits::max() && i_min != -1); - /* - 2) Take it as the intersection. - */ + /* + 2) Take it as the intersection. + */ - //std::cerr << "2" << std::endl; + // std::cerr << "2" << std::endl; - intersection.clear(); - //std::cerr << "2.1" << std::endl; - std::copy( - sets[i_min].cbegin(), sets[i_min].cend(), - std::inserter(intersection,intersection.end()) - ); + intersection.clear(); + // std::cerr << "2.1" << std::endl; + std::copy(sets[i_min].cbegin(), sets[i_min].cend(), + std::inserter(intersection, intersection.end())); - /* - 3) Remove elements from intersection present in other sets. - */ + /* + 3) Remove elements from intersection present in other sets. + */ - for(const _set_T ¤t_set : sets) { + for (const _set_T ¤t_set : sets) { + for (auto it = intersection.begin(); it != intersection.end();) { + if (current_set.find(*it) == current_set.cend()) { + it = intersection.erase(it); + } else { + ++it; + } + } + } - for(auto it = intersection.begin(); it !=intersection.end();){ - - if(current_set.find(*it) == current_set.cend()){ - it=intersection.erase(it); - } - else{ - ++it; - } - } - } - - return 0; + return 0; } +template +int32_t remove_subset(std::vector<_set_T> &sets, const _subset_T &subset) { + for (int32_t i = 0; i < static_cast(sets.size()); i++) { + _set_T ¤t_set = sets[i]; -template -int32_t remove_subset(std::vector<_set_T> &sets, const _subset_T &subset){ - - for(int32_t i=0;i(sets.size());i++){ + for (const auto &nkmer : subset) { + current_set.erase(nkmer); + } + } - _set_T ¤t_set = sets[i]; - - for(const auto &nkmer : subset){ - current_set.erase(nkmer); - } - } - - return 0; + return 0; } - -template -int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE* fstats, bool verbose){ - if(fstats){ - fprintf(fstats,"%s\t%lu\n",fasta_fn.c_str(),set.size()); - } - - FILE *file=nullptr; - if(fasta_fn=="-"){ - file=stdout; - } - else{ - file=fopen(fasta_fn.c_str(),"w+"); - test_file(file, fasta_fn); - } - char kmer_str[max_allowed_kmer_length+1]; - simplitig_t simplitig(k); - const std::vector nucls = {'A','C','G','T'}; - - //int32_t i=0; - int32_t simplitig_id=1; - while(set.size()>0){ - - const auto central_nkmer=*(set.begin()); - set.erase(central_nkmer); - - std::string central_kmer_string; - decode_kmer(central_nkmer,k,central_kmer_string); - simplitig.new_simplitig(central_kmer_string.c_str()); - - typename _set_T::value_type nkmer; - - - for (int direction=0;direction<2;direction++){ - - //std::cerr << "direction " << direction << std::endl; - - if (direction==0){ - // forward - } - else{ - // reverse - reverse_complement_in_place(central_kmer_string); - } - - strncpy(kmer_str,central_kmer_string.c_str(),k); - kmer_str[k]='\0'; - - bool extending = true; - - //std::cerr << "central k-mer: " << central_kmer_string << std::endl; - - - while (extending){ - for(int32_t i=0;i +int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, bool verbose) { + if (fstats) { + fprintf(fstats, "%s\t%lu\n", fasta_fn.c_str(), set.size()); + } + + FILE *file = nullptr; + if (fasta_fn == "-") { + file = stdout; + } else { + file = fopen(fasta_fn.c_str(), "w+"); + test_file(file, fasta_fn); + } + char kmer_str[max_allowed_kmer_length + 1]; + simplitig_t simplitig(k); + const std::vector nucls = {'A', 'C', 'G', 'T'}; + + // int32_t i=0; + int32_t simplitig_id = 1; + while (set.size() > 0) { + const auto central_nkmer = *(set.begin()); + set.erase(central_nkmer); + + std::string central_kmer_string; + decode_kmer(central_nkmer, k, central_kmer_string); + simplitig.new_simplitig(central_kmer_string.c_str()); + + typename _set_T::value_type nkmer; + + for (int direction = 0; direction < 2; direction++) { + // std::cerr << "direction " << direction << std::endl; + + if (direction == 0) { + // forward + } else { + // reverse + reverse_complement_in_place(central_kmer_string); + } + + strncpy(kmer_str, central_kmer_string.c_str(), k); + kmer_str[k] = '\0'; + + bool extending = true; + + // std::cerr << "central k-mer: " << central_kmer_string << std::endl; + + while (extending) { + for (int32_t i = 0; i < k; i++) { + kmer_str[i] = kmer_str[i + 1]; + } + kmer_str[k] = '\0'; + + extending = false; + for (const char &c : nucls) { + kmer_str[k - 1] = c; + + encode_canonical(kmer_str, k, nkmer); + + if (set.count(nkmer)) { + // std::cerr << "extending " << c << std::endl; + // debug_print_kmer_set(set,k); + // std::cerr << std::string(simplitig.l_ext) << c << std::endl; + // std::cerr << std::endl; + if (direction == 0) { + simplitig.r_extend(c); + } else { + simplitig.l_extend(c); + } + set.erase(nkmer); + + if (!simplitig.is_full()) { + extending = true; + } + break; + } + } + } + } + + // std::cerr << "====================" << std::endl; + + std::stringstream ss; + ss << "c" << simplitig_id; + const std::string simplitig_name(ss.str()); + simplitig.print_to_fasta(file, simplitig_name.c_str()); + simplitig_id++; + } + + fclose(file); + + if (verbose) { + std::cerr << " assembly finished (" << simplitig_id << " simplitigs)" << std::endl; + } + + return 0; } - -int main (int argc, char* argv[]) -{ - int32_t k=-1; - - std::string intersection_fn; - std::vector in_fns; - std::vector out_fns; - std::string stats_fn; - FILE *fstats=nullptr; - - if (argc<2){ - print_help(); - exit(1); - } - - bool compute_intersection=false; - bool compute_output=false; - bool verbose=true; - int32_t no_sets=0; - - int c; - while ((c = getopt(argc, (char *const *)argv, "hSi:o:x:s:k:")) >= 0) { - switch (c) { - case 'h': { - print_help(); - exit(0); - break; - } - case 'i': { - in_fns.push_back(std::string(optarg)); - no_sets+=1; - break; - } - case 'o': { - out_fns.push_back(std::string(optarg)); - compute_output=true; - break; - } - case 'x': { - intersection_fn=std::string(optarg); - compute_intersection=true; - break; - } - case 's': { - stats_fn=std::string(optarg); - if(stats_fn=="-"){ - fstats = stdin; - } - else { - fstats = fopen(stats_fn.c_str(), "w+"); - test_file(fstats, stats_fn); - } - - break; - } - case 'S': { - verbose=false; - - break; - } - case 'k': { - k = atoi(optarg); - break; - } - case '?': { - std::cerr<<"Unknown error"<(out_fns.size())!=no_sets)){ - std::cerr << "If -o is used, it must be used as many times as -i (" << no_sets << "!=" << out_fns.size() << ")." << std::endl; - return EXIT_FAILURE; - } - - if(fstats){ - fprintf(fstats,"# cmd: %s",argv[0]); - - for (int32_t i=1;i > full_sets(no_sets); - - if(verbose){ - std::cerr << "=====================" << std::endl; - std::cerr << "1) Loading references" << std::endl; - std::cerr << "=====================" << std::endl; - } - - - std::vector in_sizes; - std::vector out_sizes; - - for(int32_t i=0;i intersection; - - int32_t intersection_size = 0; - - if(compute_intersection){ - if (verbose){ - std::cerr << "2.1) Computing intersection" << std::endl; - } - - find_intersection(full_sets, intersection); - intersection_size = intersection.size(); - if (verbose){ - std::cerr << " intersection size: " << intersection_size << std::endl; - } - if(compute_output){ - if (verbose){ - std::cerr << "2.2) Removing this intersection from all kmer sets" << std::endl; - } - remove_subset(full_sets, intersection); - } - } - - if(compute_output){ - for (int32_t i=0;i(in_fns.size());i++){ - assemble(out_fns[i], full_sets[i], k, fstats, verbose); - } - } - if(compute_intersection){ - assemble(intersection_fn, intersection, k, fstats, verbose); - } - - if (fstats){ - fclose(fstats); - } - - return 0; +int main(int argc, char *argv[]) { + int32_t k = -1; + + std::string intersection_fn; + std::vector in_fns; + std::vector out_fns; + std::string stats_fn; + FILE *fstats = nullptr; + + if (argc < 2) { + print_help(); + exit(1); + } + + bool compute_intersection = false; + bool compute_output = false; + bool verbose = true; + int32_t no_sets = 0; + + int c; + while ((c = getopt(argc, (char *const *)argv, "hSi:o:x:s:k:")) >= 0) { + switch (c) { + case 'h': { + print_help(); + exit(0); + break; + } + case 'i': { + in_fns.push_back(std::string(optarg)); + no_sets += 1; + break; + } + case 'o': { + out_fns.push_back(std::string(optarg)); + compute_output = true; + break; + } + case 'x': { + intersection_fn = std::string(optarg); + compute_intersection = true; + break; + } + case 's': { + stats_fn = std::string(optarg); + if (stats_fn == "-") { + fstats = stdin; + } else { + fstats = fopen(stats_fn.c_str(), "w+"); + test_file(fstats, stats_fn); + } + + break; + } + case 'S': { + verbose = false; + + break; + } + case 'k': { + k = atoi(optarg); + break; + } + case '?': { + std::cerr << "Unknown error" << std::endl; + exit(1); + break; + } + } + } + + if (k == -1) { + print_help(); + std::cerr << "K-mer size (-k) is required." << std::endl; + return EXIT_FAILURE; + } + + if (k <= 0 || max_allowed_kmer_length < k) { + std::cerr << "K-mer size must satisfy 1 <= k <= " << max_allowed_kmer_length << "." + << std::endl; + return EXIT_FAILURE; + } + + if (compute_output && (static_cast(out_fns.size()) != no_sets)) { + std::cerr << "If -o is used, it must be used as many times as -i (" << no_sets + << "!=" << out_fns.size() << ")." << std::endl; + return EXIT_FAILURE; + } + + if (fstats) { + fprintf(fstats, "# cmd: %s", argv[0]); + + for (int32_t i = 1; i < argc; i++) { + fprintf(fstats, " %s", argv[i]); + } + fprintf(fstats, "\n"); + } + + std::vector> full_sets(no_sets); + + if (verbose) { + std::cerr << "=====================" << std::endl; + std::cerr << "1) Loading references" << std::endl; + std::cerr << "=====================" << std::endl; + } + + std::vector in_sizes; + std::vector out_sizes; + + for (int32_t i = 0; i < no_sets; i++) { + kmers_from_fasta(in_fns[i], full_sets[i], k, fstats, verbose); + // debug_print_kmer_set(full_sets[i],k); + in_sizes.insert(in_sizes.end(), full_sets[i].size()); + } + + if (verbose) { + std::cerr << "===============" << std::endl; + std::cerr << "2) Intersecting" << std::endl; + std::cerr << "===============" << std::endl; + } + + std::unordered_set intersection; + + int32_t intersection_size = 0; + + if (compute_intersection) { + if (verbose) { + std::cerr << "2.1) Computing intersection" << std::endl; + } + + find_intersection(full_sets, intersection); + intersection_size = intersection.size(); + if (verbose) { + std::cerr << " intersection size: " << intersection_size << std::endl; + } + if (compute_output) { + if (verbose) { + std::cerr << "2.2) Removing this intersection from all kmer sets" << std::endl; + } + remove_subset(full_sets, intersection); + } + } + + if (compute_output) { + for (int32_t i = 0; i < no_sets; i++) { + out_sizes.insert(out_sizes.end(), full_sets[i].size()); + assert(in_sizes[i] == out_sizes[i] + intersection_size); + if (verbose) { + std::cerr << in_sizes[i] << " " << out_sizes[i] << " ...inter:" << intersection_size + << std::endl; + } + } + } + + if (verbose) { + std::cerr << "=============" << std::endl; + std::cerr << "3) Assembling" << std::endl; + std::cerr << "=============" << std::endl; + } + + if (compute_output) { + for (int32_t i = 0; i < static_cast(in_fns.size()); i++) { + assemble(out_fns[i], full_sets[i], k, fstats, verbose); + } + } + if (compute_intersection) { + assemble(intersection_fn, intersection, k, fstats, verbose); + } + + if (fstats) { + fclose(fstats); + } + + return 0; } From d70da2f68fb5732444ee3bbb923bfd4bb3509075 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 Jan 2020 14:08:56 -0500 Subject: [PATCH 07/22] Cleanup --- src/prophasm.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index ae637d1..931ac19 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -34,8 +34,8 @@ de-Bruijn graphs. Todo: - * Find a library for sets of integers bigger than uint64_t (to support k-mers longer than -32). + * Find a library for sets of integers bigger than uint64_t + (to support k-mers longer than 32). * Optimize loading FASTA files. */ #include @@ -370,8 +370,6 @@ int32_t find_intersection(const std::vector<_set_T> &sets, _set_T &intersection) int32_t min = std::numeric_limits::max(); int32_t i_min = -1; - // std::cerr << "searching min" << std::endl; - for (int32_t i = 0; i < static_cast(sets.size()); i++) { if (static_cast(sets[i].size()) < min) { min = sets[i].size(); @@ -385,11 +383,7 @@ int32_t find_intersection(const std::vector<_set_T> &sets, _set_T &intersection) /* 2) Take it as the intersection. */ - - // std::cerr << "2" << std::endl; - intersection.clear(); - // std::cerr << "2.1" << std::endl; std::copy(sets[i_min].cbegin(), sets[i_min].cend(), std::inserter(intersection, intersection.end())); @@ -468,7 +462,6 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, bool extending = true; // std::cerr << "central k-mer: " << central_kmer_string << std::endl; - while (extending) { for (int32_t i = 0; i < k; i++) { kmer_str[i] = kmer_str[i + 1]; @@ -502,8 +495,6 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, } } - // std::cerr << "====================" << std::endl; - std::stringstream ss; ss << "c" << simplitig_id; const std::string simplitig_name(ss.str()); From cbe303af55048b3fd18ede39103dfb2e6becaa89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 12 May 2020 08:02:51 -0400 Subject: [PATCH 08/22] Print simplitig directly --- src/prophasm.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 931ac19..8742c7b 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -60,7 +60,6 @@ typedef uint64_t nkmer_t; typedef std::set set_t; const int32_t max_simplitig_length = 10000000; -const int32_t fasta_line_length = max_simplitig_length; // do not break fasta lines const int32_t max_allowed_kmer_length = sizeof(nkmer_t) * 4; // const int32_t default_k=31; @@ -294,12 +293,7 @@ struct simplitig_t { fprintf(fasta_file, ">%s %s\n", simplitig_name, comment); } - char print_buffer[fasta_line_length + 1] = {0}; - - for (char *p = l_ext; p < r_ext; p += fasta_line_length) { - strncpy(print_buffer, p, fasta_line_length); - fprintf(fasta_file, "%s\n", print_buffer); - } + fprintf(fasta_file, "%s\n", l_ext); return 0; } From de1b7f486e38290c84bd2f492117043f00013262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 12 May 2020 08:26:30 -0400 Subject: [PATCH 09/22] Improve output messages --- src/prophasm.cpp | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 8742c7b..f54ff8e 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -104,13 +104,13 @@ void print_help() { << " - re-assemble f1 to g1\n" << "\n" << "Command-line parameters:\n" - << " -k INT K-mer size.\n" + << " -k INT K-mer length.\n" << " -i FILE Input FASTA file (can be used multiple times).\n" << " -o FILE Output FASTA file (if used, must be used as many times as -i).\n" << " -x FILE Compute intersection, subtract it, save it.\n" << " -s FILE Output file with k-mer statistics.\n" << - //" -k INT K-mer size. [" << default_k << "]\n" << + //" -k INT K-mer length. [" << default_k << "]\n" << " -S Silent mode.\n" << "\n" << "Note that '-' can be used for standard input/output. \n" @@ -308,7 +308,7 @@ template int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, bool verbose) { if (verbose) { - std::cerr << "Loading " << fasta_fn << std::endl; + std::cerr << " loading " << fasta_fn << std::endl; } set.clear(); @@ -429,7 +429,9 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, const std::vector nucls = {'A', 'C', 'G', 'T'}; // int32_t i=0; - int32_t simplitig_id = 1; + int64_t simplitig_id = 1; + int64_t kmers = set.size(); + while (set.size() > 0) { const auto central_nkmer = *(set.begin()); set.erase(central_nkmer); @@ -498,8 +500,11 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, fclose(file); + const int64_t ns = simplitig_id; + const int64_t cl = kmers + ns * (k - 1); if (verbose) { - std::cerr << " assembly finished (" << simplitig_id << " simplitigs)" << std::endl; + std::cerr << " simplitig computation finished (" << ns << " simplitigs, " + << cl / (1024.0 * 1024.0) << " Mbp)" << std::endl; } return 0; @@ -577,12 +582,12 @@ int main(int argc, char *argv[]) { if (k == -1) { print_help(); - std::cerr << "K-mer size (-k) is required." << std::endl; + std::cerr << "K-mer length (-k) is required." << std::endl; return EXIT_FAILURE; } if (k <= 0 || max_allowed_kmer_length < k) { - std::cerr << "K-mer size must satisfy 1 <= k <= " << max_allowed_kmer_length << "." + std::cerr << "K-mer length must satisfy 1 <= k <= " << max_allowed_kmer_length << "." << std::endl; return EXIT_FAILURE; } @@ -605,9 +610,9 @@ int main(int argc, char *argv[]) { std::vector> full_sets(no_sets); if (verbose) { - std::cerr << "=====================" << std::endl; - std::cerr << "1) Loading references" << std::endl; - std::cerr << "=====================" << std::endl; + std::cerr << "======================" << std::endl; + std::cerr << "1) Loading input files" << std::endl; + std::cerr << "======================" << std::endl; } std::vector in_sizes; @@ -631,17 +636,17 @@ int main(int argc, char *argv[]) { if (compute_intersection) { if (verbose) { - std::cerr << "2.1) Computing intersection" << std::endl; + std::cerr << "2.1) Computing the intersection" << std::endl; } find_intersection(full_sets, intersection); intersection_size = intersection.size(); if (verbose) { - std::cerr << " intersection size: " << intersection_size << std::endl; + std::cerr << " intersection size: " << intersection_size << " k-mers" << std::endl; } if (compute_output) { if (verbose) { - std::cerr << "2.2) Removing this intersection from all kmer sets" << std::endl; + std::cerr << "2.2) Computing set differences" << std::endl; } remove_subset(full_sets, intersection); } @@ -652,16 +657,18 @@ int main(int argc, char *argv[]) { out_sizes.insert(out_sizes.end(), full_sets[i].size()); assert(in_sizes[i] == out_sizes[i] + intersection_size); if (verbose) { - std::cerr << in_sizes[i] << " " << out_sizes[i] << " ...inter:" << intersection_size + std::cerr << " input size: " << in_sizes[i] + << " k-mers, output size: " << out_sizes[i] + << " k-mers, intersection size: " << intersection_size << " k-mers" << std::endl; } } } if (verbose) { - std::cerr << "=============" << std::endl; - std::cerr << "3) Assembling" << std::endl; - std::cerr << "=============" << std::endl; + std::cerr << "=======================" << std::endl; + std::cerr << "3) Computing simplitigs" << std::endl; + std::cerr << "=======================" << std::endl; } if (compute_output) { From 4d7e7aa4477befbd1f302b58058664798c3b775f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 12 May 2020 08:27:37 -0400 Subject: [PATCH 10/22] Fix ns and cl --- src/prophasm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index f54ff8e..42a9553 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -500,7 +500,7 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, fclose(file); - const int64_t ns = simplitig_id; + const int64_t ns = simplitig_id - 1; const int64_t cl = kmers + ns * (k - 1); if (verbose) { std::cerr << " simplitig computation finished (" << ns << " simplitigs, " From edd99b3912f3c39fad6b449536cf8532f79ac721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 12 May 2020 08:33:22 -0400 Subject: [PATCH 11/22] Update program description --- src/prophasm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 42a9553..99c7882 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -90,7 +90,7 @@ KSEQ_INIT(gzFile, gzread) void print_help() { std::cerr << "\n" - << "Program: prophasm (a greedy assembler for k-mer set compression)\n" + << "Program: prophasm (computation of simplitigs and k-mer set operations)\n" << "Version: " VERSION "\n" << "Contact: Karel Brinda \n" << "\n" From e3b0b11e6e775357fd0a05d7e1be248e95290516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 12 May 2020 09:46:05 -0400 Subject: [PATCH 12/22] Update the help message --- src/prophasm.cpp | 76 +++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 99c7882..0cb536a 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -89,32 +89,31 @@ static const uint8_t nt256_nt4[] = { KSEQ_INIT(gzFile, gzread) void print_help() { - std::cerr << "\n" - << "Program: prophasm (computation of simplitigs and k-mer set operations)\n" - << "Version: " VERSION "\n" - << "Contact: Karel Brinda \n" - << "\n" - << "Usage: prophasm [options]\n" - << "\n" - << "Examples: prophasm -k 15 -i f1.fa -i f2.fa -x fx.fa\n" - << " - compute intersection of f1 and f2\n" - << " prophasm -k 15 -i f1.fa -i f2.fa -x fx.fa -o g1.fa -o g2.fa\n" - << " - compute intersection of f1 and f2, and subtract it from them\n" - << " prophasm -k 15 -i f1.fa -o g1.fa\n" - << " - re-assemble f1 to g1\n" - << "\n" - << "Command-line parameters:\n" - << " -k INT K-mer length.\n" - << " -i FILE Input FASTA file (can be used multiple times).\n" - << " -o FILE Output FASTA file (if used, must be used as many times as -i).\n" - << " -x FILE Compute intersection, subtract it, save it.\n" - << " -s FILE Output file with k-mer statistics.\n" - << - //" -k INT K-mer length. [" << default_k << "]\n" << - " -S Silent mode.\n" - << "\n" - << "Note that '-' can be used for standard input/output. \n" - << std::endl; + std::cerr + << "\n" + << "Program: prophasm (computation of simplitigs and k-mer set operations)\n" + << "Version: " VERSION "\n" + << "Contact: Karel Brinda \n" + << "\n" + << "Usage: prophasm [options]\n" + << "\n" + << "Examples: prophasm -k 31 -i ref.fa -o simplitigs.fa\n" + << " - compute simplitigs of ref.fa\n" + << " prophasm -k 31 -i ref1.fa -i ref2.fa -x inter.fa\n" + << " - intersect the k-mers sets of ref1 and ref2\n" + << " prophasm -k 31 -i ref1.fa -i ref2.fa -x inter.fa -o dif1.fa -o dif2.fa\n" + << " - intersect ref1 and ref2, and compute the set differences\n" + << "\n" + << "Command-line parameters:\n" + << " -k INT k-mer length (from [1, 32])\n" + << " -i FILE input FASTA file (can be used multiple times)\n" + << " -o FILE output FASTA file (if used, must be used as many times as -i)\n" + << " -x FILE compute intersection, subtract it, save it\n" + << " -s FILE output file with k-mer statistics\n" + << " -s silent mode\n" + << "\n" + << "Note that '-' can be used for standard input/output. \n" + << std::endl; } void test_file(FILE *fo, std::string fn) { @@ -524,8 +523,8 @@ int main(int argc, char *argv[]) { exit(1); } - bool compute_intersection = false; - bool compute_output = false; + bool compute_intersection = true; + bool compute_differences = false; bool verbose = true; int32_t no_sets = 0; @@ -544,12 +543,11 @@ int main(int argc, char *argv[]) { } case 'o': { out_fns.push_back(std::string(optarg)); - compute_output = true; + compute_differences = true; break; } case 'x': { - intersection_fn = std::string(optarg); - compute_intersection = true; + intersection_fn = std::string(optarg); break; } case 's': { @@ -592,7 +590,7 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } - if (compute_output && (static_cast(out_fns.size()) != no_sets)) { + if (compute_differences && (static_cast(out_fns.size()) != no_sets)) { std::cerr << "If -o is used, it must be used as many times as -i (" << no_sets << "!=" << out_fns.size() << ")." << std::endl; return EXIT_FAILURE; @@ -608,6 +606,9 @@ int main(int argc, char *argv[]) { } std::vector> full_sets(no_sets); + std::unordered_set intersection; + std::vector in_sizes; + std::vector out_sizes; if (verbose) { std::cerr << "======================" << std::endl; @@ -615,9 +616,6 @@ int main(int argc, char *argv[]) { std::cerr << "======================" << std::endl; } - std::vector in_sizes; - std::vector out_sizes; - for (int32_t i = 0; i < no_sets; i++) { kmers_from_fasta(in_fns[i], full_sets[i], k, fstats, verbose); // debug_print_kmer_set(full_sets[i],k); @@ -630,8 +628,6 @@ int main(int argc, char *argv[]) { std::cerr << "===============" << std::endl; } - std::unordered_set intersection; - int32_t intersection_size = 0; if (compute_intersection) { @@ -644,7 +640,7 @@ int main(int argc, char *argv[]) { if (verbose) { std::cerr << " intersection size: " << intersection_size << " k-mers" << std::endl; } - if (compute_output) { + if (compute_differences) { if (verbose) { std::cerr << "2.2) Computing set differences" << std::endl; } @@ -652,7 +648,7 @@ int main(int argc, char *argv[]) { } } - if (compute_output) { + if (compute_differences) { for (int32_t i = 0; i < no_sets; i++) { out_sizes.insert(out_sizes.end(), full_sets[i].size()); assert(in_sizes[i] == out_sizes[i] + intersection_size); @@ -671,7 +667,7 @@ int main(int argc, char *argv[]) { std::cerr << "=======================" << std::endl; } - if (compute_output) { + if (compute_differences) { for (int32_t i = 0; i < static_cast(in_fns.size()); i++) { assemble(out_fns[i], full_sets[i], k, fstats, verbose); } From 3fcb41f3fa619ebc6b940be21ae7317229a8d017 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 12 May 2020 09:48:16 -0400 Subject: [PATCH 13/22] Fix a typo --- src/prophasm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 0cb536a..c034437 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -110,7 +110,7 @@ void print_help() { << " -o FILE output FASTA file (if used, must be used as many times as -i)\n" << " -x FILE compute intersection, subtract it, save it\n" << " -s FILE output file with k-mer statistics\n" - << " -s silent mode\n" + << " -S silent mode\n" << "\n" << "Note that '-' can be used for standard input/output. \n" << std::endl; From 0d90eaa9e5e883672edb25033e92e7ca20079442 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 12 May 2020 09:57:13 -0400 Subject: [PATCH 14/22] Treat separately single sets --- src/prophasm.cpp | 113 +++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 53 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index c034437..47018b9 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -605,75 +605,82 @@ int main(int argc, char *argv[]) { fprintf(fstats, "\n"); } - std::vector> full_sets(no_sets); - std::unordered_set intersection; - std::vector in_sizes; - std::vector out_sizes; - - if (verbose) { - std::cerr << "======================" << std::endl; - std::cerr << "1) Loading input files" << std::endl; - std::cerr << "======================" << std::endl; - } - - for (int32_t i = 0; i < no_sets; i++) { - kmers_from_fasta(in_fns[i], full_sets[i], k, fstats, verbose); - // debug_print_kmer_set(full_sets[i],k); - in_sizes.insert(in_sizes.end(), full_sets[i].size()); - } - - if (verbose) { - std::cerr << "===============" << std::endl; - std::cerr << "2) Intersecting" << std::endl; - std::cerr << "===============" << std::endl; - } - - int32_t intersection_size = 0; + if (no_sets == 1) { + std::unordered_set full_set; + kmers_from_fasta(in_fns[0], full_set, k, fstats, verbose); + assemble(out_fns[0], full_set, k, fstats, verbose); + } else { + std::vector> full_sets(no_sets); + std::unordered_set intersection; + std::vector in_sizes; + std::vector out_sizes; - if (compute_intersection) { if (verbose) { - std::cerr << "2.1) Computing the intersection" << std::endl; + std::cerr << "======================" << std::endl; + std::cerr << "1) Loading input files" << std::endl; + std::cerr << "======================" << std::endl; + } + + for (int32_t i = 0; i < no_sets; i++) { + kmers_from_fasta(in_fns[i], full_sets[i], k, fstats, verbose); + // debug_print_kmer_set(full_sets[i],k); + in_sizes.insert(in_sizes.end(), full_sets[i].size()); } - find_intersection(full_sets, intersection); - intersection_size = intersection.size(); if (verbose) { - std::cerr << " intersection size: " << intersection_size << " k-mers" << std::endl; + std::cerr << "===============" << std::endl; + std::cerr << "2) Intersecting" << std::endl; + std::cerr << "===============" << std::endl; } - if (compute_differences) { + + int32_t intersection_size = 0; + + if (compute_intersection) { if (verbose) { - std::cerr << "2.2) Computing set differences" << std::endl; + std::cerr << "2.1) Computing the intersection" << std::endl; } - remove_subset(full_sets, intersection); - } - } - if (compute_differences) { - for (int32_t i = 0; i < no_sets; i++) { - out_sizes.insert(out_sizes.end(), full_sets[i].size()); - assert(in_sizes[i] == out_sizes[i] + intersection_size); + find_intersection(full_sets, intersection); + intersection_size = intersection.size(); if (verbose) { - std::cerr << " input size: " << in_sizes[i] - << " k-mers, output size: " << out_sizes[i] - << " k-mers, intersection size: " << intersection_size << " k-mers" + std::cerr << " intersection size: " << intersection_size << " k-mers" << std::endl; } + if (compute_differences) { + if (verbose) { + std::cerr << "2.2) Computing set differences" << std::endl; + } + remove_subset(full_sets, intersection); + } } - } - if (verbose) { - std::cerr << "=======================" << std::endl; - std::cerr << "3) Computing simplitigs" << std::endl; - std::cerr << "=======================" << std::endl; - } + if (compute_differences) { + for (int32_t i = 0; i < no_sets; i++) { + out_sizes.insert(out_sizes.end(), full_sets[i].size()); + assert(in_sizes[i] == out_sizes[i] + intersection_size); + if (verbose) { + std::cerr << " input size: " << in_sizes[i] + << " k-mers, output size: " << out_sizes[i] + << " k-mers, intersection size: " << intersection_size << " k-mers" + << std::endl; + } + } + } - if (compute_differences) { - for (int32_t i = 0; i < static_cast(in_fns.size()); i++) { - assemble(out_fns[i], full_sets[i], k, fstats, verbose); + if (verbose) { + std::cerr << "=======================" << std::endl; + std::cerr << "3) Computing simplitigs" << std::endl; + std::cerr << "=======================" << std::endl; + } + + if (compute_differences) { + for (int32_t i = 0; i < static_cast(in_fns.size()); i++) { + assemble(out_fns[i], full_sets[i], k, fstats, verbose); + } + } + if (compute_intersection) { + assemble(intersection_fn, intersection, k, fstats, verbose); } - } - if (compute_intersection) { - assemble(intersection_fn, intersection, k, fstats, verbose); } if (fstats) { From 5a09d4af4f420c174f1fcee3431133e568b3c031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 30 Jul 2021 11:23:03 -0400 Subject: [PATCH 15/22] Update version --- README.md | 28 ++++++++++++++-------------- src/version.h | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 6d53f3f..f3949a3 100644 --- a/README.md +++ b/README.md @@ -90,26 +90,26 @@ Set operations: USAGE-BEGIN --> ``` -Program: prophasm (a greedy assembler for k-mer set compression) -Version: 0.1.1 +Program: prophasm (computation of simplitigs and k-mer set operations) +Version: 0.1.2 Contact: Karel Brinda Usage: prophasm [options] -Examples: prophasm -k 15 -i f1.fa -i f2.fa -x fx.fa - - compute intersection of f1 and f2 - prophasm -k 15 -i f1.fa -i f2.fa -x fx.fa -o g1.fa -o g2.fa - - compute intersection of f1 and f2, and subtract it from them - prophasm -k 15 -i f1.fa -o g1.fa - - re-assemble f1 to g1 +Examples: prophasm -k 31 -i ref.fa -o simplitigs.fa + - compute simplitigs of ref.fa + prophasm -k 31 -i ref1.fa -i ref2.fa -x inter.fa + - intersect the k-mers sets of ref1 and ref2 + prophasm -k 31 -i ref1.fa -i ref2.fa -x inter.fa -o dif1.fa -o dif2.fa + - intersect ref1 and ref2, and compute the set differences Command-line parameters: - -k INT K-mer size. - -i FILE Input FASTA file (can be used multiple times). - -o FILE Output FASTA file (if used, must be used as many times as -i). - -x FILE Compute intersection, subtract it, save it. - -s FILE Output file with k-mer statistics. - -S Silent mode. + -k INT k-mer length (from [1, 32]) + -i FILE input FASTA file (can be used multiple times) + -o FILE output FASTA file (if used, must be used as many times as -i) + -x FILE compute intersection, subtract it, save it + -s FILE output file with k-mer statistics + -S silent mode Note that '-' can be used for standard input/output. diff --git a/src/version.h b/src/version.h index 080e36d..287ffa6 100644 --- a/src/version.h +++ b/src/version.h @@ -1,2 +1,2 @@ -#define VERSION "0.1.1" +#define VERSION "0.1.2" From 2b85eb4af4aca6fa4b498a6957cd33aeb1c1f8ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 17 Feb 2023 11:50:11 +0100 Subject: [PATCH 16/22] Update LICENSE --- LICENSE | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 69d87ba..af3d5ea 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,9 @@ The MIT License -Copyright (c) 2016-2020 Karel Brinda +Copyright (c) 2022- Inria + 2017-2021 Harvard Medical School + 2017-2021 Harvard T.H. Chan School of Public Health + 2015-2016 Universite Paris-Est Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From b54fae4a4e0fe6e2016a8e9c166fcedd07db8575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 17 Feb 2023 13:51:26 +0100 Subject: [PATCH 17/22] Add CITATION.cff --- CITATION.cff | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..1886c2c --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,33 @@ +# vim: set ft=yaml ts=2 sts=2 sw=2 expandtab: +cff-version: 1.2.0 +title: ProphAsm +message: "If you use this software, please cite both the article from preferred-citation and the software itself." +type: software +authors: + - family-names: Břinda + given-names: Karel + email: karel.brinda@inria.fr + orcid: "https://orcid.org/0000-0003-0200-557X" +doi: 10.5281/zenodo.3887034 +repository-code: 'https://github.com/prophyle/prophasm' +license: MIT +preferred-citation: + authors: + - family-names: Břinda + given-names: Karel + email: karel.brinda@inria.fr + orcid: "https://orcid.org/0000-0003-0200-557X" + - family-names: Baym + given-names: Michael + email: baym@hms.harvard.edu + orcid: "https://orcid.org/0000-0003-1303-5598" + - family-names: Kucherov + given-names: Gregory + email: gregory.kucherov@univ-eiffel.fr + orcid: "https://orcid.org/0000-0001-5899-5424" + type: article + title: "Simplitigs as an efficient and scalable representation of de Bruijn graphs" + year: 2021 + journal: Genome Biology + volume: 22 + doi: 10.1186/s13059-021-02297-z From ef2506b43ca4d2c394babcd126a89469be124a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 17 Feb 2023 13:57:03 +0100 Subject: [PATCH 18/22] Reformat --- CITATION.cff | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 1886c2c..101ca95 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,32 +1,37 @@ # vim: set ft=yaml ts=2 sts=2 sw=2 expandtab: +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + cff-version: 1.2.0 title: ProphAsm -message: "If you use this software, please cite both the article from preferred-citation and the software itself." +message: >- + If you use this software, please cite both the article + from preferred-citation and the software itself. type: software authors: - family-names: Břinda given-names: Karel email: karel.brinda@inria.fr - orcid: "https://orcid.org/0000-0003-0200-557X" -doi: 10.5281/zenodo.3887034 + orcid: 'https://orcid.org/0000-0003-0200-557X' repository-code: 'https://github.com/prophyle/prophasm' license: MIT +doi: 10.5281/zenodo.3887034 preferred-citation: authors: - family-names: Břinda given-names: Karel email: karel.brinda@inria.fr - orcid: "https://orcid.org/0000-0003-0200-557X" + orcid: 'https://orcid.org/0000-0003-0200-557X' - family-names: Baym given-names: Michael email: baym@hms.harvard.edu - orcid: "https://orcid.org/0000-0003-1303-5598" + orcid: 'https://orcid.org/0000-0003-1303-5598' - family-names: Kucherov given-names: Gregory email: gregory.kucherov@univ-eiffel.fr - orcid: "https://orcid.org/0000-0001-5899-5424" + orcid: 'https://orcid.org/0000-0001-5899-5424' type: article - title: "Simplitigs as an efficient and scalable representation of de Bruijn graphs" + title: Simplitigs as an efficient and scalable representation of de Bruijn graphs year: 2021 journal: Genome Biology volume: 22 From c2cdf26ccdffd46d6b5dce932fd33e1924d061f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 17 Feb 2023 18:06:11 +0100 Subject: [PATCH 19/22] Improve statistics --- src/prophasm.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 47018b9..83a0461 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -502,8 +502,8 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, const int64_t ns = simplitig_id - 1; const int64_t cl = kmers + ns * (k - 1); if (verbose) { - std::cerr << " simplitig computation finished (" << ns << " simplitigs, " - << cl / (1024.0 * 1024.0) << " Mbp)" << std::endl; + std::cerr << " simplitig computation finished; #kmers=" << kmers << ", NS=" << ns << ", CL=" + << cl << " bp (" << round(cl*1e-4)*1e-2 << " Mbp)" << std::endl; } return 0; @@ -643,7 +643,7 @@ int main(int argc, char *argv[]) { find_intersection(full_sets, intersection); intersection_size = intersection.size(); if (verbose) { - std::cerr << " intersection size: " << intersection_size << " k-mers" + std::cerr << " intersection: #kmers=" << intersection_size << std::endl; } if (compute_differences) { @@ -659,9 +659,9 @@ int main(int argc, char *argv[]) { out_sizes.insert(out_sizes.end(), full_sets[i].size()); assert(in_sizes[i] == out_sizes[i] + intersection_size); if (verbose) { - std::cerr << " input size: " << in_sizes[i] - << " k-mers, output size: " << out_sizes[i] - << " k-mers, intersection size: " << intersection_size << " k-mers" + std::cerr << " input: #kmers=" << in_sizes[i] + << ", output: #kmers=" << out_sizes[i] + //<< ", intersection size: " << intersection_size << " k-mers" << std::endl; } } From ee01cff137731334973af5990474dab4df8b3da0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 17 Feb 2023 18:17:19 +0100 Subject: [PATCH 20/22] Improve stats --- src/prophasm.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index 83a0461..f3faf9a 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -307,7 +307,7 @@ template int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, bool verbose) { if (verbose) { - std::cerr << " loading " << fasta_fn << std::endl; + std::cerr << " fasta loading (" << fasta_fn << ")" << std::endl; } set.clear(); @@ -327,10 +327,11 @@ int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE * typename _set_T::value_type nkmer; + int64_t ns=0; + int64_t cl=0; for (int32_t seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { - // std::cerr << "kmers from fasta" << std::endl; - - // std::cerr << "starting iterator" << std::endl; + ns++; + cl+=seq->seq.l; for (char *kmer = seq->seq.s; kmer < (seq->seq.s) + (seq->seq.l) - k + 1; kmer++) { int c = encode_canonical(kmer, k, nkmer); if (c == 0) { @@ -341,6 +342,7 @@ int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE * } } + if (fstats) { fprintf(fstats, "%s\t%lu\n", fasta_fn.c_str(), set.size()); } @@ -349,6 +351,11 @@ int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE * kseq_destroy(seq); gzclose(fp); + if (verbose) { + std::cerr << " #kmers=" << set.size() << ", NS=" << ns << ", CL=" + << cl << " bp (" << round(cl*1e-4)*1e-2 << " Mbp)" << std::endl; + } + return 0; } @@ -502,7 +509,8 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, const int64_t ns = simplitig_id - 1; const int64_t cl = kmers + ns * (k - 1); if (verbose) { - std::cerr << " simplitig computation finished; #kmers=" << kmers << ", NS=" << ns << ", CL=" + std::cerr << " simplitig computation finished (" << fasta_fn << ")" << std::endl; + std::cerr << " #kmers=" << kmers << ", NS=" << ns << ", CL=" << cl << " bp (" << round(cl*1e-4)*1e-2 << " Mbp)" << std::endl; } From 68a1ef900fafaf7bfec20b9d16a52b97002f9d0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 17 Feb 2023 18:17:46 +0100 Subject: [PATCH 21/22] Reformat --- src/prophasm.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/prophasm.cpp b/src/prophasm.cpp index f3faf9a..1251834 100644 --- a/src/prophasm.cpp +++ b/src/prophasm.cpp @@ -327,11 +327,11 @@ int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE * typename _set_T::value_type nkmer; - int64_t ns=0; - int64_t cl=0; + int64_t ns = 0; + int64_t cl = 0; for (int32_t seqid = 0; (l = kseq_read(seq)) >= 0; seqid++) { - ns++; - cl+=seq->seq.l; + ns++; + cl += seq->seq.l; for (char *kmer = seq->seq.s; kmer < (seq->seq.s) + (seq->seq.l) - k + 1; kmer++) { int c = encode_canonical(kmer, k, nkmer); if (c == 0) { @@ -342,7 +342,6 @@ int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE * } } - if (fstats) { fprintf(fstats, "%s\t%lu\n", fasta_fn.c_str(), set.size()); } @@ -352,8 +351,8 @@ int kmers_from_fasta(const std::string &fasta_fn, _set_T &set, int32_t k, FILE * gzclose(fp); if (verbose) { - std::cerr << " #kmers=" << set.size() << ", NS=" << ns << ", CL=" - << cl << " bp (" << round(cl*1e-4)*1e-2 << " Mbp)" << std::endl; + std::cerr << " #kmers=" << set.size() << ", NS=" << ns << ", CL=" << cl << " bp (" + << round(cl * 1e-4) * 1e-2 << " Mbp)" << std::endl; } return 0; @@ -510,8 +509,8 @@ int assemble(const std::string &fasta_fn, _set_T &set, int32_t k, FILE *fstats, const int64_t cl = kmers + ns * (k - 1); if (verbose) { std::cerr << " simplitig computation finished (" << fasta_fn << ")" << std::endl; - std::cerr << " #kmers=" << kmers << ", NS=" << ns << ", CL=" - << cl << " bp (" << round(cl*1e-4)*1e-2 << " Mbp)" << std::endl; + std::cerr << " #kmers=" << kmers << ", NS=" << ns << ", CL=" << cl << " bp (" + << round(cl * 1e-4) * 1e-2 << " Mbp)" << std::endl; } return 0; @@ -651,8 +650,7 @@ int main(int argc, char *argv[]) { find_intersection(full_sets, intersection); intersection_size = intersection.size(); if (verbose) { - std::cerr << " intersection: #kmers=" << intersection_size - << std::endl; + std::cerr << " intersection: #kmers=" << intersection_size << std::endl; } if (compute_differences) { if (verbose) { @@ -667,9 +665,9 @@ int main(int argc, char *argv[]) { out_sizes.insert(out_sizes.end(), full_sets[i].size()); assert(in_sizes[i] == out_sizes[i] + intersection_size); if (verbose) { - std::cerr << " input: #kmers=" << in_sizes[i] - << ", output: #kmers=" << out_sizes[i] - //<< ", intersection size: " << intersection_size << " k-mers" + std::cerr << " input: #kmers=" << in_sizes[i] << ", output: #kmers=" + << out_sizes[i] + //<< ", intersection size: " << intersection_size << " k-mers" << std::endl; } } From dfd92f4aebcc9eacca60ddda4366e159b359e2e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Fri, 17 Feb 2023 23:54:05 +0100 Subject: [PATCH 22/22] Fix contact --- README.md | 8 ++++++-- src/prophasm.cpp | 28 +--------------------------- tests/tools/fa_norm.py | 2 +- tests/tools/fa_to_kmers.py | 2 +- tests/tools/verify_output.py | 2 +- 5 files changed, 10 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index f3949a3..82c0630 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ * [Cite](#cite) * [Prerequisities](#prerequisities) * [Getting started](#getting-started) +* [How to use](#how-to-use) * [Links](#links) * [Issues](#issues) * [Changelog](#changelog) @@ -78,13 +79,16 @@ Compute simplitigs: ./prophasm -k 15 -i tests/test1.fa -o simplitigs.fa ``` + +## How to use + Set operations: ``` ./prophasm -k 15 -i tests/test1.fa -i tests/test2.fa -o _out1.fa -o _out2.fa -x _intersect.fa -s _stats.tsv ``` -## Command line parameters +## Command-line arguments