diff --git a/src/dumpstrobes.cpp b/src/dumpstrobes.cpp index cb9642d8..4a8a1d88 100644 --- a/src/dumpstrobes.cpp +++ b/src/dumpstrobes.cpp @@ -78,12 +78,17 @@ int run_dumpstrobes(int argc, char **argv) { args::HelpFlag help(parser, "help", "Print help and exit", {'h', "help"}); args::Flag syncmers(parser, "syncmers", "Dump syncmers instead of randstrobes", {"syncmers"}); + args::Flag seeds(parser, "seeds", "Dump sorted seed vector (format: hash, position, ref_index, strobe2_offset)", {"seeds"}); args::Flag count(parser, "count", "Count only", {"count"}); + args::ValueFlag threads(parser, "INT", "Number of threads [8]", {'t', "threads"}); auto seeding = SeedingArguments{parser}; args::Positional ref_filename(parser, "reference", "Reference in FASTA format", args::Options::Required); try { parser.ParseCLI(argc, argv); + if ((seeds && syncmers) || (seeds && count)) { + throw args::Error("Cannot combine --seeds with --syncmers or --count"); + } } catch (const args::Help&) { std::cout << parser; @@ -148,6 +153,16 @@ int run_dumpstrobes(int argc, char **argv) { } } std::cout << n << std::endl; + } else if (seeds) { + float top_filter_fraction = 0.0002; + int bits = -1; // autodetermine + int n_threads = threads ? args::get(threads) : 8; + StrobemerIndex index(references, index_parameters, bits); + index.populate(top_filter_fraction, n_threads); + for (size_t i = 0; i < index.size(); ++i) { + auto rs = index.get_randstrobe(i); + std::cout << rs.hash << "," << rs.position << "," << rs.reference_index() << "," << rs.strobe2_offset() << '\n'; + } } else { for (size_t i = 0; i < references.size(); ++i) { auto& seq = references.sequences[i]; diff --git a/src/index.cpp b/src/index.cpp index a6737f16..ceb4ffc3 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -22,6 +22,54 @@ static Logger& logger = Logger::get(); static const uint32_t STI_FILE_FORMAT_VERSION = 2; + +namespace { + +uint64_t count_randstrobes(const std::string& seq, const IndexParameters& parameters) { + uint64_t n_syncmers = 0; + SyncmerIterator syncmer_iterator(seq, parameters.syncmer); + Syncmer syncmer; + while (!(syncmer = syncmer_iterator.next()).is_end()) { + n_syncmers++; + } + // The last w_min syncmers do not result in a randstrobe + if (n_syncmers < parameters.randstrobe.w_min) { + return 0; + } else { + return n_syncmers - parameters.randstrobe.w_min; + } +} + +std::vector count_all_randstrobes(const References& references, const IndexParameters& parameters, size_t n_threads) { + std::vector workers; + std::atomic_size_t ref_index{0}; + + std::vector counts; + counts.assign(references.size(), 0); + + for (size_t i = 0; i < n_threads; ++i) { + workers.push_back( + std::thread( + [&ref_index](const References& references, const IndexParameters& parameters, std::vector& counts) { + while (true) { + size_t j = ref_index.fetch_add(1); + if (j >= references.size()) { + break; + } + counts[j] = count_randstrobes(references.sequences[j], parameters); + } + }, std::ref(references), std::ref(parameters), std::ref(counts)) + ); + } + for (auto& worker : workers) { + worker.join(); + } + + return counts; +} + +} + void StrobemerIndex::write(const std::string& filename) const { std::ofstream ofs(filename, std::ios::binary); @@ -90,49 +138,6 @@ int StrobemerIndex::pick_bits(size_t size) const { return std::clamp(static_cast(log2(estimated_number_of_randstrobes)) - 1, 8, 31); } -uint64_t count_randstrobes(const std::string& seq, const IndexParameters& parameters) { - uint64_t n_syncmers = 0; - SyncmerIterator syncmer_iterator(seq, parameters.syncmer); - Syncmer syncmer; - while (!(syncmer = syncmer_iterator.next()).is_end()) { - n_syncmers++; - } - // The last w_min syncmers do not result in a randstrobe - if (n_syncmers < parameters.randstrobe.w_min) { - return 0; - } else { - return n_syncmers - parameters.randstrobe.w_min; - } -} - -std::vector count_all_randstrobes(const References& references, const IndexParameters& parameters, size_t n_threads) { - std::vector workers; - std::atomic_size_t ref_index{0}; - - std::vector counts; - counts.assign(references.size(), 0); - - for (size_t i = 0; i < n_threads; ++i) { - workers.push_back( - std::thread( - [&ref_index](const References& references, const IndexParameters& parameters, std::vector& counts) { - while (true) { - size_t j = ref_index.fetch_add(1); - if (j >= references.size()) { - break; - } - counts[j] = count_randstrobes(references.sequences[j], parameters); - } - }, std::ref(references), std::ref(parameters), std::ref(counts)) - ); - } - for (auto& worker : workers) { - worker.join(); - } - - return counts; -} - void StrobemerIndex::populate(float f, size_t n_threads) { Timer count_hash; auto randstrobe_counts = count_all_randstrobes(references, parameters, n_threads); diff --git a/src/index.hpp b/src/index.hpp index b842e471..941db7de 100644 --- a/src/index.hpp +++ b/src/index.hpp @@ -104,6 +104,14 @@ struct StrobemerIndex { return randstrobes[position].reference_index(); } + RefRandstrobe get_randstrobe(bucket_index_t position) const { + return randstrobes[position]; + } + + size_t size() const { + return randstrobes.size(); + } + unsigned int get_count(bucket_index_t position) const { // For 95% of cases, the result will be small and a brute force search // is the best option. Once, we go over MAX_LINEAR_SEARCH, though, we