Skip to content

Commit

Permalink
Merge pull request #353 from ksahlin/dumpseeds
Browse files Browse the repository at this point in the history
Add dumpstrobes --seeds option for writing out the full seed vector
  • Loading branch information
marcelm authored Oct 2, 2023
2 parents 38a3a5d + 1a7d4ae commit 1587954
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 43 deletions.
15 changes: 15 additions & 0 deletions src/dumpstrobes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,17 @@ int run_dumpstrobes(int argc, char **argv) {

args::HelpFlag help(parser, "help", "Print help and exit", {'h', "help"});
args::Flag syncmers(parser, "syncmers", "Dump syncmers instead of randstrobes", {"syncmers"});
args::Flag seeds(parser, "seeds", "Dump sorted seed vector (format: hash, position, ref_index, strobe2_offset)", {"seeds"});
args::Flag count(parser, "count", "Count only", {"count"});
args::ValueFlag<int> threads(parser, "INT", "Number of threads [8]", {'t', "threads"});
auto seeding = SeedingArguments{parser};
args::Positional<std::string> ref_filename(parser, "reference", "Reference in FASTA format", args::Options::Required);

try {
parser.ParseCLI(argc, argv);
if ((seeds && syncmers) || (seeds && count)) {
throw args::Error("Cannot combine --seeds with --syncmers or --count");
}
}
catch (const args::Help&) {
std::cout << parser;
Expand Down Expand Up @@ -148,6 +153,16 @@ int run_dumpstrobes(int argc, char **argv) {
}
}
std::cout << n << std::endl;
} else if (seeds) {
float top_filter_fraction = 0.0002;
int bits = -1; // autodetermine
int n_threads = threads ? args::get(threads) : 8;
StrobemerIndex index(references, index_parameters, bits);
index.populate(top_filter_fraction, n_threads);
for (size_t i = 0; i < index.size(); ++i) {
auto rs = index.get_randstrobe(i);
std::cout << rs.hash << "," << rs.position << "," << rs.reference_index() << "," << rs.strobe2_offset() << '\n';
}
} else {
for (size_t i = 0; i < references.size(); ++i) {
auto& seq = references.sequences[i];
Expand Down
91 changes: 48 additions & 43 deletions src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,54 @@
static Logger& logger = Logger::get();
static const uint32_t STI_FILE_FORMAT_VERSION = 2;


namespace {

uint64_t count_randstrobes(const std::string& seq, const IndexParameters& parameters) {
uint64_t n_syncmers = 0;
SyncmerIterator syncmer_iterator(seq, parameters.syncmer);
Syncmer syncmer;
while (!(syncmer = syncmer_iterator.next()).is_end()) {
n_syncmers++;
}
// The last w_min syncmers do not result in a randstrobe
if (n_syncmers < parameters.randstrobe.w_min) {
return 0;
} else {
return n_syncmers - parameters.randstrobe.w_min;
}
}

std::vector<uint64_t> count_all_randstrobes(const References& references, const IndexParameters& parameters, size_t n_threads) {
std::vector<std::thread> workers;
std::atomic_size_t ref_index{0};

std::vector<uint64_t> counts;
counts.assign(references.size(), 0);

for (size_t i = 0; i < n_threads; ++i) {
workers.push_back(
std::thread(
[&ref_index](const References& references, const IndexParameters& parameters, std::vector<uint64_t>& counts) {
while (true) {
size_t j = ref_index.fetch_add(1);
if (j >= references.size()) {
break;
}
counts[j] = count_randstrobes(references.sequences[j], parameters);
}
}, std::ref(references), std::ref(parameters), std::ref(counts))
);
}
for (auto& worker : workers) {
worker.join();
}

return counts;
}

}

void StrobemerIndex::write(const std::string& filename) const {
std::ofstream ofs(filename, std::ios::binary);

Expand Down Expand Up @@ -90,49 +138,6 @@ int StrobemerIndex::pick_bits(size_t size) const {
return std::clamp(static_cast<int>(log2(estimated_number_of_randstrobes)) - 1, 8, 31);
}

uint64_t count_randstrobes(const std::string& seq, const IndexParameters& parameters) {
uint64_t n_syncmers = 0;
SyncmerIterator syncmer_iterator(seq, parameters.syncmer);
Syncmer syncmer;
while (!(syncmer = syncmer_iterator.next()).is_end()) {
n_syncmers++;
}
// The last w_min syncmers do not result in a randstrobe
if (n_syncmers < parameters.randstrobe.w_min) {
return 0;
} else {
return n_syncmers - parameters.randstrobe.w_min;
}
}

std::vector<uint64_t> count_all_randstrobes(const References& references, const IndexParameters& parameters, size_t n_threads) {
std::vector<std::thread> workers;
std::atomic_size_t ref_index{0};

std::vector<uint64_t> counts;
counts.assign(references.size(), 0);

for (size_t i = 0; i < n_threads; ++i) {
workers.push_back(
std::thread(
[&ref_index](const References& references, const IndexParameters& parameters, std::vector<uint64_t>& counts) {
while (true) {
size_t j = ref_index.fetch_add(1);
if (j >= references.size()) {
break;
}
counts[j] = count_randstrobes(references.sequences[j], parameters);
}
}, std::ref(references), std::ref(parameters), std::ref(counts))
);
}
for (auto& worker : workers) {
worker.join();
}

return counts;
}

void StrobemerIndex::populate(float f, size_t n_threads) {
Timer count_hash;
auto randstrobe_counts = count_all_randstrobes(references, parameters, n_threads);
Expand Down
8 changes: 8 additions & 0 deletions src/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ struct StrobemerIndex {
return randstrobes[position].reference_index();
}

RefRandstrobe get_randstrobe(bucket_index_t position) const {
return randstrobes[position];
}

size_t size() const {
return randstrobes.size();
}

unsigned int get_count(bucket_index_t position) const {
// For 95% of cases, the result will be small and a brute force search
// is the best option. Once, we go over MAX_LINEAR_SEARCH, though, we
Expand Down

0 comments on commit 1587954

Please sign in to comment.