Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: add sig cat utility #394

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
13 changes: 13 additions & 0 deletions doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
| `pairwise` | Multithreaded pairwise comparison of multiple sketches, in memory | [link](#Running-multisearch-and-pairwise)
| `cluster` | cluster sequences based on similarity data from `pairwise` or `multisearch` | [link](#Running-cluster)
| `index` | build a RocksDB inverted index for efficient containment queries | [link](#Running-index)
| `sigcat` | concatenate signatures to a zipfile | [link](#Running-sigcat) |

This repository implements multithreaded plugins for
[sourmash](https://sourmash.readthedocs.io/) that provide very fast
Expand Down Expand Up @@ -384,6 +385,18 @@ Note that RocksDB indexes are implemented in the core
used in downstream software packages (this plugin, and
[the branchwater application code](https://github.com/sourmash-bio/branchwater)). The above documentation applies to sourmash core v0.15.0.

### Running `sigcat`

The `sigcat` command combines signatures into a single sourmash Zipfile. It's equivalent `sourmash` command is `sourmash sig cat`.

For example:
```
sourmash scripts sigcat file1.sig file2.sig -o all.zip
```
will combine all signatures in `file1.sig` and `file2.sig` and put them in the file `all.zip`.

`sigcat` can be used to select out specific signatures as well, by `ksize`, `moltype`, and `scaled`, where signatures will be downsampled to the desired scaled.

## Notes on concurrency and efficiency

Each command does things somewhat differently, with implications for
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ check = "sourmash_plugin_branchwater:Branchwater_Check"
manysketch = "sourmash_plugin_branchwater:Branchwater_Manysketch"
pairwise = "sourmash_plugin_branchwater:Branchwater_Pairwise"
cluster = "sourmash_plugin_branchwater:Branchwater_Cluster"
sigcat = "sourmash_plugin_branchwater:Branchwater_SigCat"

[project.optional-dependencies]
test = [
Expand Down
3 changes: 3 additions & 0 deletions src/fastgather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@ pub fn fastgather(
prefetch_output: Option<String>,
allow_failed_sigpaths: bool,
) -> Result<()> {
let allow_empty_collection = false;
let query_collection = load_collection(
&query_filepath,
selection,
ReportType::Query,
allow_failed_sigpaths,
allow_empty_collection,
)?;

if query_collection.len() != 1 {
Expand All @@ -47,6 +49,7 @@ pub fn fastgather(
selection,
ReportType::Against,
allow_failed_sigpaths,
allow_empty_collection,
)?;

// calculate the minimum number of hashes based on desired threshold
Expand Down
3 changes: 3 additions & 0 deletions src/fastmultigather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,14 @@ pub fn fastmultigather(
save_matches: bool,
create_empty_results: bool,
) -> Result<()> {
let allow_empty_collection = false;
// load query collection
let query_collection = load_collection(
&query_filepath,
selection,
ReportType::Query,
allow_failed_sigpaths,
allow_empty_collection,
)?;

let threshold_hashes: u64 = {
Expand All @@ -61,6 +63,7 @@ pub fn fastmultigather(
selection,
ReportType::Against,
allow_failed_sigpaths,
allow_empty_collection,
)?;
// load against sketches into memory, downsampling on the way
let against = load_sketches(against_collection, selection, ReportType::Against).unwrap();
Expand Down
2 changes: 2 additions & 0 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ pub fn index<P: AsRef<Path>>(
use_internal_storage: bool,
) -> Result<(), Box<dyn std::error::Error>> {
println!("Loading siglist");
let allow_empty_collection = false;

let collection = load_collection(
&siglist,
selection,
ReportType::General,
allow_failed_sigpaths,
allow_empty_collection,
)?;

let mut index = RevIndex::create(
Expand Down
41 changes: 35 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ mod mastiff_manygather;
mod mastiff_manysearch;
mod multisearch;
mod pairwise;
mod sigcat;

use camino::Utf8PathBuf as PathBuf;

Expand All @@ -33,7 +34,7 @@ fn do_manysearch(
output_path: Option<String>,
) -> anyhow::Result<u8> {
let againstfile_path: PathBuf = siglist_path.clone().into();
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(Some(ksize), Some(scaled), Some(&moltype));
eprintln!("selection scaled: {:?}", selection.scaled());
let allow_failed_sigpaths = true;

Expand Down Expand Up @@ -84,7 +85,7 @@ fn do_fastgather(
output_path_prefetch: Option<String>,
output_path_gather: Option<String>,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(Some(ksize), Some(scaled), Some(&moltype));
let allow_failed_sigpaths = true;

match fastgather::fastgather(
Expand Down Expand Up @@ -119,7 +120,7 @@ fn do_fastmultigather(
create_empty_results: bool,
) -> anyhow::Result<u8> {
let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into();
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(Some(ksize), Some(scaled), Some(&moltype));
let allow_failed_sigpaths = true;

// if a siglist path is a revindex, run mastiff_manygather. If not, run multigather
Expand Down Expand Up @@ -188,7 +189,7 @@ fn do_index(
colors: bool,
use_internal_storage: bool,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(Some(ksize), Some(scaled), Some(&moltype));
let allow_failed_sigpaths = false;
match index::index(
siglist,
Expand Down Expand Up @@ -231,7 +232,7 @@ fn do_multisearch(
estimate_ani: bool,
output_path: Option<String>,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(Some(ksize), Some(scaled), Some(&moltype));
let allow_failed_sigpaths = true;

match multisearch::multisearch(
Expand Down Expand Up @@ -264,7 +265,7 @@ fn do_pairwise(
write_all: bool,
output_path: Option<String>,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(Some(ksize), Some(scaled), Some(&moltype));
let allow_failed_sigpaths = true;
match pairwise::pairwise(
siglist_path,
Expand Down Expand Up @@ -324,6 +325,33 @@ fn do_cluster(
}
}

#[pyfunction]
#[allow(clippy::too_many_arguments)]
fn do_sigcat(
sigfiles: String,
output_path: String,
force: bool,
ksize: Option<u8>,
scaled: Option<usize>,
moltype: Option<String>,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, moltype.as_deref());
let allow_failed_sigpaths = true;
match sigcat::sig_cat(
sigfiles,
&selection,
allow_failed_sigpaths,
output_path,
force,
) {
Ok(_) => Ok(0),
Err(e) => {
eprintln!("Error: {e}");
Ok(1)
}
}
}

#[pymodule]
fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(do_manysearch, m)?)?;
Expand All @@ -336,5 +364,6 @@ fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult
m.add_function(wrap_pyfunction!(do_multisearch, m)?)?;
m.add_function(wrap_pyfunction!(do_pairwise, m)?)?;
m.add_function(wrap_pyfunction!(do_cluster, m)?)?;
m.add_function(wrap_pyfunction!(do_sigcat, m)?)?;
Ok(())
}
3 changes: 3 additions & 0 deletions src/manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ pub fn manysearch(
output: Option<String>,
allow_failed_sigpaths: bool,
) -> Result<()> {
let allow_empty_collection = false;
// Load query collection
let query_collection = load_collection(
&query_filepath,
selection,
ReportType::Query,
allow_failed_sigpaths,
allow_empty_collection,
)?;
// load all query sketches into memory, downsampling on the way
let query_sketchlist = load_sketches(query_collection, selection, ReportType::Query).unwrap();
Expand All @@ -38,6 +40,7 @@ pub fn manysearch(
selection,
ReportType::Against,
allow_failed_sigpaths,
allow_empty_collection,
)?;

// set up a multi-producer, single-consumer channel.
Expand Down
2 changes: 2 additions & 0 deletions src/mastiff_manygather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pub fn mastiff_manygather(
output: Option<String>,
allow_failed_sigpaths: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let allow_empty_collection = false;
if !is_revindex_database(&index) {
bail!("'{}' is not a valid RevIndex database", index);
}
Expand All @@ -32,6 +33,7 @@ pub fn mastiff_manygather(
selection,
ReportType::Query,
allow_failed_sigpaths,
allow_empty_collection,
)?;

// set up a multi-producer, single-consumer channel.
Expand Down
2 changes: 2 additions & 0 deletions src/mastiff_manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pub fn mastiff_manysearch(
output: Option<String>,
allow_failed_sigpaths: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let allow_empty_collection = false;
if !is_revindex_database(&index) {
bail!("'{}' is not a valid RevIndex database", index);
}
Expand All @@ -36,6 +37,7 @@ pub fn mastiff_manysearch(
selection,
ReportType::Query,
allow_failed_sigpaths,
allow_empty_collection,
)?;

// set up a multi-producer, single-consumer channel.
Expand Down
3 changes: 3 additions & 0 deletions src/multisearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@ pub fn multisearch(
estimate_ani: bool,
output: Option<String>,
) -> Result<(), Box<dyn std::error::Error>> {
let allow_empty_collection = false;
// Load all queries into memory at once.

let query_collection = load_collection(
&query_filepath,
selection,
ReportType::Query,
allow_failed_sigpaths,
allow_empty_collection,
)?;
let queries = load_sketches(query_collection, selection, ReportType::Query).unwrap();

Expand All @@ -41,6 +43,7 @@ pub fn multisearch(
selection,
ReportType::Against,
allow_failed_sigpaths,
allow_empty_collection,
)?;
let against = load_sketches(against_collection, selection, ReportType::Against).unwrap();

Expand Down
2 changes: 2 additions & 0 deletions src/pairwise.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@ pub fn pairwise(
write_all: bool,
output: Option<String>,
) -> Result<(), Box<dyn std::error::Error>> {
let allow_empty_collection = false;
// Load all sigs into memory at once.
let collection = load_collection(
&siglist,
selection,
ReportType::General,
allow_failed_sigpaths,
allow_empty_collection,
)?;

if collection.len() <= 1 {
Expand Down
38 changes: 38 additions & 0 deletions src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,3 +422,41 @@ def main(self, args):
notify(f"...clustering is done! results in '{args.output}'")
notify(f" cluster counts in '{args.cluster_sizes}'")
return status


class Branchwater_SigCat(CommandLinePlugin):
command = 'sigcat'
description = 'concatenate signatures into a single sourmash zip file'

def __init__(self, p):
super().__init__(p)
p.add_argument('signatures', nargs='+', help="sourmash signature files")
p.add_argument('-o', '--output', required=True,
help='output zip file for final signatures')
p.add_argument('-k', '--ksize', type=int,
help='k-mer size at which to select sketches; no default')
p.add_argument('-s', '--scaled', type=int,
help='scaled factor at which to do comparisons; no default')
p.add_argument('-m', '--moltype', choices = ["DNA", "protein", "dayhoff", "hp"],
help = 'molecule type (DNA, protein, dayhoff, or hp; no default)')
p.add_argument('-f', '--force', action='store_true',
help='force: allow input sig files to contain no signatures or only incompatible signatures')

def main(self, args):
print_version()

allsigs = " ".join(args.signatures) # so can pass string into rust instead of pylist
notify(f"concatenating signatures in '{allsigs}'")
if args.moltype:
args.moltype = args.moltype.lower()

super().main(args)
status = sourmash_plugin_branchwater.do_sigcat(allsigs,
args.output,
args.force,
args.ksize,
args.scaled,
args.moltype)
if status == 0:
notify(f"...cat is done! results in '{args.output}'")
return status
Loading
Loading