Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add singlesketch command #459

Merged
merged 7 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
| command | functionality | docs |
| -------- | -------- | -------- |
| `manysketch` | Rapidly build sketches for many input files | [link](#Running-manysketch) |
| `singlesketch` | Sketch a single sequence file | [link](#Running-singlesketch)
| `fastgather` | Multithreaded `gather` of **one** metagenome against a database| [link](#Running-fastgather)
| `fastmultigather` | Multithreaded `gather` of **multiple** metagenomes against a database | [link](#Running-fastmultigather)
| `manysearch` | Multithreaded containment search for many queries in many large metagenomes | [link](#Running-manysearch)
Expand Down Expand Up @@ -212,6 +213,22 @@ sourmash sig summarize proteins.zip

In this case, three sketches of `protein`, `dayhoff`, and `hp` moltypes were made for each file in `proteins.csv` and saved to `proteins.zip`.


## Running `singlesketch`

The `singlesketch` command generates a sketch for a single sequence file.

### Basic Usage
```bash
sourmash scripts singlesketch input.fa -p k=21,scaled=1000,dna -o output.sig --name signature_name
```
### Using `stdin/stdout`
You can use `-` for `stdin` and output the result to `stdout`:
```bash
cat input.fa | sourmash scripts singlesketch - -o -
```


### Running `multisearch` and `pairwise`

The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ check = "sourmash_plugin_branchwater:Branchwater_Check"
manysketch = "sourmash_plugin_branchwater:Branchwater_Manysketch"
pairwise = "sourmash_plugin_branchwater:Branchwater_Pairwise"
cluster = "sourmash_plugin_branchwater:Branchwater_Cluster"
singlesketch = "sourmash_plugin_branchwater:Branchwater_SingleSketch"

[project.optional-dependencies]
test = [
Expand Down
20 changes: 20 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/// Python interface Rust code for sourmash_plugin_branchwater.
use pyo3::prelude::*;
use singlesketch::singlesketch;

#[macro_use]
extern crate simple_error;
Expand All @@ -18,6 +19,7 @@ mod mastiff_manygather;
mod mastiff_manysearch;
mod multisearch;
mod pairwise;
mod singlesketch;

use camino::Utf8PathBuf as PathBuf;

Expand Down Expand Up @@ -300,6 +302,23 @@ fn do_manysketch(
}
}

#[pyfunction]
#[pyo3(signature = (input_filename, param_str, output, name))]
fn do_singlesketch(
input_filename: String,
param_str: String,
output: String,
name: String,
) -> anyhow::Result<u8> {
match singlesketch::singlesketch(input_filename, param_str, output, name) {
Ok(_) => Ok(0),
Err(e) => {
eprintln!("Error: {e}");
Ok(1)
}
}
}

#[pyfunction]
#[pyo3(signature = (pairwise_csv, output_clusters, similarity_column, similarity_threshold, cluster_sizes=None))]
fn do_cluster(
Expand Down Expand Up @@ -336,5 +355,6 @@ fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult
m.add_function(wrap_pyfunction!(do_multisearch, m)?)?;
m.add_function(wrap_pyfunction!(do_pairwise, m)?)?;
m.add_function(wrap_pyfunction!(do_cluster, m)?)?;
m.add_function(wrap_pyfunction!(do_singlesketch, m)?)?;
Ok(())
}
88 changes: 2 additions & 86 deletions src/manysketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,99 +2,15 @@
use anyhow::{anyhow, Result};
use rayon::prelude::*;

use crate::utils::{load_fasta_fromfile, sigwriter, Params};
use crate::utils::{load_fasta_fromfile, parse_params_str, sigwriter, Params};
use camino::Utf8Path as Path;
use needletail::parse_fastx_file;
use sourmash::cmd::ComputeParameters;
use sourmash::signature::Signature;
use std::sync::atomic;
use std::sync::atomic::AtomicUsize;

fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
let mut unique_params: std::collections::HashSet<Params> = std::collections::HashSet::new();

// split params_strs by _ and iterate over each param
for p_str in params_strs.split('_').collect::<Vec<&str>>().iter() {
let items: Vec<&str> = p_str.split(',').collect();

let mut ksizes = Vec::new();
let mut track_abundance = false;
let mut num = 0;
let mut scaled = 1000;
let mut seed = 42;
let mut is_dna = false;
let mut is_protein = false;
let mut is_dayhoff = false;
let mut is_hp = false;

for item in items.iter() {
match *item {
_ if item.starts_with("k=") => {
let k_value = item[2..]
.parse()
.map_err(|_| format!("cannot parse k='{}' as a number", &item[2..]))?;
ksizes.push(k_value);
}
"abund" => track_abundance = true,
"noabund" => track_abundance = false,
_ if item.starts_with("num=") => {
num = item[4..]
.parse()
.map_err(|_| format!("cannot parse num='{}' as a number", &item[4..]))?;
}
_ if item.starts_with("scaled=") => {
scaled = item[7..]
.parse()
.map_err(|_| format!("cannot parse scaled='{}' as a number", &item[7..]))?;
}
_ if item.starts_with("seed=") => {
seed = item[5..]
.parse()
.map_err(|_| format!("cannot parse seed='{}' as a number", &item[5..]))?;
}
"protein" => {
is_protein = true;
}
"dna" => {
is_dna = true;
}
"dayhoff" => {
is_dayhoff = true;
}
"hp" => {
is_hp = true;
}
_ => return Err(format!("unknown component '{}' in params string", item)),
}
}

if !is_dna && !is_protein && !is_dayhoff && !is_hp {
return Err(format!("No moltype provided in params string {}", p_str));
}
if ksizes.is_empty() {
return Err(format!("No ksizes provided in params string {}", p_str));
}

for &k in &ksizes {
let param = Params {
ksize: k,
track_abundance,
num,
scaled,
seed,
is_protein,
is_dna,
is_dayhoff,
is_hp,
};
unique_params.insert(param);
}
}

Ok(unique_params.into_iter().collect())
}

fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec<Signature> {
pub fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec<Signature> {
let mut sigs = Vec::new();

for param in params.iter().cloned() {
Expand Down
47 changes: 47 additions & 0 deletions src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,53 @@ def main(self, args):
notify(f"...pairwise is done! results in '{args.output}'")
return status

class Branchwater_SingleSketch(CommandLinePlugin):
command = 'singlesketch'
description = 'sketch a single sequence file'

def __init__(self, p):
super().__init__(p)
p.add_argument('input_filename', help="input FASTA file or '-' for stdin")
p.add_argument('-o', '--output', required=True,
help='output file for the signature or - for stdout')
p.add_argument('-p', '--param-string', action='append', type=str, default=[],
help='parameter string for sketching (default: k=31,scaled=1000)')
p.add_argument('-n', '--name', help="optional name for the signature, default is the basename of input path")

def main(self, args):
print_version()
if not args.param_string:
args.param_string = ["k=31,scaled=1000,dna"]

# Check and append 'dna' if no moltype is found in a param string
moltypes = ["dna", "protein", "dayhoff", "hp"]
updated_param_strings = []

for param in args.param_string:
# If no moltype is found in the current param string, append 'dna'
if not any(mt in param for mt in moltypes):
param += ",dna"
updated_param_strings.append(param)

notify(f"params: {updated_param_strings}")

# Convert the list of param strings to a single string
args.param_string = "_".join(updated_param_strings).lower()

# If --name is not provided, default to input_filename, but if the source file is -, set name to empty string
signature_name = args.name if args.name else os.path.basename(args.input_filename) if args.input_filename != "-" else ""

notify(f"sketching file '{args.input_filename}' with params '{args.param_string}' and name '{signature_name}'")

super().main(args)
status = sourmash_plugin_branchwater.do_singlesketch(args.input_filename,
args.param_string,
args.output,
signature_name) # Pass the name to Rust
if status == 0:
notify(f"...singlesketch is done! results in '{args.output}'")
return status


class Branchwater_Manysketch(CommandLinePlugin):
command = 'manysketch'
Expand Down
87 changes: 86 additions & 1 deletion src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas
import sourmash
from sourmash import index

import io
from . import sourmash_tst_utils as utils


Expand Down Expand Up @@ -894,3 +894,88 @@ def test_manysketch_prefix_duplicated_force(runtmp, capfd):
print(sigs)

assert len(sigs) == 3

def test_singlesketch_simple(runtmp):
"""Test basic single sketching with default parameters."""
fa1 = get_test_data('short.fa')
output = runtmp.output('short.sig')

# Run the singlesketch command
runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output)

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sig = sourmash.load_one_signature(output)

assert sig.name == 'short.fa'
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000


def test_singlesketch_with_name(runtmp):
"""Test single sketching with a custom name."""
fa1 = get_test_data('short.fa')
output = runtmp.output('short_named.sig')

# Run the singlesketch command with the --name option
runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-n', 'custom_name')

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sig = sourmash.load_one_signature(output)

assert sig.name == 'custom_name'
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000


def test_singlesketch_mult_k(runtmp):
"""Test single sketching with multiple k-mer sizes."""
fa1 = get_test_data('short.fa')
output = runtmp.output('short_mult_k.sig')

# Run the singlesketch command with multiple k sizes
runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'k=21,scaled=100', '-p', 'k=31,scaled=100')

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sigs = list(sourmash.load_signatures(output))

# Verify that two signatures with different k-mer sizes exist
assert len(sigs) == 2
assert any(sig.minhash.ksize == 21 for sig in sigs)
assert any(sig.minhash.ksize == 31 for sig in sigs)


def test_singlesketch_mult_moltype(runtmp):
"""Test single sketching with different molecule types."""
fa1 = get_test_data('short-protein.fa')
output = runtmp.output('short_mult_moltype.sig')

# Run the singlesketch command with multiple molecule types
runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'protein,k=10,scaled=100')

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sig = sourmash.load_one_signature(output)

# Verify that the signature has the correct molecule type and parameters
assert sig.minhash.ksize == 10
assert sig.minhash.is_protein
assert sig.minhash.scaled == 100


def test_singlesketch_invalid_params(runtmp, capfd):
"""Test singlesketch command with invalid parameters."""
fa1 = get_test_data('short.fa')
output = runtmp.output('short_invalid.sig')

# Run the singlesketch command with an invalid parameter string
with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'invalid_param')

# Check that the error message is correct
captured = capfd.readouterr()
assert "Failed to parse params string" in captured.err
Loading
Loading