From f3d0b2f80ecc314a96956f74b261214280fef418 Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Sun, 29 Sep 2024 21:02:48 -0700 Subject: [PATCH 1/5] add singlesketch command --- pyproject.toml | 1 + src/lib.rs | 20 +++++ src/manysketch.rs | 88 +------------------ .../sourmash_plugin_branchwater/__init__.py | 34 +++++++ src/singlesketch.rs | 75 ++++++++++++++++ src/utils.rs | 84 ++++++++++++++++++ 6 files changed, 216 insertions(+), 86 deletions(-) create mode 100644 src/singlesketch.rs diff --git a/pyproject.toml b/pyproject.toml index 9c53d00c..be2ae2db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ check = "sourmash_plugin_branchwater:Branchwater_Check" manysketch = "sourmash_plugin_branchwater:Branchwater_Manysketch" pairwise = "sourmash_plugin_branchwater:Branchwater_Pairwise" cluster = "sourmash_plugin_branchwater:Branchwater_Cluster" +singlesketch = "sourmash_plugin_branchwater:Branchwater_SingleSketch" [project.optional-dependencies] test = [ diff --git a/src/lib.rs b/src/lib.rs index b4aee549..79355f59 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ /// Python interface Rust code for sourmash_plugin_branchwater. use pyo3::prelude::*; +use singlesketch::singlesketch; #[macro_use] extern crate simple_error; @@ -18,6 +19,7 @@ mod mastiff_manygather; mod mastiff_manysearch; mod multisearch; mod pairwise; +mod singlesketch; use camino::Utf8PathBuf as PathBuf; @@ -300,6 +302,23 @@ fn do_manysketch( } } +#[pyfunction] +#[pyo3(signature = (input_filename, moltype, param_str, output))] +fn do_singlesketch( + input_filename: String, + moltype: String, + param_str: String, + output: String, +) -> anyhow::Result { + match singlesketch::singlesketch(input_filename, moltype, param_str, output) { + Ok(_) => Ok(0), + Err(e) => { + eprintln!("Error: {e}"); + Ok(1) + } + } +} + #[pyfunction] #[pyo3(signature = (pairwise_csv, output_clusters, similarity_column, similarity_threshold, cluster_sizes=None))] fn do_cluster( @@ -336,5 +355,6 @@ fn sourmash_plugin_branchwater(_py: Python, m: &Bound<'_, PyModule>) -> PyResult m.add_function(wrap_pyfunction!(do_multisearch, m)?)?; m.add_function(wrap_pyfunction!(do_pairwise, m)?)?; m.add_function(wrap_pyfunction!(do_cluster, m)?)?; + m.add_function(wrap_pyfunction!(do_singlesketch, m)?)?; Ok(()) } diff --git a/src/manysketch.rs b/src/manysketch.rs index 7ee1d2b6..3eafa880 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -2,7 +2,7 @@ use anyhow::{anyhow, Result}; use rayon::prelude::*; -use crate::utils::{load_fasta_fromfile, sigwriter, Params}; +use crate::utils::{load_fasta_fromfile, parse_params_str, sigwriter, Params}; use camino::Utf8Path as Path; use needletail::parse_fastx_file; use sourmash::cmd::ComputeParameters; @@ -10,91 +10,7 @@ use sourmash::signature::Signature; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -fn parse_params_str(params_strs: String) -> Result, String> { - let mut unique_params: std::collections::HashSet = std::collections::HashSet::new(); - - // split params_strs by _ and iterate over each param - for p_str in params_strs.split('_').collect::>().iter() { - let items: Vec<&str> = p_str.split(',').collect(); - - let mut ksizes = Vec::new(); - let mut track_abundance = false; - let mut num = 0; - let mut scaled = 1000; - let mut seed = 42; - let mut is_dna = false; - let mut is_protein = false; - let mut is_dayhoff = false; - let mut is_hp = false; - - for item in items.iter() { - match *item { - _ if item.starts_with("k=") => { - let k_value = item[2..] - .parse() - .map_err(|_| format!("cannot parse k='{}' as a number", &item[2..]))?; - ksizes.push(k_value); - } - "abund" => track_abundance = true, - "noabund" => track_abundance = false, - _ if item.starts_with("num=") => { - num = item[4..] - .parse() - .map_err(|_| format!("cannot parse num='{}' as a number", &item[4..]))?; - } - _ if item.starts_with("scaled=") => { - scaled = item[7..] - .parse() - .map_err(|_| format!("cannot parse scaled='{}' as a number", &item[7..]))?; - } - _ if item.starts_with("seed=") => { - seed = item[5..] - .parse() - .map_err(|_| format!("cannot parse seed='{}' as a number", &item[5..]))?; - } - "protein" => { - is_protein = true; - } - "dna" => { - is_dna = true; - } - "dayhoff" => { - is_dayhoff = true; - } - "hp" => { - is_hp = true; - } - _ => return Err(format!("unknown component '{}' in params string", item)), - } - } - - if !is_dna && !is_protein && !is_dayhoff && !is_hp { - return Err(format!("No moltype provided in params string {}", p_str)); - } - if ksizes.is_empty() { - return Err(format!("No ksizes provided in params string {}", p_str)); - } - - for &k in &ksizes { - let param = Params { - ksize: k, - track_abundance, - num, - scaled, - seed, - is_protein, - is_dna, - is_dayhoff, - is_hp, - }; - unique_params.insert(param); - } - } - - Ok(unique_params.into_iter().collect()) -} - -fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec { +pub fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec { let mut sigs = Vec::new(); for param in params.iter().cloned() { diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 1983b37d..01306d52 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -341,6 +341,40 @@ def main(self, args): notify(f"...pairwise is done! results in '{args.output}'") return status +class Branchwater_SingleSketch(CommandLinePlugin): + command = 'singlesketch' + description = 'sketch a single sequence file' + + def __init__(self, p): + super().__init__(p) + p.add_argument('moltype', choices=["dna", "protein", "dayhoff", "hp"], + help='molecule type (dna, protein, dayhoff, or hp)') + p.add_argument('input_filename', help="input FASTA file or '-' for stdin") + p.add_argument('-p', '--param-string', action='append', type=str, default=[], + help='parameter string for sketching (default: k=31,scaled=1000)') + p.add_argument('-o', '--output', required=True, + help='output file for the signature') + + def main(self, args): + print_version() + if not args.param_string: + args.param_string = ["k=31,scaled=1000"] + notify(f"params: {args.param_string}") + + # Convert to a single string for easier Rust handling + args.param_string = "_".join(args.param_string) + # Lowercase the param string + args.param_string = args.param_string.lower() + + super().main(args) + status = sourmash_plugin_branchwater.do_singlesketch(args.input_filename, + args.moltype.lower(), + args.param_string, + args.output) + if status == 0: + notify(f"...sketch is done! results in '{args.output}'") + return status + class Branchwater_Manysketch(CommandLinePlugin): command = 'manysketch' diff --git a/src/singlesketch.rs b/src/singlesketch.rs new file mode 100644 index 00000000..4822f2db --- /dev/null +++ b/src/singlesketch.rs @@ -0,0 +1,75 @@ +use crate::utils::parse_params_str; +/// sketch: sketch a single sequence file. +use anyhow::{bail, Result}; +use camino::Utf8Path as Path; +use needletail::{parse_fastx_file, parse_fastx_reader}; +use std::fs::File; +use std::io::BufWriter; + +pub fn singlesketch( + input_filename: String, + moltype: String, + param_str: String, + output: String, +) -> Result<()> { + // Parse parameter string into params_vec + let param_result = parse_params_str(param_str); + let params_vec = match param_result { + Ok(params) => params, + Err(e) => { + eprintln!("Error parsing params string: {}", e); + bail!("Failed to parse params string"); + } + }; + + // Build signature templates based on parameters and molecule type + let sig_templates = crate::manysketch::build_siginfo(¶ms_vec, &moltype); + + if sig_templates.is_empty() { + bail!("No signatures to build for the given parameters."); + } + + let mut sigs = sig_templates.clone(); + + // Open FASTA file reader + let mut reader = if input_filename == "-" { + let stdin = std::io::stdin(); + parse_fastx_reader(stdin)? + } else { + parse_fastx_file(&input_filename)? + }; + + // Parse FASTA and add to signature + while let Some(record_result) = reader.next() { + match record_result { + Ok(record) => { + sigs.iter_mut().for_each(|sig| { + if moltype == "protein" { + sig.add_protein(&record.seq()) + .expect("Failed to add protein"); + } else { + sig.add_sequence(&record.seq(), true) + .expect("Failed to add sequence"); + } + }); + } + Err(err) => eprintln!("Error while processing record: {:?}", err), + } + } + + // Set name and filename for signatures + sigs.iter_mut().for_each(|sig| { + sig.set_name(&input_filename); + sig.set_filename(&input_filename); + }); + + // Write signatures to output file + let outpath = Path::new(&output); + let file = File::create(outpath)?; + let mut writer = BufWriter::new(file); + + // Write in JSON format + serde_json::to_writer(&mut writer, &sigs)?; + + Ok(()) +} diff --git a/src/utils.rs b/src/utils.rs index b74d53a4..33f78316 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1387,3 +1387,87 @@ pub fn write_signature( zip.start_file(sig_filename, zip_options).unwrap(); zip.write_all(&gzipped_buffer).unwrap(); } + +pub fn parse_params_str(params_strs: String) -> Result, String> { + let mut unique_params: std::collections::HashSet = std::collections::HashSet::new(); + + // split params_strs by _ and iterate over each param + for p_str in params_strs.split('_').collect::>().iter() { + let items: Vec<&str> = p_str.split(',').collect(); + + let mut ksizes = Vec::new(); + let mut track_abundance = false; + let mut num = 0; + let mut scaled = 1000; + let mut seed = 42; + let mut is_dna = false; + let mut is_protein = false; + let mut is_dayhoff = false; + let mut is_hp = false; + + for item in items.iter() { + match *item { + _ if item.starts_with("k=") => { + let k_value = item[2..] + .parse() + .map_err(|_| format!("cannot parse k='{}' as a number", &item[2..]))?; + ksizes.push(k_value); + } + "abund" => track_abundance = true, + "noabund" => track_abundance = false, + _ if item.starts_with("num=") => { + num = item[4..] + .parse() + .map_err(|_| format!("cannot parse num='{}' as a number", &item[4..]))?; + } + _ if item.starts_with("scaled=") => { + scaled = item[7..] + .parse() + .map_err(|_| format!("cannot parse scaled='{}' as a number", &item[7..]))?; + } + _ if item.starts_with("seed=") => { + seed = item[5..] + .parse() + .map_err(|_| format!("cannot parse seed='{}' as a number", &item[5..]))?; + } + "protein" => { + is_protein = true; + } + "dna" => { + is_dna = true; + } + "dayhoff" => { + is_dayhoff = true; + } + "hp" => { + is_hp = true; + } + _ => return Err(format!("unknown component '{}' in params string", item)), + } + } + + if !is_dna && !is_protein && !is_dayhoff && !is_hp { + return Err(format!("No moltype provided in params string {}", p_str)); + } + if ksizes.is_empty() { + return Err(format!("No ksizes provided in params string {}", p_str)); + } + + for &k in &ksizes { + let param = Params { + ksize: k, + track_abundance, + num, + scaled, + seed, + is_protein, + is_dna, + is_dayhoff, + is_hp, + }; + unique_params.insert(param); + } + } + + Ok(unique_params.into_iter().collect()) +} From 4b1c706cb394b0372b7d7e20a1b38140920ffbc0 Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Mon, 30 Sep 2024 09:13:01 -0700 Subject: [PATCH 2/5] standardizes params --- src/lib.rs | 5 ++- .../sourmash_plugin_branchwater/__init__.py | 34 ++++++++++++------- src/singlesketch.rs | 25 +++++++++----- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 79355f59..88ec9a40 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -303,14 +303,13 @@ fn do_manysketch( } #[pyfunction] -#[pyo3(signature = (input_filename, moltype, param_str, output))] +#[pyo3(signature = (input_filename, param_str, output))] fn do_singlesketch( input_filename: String, - moltype: String, param_str: String, output: String, ) -> anyhow::Result { - match singlesketch::singlesketch(input_filename, moltype, param_str, output) { + match singlesketch::singlesketch(input_filename, param_str, output) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 01306d52..09d85db8 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -347,32 +347,40 @@ class Branchwater_SingleSketch(CommandLinePlugin): def __init__(self, p): super().__init__(p) - p.add_argument('moltype', choices=["dna", "protein", "dayhoff", "hp"], - help='molecule type (dna, protein, dayhoff, or hp)') p.add_argument('input_filename', help="input FASTA file or '-' for stdin") - p.add_argument('-p', '--param-string', action='append', type=str, default=[], - help='parameter string for sketching (default: k=31,scaled=1000)') p.add_argument('-o', '--output', required=True, help='output file for the signature') + p.add_argument('-p', '--param-string', action='append', type=str, default=[], + help='parameter string for sketching (default: k=31,scaled=1000)') def main(self, args): print_version() if not args.param_string: args.param_string = ["k=31,scaled=1000"] - notify(f"params: {args.param_string}") - # Convert to a single string for easier Rust handling - args.param_string = "_".join(args.param_string) - # Lowercase the param string - args.param_string = args.param_string.lower() + # Check and append 'dna' if no moltype is found in a param string + moltypes = ["dna", "protein", "dayhoff", "hp"] + updated_param_strings = [] + + for param in args.param_string: + # If no moltype is found in the current param string, append 'dna' + if not any(mt in param for mt in moltypes): + param += ",dna" + updated_param_strings.append(param) + + notify(f"params: {updated_param_strings}") + + # Convert the list of param strings to a single string + args.param_string = "_".join(updated_param_strings).lower() + + notify(f"sketching file '{args.input_filename}' with params '{args.param_string}'") super().main(args) status = sourmash_plugin_branchwater.do_singlesketch(args.input_filename, - args.moltype.lower(), - args.param_string, - args.output) + args.param_string, + args.output) if status == 0: - notify(f"...sketch is done! results in '{args.output}'") + notify(f"...singlesketch is done! results in '{args.output}'") return status diff --git a/src/singlesketch.rs b/src/singlesketch.rs index 4822f2db..2ca28c01 100644 --- a/src/singlesketch.rs +++ b/src/singlesketch.rs @@ -1,19 +1,13 @@ use crate::utils::parse_params_str; -/// sketch: sketch a single sequence file. use anyhow::{bail, Result}; use camino::Utf8Path as Path; use needletail::{parse_fastx_file, parse_fastx_reader}; use std::fs::File; use std::io::BufWriter; -pub fn singlesketch( - input_filename: String, - moltype: String, - param_str: String, - output: String, -) -> Result<()> { +pub fn singlesketch(input_filename: String, param_str: String, output: String) -> Result<()> { // Parse parameter string into params_vec - let param_result = parse_params_str(param_str); + let param_result = parse_params_str(param_str.clone()); let params_vec = match param_result { Ok(params) => params, Err(e) => { @@ -22,7 +16,20 @@ pub fn singlesketch( } }; - // Build signature templates based on parameters and molecule type + // Extract moltype from the param_str (assume it's always the first part) + let moltype = if param_str.contains("dna") { + "dna" + } else if param_str.contains("protein") { + "protein" + } else if param_str.contains("dayhoff") { + "dayhoff" + } else if param_str.contains("hp") { + "hp" + } else { + bail!("Unrecognized molecule type in params string"); + }; + + // Build signature templates based on parsed parameters and detected moltype let sig_templates = crate::manysketch::build_siginfo(¶ms_vec, &moltype); if sig_templates.is_empty() { From c8c3e1bfe5642676845d94f018face2a95ea9b2d Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Mon, 30 Sep 2024 10:43:42 -0700 Subject: [PATCH 3/5] add --name --- src/lib.rs | 5 +++-- .../sourmash_plugin_branchwater/__init__.py | 8 ++++++-- src/singlesketch.rs | 15 +++++++++------ 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 88ec9a40..40789191 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -303,13 +303,14 @@ fn do_manysketch( } #[pyfunction] -#[pyo3(signature = (input_filename, param_str, output))] +#[pyo3(signature = (input_filename, param_str, output, name))] fn do_singlesketch( input_filename: String, param_str: String, output: String, + name: String, ) -> anyhow::Result { - match singlesketch::singlesketch(input_filename, param_str, output) { + match singlesketch::singlesketch(input_filename, param_str, output, name) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 09d85db8..28fcf581 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -352,6 +352,7 @@ def __init__(self, p): help='output file for the signature') p.add_argument('-p', '--param-string', action='append', type=str, default=[], help='parameter string for sketching (default: k=31,scaled=1000)') + p.add_argument('-n', '--name', help="optional name for the signature") def main(self, args): print_version() @@ -373,12 +374,15 @@ def main(self, args): # Convert the list of param strings to a single string args.param_string = "_".join(updated_param_strings).lower() - notify(f"sketching file '{args.input_filename}' with params '{args.param_string}'") + # If --name is not provided, default to input_filename + signature_name = args.name if args.name else args.input_filename + notify(f"sketching file '{args.input_filename}' with params '{args.param_string}' and name '{signature_name}'") super().main(args) status = sourmash_plugin_branchwater.do_singlesketch(args.input_filename, args.param_string, - args.output) + args.output, + signature_name) # Pass the name to Rust if status == 0: notify(f"...singlesketch is done! results in '{args.output}'") return status diff --git a/src/singlesketch.rs b/src/singlesketch.rs index 2ca28c01..4ccc528e 100644 --- a/src/singlesketch.rs +++ b/src/singlesketch.rs @@ -5,7 +5,12 @@ use needletail::{parse_fastx_file, parse_fastx_reader}; use std::fs::File; use std::io::BufWriter; -pub fn singlesketch(input_filename: String, param_str: String, output: String) -> Result<()> { +pub fn singlesketch( + input_filename: String, + param_str: String, + output: String, + name: String, +) -> Result<()> { // Parse parameter string into params_vec let param_result = parse_params_str(param_str.clone()); let params_vec = match param_result { @@ -30,14 +35,12 @@ pub fn singlesketch(input_filename: String, param_str: String, output: String) - }; // Build signature templates based on parsed parameters and detected moltype - let sig_templates = crate::manysketch::build_siginfo(¶ms_vec, &moltype); + let mut sigs = crate::manysketch::build_siginfo(¶ms_vec, &moltype); - if sig_templates.is_empty() { + if sigs.is_empty() { bail!("No signatures to build for the given parameters."); } - let mut sigs = sig_templates.clone(); - // Open FASTA file reader let mut reader = if input_filename == "-" { let stdin = std::io::stdin(); @@ -66,7 +69,7 @@ pub fn singlesketch(input_filename: String, param_str: String, output: String) - // Set name and filename for signatures sigs.iter_mut().for_each(|sig| { - sig.set_name(&input_filename); + sig.set_name(&name); // Use the provided name sig.set_filename(&input_filename); }); From ddfc38231bcc7a197b06b7ba693e9b1fd704d170 Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Mon, 30 Sep 2024 11:47:34 -0700 Subject: [PATCH 4/5] add docs and stdout and tests --- doc/README.md | 17 ++++ .../sourmash_plugin_branchwater/__init__.py | 11 +-- src/python/tests/test_sketch.py | 87 ++++++++++++++++++- src/singlesketch.rs | 23 +++-- 4 files changed, 125 insertions(+), 13 deletions(-) diff --git a/doc/README.md b/doc/README.md index eb7c4825..b0c68edd 100644 --- a/doc/README.md +++ b/doc/README.md @@ -3,6 +3,7 @@ | command | functionality | docs | | -------- | -------- | -------- | | `manysketch` | Rapidly build sketches for many input files | [link](#Running-manysketch) | +| `singlesketch` | Sketch a single sequence file | [link](#Running-singlesketch) | `fastgather` | Multithreaded `gather` of **one** metagenome against a database| [link](#Running-fastgather) | `fastmultigather` | Multithreaded `gather` of **multiple** metagenomes against a database | [link](#Running-fastmultigather) | `manysearch` | Multithreaded containment search for many queries in many large metagenomes | [link](#Running-manysearch) @@ -212,6 +213,22 @@ sourmash sig summarize proteins.zip In this case, three sketches of `protein`, `dayhoff`, and `hp` moltypes were made for each file in `proteins.csv` and saved to `proteins.zip`. + +## Running `singlesketch` + +The `singlesketch` command generates a sketch for a single sequence file. + +### Basic Usage +```bash +sourmash scripts singlesketch input.fa -p k=21,scaled=1000,dna -o output.sig --name signature_name +``` +### Using `stdin/stdout` +You can use `-` for `stdin` and output the result to `stdout`: +```bash +cat input.fa | sourmash scripts singlesketch - -o - +``` + + ### Running `multisearch` and `pairwise` The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory. diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 28fcf581..4280a257 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -349,15 +349,15 @@ def __init__(self, p): super().__init__(p) p.add_argument('input_filename', help="input FASTA file or '-' for stdin") p.add_argument('-o', '--output', required=True, - help='output file for the signature') + help='output file for the signature or - for stdout') p.add_argument('-p', '--param-string', action='append', type=str, default=[], help='parameter string for sketching (default: k=31,scaled=1000)') - p.add_argument('-n', '--name', help="optional name for the signature") + p.add_argument('-n', '--name', help="optional name for the signature, default is the basename of input path") def main(self, args): print_version() if not args.param_string: - args.param_string = ["k=31,scaled=1000"] + args.param_string = ["k=31,scaled=1000,dna"] # Check and append 'dna' if no moltype is found in a param string moltypes = ["dna", "protein", "dayhoff", "hp"] @@ -374,8 +374,9 @@ def main(self, args): # Convert the list of param strings to a single string args.param_string = "_".join(updated_param_strings).lower() - # If --name is not provided, default to input_filename - signature_name = args.name if args.name else args.input_filename + # If --name is not provided, default to input_filename, but if the source file is -, set name to empty string + signature_name = args.name if args.name else os.path.basename(args.input_filename) if args.input_filename != "-" else "" + notify(f"sketching file '{args.input_filename}' with params '{args.param_string}' and name '{signature_name}'") super().main(args) diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index eabd8087..98f08058 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -3,7 +3,7 @@ import pandas import sourmash from sourmash import index - +import io from . import sourmash_tst_utils as utils @@ -894,3 +894,88 @@ def test_manysketch_prefix_duplicated_force(runtmp, capfd): print(sigs) assert len(sigs) == 3 + +def test_singlesketch_simple(runtmp): + """Test basic single sketching with default parameters.""" + fa1 = get_test_data('short.fa') + output = runtmp.output('short.sig') + + # Run the singlesketch command + runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output) + + # Check if the output exists and contains the expected data + assert os.path.exists(output) + sig = sourmash.load_one_signature(output) + + assert sig.name == 'short.fa' + assert sig.minhash.ksize == 31 + assert sig.minhash.is_dna + assert sig.minhash.scaled == 1000 + + +def test_singlesketch_with_name(runtmp): + """Test single sketching with a custom name.""" + fa1 = get_test_data('short.fa') + output = runtmp.output('short_named.sig') + + # Run the singlesketch command with the --name option + runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-n', 'custom_name') + + # Check if the output exists and contains the expected data + assert os.path.exists(output) + sig = sourmash.load_one_signature(output) + + assert sig.name == 'custom_name' + assert sig.minhash.ksize == 31 + assert sig.minhash.is_dna + assert sig.minhash.scaled == 1000 + + +def test_singlesketch_mult_k(runtmp): + """Test single sketching with multiple k-mer sizes.""" + fa1 = get_test_data('short.fa') + output = runtmp.output('short_mult_k.sig') + + # Run the singlesketch command with multiple k sizes + runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'k=21,scaled=100', '-p', 'k=31,scaled=100') + + # Check if the output exists and contains the expected data + assert os.path.exists(output) + sigs = list(sourmash.load_signatures(output)) + + # Verify that two signatures with different k-mer sizes exist + assert len(sigs) == 2 + assert any(sig.minhash.ksize == 21 for sig in sigs) + assert any(sig.minhash.ksize == 31 for sig in sigs) + + +def test_singlesketch_mult_moltype(runtmp): + """Test single sketching with different molecule types.""" + fa1 = get_test_data('short-protein.fa') + output = runtmp.output('short_mult_moltype.sig') + + # Run the singlesketch command with multiple molecule types + runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'protein,k=10,scaled=100') + + # Check if the output exists and contains the expected data + assert os.path.exists(output) + sig = sourmash.load_one_signature(output) + + # Verify that the signature has the correct molecule type and parameters + assert sig.minhash.ksize == 10 + assert sig.minhash.is_protein + assert sig.minhash.scaled == 100 + + +def test_singlesketch_invalid_params(runtmp, capfd): + """Test singlesketch command with invalid parameters.""" + fa1 = get_test_data('short.fa') + output = runtmp.output('short_invalid.sig') + + # Run the singlesketch command with an invalid parameter string + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'singlesketch', fa1, '-o', output, '-p', 'invalid_param') + + # Check that the error message is correct + captured = capfd.readouterr() + assert "Failed to parse params string" in captured.err diff --git a/src/singlesketch.rs b/src/singlesketch.rs index 4ccc528e..35c296f5 100644 --- a/src/singlesketch.rs +++ b/src/singlesketch.rs @@ -3,7 +3,7 @@ use anyhow::{bail, Result}; use camino::Utf8Path as Path; use needletail::{parse_fastx_file, parse_fastx_reader}; use std::fs::File; -use std::io::BufWriter; +use std::io::{self, BufWriter, Write}; pub fn singlesketch( input_filename: String, @@ -73,13 +73,22 @@ pub fn singlesketch( sig.set_filename(&input_filename); }); - // Write signatures to output file - let outpath = Path::new(&output); - let file = File::create(outpath)?; - let mut writer = BufWriter::new(file); + // Check if the output is stdout or a file + if output == "-" { + // Write signatures to stdout + let stdout = io::stdout(); + let mut handle = stdout.lock(); + serde_json::to_writer(&mut handle, &sigs)?; + handle.flush()?; + } else { + // Write signatures to output file + let outpath = Path::new(&output); + let file = File::create(outpath)?; + let mut writer = BufWriter::new(file); - // Write in JSON format - serde_json::to_writer(&mut writer, &sigs)?; + // Write in JSON format + serde_json::to_writer(&mut writer, &sigs)?; + } Ok(()) } From 0203cc9d99665912618b5f6583caeca5a8e31b49 Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Mon, 30 Sep 2024 12:09:33 -0700 Subject: [PATCH 5/5] report number of sequences --- src/singlesketch.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/singlesketch.rs b/src/singlesketch.rs index 35c296f5..546665b9 100644 --- a/src/singlesketch.rs +++ b/src/singlesketch.rs @@ -21,7 +21,6 @@ pub fn singlesketch( } }; - // Extract moltype from the param_str (assume it's always the first part) let moltype = if param_str.contains("dna") { "dna" } else if param_str.contains("protein") { @@ -49,6 +48,9 @@ pub fn singlesketch( parse_fastx_file(&input_filename)? }; + // Counter for the number of sequences processed (u64) + let mut sequence_count: u64 = 0; + // Parse FASTA and add to signature while let Some(record_result) = reader.next() { match record_result { @@ -62,6 +64,7 @@ pub fn singlesketch( .expect("Failed to add sequence"); } }); + sequence_count += 1; } Err(err) => eprintln!("Error while processing record: {:?}", err), } @@ -90,5 +93,12 @@ pub fn singlesketch( serde_json::to_writer(&mut writer, &sigs)?; } + eprintln!( + "calculated {} signatures for {} sequences in {}", + sigs.len(), + sequence_count, + input_filename + ); + Ok(()) }