From bcd74e931f269fa51a3fc88c05000b6306024a2e Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 21 Mar 2024 12:17:46 -0500 Subject: [PATCH 1/5] That was easier than expected. Need to test --- src/main.rs | 20 +++++++++++++++++--- src/utils/config.rs | 24 +++++++++++++++++++++++- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 74d8512..6d675d6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,7 @@ extern crate simplelog; extern crate serde_yaml; extern crate rand_distr; extern crate itertools; +extern crate xorshift; mod utils; @@ -13,7 +14,8 @@ use std::fs::File; use clap::{Parser}; use log::*; use simplelog::*; -use rand::thread_rng; +use rand::SeedableRng; +use xorshift::thread_rng; use utils::cli; use utils::config::{read_config_yaml, build_config_from_args}; @@ -59,8 +61,6 @@ fn main() -> Result<(), std::fmt::Error> { ) ]).unwrap(); - let mut rng = thread_rng(); - // set up the config struct based on whether there was an input config. Input config // overrides any other inputs. let config = if args.config != "" { @@ -72,6 +72,20 @@ fn main() -> Result<(), std::fmt::Error> { Ok(build_config_from_args(args).expect("Problem reading configuration yaml file")) }.unwrap(); + + let mut seed: u64; + if config.rng_seed.exists() { + seed = config.rng_seed.unwrap(); + } else { + // We pick a very large random u64. + // This is based on a stack overflow article about ensuring actual randomness (but, it's + // probably overkill for this application). + let mut seed_rng = thread_rng(); + seed = seed_rng.gen::(2**52, 2**53); + } + info!("Generating random numbers using the seed: {}", seed); + let mut rng = SeedableRng::seed_from_u64(seed); + run_neat(config, &mut rng).unwrap(); Ok(()) } diff --git a/src/utils/config.rs b/src/utils/config.rs index 86ac94c..428b90d 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -50,6 +50,7 @@ pub struct RunConfiguration { pub produce_fasta: bool, pub produce_vcf: bool, pub produce_bam: bool, + pub rng_seed: Option, pub overwrite_output: bool, pub output_dir: String, pub output_prefix: String, @@ -77,6 +78,7 @@ pub struct ConfigBuilder { produce_fasta: bool, produce_vcf: bool, produce_bam: bool, + pub rng_seed: Option, overwrite_output: bool, output_dir: String, output_prefix: String, @@ -104,6 +106,7 @@ impl ConfigBuilder { produce_fasta: false, produce_vcf: false, produce_bam: false, + rng_seed: None, overwrite_output: false, output_dir: String::from(env::current_dir().unwrap().to_str().unwrap()), output_prefix: String::from("neat_out"), @@ -171,6 +174,11 @@ impl ConfigBuilder { self } + pub fn set_rng_seed(mut self, rng_seed: u64) -> ConfigBuilder { + self.rng_seed = Option::from(rng_seed); + self + } + pub fn set_overwrite_output(mut self) -> ConfigBuilder { self.overwrite_output = true; self @@ -228,6 +236,9 @@ impl ConfigBuilder { if self.produce_bam { debug!("Produce bam file: {}.bam", file_prefix) } + if self.rng_seed.is_some() { + debug!("Using rng seed: {}", self.rng_seed.unwrap()) + } Ok(()) } @@ -246,6 +257,7 @@ impl ConfigBuilder { produce_fasta: self.produce_fasta, produce_vcf: self.produce_vcf, produce_bam: self.produce_bam, + rng_seed: self.rng_seed, overwrite_output: self.overwrite_output, output_dir: self.output_dir, output_prefix: self.output_prefix, @@ -342,6 +354,12 @@ pub fn read_config_yaml(yaml: String) -> Result, ConfigErr config_builder = config_builder.set_produce_bam(true) } }, + "rng_seed" => { + // Optional rng seed to replicate results + if value != ".".to_string() { + config_builder = config_builder.set_rng_seed(value.parse::().unwrap()) + } + }, "overwrite_output" => { // overwrite_output is false by default, to prevent data loss, setting this flag // will instead enable rusty-neat to overwrite the output @@ -416,13 +434,13 @@ mod tests { produce_bam: true, produce_fasta: true, produce_vcf: true, + rng_seed: None, overwrite_output: true, output_dir: String::from("/my/my"), output_prefix: String::from("Hey.hey") }; println!("{:?}", test_configuration); - assert_eq!(test_configuration.reference, "Hello.world".to_string()); assert_eq!(test_configuration.read_len, 100); assert_eq!(test_configuration.coverage, 22); @@ -435,6 +453,7 @@ mod tests { assert_eq!(test_configuration.produce_vcf, true); assert_eq!(test_configuration.produce_bam, true); assert_eq!(test_configuration.produce_fasta, true); + assert_eq!(test_configuration.rng_seed, None); assert_eq!(test_configuration.overwrite_output, true); assert_eq!(test_configuration.output_dir, "/my/my".to_string()); assert_eq!(test_configuration.output_prefix, "Hey.hey".to_string()); @@ -481,6 +500,7 @@ mod tests { assert_eq!(config.produce_fastq, true); assert_eq!(config.produce_vcf, false); assert_eq!(config.produce_bam, false); + assert_eq!(config.rng_seed, None); config = config.set_mutation_rate(0.111) .set_ploidy(3) .set_paired_ended(true) @@ -490,6 +510,7 @@ mod tests { .set_produce_fasta(true) .set_produce_vcf(true) .set_produce_bam(true) + .set_rng_seed(11393) .set_overwrite_output(); assert_eq!(config.mutation_rate, 0.111); assert_eq!(config.ploidy, 3); @@ -500,6 +521,7 @@ mod tests { assert_eq!(config.produce_fastq, false); assert_eq!(config.produce_vcf, true); assert_eq!(config.produce_bam, true); + assert_eq!(config.rng_seed, 11393); } #[test] From 611f14b931382412d3f06b17153fc89d1cd429ab Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 21 Mar 2024 16:22:10 -0500 Subject: [PATCH 2/5] Updated error handling, need to test new errors --- Cargo.lock | 2 + Cargo.toml | 2 + src/main.rs | 22 ++++----- src/utils.rs | 2 +- src/utils/config.rs | 14 +++++- src/utils/fasta_tools.rs | 24 ++++++++-- src/utils/make_reads.rs | 46 +++++++++++++----- src/utils/mutate.rs | 14 +++--- src/utils/neat_rng.rs | 101 +++++++++++++++++++++++++++++++++++++++ src/utils/nucleotides.rs | 8 ++-- src/utils/runner.rs | 36 +++++++------- src/utils/vcf_tools.rs | 9 ++-- 12 files changed, 220 insertions(+), 60 deletions(-) create mode 100644 src/utils/neat_rng.rs diff --git a/Cargo.lock b/Cargo.lock index 4233824..a2e316e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -648,6 +648,8 @@ dependencies = [ "itertools", "log", "rand 0.9.0-alpha.0", + "rand_chacha", + "rand_core 0.9.0-alpha.0", "rand_distr", "serde_yaml", "simplelog", diff --git a/Cargo.toml b/Cargo.toml index 2f22a48..2fcc557 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,5 @@ clap = { version = "4.5.1", features = ["derive"] } itertools = "0.12.1" assert_fs = "1.1.1" rand_distr = "0.5.0-alpha.0" +rand_core = "0.9.0-alpha.0" +rand_chacha = "0.9.0-alpha.0" diff --git a/src/main.rs b/src/main.rs index 6d675d6..453508e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,7 +5,8 @@ extern crate simplelog; extern crate serde_yaml; extern crate rand_distr; extern crate itertools; -extern crate xorshift; +extern crate rand_core; +extern crate rand_chacha; mod utils; @@ -14,13 +15,14 @@ use std::fs::File; use clap::{Parser}; use log::*; use simplelog::*; -use rand::SeedableRng; -use xorshift::thread_rng; - +use rand::{Rng, SeedableRng}; +use rand::thread_rng; +use rand_core::RngCore; use utils::cli; use utils::config::{read_config_yaml, build_config_from_args}; use utils::file_tools::check_parent; use utils::runner::run_neat; +use utils::neat_rng::NeatRng; fn main() -> Result<(), std::fmt::Error> { @@ -73,18 +75,16 @@ fn main() -> Result<(), std::fmt::Error> { }.unwrap(); - let mut seed: u64; - if config.rng_seed.exists() { + let seed: u64; + if !config.rng_seed.is_none() { seed = config.rng_seed.unwrap(); } else { - // We pick a very large random u64. - // This is based on a stack overflow article about ensuring actual randomness (but, it's - // probably overkill for this application). + // We pick a random u64 to use as a seed let mut seed_rng = thread_rng(); - seed = seed_rng.gen::(2**52, 2**53); + seed = seed_rng.next_u64(); } info!("Generating random numbers using the seed: {}", seed); - let mut rng = SeedableRng::seed_from_u64(seed); + let mut rng: NeatRng = SeedableRng::seed_from_u64(seed); run_neat(config, &mut rng).unwrap(); Ok(()) diff --git a/src/utils.rs b/src/utils.rs index ff415ec..b11d8c8 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,5 +7,5 @@ pub mod mutate; pub mod fastq_tools; pub mod vcf_tools; pub mod nucleotides; - +pub mod neat_rng; pub mod runner; \ No newline at end of file diff --git a/src/utils/config.rs b/src/utils/config.rs index 428b90d..abc648a 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -5,6 +5,7 @@ use std::string::String; use crate::utils::cli::Cli; use log::{debug, warn, error}; use std::env; +use std::fmt::{Display, Formatter}; /// This is the run configuration for this particular run, which holds the parameters needed by the /// various side functions. It is build with a ConfigurationBuilder, which can take either a @@ -78,7 +79,7 @@ pub struct ConfigBuilder { produce_fasta: bool, produce_vcf: bool, produce_bam: bool, - pub rng_seed: Option, + rng_seed: Option, overwrite_output: bool, output_dir: String, output_prefix: String, @@ -90,6 +91,15 @@ pub enum ConfigError { ConfigurationError, } +impl Display for ConfigError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match *self { + ConfigError::FileReadError => { write!(f, "Problem reading the configuration file.") }, + ConfigError::ConfigurationError => { write!(f, "Problem creating the configuration for this run.")}, + } + } +} + impl ConfigBuilder { pub fn new() -> ConfigBuilder { ConfigBuilder { @@ -521,7 +531,7 @@ mod tests { assert_eq!(config.produce_fastq, false); assert_eq!(config.produce_vcf, true); assert_eq!(config.produce_bam, true); - assert_eq!(config.rng_seed, 11393); + assert_eq!(config.rng_seed, Some(11393)); } #[test] diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index d0b6c7f..d47d342 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -3,12 +3,28 @@ use std::io; use std::io::Write; use std::*; use HashMap; +use std::fmt::{Display, Formatter}; use utils::file_tools::read_lines; use utils::file_tools::open_file; use utils::nucleotides::{u8_to_base, base_to_u8}; /// This library contains tools needed to process fasta files as input and output. +#[derive(Debug)] +enum FastaError { + FastaReadError, + FastaWriteError, +} + +impl Display for FastaError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + FastaError::FastaReadError => { write!(f, "Error reading the fasta file") }, + FastaError::FastaWriteError => { write!(f, "Error writing a fasta file") }, + } + } +} + pub fn sequence_array_to_string(input_array: &Vec) -> String { // Converts a sequence vector into a string representing the DNA sequence let mut return_string = String::new(); @@ -20,7 +36,7 @@ pub fn sequence_array_to_string(input_array: &Vec) -> String { pub fn read_fasta( fasta_path: &str -) -> (Box>>, Vec) { +) -> Result<(Box>>, Vec), FastaError> { // Reads a fasta file and turns it into a HashMap and puts it in the heap info!("Reading fasta: {}", fasta_path); @@ -49,7 +65,7 @@ pub fn read_fasta( }); // Need to pick up the last one fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone()); - (Box::new(fasta_map), fasta_order) + Ok((Box::new(fasta_map), fasta_order)) } pub fn write_fasta( @@ -112,7 +128,7 @@ mod tests { #[test] fn test_read_fasta() { let test_fasta = "data/H1N1.fa"; - let (_test_map, map_order) = read_fasta(test_fasta); + let (_test_map, map_order) = read_fasta(test_fasta).unwrap(); assert_eq!(map_order[0], "H1N1_HA".to_string()) } @@ -120,7 +136,7 @@ mod tests { #[should_panic] fn test_read_bad_fasta() { let test_fasta = "data/fake.fasta"; - read_fasta(test_fasta); + read_fasta(test_fasta).unwrap(); } #[test] diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index fe8499f..5376f2c 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -1,10 +1,12 @@ use std::collections::{HashSet, VecDeque}; +use std::fmt::{Display, Formatter, write}; use rand::RngCore; -use rand::rngs::ThreadRng; use rand::seq::SliceRandom; use rand_distr::{Normal, Distribution}; +use utils::neat_rng::NeatRng; + /// This is the core functionality of NEAT. Generate reads turns a mutated fasta array into short reads. /// The idea of cover_dataset is we generate a set of coordinates /// That define reads and covers the dataset coverage number times, to give the contig the proper @@ -12,13 +14,28 @@ use rand_distr::{Normal, Distribution}; /// the mutated fasta file. These will either be read-length fragments or fragment model length /// fragments. +#[derive(Debug)] +enum GenerateReadsError { + CoverDatasetError, + GenerateReadsError, +} + +impl Display for GenerateReadsError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match *self { + GenerateReadsError::CoverDatasetError => { write!(f, "Error generating coordinates of fragments") } + GenerateReadsError::GenerateReadsError => { write!(f, "Error generating reads from coordinates") } + } + } +} + fn cover_dataset( span_length: usize, read_length: usize, mut fragment_pool: Vec, coverage: usize, - mut rng: &mut ThreadRng, -) -> Vec<(usize, usize)> { + mut rng: &mut NeatRng, +) -> Result, GenerateReadsError> { /* Takes: span_length: Total number of bases in the sequence @@ -100,7 +117,11 @@ fn cover_dataset( gap_size += wildcard; } } - read_set + if read_set.is_empty() { + Err(GenerateReadsError::CoverDatasetError) + } else { + Ok(read_set) + } } pub fn generate_reads( @@ -110,8 +131,8 @@ pub fn generate_reads( paired_ended: bool, mean: Option, st_dev: Option, - mut rng: &mut ThreadRng, -) -> Box>> { + mut rng: &mut NeatRng, +) -> Result>>, GenerateReadsError>{ /* Takes: mutated_sequence: a vector of u8's representing the mutated sequence. @@ -148,7 +169,7 @@ pub fn generate_reads( fragment_pool, *coverage, &mut rng, - ); + ).unwrap(); // Generate the reads from the read positions. for (start, end) in read_positions { @@ -161,7 +182,8 @@ pub fn generate_reads( #[cfg(test)] mod tests { use super::*; - use rand::thread_rng; + use rand::SeedableRng; + use utils::neat_rng::NeatRng; #[test] fn test_cover_dataset() { @@ -169,7 +191,7 @@ mod tests { let read_length = 10; let fragment_pool = vec![10]; let coverage = 1; - let mut rng = thread_rng(); + let mut rng = NeatRng::seed_from_u64(0); let cover = cover_dataset( span_length, @@ -187,7 +209,7 @@ mod tests { let read_length = 100; let fragment_pool = vec![300]; let coverage = 1; - let mut rng = thread_rng(); + let mut rng = NeatRng::seed_from_u64(0); let cover = cover_dataset( span_length, @@ -207,7 +229,7 @@ mod tests { let paired_ended = false; let mean = None; let st_dev = None; - let mut rng = thread_rng(); + let mut rng = NeatRng::seed_from_u64(0); let reads = generate_reads( &mutated_sequence, &read_length, @@ -229,7 +251,7 @@ mod tests { let paired_ended = true; let mean = Some(200.0); let st_dev = Some(1.0); - let mut rng = thread_rng(); + let mut rng = NeatRng::seed_from_u64(0); let reads = generate_reads( &mutated_sequence, &read_length, diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs index 914e74d..0e7557d 100644 --- a/src/utils/mutate.rs +++ b/src/utils/mutate.rs @@ -1,10 +1,11 @@ use std::cmp::max; use std::collections::HashMap; -use rand::prelude::{ThreadRng, IndexedRandom}; +use rand::prelude::IndexedRandom; use rand::Rng; use log::debug; use itertools::izip; use utils::nucleotides::NucModel; +use utils::neat_rng::NeatRng; /// This is a basic mutation with SNPs using a basic mutation model. /// mutate_fasta takes a fasta Hashmap and returns a mutated version and the locations of the @@ -14,7 +15,7 @@ use utils::nucleotides::NucModel; pub fn mutate_fasta( file_struct: &HashMap>, - mut rng: &mut ThreadRng + mut rng: &mut NeatRng ) -> (Box>>, Box>>) { /* Takes: @@ -75,7 +76,7 @@ pub fn mutate_fasta( fn mutate_sequence( sequence: &Vec, num_positions: usize, - mut rng: &mut ThreadRng + mut rng: &mut NeatRng ) -> (Vec, Vec<(usize, u8, u8)>) { /* Takes: @@ -141,14 +142,15 @@ fn mutate_sequence( #[cfg(test)] mod tests { - use rand::thread_rng; + use rand::SeedableRng; + use utils::neat_rng::NeatRng; use super::*; #[test] fn test_mutate_sequence() { let seq1: Vec = vec![4, 4, 0, 0, 0, 1, 1, 2, 0, 3, 1, 1, 1]; let num_positions = 2; - let mut rng = thread_rng(); + let mut rng = NeatRng::seed_from_u64(0); let mutant = mutate_sequence(&seq1, num_positions, &mut rng); assert_eq!(mutant.0.len(), seq1.len()); assert!(!mutant.1.is_empty()); @@ -163,7 +165,7 @@ mod tests { let file_struct: HashMap> = HashMap::from([ ("chr1".to_string(), seq.clone()) ]); - let mut rng = thread_rng(); + let mut rng = NeatRng::seed_from_u64(0); let mutations = mutate_fasta( &file_struct, &mut rng, diff --git a/src/utils/neat_rng.rs b/src/utils/neat_rng.rs new file mode 100644 index 0000000..0357747 --- /dev/null +++ b/src/utils/neat_rng.rs @@ -0,0 +1,101 @@ +// Copyright 2018 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// This is basically a direct lift from rand::rngs::StdRng. Why did I do it this way? I want +// flexibility to change the rng behind the scenes, and was having trouble implementing RNGCore +// myself. I could have just stolen the techniques, but decided to give credit. +// I have borrowed this code for Rust because I am not 100% how to borrow it the "right" way. + +//! The standard RNG + +use rand_core::{CryptoRng, Error, RngCore, SeedableRng}; + +use rand_chacha::ChaCha12Rng as Rng; + +/// The standard RNG. The PRNG algorithm in `NeatRng` is chosen to be efficient +/// on the current platform, to be statistically strong and unpredictable +/// (meaning a cryptographically secure PRNG). +/// +/// The current algorithm used is the ChaCha block cipher with 12 rounds. Please +/// see this relevant [rand issue] for the discussion. This may change as new +/// evidence of cipher security and performance becomes available. +/// +/// The algorithm is deterministic but should not be considered reproducible +/// due to dependence on configuration and possible replacement in future +/// library versions. For a secure reproducible generator, we recommend use of +/// the [rand_chacha] crate directly. +/// +/// [rand_chacha]: https://crates.io/crates/rand_chacha +/// [rand issue]: https://github.com/rust-random/rand/issues/932 +#[cfg_attr(doc_cfg, doc(cfg(feature = "std_rng")))] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct NeatRng(Rng); + +impl RngCore for NeatRng { + #[inline(always)] + fn next_u32(&mut self) -> u32 { + self.0.next_u32() + } + + #[inline(always)] + fn next_u64(&mut self) -> u64 { + self.0.next_u64() + } + + #[inline(always)] + fn fill_bytes(&mut self, dest: &mut [u8]) { + self.0.fill_bytes(dest); + } + + #[inline(always)] + fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> { + self.0.try_fill_bytes(dest) + } +} + +impl SeedableRng for NeatRng { + type Seed = ::Seed; + + #[inline(always)] + fn from_seed(seed: Self::Seed) -> Self { + NeatRng(Rng::from_seed(seed)) + } + + #[inline(always)] + fn from_rng(rng: R) -> Result { + Rng::from_rng(rng).map(NeatRng) + } +} + +impl CryptoRng for NeatRng {} + + +#[cfg(test)] +mod test { + use super::*; + use rand::{RngCore, SeedableRng}; + + #[test] + fn test_neat_rng_construction() { + // Test value-stability of NeatRng. This is expected to break any time + // the algorithm is changed. + #[rustfmt::skip] + let seed = [1,0,0,0, 23,0,0,0, 200,1,0,0, 210,30,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0]; + + let target = [10719222850664546238, 14064965282130556830]; + + let mut rng0 = NeatRng::from_seed(seed); + let x0 = rng0.next_u64(); + + let mut rng1 = NeatRng::from_rng(rng0).unwrap(); + let x1 = rng1.next_u64(); + + assert_eq!([x0, x1], target); + } +} \ No newline at end of file diff --git a/src/utils/nucleotides.rs b/src/utils/nucleotides.rs index ad517b9..dc68091 100644 --- a/src/utils/nucleotides.rs +++ b/src/utils/nucleotides.rs @@ -1,6 +1,6 @@ -use rand::prelude::ThreadRng; use itertools::izip; use rand::seq::IndexedRandom; +use utils::neat_rng::NeatRng; /// Throughout this program, we are standardizing the use of a u8 representation of the nucleotides /// A = 0 @@ -86,7 +86,7 @@ impl NucModel { } } - pub fn choose_new_nuc(&self, base: u8, mut rng: &mut ThreadRng) -> u8 { + pub fn choose_new_nuc(&self, base: u8, mut rng: &mut NeatRng) -> u8 { // We'll handle the trivial case first: if base >= 4 { return 4; @@ -121,7 +121,7 @@ impl NucModel { #[cfg(test)] mod tests { - use rand::thread_rng; + use rand_core::SeedableRng; use super::*; #[test] @@ -150,7 +150,7 @@ mod tests { let c_weights = vec![20, 0, 1, 1]; let g_weights = vec![1, 1, 0, 20]; let t_weights = vec![20, 1, 20, 0]; - let mut rng = thread_rng(); + let mut rng = NeatRng::seed_from_u64(0); let test_model = NucModel::from(vec![a_weights, c_weights, g_weights, t_weights]); // It actually mutates the base assert_ne!(test_model.choose_new_nuc(0, &mut rng), 0); diff --git a/src/utils/runner.rs b/src/utils/runner.rs index 8393c89..4e310c7 100644 --- a/src/utils/runner.rs +++ b/src/utils/runner.rs @@ -1,35 +1,40 @@ use std::collections::HashSet; -use std::fmt::{Display, Formatter}; +use std::fmt::{Display, Formatter, write}; use log::info; use rand::prelude::SliceRandom; -use rand::rngs::ThreadRng; use utils::config::RunConfiguration; +use rand::SeedableRng; use utils::fasta_tools::{read_fasta, write_fasta}; use utils::fastq_tools::write_fastq; use utils::make_reads::generate_reads; use utils::mutate::mutate_fasta; +use utils::neat_rng::NeatRng; use utils::vcf_tools::write_vcf; #[derive(Debug)] pub enum NeatErrors { - FastaError, - FastqError, - VcfError, + FileError, + FastaReadError, } impl Display for NeatErrors { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - todo!() + match *self { + NeatErrors::FileError => { write!(f, "Neat encountered an error with a file.")} + NeatErrors::FastaReadError => { write!(f, "Error creating fasta data") } + } } } -pub fn run_neat(config: Box, mut rng: &mut ThreadRng) -> Result<(), NeatErrors>{ +pub fn run_neat(config: Box, mut rng: &mut NeatRng) -> Result<(), NeatErrors>{ // Create the prefix of the files to write let output_file = format!("{}/{}", config.output_dir, config.output_prefix); // Reading the reference file into memory info!("Mapping reference fasta file: {}", &config.reference); - let (fasta_map, fasta_order) = read_fasta(&config.reference); + let (fasta_map, fasta_order) = read_fasta(&config.reference) + .or(Err(NeatErrors::FastaReadError)) + .unwrap(); // Mutating the reference and recording the variant locations. info!("Mutating reference."); @@ -46,8 +51,8 @@ pub fn run_neat(config: Box, mut rng: &mut ThreadRng) -> Resul config.overwrite_output, &output_file, ) - .or(Err(NeatErrors::FastaError)) - .expect("Error writing fasta file"); + .or(Err(NeatErrors::FileError)) + .unwrap(); } if config.produce_vcf { @@ -61,8 +66,8 @@ pub fn run_neat(config: Box, mut rng: &mut ThreadRng) -> Resul &output_file, &mut rng ) - .or(Err(NeatErrors::VcfError)) - .expect("Error writing VCF file"); + .or(Err(NeatErrors::FileError)) + .unwrap(); } let mut read_sets: HashSet> = HashSet::new(); @@ -94,8 +99,8 @@ pub fn run_neat(config: Box, mut rng: &mut ThreadRng) -> Resul config.paired_ended, *outsets, ) - .or(Err(NeatErrors::FastqError)) - .expect("Problem writing fastq file"); + .or(Err(NeatErrors::FileError)) + .unwrap(); info!("Processing complete") } Ok(()) @@ -104,7 +109,6 @@ pub fn run_neat(config: Box, mut rng: &mut ThreadRng) -> Resul #[cfg(test)] mod tests { use std::fs; - use rand::thread_rng; use std::path::Path; use super::*; @@ -113,7 +117,7 @@ mod tests { let config = RunConfiguration::build().build(); run_neat( Box::new(config), - &mut thread_rng(), + &mut NeatRng::seed_from_u64(0), ).unwrap(); let fastq_file = Path::new("neat_out_r1.fastq").canonicalize().unwrap(); fs::remove_file(fastq_file).unwrap(); diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs index 26b58af..2871ade 100644 --- a/src/utils/vcf_tools.rs +++ b/src/utils/vcf_tools.rs @@ -4,10 +4,10 @@ use std::collections::HashMap; use std::io; use std::io::Write; use rand::Rng; -use rand::rngs::ThreadRng; use rand::seq::IndexedRandom; use utils::nucleotides::u8_to_base; use utils::file_tools::open_file; +use utils::neat_rng::NeatRng; fn genotype_to_string(genotype: Vec) -> String { /* @@ -28,7 +28,7 @@ pub fn write_vcf( reference_path: &str, overwrite_output: bool, output_file_prefix: &str, - mut rng: &mut ThreadRng, + mut rng: &mut NeatRng, ) -> io::Result<()> { /* Takes: @@ -105,9 +105,10 @@ pub fn write_vcf( #[cfg(test)] mod tests { use std::fs; - use rand::thread_rng; + use rand::SeedableRng; use super::*; use std::path::Path; + use utils::neat_rng::NeatRng; #[test] fn test_genotype_to_string() { @@ -125,7 +126,7 @@ mod tests { let reference_path = "/fake/path/to/H1N1.fa"; let overwrite_output = false; let output_file_prefix = "test"; - let mut rng = thread_rng(); + let mut rng = NeatRng::seed_from_u64(0); write_vcf( &variant_locations, &fasta_order, From 355b0b2b6fe5edee1d3cc06fbaa615eb5023c983 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 21 Mar 2024 16:46:44 -0500 Subject: [PATCH 3/5] Running, but needs to test the errors --- src/main.rs | 2 +- src/utils/fasta_tools.rs | 2 +- src/utils/make_reads.rs | 18 +++++++++++------- src/utils/runner.rs | 17 +++++------------ 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/main.rs b/src/main.rs index 453508e..2e34a62 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ use std::fs::File; use clap::{Parser}; use log::*; use simplelog::*; -use rand::{Rng, SeedableRng}; +use rand::SeedableRng; use rand::thread_rng; use rand_core::RngCore; use utils::cli; diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index d47d342..8cb43d5 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -11,7 +11,7 @@ use utils::nucleotides::{u8_to_base, base_to_u8}; /// This library contains tools needed to process fasta files as input and output. #[derive(Debug)] -enum FastaError { +pub enum FastaError { FastaReadError, FastaWriteError, } diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index 5376f2c..d6846a9 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -1,5 +1,5 @@ use std::collections::{HashSet, VecDeque}; -use std::fmt::{Display, Formatter, write}; +use std::fmt::{Display, Formatter}; use rand::RngCore; use rand::seq::SliceRandom; @@ -15,7 +15,7 @@ use utils::neat_rng::NeatRng; /// fragments. #[derive(Debug)] -enum GenerateReadsError { +pub enum GenerateReadsError { CoverDatasetError, GenerateReadsError, } @@ -176,7 +176,11 @@ pub fn generate_reads( read_set.insert(mutated_sequence[start..end].into()); } // puts the reads in the heap. - Box::new(read_set) + if read_set.is_empty() { + Err(GenerateReadsError::GenerateReadsError) + } else { + Ok(Box::new(read_set)) + } } #[cfg(test)] @@ -200,7 +204,7 @@ mod tests { coverage, &mut rng, ); - assert_eq!(cover[0], (0,10)) + assert_eq!(cover.unwrap()[0], (0,10)) } #[test] @@ -218,7 +222,7 @@ mod tests { coverage, &mut rng, ); - assert_eq!(cover[0], (0, 300)) + assert_eq!(cover.unwrap()[0], (0, 300)) } #[test] @@ -238,7 +242,7 @@ mod tests { mean, st_dev, &mut rng, - ); + ).unwrap(); println!("{:?}", reads); assert!(reads.contains(&(vec![0, 0, 1, 0, 3, 3, 3, 3, 0, 0]))); } @@ -262,6 +266,6 @@ mod tests { &mut rng, ); println!("{:?}", reads); - assert!(!reads.is_empty()) + assert!(!reads.unwrap().is_empty()) } } \ No newline at end of file diff --git a/src/utils/runner.rs b/src/utils/runner.rs index 4e310c7..f0d9948 100644 --- a/src/utils/runner.rs +++ b/src/utils/runner.rs @@ -1,5 +1,5 @@ use std::collections::HashSet; -use std::fmt::{Display, Formatter, write}; +use std::fmt::{Display, Formatter}; use log::info; use rand::prelude::SliceRandom; use utils::config::RunConfiguration; @@ -33,7 +33,6 @@ pub fn run_neat(config: Box, mut rng: &mut NeatRng) -> Result< // Reading the reference file into memory info!("Mapping reference fasta file: {}", &config.reference); let (fasta_map, fasta_order) = read_fasta(&config.reference) - .or(Err(NeatErrors::FastaReadError)) .unwrap(); // Mutating the reference and recording the variant locations. @@ -50,9 +49,7 @@ pub fn run_neat(config: Box, mut rng: &mut NeatRng) -> Result< &fasta_order, config.overwrite_output, &output_file, - ) - .or(Err(NeatErrors::FileError)) - .unwrap(); + ).unwrap(); } if config.produce_vcf { @@ -65,9 +62,7 @@ pub fn run_neat(config: Box, mut rng: &mut NeatRng) -> Result< config.overwrite_output, &output_file, &mut rng - ) - .or(Err(NeatErrors::FileError)) - .unwrap(); + ).unwrap(); } let mut read_sets: HashSet> = HashSet::new(); @@ -82,7 +77,7 @@ pub fn run_neat(config: Box, mut rng: &mut NeatRng) -> Result< config.fragment_mean, config.fragment_st_dev, &mut rng - ); + ).unwrap(); read_sets.extend(*data_set); } @@ -98,9 +93,7 @@ pub fn run_neat(config: Box, mut rng: &mut NeatRng) -> Result< config.overwrite_output, config.paired_ended, *outsets, - ) - .or(Err(NeatErrors::FileError)) - .unwrap(); + ).unwrap(); info!("Processing complete") } Ok(()) From 807074258f0039a4cdc68ce2477374dc9afad00d Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 21 Mar 2024 23:16:49 -0500 Subject: [PATCH 4/5] I'm happy with error handling. I think it's simple but straightforward and introducing the complexity did not make things more clear, for me, so I went back to something relatively simple --- src/main.rs | 16 ++++--- src/utils/config.rs | 97 ++++++++++++++++++---------------------- src/utils/fasta_tools.rs | 24 ++-------- src/utils/make_reads.rs | 34 +++----------- src/utils/neat_rng.rs | 1 + src/utils/runner.rs | 25 +++-------- 6 files changed, 71 insertions(+), 126 deletions(-) diff --git a/src/main.rs b/src/main.rs index 2e34a62..b94db9e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,7 +24,7 @@ use utils::file_tools::check_parent; use utils::runner::run_neat; use utils::neat_rng::NeatRng; -fn main() -> Result<(), std::fmt::Error> { +fn main() { info!("Begin processing"); // parse the arguments from the command line @@ -71,10 +71,13 @@ fn main() -> Result<(), std::fmt::Error> { } else { info!("Using command line arguments."); debug!("Command line args: {:?}", &args); - Ok(build_config_from_args(args).expect("Problem reading configuration yaml file")) - }.unwrap(); - + build_config_from_args(args).unwrap_or_else(|error| { + panic!("Problem reading configuration yaml file {:?}", error) + }) + }; + // Generate the RNG used for this run. If one was given in the config file, use that, or else + // use thread_rng to generate a random seed, then seed using a SeedableRng based on StdRng let seed: u64; if !config.rng_seed.is_none() { seed = config.rng_seed.unwrap(); @@ -86,7 +89,8 @@ fn main() -> Result<(), std::fmt::Error> { info!("Generating random numbers using the seed: {}", seed); let mut rng: NeatRng = SeedableRng::seed_from_u64(seed); - run_neat(config, &mut rng).unwrap(); - Ok(()) + run_neat(config, &mut rng).unwrap_or_else(|error| { + panic!("Neat encountered a problem: {:?}", error) + }) } diff --git a/src/utils/config.rs b/src/utils/config.rs index abc648a..29e25bb 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -3,9 +3,9 @@ extern crate log; use std::collections::{HashMap}; use std::string::String; use crate::utils::cli::Cli; -use log::{debug, warn, error}; +use log::{debug, warn}; use std::env; -use std::fmt::{Display, Formatter}; +use std::path::Path; /// This is the run configuration for this particular run, which holds the parameters needed by the /// various side functions. It is build with a ConfigurationBuilder, which can take either a @@ -67,7 +67,7 @@ impl RunConfiguration { // The config builder allows us to construct a config in multiple different ways, depending // on the input. pub struct ConfigBuilder { - reference: String, + reference: Option, read_len: usize, coverage: usize, mutation_rate: f64, @@ -85,26 +85,11 @@ pub struct ConfigBuilder { output_prefix: String, } -#[derive(Debug)] -pub enum ConfigError { - FileReadError, - ConfigurationError, -} - -impl Display for ConfigError { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match *self { - ConfigError::FileReadError => { write!(f, "Problem reading the configuration file.") }, - ConfigError::ConfigurationError => { write!(f, "Problem creating the configuration for this run.")}, - } - } -} - impl ConfigBuilder { pub fn new() -> ConfigBuilder { ConfigBuilder { // Setting default values - reference: String::from("data/H1N1.fa"), + reference: None, read_len: 150, coverage: 10, mutation_rate: 0.001, @@ -125,7 +110,7 @@ impl ConfigBuilder { // Basically a bunch of setters pub fn set_reference(mut self, reference: String) -> ConfigBuilder { - self.reference = reference; + self.reference = Option::from(reference); self } @@ -204,8 +189,11 @@ impl ConfigBuilder { self } - pub fn check_and_print_config(&self) -> Result<(), ConfigError> { - debug!("Running rusty-neat to generate reads on {} with...", self.reference); + pub fn check_and_print_config(&self) { + if self.reference.is_none() { + panic!("No reference was specified.") + } + debug!("Running rusty-neat to generate reads on {} with...", self.reference.clone().unwrap()); debug!(" >read length: {}", self.read_len); debug!(" >coverage: {}", self.coverage); debug!(" >mutation rate: {}", self.mutation_rate); @@ -216,16 +204,14 @@ impl ConfigBuilder { } let file_prefix = format!("{}/{}", self.output_dir, self.output_prefix); if !(self.produce_fastq | self.produce_fasta | self.produce_vcf | self.produce_bam) { - error!("All file types set to false, no files would be produced."); - return Err(ConfigError::ConfigurationError); + panic!("All file types set to false, no files would be produced."); } if self.paired_ended { if self.fragment_mean.is_none() | self.fragment_st_dev.is_none() { - error!( + panic!( "Paired ended is set to true, but fragment mean \ and standard deviation were not set." ); - return Err(ConfigError::ConfigurationError); } if self.produce_fastq { debug!("\t> fragment mean: {}", self.fragment_mean.unwrap()); @@ -249,13 +235,12 @@ impl ConfigBuilder { if self.rng_seed.is_some() { debug!("Using rng seed: {}", self.rng_seed.unwrap()) } - Ok(()) } // Function to build the actual configuration. pub fn build(self) -> RunConfiguration { RunConfiguration { - reference: self.reference, + reference: self.reference.unwrap(), read_len: self.read_len, coverage: self.coverage, mutation_rate: self.mutation_rate, @@ -275,7 +260,7 @@ impl ConfigBuilder { } } -pub fn read_config_yaml(yaml: String) -> Result, ConfigError> { +pub fn read_config_yaml(yaml: String) -> Box { /* Reads an input configuration file from yaml using the serde package. Then sets the parameters based on the inputs. A "." value means to use the default value. @@ -285,7 +270,7 @@ pub fn read_config_yaml(yaml: String) -> Result, ConfigErr let f = std::fs::File::open(yaml); let file = match f { Ok(l) => l, - Err(_) => return Err(ConfigError::FileReadError), + Err(error) => panic!("Problem reading the cofig file: {}", error), }; // Uses serde_yaml to read the file into a HashMap let scrape_config: HashMap = serde_yaml::from_reader(file) @@ -297,11 +282,15 @@ pub fn read_config_yaml(yaml: String) -> Result, ConfigErr for (key, value) in scrape_config { match key.as_str() { "reference" => { - if !(value == ".".to_string() || (value == "REQUIRED".to_string())) { - config_builder = config_builder.set_reference(value) + if value != ".".to_string() && value != "".to_string() { + let reference_path = Path::new(&value); + if !reference_path.exists() { + panic!("Reference file not found: {}", value) + } else { + config_builder = config_builder.set_reference(value); + } } else { - error!("Reference is required."); - return Err(ConfigError::ConfigurationError) + panic!("Reference was not specified in config."); } }, "read_len" => { @@ -390,8 +379,8 @@ pub fn read_config_yaml(yaml: String) -> Result, ConfigErr _ => continue, } } - let _ = &config_builder.check_and_print_config().unwrap(); - Ok(Box::new(config_builder.build())) + let _ = &config_builder.check_and_print_config(); + Box::new(config_builder.build()) } pub fn build_config_from_args(args: Cli) -> Result, &'static str> { @@ -406,8 +395,7 @@ pub fn build_config_from_args(args: Cli) -> Result, &'stat if args.reference != "" { config_builder = config_builder.set_reference(args.reference) } else { - error!("No reference specified"); - return Err("Reference File Error."); + panic!("No reference specified"); } // The default value works directly for the config builder config_builder = config_builder.set_read_len(args.read_length); @@ -421,7 +409,7 @@ pub fn build_config_from_args(args: Cli) -> Result, &'stat config_builder = config_builder.set_output_prefix(args.output_file_prefix) }; // Wraps things in a Box to move this object to the heap - let _ = &config_builder.check_and_print_config().unwrap(); + let _ = &config_builder.check_and_print_config(); Ok(Box::new(config_builder.build())) } @@ -473,13 +461,13 @@ mod tests { fn test_build() { use super::*; let x = RunConfiguration::build(); - assert_eq!(x.reference, "data/H1N1.fa".to_string()) + assert!(x.reference.is_none()) } #[test] fn test_read_config_yaml() { let yaml = String::from("config/neat_test.yml"); - let test_config = read_config_yaml(yaml).unwrap(); + let test_config = read_config_yaml(yaml); assert_eq!(test_config.reference, "data/ecoli.fa".to_string()); assert_eq!(test_config.coverage, 3); } @@ -488,14 +476,14 @@ mod tests { #[should_panic] fn test_bad_yaml() { let yaml = String::from("fake_file.yml"); - read_config_yaml(yaml).unwrap(); + read_config_yaml(yaml); } #[test] #[should_panic] fn test_missing_ref() { let yaml = String::from("config/simple_template.yml"); - let _test_config = read_config_yaml(yaml).unwrap(); + read_config_yaml(yaml); } #[test] @@ -571,34 +559,37 @@ mod tests { #[test] fn test_overwrite_warn() { let mut config = ConfigBuilder::new(); + config = config.set_reference("data/H1N1.fa".to_string()); config = config.set_overwrite_output(); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); } #[test] fn test_produce_fastq_messages() { let mut config = ConfigBuilder::new(); + config = config.set_reference("data/H1N1.fa".to_string()); config = config.set_paired_ended(true); config = config.set_fragment_mean(100.0); config = config.set_fragment_st_dev(10.0); // tests first branch of if statement for paired_ended & produce_fastq = true - config.check_and_print_config().unwrap(); + config.check_and_print_config(); // Checks the alternative pe = true, produce_fastq = false config = config.set_produce_fastq(false); // need to produce at least one file or check will panic config = config.set_produce_fasta(true); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); } #[test] fn test_produce_fasta_messages() { let mut config = ConfigBuilder::new(); + config = config.set_reference("data/H1N1.fa".to_string()); config = config.set_produce_fasta(true); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); config = config.set_produce_vcf(true); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); config = config.set_produce_bam(true); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); // If the unwraps all work we're good. } @@ -607,7 +598,7 @@ mod tests { fn test_no_files() { let mut config = ConfigBuilder::new(); config = config.set_produce_fastq(false); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); } #[test] @@ -616,7 +607,7 @@ mod tests { let mut config = ConfigBuilder::new(); // paired end set to true, by default, fragment mean and st dev are None config = config.set_paired_ended(true); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); } #[test] @@ -625,7 +616,7 @@ mod tests { let mut config = ConfigBuilder::new(); config = config.set_paired_ended(true); config = config.set_fragment_st_dev(10.0); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); } #[test] @@ -634,7 +625,7 @@ mod tests { let mut config = ConfigBuilder::new(); config = config.set_paired_ended(true); config = config.set_fragment_mean(100.0); - config.check_and_print_config().unwrap(); + config.check_and_print_config(); } } \ No newline at end of file diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 8cb43d5..46030f0 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -3,28 +3,12 @@ use std::io; use std::io::Write; use std::*; use HashMap; -use std::fmt::{Display, Formatter}; use utils::file_tools::read_lines; use utils::file_tools::open_file; use utils::nucleotides::{u8_to_base, base_to_u8}; /// This library contains tools needed to process fasta files as input and output. -#[derive(Debug)] -pub enum FastaError { - FastaReadError, - FastaWriteError, -} - -impl Display for FastaError { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - match *self { - FastaError::FastaReadError => { write!(f, "Error reading the fasta file") }, - FastaError::FastaWriteError => { write!(f, "Error writing a fasta file") }, - } - } -} - pub fn sequence_array_to_string(input_array: &Vec) -> String { // Converts a sequence vector into a string representing the DNA sequence let mut return_string = String::new(); @@ -36,7 +20,7 @@ pub fn sequence_array_to_string(input_array: &Vec) -> String { pub fn read_fasta( fasta_path: &str -) -> Result<(Box>>, Vec), FastaError> { +) -> Result<(Box>>, Vec), io::Error> { // Reads a fasta file and turns it into a HashMap and puts it in the heap info!("Reading fasta: {}", fasta_path); @@ -61,7 +45,7 @@ pub fn read_fasta( } } }, - Err(error) => panic!("Error reading fasta file: {:?}", error), + Err(error) => panic!("Problem reading fasta file: {}", error) }); // Need to pick up the last one fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone()); @@ -153,9 +137,9 @@ mod tests { &fasta_order, true, output_file - ); + ).unwrap(); let file_name = "test.fasta"; - assert_eq!(test_write.unwrap(), ()); + assert_eq!(test_write, ()); let attr = fs::metadata(file_name).unwrap(); assert!(attr.len() > 0); fs::remove_file(file_name)?; diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index d6846a9..953a30c 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -1,5 +1,4 @@ use std::collections::{HashSet, VecDeque}; -use std::fmt::{Display, Formatter}; use rand::RngCore; use rand::seq::SliceRandom; @@ -14,28 +13,13 @@ use utils::neat_rng::NeatRng; /// the mutated fasta file. These will either be read-length fragments or fragment model length /// fragments. -#[derive(Debug)] -pub enum GenerateReadsError { - CoverDatasetError, - GenerateReadsError, -} - -impl Display for GenerateReadsError { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match *self { - GenerateReadsError::CoverDatasetError => { write!(f, "Error generating coordinates of fragments") } - GenerateReadsError::GenerateReadsError => { write!(f, "Error generating reads from coordinates") } - } - } -} - fn cover_dataset( span_length: usize, read_length: usize, mut fragment_pool: Vec, coverage: usize, mut rng: &mut NeatRng, -) -> Result, GenerateReadsError> { +) -> Vec<(usize, usize)> { /* Takes: span_length: Total number of bases in the sequence @@ -117,11 +101,7 @@ fn cover_dataset( gap_size += wildcard; } } - if read_set.is_empty() { - Err(GenerateReadsError::CoverDatasetError) - } else { - Ok(read_set) - } + read_set } pub fn generate_reads( @@ -132,7 +112,7 @@ pub fn generate_reads( mean: Option, st_dev: Option, mut rng: &mut NeatRng, -) -> Result>>, GenerateReadsError>{ +) -> Result>>, &'static str>{ /* Takes: mutated_sequence: a vector of u8's representing the mutated sequence. @@ -169,7 +149,7 @@ pub fn generate_reads( fragment_pool, *coverage, &mut rng, - ).unwrap(); + ); // Generate the reads from the read positions. for (start, end) in read_positions { @@ -177,7 +157,7 @@ pub fn generate_reads( } // puts the reads in the heap. if read_set.is_empty() { - Err(GenerateReadsError::GenerateReadsError) + Err("No reads generated") } else { Ok(Box::new(read_set)) } @@ -204,7 +184,7 @@ mod tests { coverage, &mut rng, ); - assert_eq!(cover.unwrap()[0], (0,10)) + assert_eq!(cover[0], (0,10)) } #[test] @@ -222,7 +202,7 @@ mod tests { coverage, &mut rng, ); - assert_eq!(cover.unwrap()[0], (0, 300)) + assert_eq!(cover[0], (0, 300)) } #[test] diff --git a/src/utils/neat_rng.rs b/src/utils/neat_rng.rs index 0357747..6a21b58 100644 --- a/src/utils/neat_rng.rs +++ b/src/utils/neat_rng.rs @@ -10,6 +10,7 @@ // flexibility to change the rng behind the scenes, and was having trouble implementing RNGCore // myself. I could have just stolen the techniques, but decided to give credit. // I have borrowed this code for Rust because I am not 100% how to borrow it the "right" way. +// And we may improve on this at some point, so this gives us more flexibility. //! The standard RNG diff --git a/src/utils/runner.rs b/src/utils/runner.rs index f0d9948..d87ac5f 100644 --- a/src/utils/runner.rs +++ b/src/utils/runner.rs @@ -1,5 +1,4 @@ use std::collections::HashSet; -use std::fmt::{Display, Formatter}; use log::info; use rand::prelude::SliceRandom; use utils::config::RunConfiguration; @@ -11,22 +10,7 @@ use utils::mutate::mutate_fasta; use utils::neat_rng::NeatRng; use utils::vcf_tools::write_vcf; -#[derive(Debug)] -pub enum NeatErrors { - FileError, - FastaReadError, -} - -impl Display for NeatErrors { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match *self { - NeatErrors::FileError => { write!(f, "Neat encountered an error with a file.")} - NeatErrors::FastaReadError => { write!(f, "Error creating fasta data") } - } - } -} - -pub fn run_neat(config: Box, mut rng: &mut NeatRng) -> Result<(), NeatErrors>{ +pub fn run_neat(config: Box, mut rng: &mut NeatRng) -> Result<(), &'static str>{ // Create the prefix of the files to write let output_file = format!("{}/{}", config.output_dir, config.output_prefix); @@ -106,14 +90,15 @@ mod tests { use super::*; #[test] - fn test_runner() -> Result<(), NeatErrors>{ - let config = RunConfiguration::build().build(); + fn test_runner() { + let mut config = RunConfiguration::build(); + config = config.set_reference("data/H1N1.fa".to_string()); + let config = config.build(); run_neat( Box::new(config), &mut NeatRng::seed_from_u64(0), ).unwrap(); let fastq_file = Path::new("neat_out_r1.fastq").canonicalize().unwrap(); fs::remove_file(fastq_file).unwrap(); - Ok(()) } } \ No newline at end of file From 5dd35f080439c689d053cceae0842dff404dabdc Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 21 Mar 2024 23:31:13 -0500 Subject: [PATCH 5/5] Changed style of comments --- src/utils/cli.rs | 12 ++++++------ src/utils/config.rs | 19 ++++++++++--------- src/utils/fasta_tools.rs | 4 ++-- src/utils/fastq_tools.rs | 4 ++-- src/utils/file_tools.rs | 4 ++-- src/utils/make_reads.rs | 14 +++++++------- src/utils/mutate.rs | 12 ++++++------ src/utils/neat_logger.rs | 0 src/utils/nucleotides.rs | 20 ++++++++++---------- 9 files changed, 45 insertions(+), 44 deletions(-) delete mode 100644 src/utils/neat_logger.rs diff --git a/src/utils/cli.rs b/src/utils/cli.rs index 2cb74fc..877d0f9 100644 --- a/src/utils/cli.rs +++ b/src/utils/cli.rs @@ -1,9 +1,9 @@ -/// This is a pretty basic implementation of Clap CLI -/// The idea of this interface and supporting code is that the user can enter an optional -/// config file that will take the place of the other command line options, except the logging -/// features, which are handled separately. Either way, these options are read into a configuration -/// struct that holds the variables for the run. Logging, meanwhile, is handled separately, -/// outside run configuration parsing. +// This is a pretty basic implementation of Clap CLI +// The idea of this interface and supporting code is that the user can enter an optional +// config file that will take the place of the other command line options, except the logging +// features, which are handled separately. Either way, these options are read into a configuration +// struct that holds the variables for the run. Logging, meanwhile, is handled separately, +// outside run configuration parsing. extern crate clap; diff --git a/src/utils/config.rs b/src/utils/config.rs index 29e25bb..7d8e65d 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -1,17 +1,14 @@ -extern crate log; +// This is the run configuration for this particular run, which holds the parameters needed by the +// various side functions. It is build with a ConfigurationBuilder, which can take either a +// config yaml file or command line arguments and turn them into the configuration. use std::collections::{HashMap}; use std::string::String; -use crate::utils::cli::Cli; +use utils::cli::Cli; use log::{debug, warn}; use std::env; use std::path::Path; -/// This is the run configuration for this particular run, which holds the parameters needed by the -/// various side functions. It is build with a ConfigurationBuilder, which can take either a -/// config yaml file or command line arguments and turn them into the configuration. - - #[derive(Debug)] pub struct RunConfiguration { /* @@ -489,6 +486,7 @@ mod tests { #[test] fn test_setters() { let mut config = ConfigBuilder::new(); + assert_eq!(config.reference, None); assert_eq!(config.mutation_rate, 0.001); assert_eq!(config.ploidy, 2); assert_eq!(config.paired_ended, false); @@ -499,7 +497,8 @@ mod tests { assert_eq!(config.produce_vcf, false); assert_eq!(config.produce_bam, false); assert_eq!(config.rng_seed, None); - config = config.set_mutation_rate(0.111) + config = config.set_reference("data/H1N1.fa".to_string()) + .set_mutation_rate(0.111) .set_ploidy(3) .set_paired_ended(true) .set_fragment_mean(111.0) @@ -510,6 +509,7 @@ mod tests { .set_produce_bam(true) .set_rng_seed(11393) .set_overwrite_output(); + assert_eq!(config.reference, Some("data/H1N1.fa".to_string())); assert_eq!(config.mutation_rate, 0.111); assert_eq!(config.ploidy, 3); assert_eq!(config.paired_ended, true); @@ -520,6 +520,7 @@ mod tests { assert_eq!(config.produce_vcf, true); assert_eq!(config.produce_bam, true); assert_eq!(config.rng_seed, Some(11393)); + config.check_and_print_config() } #[test] @@ -590,7 +591,7 @@ mod tests { config.check_and_print_config(); config = config.set_produce_bam(true); config.check_and_print_config(); - // If the unwraps all work we're good. + // If it passes all the checks, we're good. } #[test] diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 46030f0..91a3eda 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -1,3 +1,5 @@ +// This library contains tools needed to process fasta files as input and output. + use log::info; use std::io; use std::io::Write; @@ -7,8 +9,6 @@ use utils::file_tools::read_lines; use utils::file_tools::open_file; use utils::nucleotides::{u8_to_base, base_to_u8}; -/// This library contains tools needed to process fasta files as input and output. - pub fn sequence_array_to_string(input_array: &Vec) -> String { // Converts a sequence vector into a string representing the DNA sequence let mut return_string = String::new(); diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index 0f1c957..63c08a2 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -1,11 +1,11 @@ +// This library writes either single ended or paired-ended fastq files. + use std::io::Write; use std::{fs, io}; use utils::fasta_tools::sequence_array_to_string; use utils::file_tools::open_file; -/// This library writes either single ended or paired-ended fastq files. - fn complement(nucleotide: u8) -> u8 { /* 0 = A, 1 = C, 2 = G, 3 = T, diff --git a/src/utils/file_tools.rs b/src/utils/file_tools.rs index 2e36fca..804f46a 100644 --- a/src/utils/file_tools.rs +++ b/src/utils/file_tools.rs @@ -1,10 +1,10 @@ +// Various file tools needed throughout the code. + use std::fs::File; use std::io; use std::io::{BufRead, Error}; use std::path::Path; -/// Various file tools needed throughout the code. - pub fn read_lines(filename: &str) -> io::Result>> { // This creates a buffer to read lines let file = File::open(filename)?; diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index 953a30c..51abecb 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -1,3 +1,10 @@ +// This is the core functionality of NEAT. Generate reads turns a mutated fasta array into short reads. +// The idea of cover_dataset is we generate a set of coordinates +// That define reads and covers the dataset coverage number times, to give the contig the proper +// read coverage. Generate reads uses this to create a list of coordinates to take slices from +// the mutated fasta file. These will either be read-length fragments or fragment model length +// fragments. + use std::collections::{HashSet, VecDeque}; use rand::RngCore; @@ -6,13 +13,6 @@ use rand_distr::{Normal, Distribution}; use utils::neat_rng::NeatRng; -/// This is the core functionality of NEAT. Generate reads turns a mutated fasta array into short reads. -/// The idea of cover_dataset is we generate a set of coordinates -/// That define reads and covers the dataset coverage number times, to give the contig the proper -/// read coverage. Generate reads uses this to create a list of coordinates to take slices from -/// the mutated fasta file. These will either be read-length fragments or fragment model length -/// fragments. - fn cover_dataset( span_length: usize, read_length: usize, diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs index 0e7557d..e8e471f 100644 --- a/src/utils/mutate.rs +++ b/src/utils/mutate.rs @@ -1,3 +1,9 @@ +// This is a basic mutation with SNPs using a basic mutation model. +// mutate_fasta takes a fasta Hashmap and returns a mutated version and the locations of the +// mutations introduced +// +// mutate_sequence adds actual mutations to the fasta sequence + use std::cmp::max; use std::collections::HashMap; use rand::prelude::IndexedRandom; @@ -7,12 +13,6 @@ use itertools::izip; use utils::nucleotides::NucModel; use utils::neat_rng::NeatRng; -/// This is a basic mutation with SNPs using a basic mutation model. -/// mutate_fasta takes a fasta Hashmap and returns a mutated version and the locations of the -/// mutations introduced -/// -/// mutate_sequence adds actual mutations to the fasta sequence - pub fn mutate_fasta( file_struct: &HashMap>, mut rng: &mut NeatRng diff --git a/src/utils/neat_logger.rs b/src/utils/neat_logger.rs deleted file mode 100644 index e69de29..0000000 diff --git a/src/utils/nucleotides.rs b/src/utils/nucleotides.rs index dc68091..a6038d3 100644 --- a/src/utils/nucleotides.rs +++ b/src/utils/nucleotides.rs @@ -1,17 +1,17 @@ +// Throughout this program, we are standardizing the use of a u8 representation of the nucleotides +// A = 0 +// C = 1 +// G = 2 +// T = 3 +// N = 4 +// This is intended to make it easier to store them. We thought about using the u8 representation +// of the character as built into Rust, but we'd then have to figure out the translations and keep +// track of extra numbers. So this is intended to simplify everything + use itertools::izip; use rand::seq::IndexedRandom; use utils::neat_rng::NeatRng; -/// Throughout this program, we are standardizing the use of a u8 representation of the nucleotides -/// A = 0 -/// C = 1 -/// G = 2 -/// T = 3 -/// N = 4 -/// This is intended to make it easier to store them. We thought about using the u8 representation -/// of the character as built into Rust, but we'd then have to figure out the translations and keep -/// track of extra numbers. So this is intended to simplify everything - pub fn base_to_u8(char_of_interest: char) -> u8 { // This defines the relationship between the 4 possible nucleotides in DNA and // a simple u8 numbering system. Everything that isn't a recognized base is a 4.