From 59063f593fef054d178c211d5f832286f8f6f6a0 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Tue, 19 Mar 2024 01:42:48 -0500 Subject: [PATCH 1/2] added tests. Overal coverage is at 64%, but making progress in utils at 74%. --- config/neat_test.yml | 45 ++++-------- config/simple_template.yml | 2 +- src/main.rs | 4 +- src/utils/config.rs | 146 +++++++++++++++++++++++++++++++------ src/utils/fasta_tools.rs | 81 +++++++++++--------- src/utils/fastq_tools.rs | 72 ++++++++++++++++++ src/utils/file_tools.rs | 28 +++++-- src/utils/make_reads.rs | 35 ++++++++- 8 files changed, 312 insertions(+), 101 deletions(-) diff --git a/config/neat_test.yml b/config/neat_test.yml index b230922..482b93e 100644 --- a/config/neat_test.yml +++ b/config/neat_test.yml @@ -1,34 +1,17 @@ reference: "data/ecoli.fa" -read_len: . -coverage: . -ploidy: . -paired_ended: . -fragment_mean: . -fragment_st_dev: . +read_len: 10 +coverage: 3 +ploidy: 9 +paired_ended: true +fragment_mean: 10.0 +fragment_st_dev: 11.0 +mutation_rate: 0.111 -produce_bam: . -produce_vcf: . -produce_fasta: . -produce_fastq: . +produce_bam: true +produce_vcf: true +produce_fasta: true +produce_fastq: false -error_model: . -mutation_model: . -fragment_model: . -gc_model: . - -partition_mode: . -threads: . -avg_seq_error: . -rescale_qualities: . -quality_offset: . -include_vcf: . -target_bed: . -off_target_scalar: . -discard_bed: . -mutation_rate: . -mutation_bed: . -no_coverage_bias: . -rng_seed: . -min_mutations: . -fasta_per_ploid: . -overwrite_output: . +overwrite_output: true +output_dir: /home/joshfactorial/code/neat_output +output_prefix: testing diff --git a/config/simple_template.yml b/config/simple_template.yml index 3af3026..c8ed324 100644 --- a/config/simple_template.yml +++ b/config/simple_template.yml @@ -1,4 +1,4 @@ -reference: REQUIRED +reference: . read_len: . coverage: 20 mutation_rate: . diff --git a/src/main.rs b/src/main.rs index 0785e57..f475e4f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,7 +22,7 @@ use utils::config::{read_config_yaml, build_config_from_args}; use utils::mutate::mutate_fasta; use utils::make_reads::generate_reads; use utils::fastq_tools::write_fastq; -use utils::file_tools::check_parent_and_create; +use utils::file_tools::check_parent; use utils::vcf_tools::write_vcf; fn main() { @@ -45,7 +45,7 @@ fn main() { }; // Check that the parent dir exists - let log_destination = check_parent_and_create(&args.log_dest).unwrap(); + let log_destination = check_parent(&args.log_dest).unwrap(); CombinedLogger::init(vec![ #[cfg(feature = "termcolor")] diff --git a/src/utils/config.rs b/src/utils/config.rs index 495202a..5a2d1ff 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -38,8 +38,8 @@ pub struct RunConfiguration { pub mutation_rate: f64, pub ploidy: usize, pub paired_ended: bool, - pub fragment_mean: f64, - pub fragment_st_dev: f64, + pub fragment_mean: Option, + pub fragment_st_dev: Option, pub produce_fastq: bool, pub produce_fasta: bool, pub produce_vcf: bool, @@ -58,7 +58,6 @@ impl RunConfiguration { // The config builder allows us to construct a config in multiple different ways, depending // on the input. -#[derive(Default)] pub struct ConfigBuilder { reference: String, read_len: usize, @@ -66,8 +65,8 @@ pub struct ConfigBuilder { mutation_rate: f64, ploidy: usize, paired_ended: bool, - fragment_mean: f64, - fragment_st_dev: f64, + fragment_mean: Option, + fragment_st_dev: Option, produce_fastq: bool, produce_fasta: bool, produce_vcf: bool, @@ -93,8 +92,8 @@ impl ConfigBuilder { mutation_rate: 0.001, ploidy: 2, paired_ended: false, - fragment_mean: 300.0, - fragment_st_dev: 30.0, + fragment_mean: None, + fragment_st_dev: None, produce_fastq: true, produce_fasta: false, produce_vcf: false, @@ -137,12 +136,12 @@ impl ConfigBuilder { } pub fn set_fragment_mean(mut self, fragment_mean: f64) -> ConfigBuilder { - self.fragment_mean = fragment_mean; + self.fragment_mean = Option::from(fragment_mean); self } pub fn set_fragment_st_dev(mut self, fragment_st_dev: f64) -> ConfigBuilder { - self.fragment_st_dev = fragment_st_dev; + self.fragment_st_dev = Option::from(fragment_st_dev); self } @@ -197,7 +196,7 @@ impl ConfigBuilder { return Err(ConfigError::ConfigurationError); } if self.paired_ended { - if self.fragment_mean.is_nan() | self.fragment_st_dev.is_nan() { + if self.fragment_mean.is_none() | self.fragment_st_dev.is_none() { error!( "Paired ended is set to true, but fragment mean \ and standard deviation were not set." @@ -205,8 +204,8 @@ impl ConfigBuilder { return Err(ConfigError::ConfigurationError); } if self.produce_fastq { - debug!("\t> fragment mean: {}", self.fragment_mean); - debug!("\t> fragment standard deviation: {}", self.fragment_st_dev); + debug!("\t> fragment mean: {}", self.fragment_mean.unwrap()); + debug!("\t> fragment standard deviation: {}", self.fragment_st_dev.unwrap()); debug!("Producing fastq files:\n\t> {}_r1.fastq\n\t {}_r2.fastq", file_prefix, file_prefix ) @@ -270,7 +269,7 @@ pub fn read_config_yaml(yaml: String) -> Result, ConfigErr for (key, value) in scrape_config { match key.as_str() { "reference" => { - if value != ".".to_string() { + if !(value == ".".to_string() || (value == "REQUIRED".to_string())) { config_builder = config_builder.set_reference(value) } else { error!("Reference is required."); @@ -405,8 +404,8 @@ mod tests { mutation_rate: 0.09, ploidy: 3, paired_ended: true, - fragment_mean: 333.0, - fragment_st_dev: 33.0, + fragment_mean: Option::from(333.0), + fragment_st_dev: Option::from(33.0), produce_fastq: false, produce_bam: true, produce_fasta: true, @@ -416,14 +415,16 @@ mod tests { output_prefix: String::from("Hey.hey") }; + println!("{:?}", test_configuration); + assert_eq!(test_configuration.reference, "Hello.world".to_string()); assert_eq!(test_configuration.read_len, 100); assert_eq!(test_configuration.coverage, 22); assert_eq!(test_configuration.mutation_rate, 0.09); assert_eq!(test_configuration.ploidy, 3); assert_eq!(test_configuration.paired_ended, true); - assert_eq!(test_configuration.fragment_mean, 333.0); - assert_eq!(test_configuration.fragment_st_dev, 33.0); + assert_eq!(test_configuration.fragment_mean.unwrap(), 333.0); + assert_eq!(test_configuration.fragment_st_dev.unwrap(), 33.0); assert_eq!(test_configuration.produce_fastq, false); assert_eq!(test_configuration.produce_vcf, true); assert_eq!(test_configuration.produce_bam, true); @@ -445,7 +446,21 @@ mod tests { let yaml = String::from("config/neat_test.yml"); let test_config = read_config_yaml(yaml).unwrap(); assert_eq!(test_config.reference, "data/ecoli.fa".to_string()); - assert_eq!(test_config.coverage, 10); + assert_eq!(test_config.coverage, 3); + } + + #[test] + #[should_panic] + fn test_bad_yaml() { + let yaml = String::from("fake_file.yml"); + read_config_yaml(yaml).unwrap(); + } + + #[test] + #[should_panic] + fn test_missing_ref() { + let yaml = String::from("config/simple_template.yml"); + let _test_config = read_config_yaml(yaml).unwrap(); } #[test] @@ -454,8 +469,8 @@ mod tests { assert_eq!(config.mutation_rate, 0.001); assert_eq!(config.ploidy, 2); assert_eq!(config.paired_ended, false); - assert_eq!(config.fragment_mean, 300.0); - assert_eq!(config.fragment_st_dev, 30.0); + assert_eq!(config.fragment_mean, None); + assert_eq!(config.fragment_st_dev, None); assert_eq!(config.produce_fasta, false); assert_eq!(config.produce_fastq, true); assert_eq!(config.produce_vcf, false); @@ -473,8 +488,8 @@ mod tests { assert_eq!(config.mutation_rate, 0.111); assert_eq!(config.ploidy, 3); assert_eq!(config.paired_ended, true); - assert_eq!(config.fragment_mean, 111.0); - assert_eq!(config.fragment_st_dev, 0.011); + assert_eq!(config.fragment_mean, Some(111.0)); + assert_eq!(config.fragment_st_dev, Some(0.011)); assert_eq!(config.produce_fasta, true); assert_eq!(config.produce_fastq, false); assert_eq!(config.produce_vcf, true); @@ -497,4 +512,91 @@ mod tests { let test_config = build_config_from_args(args).unwrap(); assert_eq!(test_config.reference, "data/ecoli.fa".to_string()) } + + #[test] + #[should_panic] + fn test_cl_missing_ref() { + let args: Cli = Cli{ + config: String::new(), + reference: String::from(""), + output_dir: String::from("test_dir"), + log_level: String::from("Trace"), + log_dest: String::new(), + output_file_prefix: String::from("test"), + read_length: 150, + coverage: 10, + }; + + build_config_from_args(args).unwrap(); + } + + #[test] + fn test_overwrite_warn() { + let mut config = ConfigBuilder::new(); + config = config.set_overwrite_output(); + config.check_and_print_config().unwrap(); + } + + #[test] + fn test_produce_fastq_messages() { + let mut config = ConfigBuilder::new(); + config = config.set_paired_ended(true); + config = config.set_fragment_mean(100.0); + config = config.set_fragment_st_dev(10.0); + // tests first branch of if statement for paired_ended & produce_fastq = true + config.check_and_print_config().unwrap(); + // Checks the alternative pe = true, produce_fastq = false + config = config.set_produce_fastq(false); + // need to produce at least one file or check will panic + config = config.set_produce_fasta(true); + config.check_and_print_config().unwrap(); + } + + #[test] + fn test_produce_fasta_messages() { + let mut config = ConfigBuilder::new(); + config = config.set_produce_fasta(true); + config.check_and_print_config().unwrap(); + config = config.set_produce_vcf(true); + config.check_and_print_config().unwrap(); + config = config.set_produce_bam(true); + config.check_and_print_config().unwrap(); + // If the unwraps all work we're good. + } + + #[test] + #[should_panic] + fn test_no_files() { + let mut config = ConfigBuilder::new(); + config = config.set_produce_fastq(false); + config.check_and_print_config().unwrap(); + } + + #[test] + #[should_panic] + fn test_no_frag_mean_or_stdev() { + let mut config = ConfigBuilder::new(); + // paired end set to true, by default, fragment mean and st dev are None + config = config.set_paired_ended(true); + config.check_and_print_config().unwrap(); + } + + #[test] + #[should_panic] + fn test_no_frag_mean() { + let mut config = ConfigBuilder::new(); + config = config.set_paired_ended(true); + config = config.set_fragment_st_dev(10.0); + config.check_and_print_config().unwrap(); + } + + #[test] + #[should_panic] + fn test_no_stdev() { + let mut config = ConfigBuilder::new(); + config = config.set_paired_ended(true); + config = config.set_fragment_mean(100.0); + config.check_and_print_config().unwrap(); + } + } \ No newline at end of file diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 5415bb5..3aebfcb 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -128,42 +128,53 @@ pub fn write_fasta( Ok(()) } -#[test] -fn test_conversions() { - let initial_sequence = "AAAANNNNGGGGCCCCTTTTAAAA"; - let test_map: Vec = vec![0,0,0,0,4,4,4,4,2,2,2,2,1,1,1,1,3,3,3,3,0,0,0,0]; - let remap: Vec = initial_sequence.chars().map(|x| char_to_num(x)).collect(); - assert_eq!(remap, test_map); - assert_eq!(sequence_array_to_string(&test_map), initial_sequence); -} +#[cfg(test)] +mod tests { + use super::*; -#[test] -fn test_read_fasta() { - let test_fasta = "data/H1N1.fa"; - let (_test_map, map_order) = read_fasta(test_fasta); - assert_eq!(map_order[0], "H1N1_HA".to_string()) -} + #[test] + fn test_conversions() { + let initial_sequence = "AAAANNNNGGGGCCCCTTTTAAAA"; + let test_map: Vec = vec![0, 0, 0, 0, 4, 4, 4, 4, 2, 2, 2, 2, 1, 1, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0]; + let remap: Vec = initial_sequence.chars().map(|x| char_to_num(x)).collect(); + assert_eq!(remap, test_map); + assert_eq!(sequence_array_to_string(&test_map), initial_sequence); + } + + #[test] + fn test_read_fasta() { + let test_fasta = "data/H1N1.fa"; + let (_test_map, map_order) = read_fasta(test_fasta); + assert_eq!(map_order[0], "H1N1_HA".to_string()) + } -#[test] -fn test_write_fasta() -> Result<(), Box> { + #[test] + #[should_panic] + fn test_read_bad_fasta() { + let test_fasta = "data/fake.fasta"; + read_fasta(test_fasta); + } - let seq1: Vec = vec![0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]; - let fasta_output: HashMap> = HashMap::from([ - (String::from("H1N1_HA"), seq1) - ]); - let fasta_pointer = Box::new(fasta_output); - let fasta_order = vec![String::from("H1N1_HA")]; - let output_file = "test"; - let test_write = write_fasta( - &fasta_pointer, - &fasta_order, - true, - output_file - ); - let file_name = "test.fasta"; - assert_eq!(test_write.unwrap(), ()); - let attr = fs::metadata(file_name).unwrap(); - assert!(attr.len() > 0); - fs::remove_file(file_name)?; - Ok(()) + #[test] + fn test_write_fasta() -> Result<(), Box> { + let seq1: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]; + let fasta_output: HashMap> = HashMap::from([ + (String::from("H1N1_HA"), seq1) + ]); + let fasta_pointer = Box::new(fasta_output); + let fasta_order = vec![String::from("H1N1_HA")]; + let output_file = "test"; + let test_write = write_fasta( + &fasta_pointer, + &fasta_order, + true, + output_file + ); + let file_name = "test.fasta"; + assert_eq!(test_write.unwrap(), ()); + let attr = fs::metadata(file_name).unwrap(); + assert!(attr.len() > 0); + fs::remove_file(file_name)?; + Ok(()) + } } \ No newline at end of file diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index 4d82443..49ab66e 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -87,4 +87,76 @@ pub fn write_fastq( fs::remove_file(filename2)?; } Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::Path; + + #[test] + fn test_complement() { + let nuc1 = 0; + let nuc2 = 1; + let nuc3 = 2; + let nuc4 = 3; + let nuc5 = 4; + + assert_eq!(complement(nuc1), 3); + assert_eq!(complement(nuc2), 2); + assert_eq!(complement(nuc3), 1); + assert_eq!(complement(nuc4), 0); + assert_eq!(complement(nuc5), 4); + } + + #[test] + fn test_reverse_complement() { + let read: Vec = vec![0, 0, 0, 0, 1, 1, 1, 1]; + let revcomp: Vec = vec![2, 2, 2, 2, 3, 3, 3, 3]; + assert_eq!(reverse_complement(&read), revcomp); + } + + #[test] + fn test_write_fastq_single() { + let fastq_filename = "test_single"; + let overwrite_output = true; + let paired_ended = false; + let seq1 = vec![0, 0, 0, 0, 1, 1, 1, 1]; + let seq2 = vec![2, 2, 2, 2, 3, 3, 3, 3,]; + let dataset = vec![&seq1, &seq2]; + write_fastq( + fastq_filename, + overwrite_output, + paired_ended, + dataset + ).unwrap(); + let outfile1 = Path::new("test_single_r1.fastq"); + let outfile2 = Path::new("test_single_r2.fastq"); + assert!(outfile1.exists()); + assert!(!outfile2.exists()); + fs::remove_file(outfile1).unwrap(); + } + + #[test] + fn test_write_fastq_paired() { + let fastq_filename = "test_paired"; + // might as well test the o_o function as well + let overwrite_output = false; + let paired_ended = true; + let seq1 = vec![0, 0, 0, 0, 1, 1, 1, 1]; + let seq2 = vec![2, 2, 2, 2, 3, 3, 3, 3,]; + let dataset = vec![&seq1, &seq2]; + write_fastq( + fastq_filename, + overwrite_output, + paired_ended, + dataset + ).unwrap(); + let outfile1 = Path::new("test_paired_r1.fastq"); + let outfile2 = Path::new("test_paired_r2.fastq"); + assert!(outfile1.exists()); + assert!(outfile2.exists()); + fs::remove_file(outfile1).unwrap(); + fs::remove_file(outfile2).unwrap(); + } } \ No newline at end of file diff --git a/src/utils/file_tools.rs b/src/utils/file_tools.rs index cc950fb..3aa2523 100644 --- a/src/utils/file_tools.rs +++ b/src/utils/file_tools.rs @@ -2,7 +2,6 @@ use std::fs::File; use std::io; use std::io::BufRead; use std::path::Path; -use log::error; pub fn read_lines(filename: &str) -> io::Result>> { // This creates a buffer to read lines @@ -18,13 +17,28 @@ pub fn open_file(mut filename: &mut str, overwrite_file: bool) -> File { } } -pub fn check_parent_and_create(filename: &str) -> io::Result<&Path> { - // checks that the parent dir exists and then if so creates the file object open +pub fn check_parent(filename: &str) -> io::Result<&Path> { + // checks that the parent dir exists and then if so creates the Path object open // and ready to write let file_path = Path::new(filename); - if !file_path.parent().unwrap().exists() { - error!("Path to log file not found!"); - assert!(file_path.parent().unwrap().exists()) - } + assert!(file_path.parent().unwrap().exists()); Ok(file_path) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_check_parent() { + let filename = "data/H1N1.fa"; + check_parent(filename).unwrap(); + } + + #[test] + #[should_panic] + fn test_check_parent_fail() { + let filename = "fake/test.fa"; + check_parent(filename).unwrap(); + } } \ No newline at end of file diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index a4aba9a..a9aecc7 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -101,8 +101,8 @@ pub fn generate_reads( read_length: &usize, coverage: &usize, paired_ended: bool, - mean: f64, - st_dev: f64, + mean: Option, + st_dev: Option, mut rng: &mut ThreadRng, ) -> Box>> { /* @@ -122,7 +122,7 @@ pub fn generate_reads( let mut fragment_pool: Vec = Vec::new(); if paired_ended { let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2); - let fragment_distribution = Normal::new(mean, st_dev).unwrap(); + let fragment_distribution = Normal::new(mean.unwrap(), st_dev.unwrap()).unwrap(); // add fragments to the fragment pool for _ in 0..num_frags { let frag = fragment_distribution.sample(&mut rng).round() as usize; @@ -149,4 +149,33 @@ pub fn generate_reads( } // puts the reads in the heap. Box::new(read_set) +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::thread_rng; + + #[test] + fn test_cover_dataset() { + let span_length = 100; + let read_length = 10; + let fragment_pool = vec![10]; + let coverage = 1; + let mut rng = thread_rng(); + + let cover = cover_dataset( + span_length, + read_length, + fragment_pool, + coverage, + &mut rng, + ); + assert_eq!(cover[0], (0,10)) + } + + #[test] + fn test_generate_reads() { + todo!() + } } \ No newline at end of file From 685ad32c231fd555be5e48ab0ea40f07dd73ae06 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 20 Mar 2024 22:56:34 -0500 Subject: [PATCH 2/2] Added extensive tests. Coverage is at 76% for the entire file, and 86% for the libraries. But, some of the 'uncovered code' is actually covered. May add more tests later. --- Cargo.toml | 2 +- config/neat_test.yml | 2 +- src/main.rs | 81 ++--------------- src/utils.rs | 5 +- src/utils/config.rs | 3 +- src/utils/fasta_tools.rs | 52 ++--------- src/utils/fastq_tools.rs | 6 +- src/utils/file_tools.rs | 8 +- src/utils/make_reads.rs | 61 ++++++++++++- src/utils/mutate.rs | 134 ++++++++++++++-------------- src/utils/nucleotides.rs | 184 +++++++++++++++++++++++++++++++++++++++ src/utils/runner.rs | 122 ++++++++++++++++++++++++++ src/utils/vcf_tools.rs | 58 ++++++++++-- 13 files changed, 509 insertions(+), 209 deletions(-) create mode 100644 src/utils/nucleotides.rs create mode 100644 src/utils/runner.rs diff --git a/Cargo.toml b/Cargo.toml index 10feb57..2f22a48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,4 +13,4 @@ serde_yaml = "0.9.32" clap = { version = "4.5.1", features = ["derive"] } itertools = "0.12.1" assert_fs = "1.1.1" -rand_distr = "0.5.0-alpha.0" \ No newline at end of file +rand_distr = "0.5.0-alpha.0" diff --git a/config/neat_test.yml b/config/neat_test.yml index 482b93e..f14b3f5 100644 --- a/config/neat_test.yml +++ b/config/neat_test.yml @@ -13,5 +13,5 @@ produce_fasta: true produce_fastq: false overwrite_output: true -output_dir: /home/joshfactorial/code/neat_output +output_dir: . output_prefix: testing diff --git a/src/main.rs b/src/main.rs index f475e4f..74d8512 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,24 +8,19 @@ extern crate itertools; mod utils; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fs::File; use clap::{Parser}; use log::*; use simplelog::*; use rand::thread_rng; -use rand::prelude::*; use utils::cli; -use utils::fasta_tools::{read_fasta, write_fasta}; use utils::config::{read_config_yaml, build_config_from_args}; -use utils::mutate::mutate_fasta; -use utils::make_reads::generate_reads; -use utils::fastq_tools::write_fastq; use utils::file_tools::check_parent; -use utils::vcf_tools::write_vcf; +use utils::runner::run_neat; -fn main() { +fn main() -> Result<(), std::fmt::Error> { info!("Begin processing"); // parse the arguments from the command line @@ -77,73 +72,7 @@ fn main() { Ok(build_config_from_args(args).expect("Problem reading configuration yaml file")) }.unwrap(); - // Create the prefix of the files to write - let output_file = format!("{}/{}", config.output_dir, config.output_prefix); - - // Reading the reference file into memory - info!("Mapping reference fasta file: {}", &config.reference); - let (fasta_map, fasta_order) = read_fasta(&config.reference); - - // Mutating the reference and recording the variant locations. - info!("Mutating reference."); - let (mutated_map, variant_locations) = mutate_fasta( - &fasta_map, - &mut rng - ); - - if config.produce_fasta { - info!("Outputting fasta file"); - write_fasta( - &mutated_map, - &fasta_order, - config.overwrite_output, - &output_file, - ).expect("Problem writing fasta file"); - } - - if config.produce_vcf { - info!("Writing vcf file"); - write_vcf( - &variant_locations, - &fasta_order, - config.ploidy, - &config.reference, - config.overwrite_output, - &output_file, - &mut rng).expect("Error writing vcf file") - } - - let mut read_sets: HashSet> = HashSet::new(); - for (_name, sequence) in mutated_map.iter() { - // defined as a set of read sequences that should cover - // the mutated sequence `coverage` number of times - let data_set = generate_reads( - sequence, - &config.read_len, - &config.coverage, - config.paired_ended, - config.fragment_mean, - config.fragment_st_dev, - &mut rng - ); - - read_sets.extend(*data_set); - } - - if config.produce_fastq { - info!("Shuffling output fastq data"); - let mut outsets: Box>> = Box::new(read_sets.iter().collect()); - outsets.shuffle(&mut rng); - - info!("Writing fastq"); - write_fastq( - &output_file, - config.overwrite_output, - config.paired_ended, - *outsets, - ).expect("Problem writing fastq file"); - info!("Processing complete") - } - + run_neat(config, &mut rng).unwrap(); + Ok(()) } diff --git a/src/utils.rs b/src/utils.rs index 04d2f1f..ff415ec 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -5,4 +5,7 @@ pub mod cli; pub mod make_reads; pub mod mutate; pub mod fastq_tools; -pub mod vcf_tools; \ No newline at end of file +pub mod vcf_tools; +pub mod nucleotides; + +pub mod runner; \ No newline at end of file diff --git a/src/utils/config.rs b/src/utils/config.rs index 5a2d1ff..ff637b6 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -4,6 +4,7 @@ use std::collections::{HashMap}; use std::string::String; use crate::utils::cli::Cli; use log::{debug, warn, error}; +use std::env; #[derive(Debug)] pub struct RunConfiguration { @@ -99,7 +100,7 @@ impl ConfigBuilder { produce_vcf: false, produce_bam: false, overwrite_output: false, - output_dir: String::from(""), + output_dir: String::from(env::current_dir().unwrap().to_str().unwrap()), output_prefix: String::from("neat_out"), } } diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 3aebfcb..a2504ce 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -1,52 +1,17 @@ -extern crate rand; -extern crate log; -extern crate assert_fs; - -use crate::utils::file_tools::read_lines; -use self::log::info; +use log::info; use std::io; use std::io::Write; use std::*; use HashMap; +use utils::file_tools::read_lines; use utils::file_tools::open_file; - -pub fn char_to_num(char_of_interest: char) -> u8 { - return match char_of_interest { - 'A' | 'a' => 0, - 'C' | 'c' => 1, - 'G' | 'g' => 2, - 'T' | 't' => 3, - _ => 4 - } -} - -pub fn num_to_char(nuc_num: u8) -> &'static str { - return match nuc_num { - 0 => "A", - 1 => "C", - 2 => "G", - 3 => "T", - _ => "N", - } -} - -#[repr(u8)] -#[allow(unused)] -pub enum NucRepr { - // I just discovered this repr type thing and need to investigate. - // todo figure out if we can use this idea to simplify things - A = 0, - C = 1, - G = 2, - T = 3, - N = 4, -} +use utils::nucleotides::{u8_to_base, base_to_u8}; pub fn sequence_array_to_string(input_array: &Vec) -> String { // Converts a sequence vector into a string representing the DNA sequence let mut return_string = String::new(); for num in input_array { - return_string += num_to_char(*num); + return_string += &(u8_to_base(*num).to_string()); } return_string } @@ -74,7 +39,7 @@ pub fn read_fasta( temp_seq = vec![]; } else { for char in l.chars() { - temp_seq.push(char_to_num(char)); + temp_seq.push(base_to_u8(char)); } } }, @@ -102,7 +67,8 @@ pub fn write_fasta( */ // writing fasta output to files let mut output_fasta = format!("{}.fasta", output_file); - let mut outfile = open_file(&mut output_fasta, overwrite_output); + let mut outfile = open_file(&mut output_fasta, overwrite_output) + .expect(&format!("Error opening {}", output_fasta)); for contig in fasta_order { let sequence = &fasta_output[contig]; // Write contig name @@ -119,7 +85,7 @@ pub fn write_fasta( max = this_length } for j in 0..max { - line += num_to_char(sequence_to_write[i + j]); + line += &(u8_to_base(sequence_to_write[i + j]).to_string()); } writeln!(&mut outfile, "{}", line)?; i += 70; @@ -136,7 +102,7 @@ mod tests { fn test_conversions() { let initial_sequence = "AAAANNNNGGGGCCCCTTTTAAAA"; let test_map: Vec = vec![0, 0, 0, 0, 4, 4, 4, 4, 2, 2, 2, 2, 1, 1, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0]; - let remap: Vec = initial_sequence.chars().map(|x| char_to_num(x)).collect(); + let remap: Vec = initial_sequence.chars().map(|x| base_to_u8(x)).collect(); assert_eq!(remap, test_map); assert_eq!(sequence_array_to_string(&test_map), initial_sequence); } diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index 49ab66e..7be5d1e 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -54,11 +54,13 @@ pub fn write_fastq( let name_prefix = "neat_generated_".to_string(); let mut filename1 = String::from(fastq_filename) + "_r1.fastq"; // open the file and append lines - let mut outfile1 = open_file(&mut filename1, overwrite_output); + let mut outfile1 = open_file(&mut filename1, overwrite_output) + .expect(&format!("Error opening output {}", filename1)); // setting up pairend ended reads For single ended reads, this will go unused. let mut filename2 = String::from(fastq_filename) + "_r2.fastq"; // open the second file and append lines - let mut outfile2 = open_file(&mut filename2, overwrite_output); + let mut outfile2 = open_file(&mut filename2, overwrite_output) + .expect(&format!("Error opening output {}", filename2)); // write out sequence by sequence using index to track the numbering let mut index = 1; for sequence in dataset { diff --git a/src/utils/file_tools.rs b/src/utils/file_tools.rs index 3aa2523..550fa08 100644 --- a/src/utils/file_tools.rs +++ b/src/utils/file_tools.rs @@ -1,6 +1,6 @@ use std::fs::File; use std::io; -use std::io::BufRead; +use std::io::{BufRead, Error}; use std::path::Path; pub fn read_lines(filename: &str) -> io::Result>> { @@ -9,11 +9,11 @@ pub fn read_lines(filename: &str) -> io::Result>> Ok(io::BufReader::new(file).lines()) } -pub fn open_file(mut filename: &mut str, overwrite_file: bool) -> File { +pub fn open_file(mut filename: &mut str, overwrite_file: bool) -> Result { if overwrite_file { - File::options().create(true).write(true).open(&mut filename).unwrap() + File::options().create(true).write(true).open(&mut filename) } else { - File::options().create_new(true).append(true).open(&mut filename).unwrap() + File::options().create_new(true).append(true).open(&mut filename) } } diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index a9aecc7..88f9278 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -175,7 +175,64 @@ mod tests { } #[test] - fn test_generate_reads() { - todo!() + fn test_gap_function() { + let span_length = 100_000; + let read_length = 100; + let fragment_pool = vec![300]; + let coverage = 1; + let mut rng = thread_rng(); + + let cover = cover_dataset( + span_length, + read_length, + fragment_pool, + coverage, + &mut rng, + ); + assert_eq!(cover[0], (0, 300)) + } + + #[test] + fn test_generate_reads_single() { + let mutated_sequence = vec![0, 0, 1, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 2, 2, 2, 4, 4, 4, 4]; + let read_length = 10; + let coverage = 1; + let paired_ended = false; + let mean = None; + let st_dev = None; + let mut rng = thread_rng(); + let reads = generate_reads( + &mutated_sequence, + &read_length, + &coverage, + paired_ended, + mean, + st_dev, + &mut rng, + ); + println!("{:?}", reads); + assert!(reads.contains(&(vec![0, 0, 1, 0, 3, 3, 3, 3, 0, 0]))); + } + + #[test] + fn test_generate_reads_paired() { + let mutated_sequence: Vec = std::iter::repeat(1).take(100_000).collect(); + let read_length = 100; + let coverage = 1; + let paired_ended = true; + let mean = Some(200.0); + let st_dev = Some(1.0); + let mut rng = thread_rng(); + let reads = generate_reads( + &mutated_sequence, + &read_length, + &coverage, + paired_ended, + mean, + st_dev, + &mut rng, + ); + println!("{:?}", reads); + assert!(!reads.is_empty()) } } \ No newline at end of file diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs index 20894ca..6084cbc 100644 --- a/src/utils/mutate.rs +++ b/src/utils/mutate.rs @@ -1,53 +1,10 @@ use std::cmp::max; use std::collections::HashMap; use rand::prelude::{ThreadRng, IndexedRandom}; -use rand::seq::SliceRandom; use rand::Rng; use log::debug; use itertools::izip; - -#[derive(Debug, Ord, PartialOrd, Eq, PartialEq)] -struct NucModel { - // This simple nucleotide model simple tracks the base to mutate from - // and the weights of each are turned into a vector. For example, if the weight count for "A" - // is 8, then the vector will be [0, 0, 0, 0, 0, 0, 0, 0]. - // 0: A, 1: C, 2: G, 3: T - base: u8, - // vector of 0s, the length of which is the weight of the A in the vector. - a: Vec, - // vector of 1s, the length of which is the weight of the C in the vector. - c: Vec, - // vector of 2s, the length of which is the weight of the G in the vector. - g: Vec, - // vector of 3s, the length of which is the weight of the T in the vector. - t: Vec, -} - -impl NucModel { - fn from(base: u8, weights: Vec) -> Self { - Self { - base, - a: vec![0; weights[0]], - c: vec![1; weights[1]], - g: vec![2; weights[2]], - t: vec![3; weights[3]], - } - } - - fn choose_new_nuc(&self, mut rng: &mut ThreadRng) -> u8 { - // concatenate the vectors together - let mut choices = [&self.a[..], &self.c[..], &self.g[..], &self.t[..]].concat(); - // shuffle the weighted list, based on whatever statistics we come up with. - choices.shuffle(&mut rng); - // start with the first choice - let mut i = 0; - // Look for the first nuc that isn't the old nuc. - while choices[i] == self.base { - i += 1; - } - choices[i] - } -} +use utils::nucleotides::NucModel; pub fn mutate_fasta( file_struct: &HashMap>, @@ -99,7 +56,7 @@ pub fn mutate_fasta( let num_positions = max(1, rough_num_positions.round() as usize); (mutated_record, contig_mutations) = mutate_sequence( - mutated_record, num_positions, &mut rng + &mutated_record, num_positions, &mut rng ); // Add to the return struct and variants map. return_struct.entry(name.clone()).or_insert(mutated_record.clone()); @@ -110,7 +67,7 @@ pub fn mutate_fasta( } fn mutate_sequence( - sequence: Vec, + sequence: &Vec, num_positions: usize, mut rng: &mut ThreadRng ) -> (Vec, Vec<(usize, u8, u8)>) { @@ -134,44 +91,83 @@ fn mutate_sequence( let weights = vec![1; mutated_record.len()]; // zip together weights and positions let mut weighted_positions: Vec<(usize, i32)> = Vec::new(); + // find all non n positions. + let non_n_positions: Vec = mutated_record + .iter() + .enumerate() + .filter(|&(_, y)| *y != 4u8) + .map(|(x, _)| x) + .collect(); // izip!() accepts iterators and/or values with IntoIterator. - for (x, y) in izip!(&mut (0..mutated_record.len()), &weights) { - weighted_positions.push((x, *y)) + for (x, y) in izip!(&non_n_positions, &weights) { + weighted_positions.push((*x, *y)) } // now choose a random selection of num_positions without replacement let mutated_elements: Vec<&(usize, i32)> = weighted_positions .choose_multiple_weighted(&mut rng, num_positions, |x| x.1) .unwrap() .collect::>(); - // Build mutation model - /* - Using NEAT's original mutation model of - [[0.0, 0.15, 0.7, 0.15], - [0.15, 0.0, 0.15, 0.7], - [0.7, 0.15, 0.0, 0.15], - [0.15, 0.7, 0.15, 0.0]] - - multiply each value by 20, which produces integer values - */ - let mut_a = NucModel::from(0, vec![0, 3, 14, 3]); - let mut_c = NucModel::from(1, vec![3, 0, 3, 14]); - let mut_g = NucModel::from(2, vec![14, 3, 0, 3]); - let mut_t = NucModel::from(3, vec![3, 14, 3, 0]); + // Build the default mutation model + // todo incorporate custom models + let nucleotide_mutation_model = NucModel::new(); // Will hold the variants added to this sequence let mut sequence_variants: Vec<(usize, u8, u8)> = Vec::new(); // for each index, picks a new base for (index, _weight) in mutated_elements { + // Skip the N's + if sequence[*index] == 4 { + continue + } + // remeber the reference for later. let reference_base = sequence[*index]; - mutated_record[*index] = match reference_base { - 0 => mut_a.choose_new_nuc(&mut rng), - 1 => mut_c.choose_new_nuc(&mut rng), - 2 => mut_g.choose_new_nuc(&mut rng), - 3 => mut_t.choose_new_nuc(&mut rng), - _ => 4 - }; + // pick a new base and assign the position to it. + mutated_record[*index] = nucleotide_mutation_model.choose_new_nuc(reference_base, &mut rng); + // This check simply ensures that our model actually mutated the base. + if mutated_record[*index] == reference_base { + panic!("BUG: Mutation model failed to mutate the base. This should not happen.") + } // add the location and alt base for the variant sequence_variants.push((*index, mutated_record[*index], reference_base)) } (mutated_record, sequence_variants) +} + +#[cfg(test)] +mod tests { + use rand::thread_rng; + use super::*; + + #[test] + fn test_mutate_sequence() { + let seq1: Vec = vec![4, 4, 0, 0, 0, 1, 1, 2, 0, 3, 1, 1, 1]; + let num_positions = 2; + let mut rng = thread_rng(); + let mutant = mutate_sequence(&seq1, num_positions, &mut rng); + assert_eq!(mutant.0.len(), seq1.len()); + assert!(!mutant.1.is_empty()); + // N's stay N's + assert_eq!(mutant.0[0], 4); + assert_eq!(mutant.0[1], 4); + } + + #[test] + fn test_mutate_fasta() { + let seq = vec![4, 4, 0, 0, 0, 1, 1, 2, 0, 3, 1, 1, 1]; + let file_struct: HashMap> = HashMap::from([ + ("chr1".to_string(), seq.clone()) + ]); + let mut rng = thread_rng(); + let mutations = mutate_fasta( + &file_struct, + &mut rng, + ); + assert!(mutations.0.contains_key("chr1")); + assert!(mutations.1.contains_key("chr1")); + let mutation_location = mutations.1["chr1"][0].0; + let mutation_alt = mutations.1["chr1"][0].1; + let mutation_ref = mutations.1["chr1"][0].2; + assert_eq!(mutation_ref, seq[mutation_location]); + assert_ne!(mutation_alt, mutation_ref) + } } \ No newline at end of file diff --git a/src/utils/nucleotides.rs b/src/utils/nucleotides.rs new file mode 100644 index 0000000..ad517b9 --- /dev/null +++ b/src/utils/nucleotides.rs @@ -0,0 +1,184 @@ +use rand::prelude::ThreadRng; +use itertools::izip; +use rand::seq::IndexedRandom; + +/// Throughout this program, we are standardizing the use of a u8 representation of the nucleotides +/// A = 0 +/// C = 1 +/// G = 2 +/// T = 3 +/// N = 4 +/// This is intended to make it easier to store them. We thought about using the u8 representation +/// of the character as built into Rust, but we'd then have to figure out the translations and keep +/// track of extra numbers. So this is intended to simplify everything + +pub fn base_to_u8(char_of_interest: char) -> u8 { + // This defines the relationship between the 4 possible nucleotides in DNA and + // a simple u8 numbering system. Everything that isn't a recognized base is a 4. + // Note that NEAT ignores soft masking. + return match char_of_interest { + 'A' | 'a' => 0, + 'C' | 'c' => 1, + 'G' | 'g' => 2, + 'T' | 't' => 3, + _ => 4, + } +} + +pub fn u8_to_base(nuc_num: u8) -> char { + // Canonical conversion from base u8 representation back into the character. + // We're returning a string instead of a char to facilitate. No attempt to preserve or display + // any soft masking. + return match nuc_num { + 0 => 'A', + 1 => 'C', + 2 => 'G', + 3 => 'T', + _ => 'N', + } +} + +#[derive(Debug, Clone)] +pub struct NucModel { + // Simple mutation model. The letter in the model represents + // the base we are mutating and the vector are the weights for mutating + // to another base (in the same a, c, g, t order) + // + // By definition, the model is a 4x4 matrix with zeros along the diagonal + // because, e.g., A can't "mutate" to A. + a: Vec, + c: Vec, + g: Vec, + t: Vec, +} + +impl NucModel { + + pub fn new() -> Self { + // Default mutation model based on the original from NEAT 2.0 + Self { + a: vec![0, 3, 14, 3], + c: vec![3, 0, 3, 14], + g: vec![14, 3, 0, 3], + t: vec![3, 14, 3, 0], + } + } + pub fn from(weights: Vec>) -> Self { + // Supply a vector of 4 vectors that define the mutation chance + // from the given base to the other 4 bases. + + // First some safety checks. This should be a 4x4 matrix defining mutation from + // ACGT (top -> down) to ACGT (left -> right) + if weights.len() != 4 { + panic!("Weights supplied to NucModel is wrong size"); + } + for weight_vec in &weights { + if weight_vec.len() != 4 { + panic!("Weights supplied to NucModel is wrong size"); + } + } + + Self { + a: weights[0].clone(), + c: weights[1].clone(), + g: weights[2].clone(), + t: weights[3].clone(), + } + } + + pub fn choose_new_nuc(&self, base: u8, mut rng: &mut ThreadRng) -> u8 { + // We'll handle the trivial case first: + if base >= 4 { + return 4; + } + + // the canonical choices for DNA, as defined above + let choices: [u8; 4] = [0, 1, 2, 3]; + // Pick the weights list for the base that was input + let weights: Vec = match base { + 0 => self.a.clone(), + 1 => self.c.clone(), + 2 => self.g.clone(), + 3 => self.t.clone(), + // we filtered out everything 4 and over above + _ => { panic!("This line should be unreachable") }, + }; + + let mut weighted_bases: Vec<(u8, usize)> = Vec::new(); + // This for loop pairs the base with its weight. + for (x, y) in izip!(choices, weights) { + weighted_bases.push((x, y)) + } + + // return a new base, based on the selection. Note that the weight for the input base + // will be always be zero (by design), so there is no chance of returning the original. + weighted_bases + .choose_weighted(&mut rng, |x| x.1) + .unwrap() + .0 + } +} + +#[cfg(test)] +mod tests { + use rand::thread_rng; + use super::*; + + #[test] + fn test_nuc_model_build() { + let a_weights = vec![0, 20, 1, 20]; + let c_weights = vec![20, 0, 1, 1]; + let g_weights = vec![1, 1, 0, 20]; + let t_weights = vec![20, 1, 20, 0]; + + let model = NucModel { + a: a_weights.clone(), + c: c_weights.clone(), + g: g_weights.clone(), + t: t_weights.clone(), + }; + + let str = format!("{:?}", model); + let str_repr = String::from("NucModel { a: [0, 20, 1, 20], c: [20, 0, 1, 1], g: [1, 1, 0, 20], t: [20, 1, 20, 0] }"); + assert_eq!(str, str_repr); + assert_eq!(model.a, a_weights); + } + + #[test] + fn test_nuc_model_from_weights() { + let a_weights = vec![0, 20, 1, 20]; + let c_weights = vec![20, 0, 1, 1]; + let g_weights = vec![1, 1, 0, 20]; + let t_weights = vec![20, 1, 20, 0]; + let mut rng = thread_rng(); + let test_model = NucModel::from(vec![a_weights, c_weights, g_weights, t_weights]); + // It actually mutates the base + assert_ne!(test_model.choose_new_nuc(0, &mut rng), 0); + assert_ne!(test_model.choose_new_nuc(1, &mut rng), 1); + assert_ne!(test_model.choose_new_nuc(2, &mut rng), 2); + assert_ne!(test_model.choose_new_nuc(3, &mut rng), 3); + // It gives back N when you give it N + assert_eq!(test_model.choose_new_nuc(4, &mut rng), 4); + } + + #[test] + #[should_panic] + fn test_nuc_model_too_many_vecs() { + let a_weights = vec![0, 20, 1, 20]; + let c_weights = vec![20, 0, 1, 1]; + let g_weights = vec![1, 1, 0, 20]; + let t_weights = vec![20, 1, 20, 0]; + let u_weights = vec![20, 1, 20, 0]; + NucModel::from(vec![a_weights, c_weights, g_weights, t_weights, u_weights]); + } + + #[test] + #[should_panic] + fn test_nuc_model_too_many_bases() { + let a_weights = vec![0, 20, 1, 20, 1]; + let c_weights = vec![20, 0, 1, 1]; + let g_weights = vec![1, 1, 0, 20]; + let t_weights = vec![20, 1, 20, 0]; + NucModel::from(vec![a_weights, c_weights, g_weights, t_weights]); + } +} \ No newline at end of file diff --git a/src/utils/runner.rs b/src/utils/runner.rs new file mode 100644 index 0000000..8393c89 --- /dev/null +++ b/src/utils/runner.rs @@ -0,0 +1,122 @@ +use std::collections::HashSet; +use std::fmt::{Display, Formatter}; +use log::info; +use rand::prelude::SliceRandom; +use rand::rngs::ThreadRng; +use utils::config::RunConfiguration; +use utils::fasta_tools::{read_fasta, write_fasta}; +use utils::fastq_tools::write_fastq; +use utils::make_reads::generate_reads; +use utils::mutate::mutate_fasta; +use utils::vcf_tools::write_vcf; + +#[derive(Debug)] +pub enum NeatErrors { + FastaError, + FastqError, + VcfError, +} + +impl Display for NeatErrors { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +pub fn run_neat(config: Box, mut rng: &mut ThreadRng) -> Result<(), NeatErrors>{ + // Create the prefix of the files to write + let output_file = format!("{}/{}", config.output_dir, config.output_prefix); + + // Reading the reference file into memory + info!("Mapping reference fasta file: {}", &config.reference); + let (fasta_map, fasta_order) = read_fasta(&config.reference); + + // Mutating the reference and recording the variant locations. + info!("Mutating reference."); + let (mutated_map, variant_locations) = mutate_fasta( + &fasta_map, + &mut rng + ); + + if config.produce_fasta { + info!("Outputting fasta file"); + write_fasta( + &mutated_map, + &fasta_order, + config.overwrite_output, + &output_file, + ) + .or(Err(NeatErrors::FastaError)) + .expect("Error writing fasta file"); + } + + if config.produce_vcf { + info!("Writing vcf file"); + write_vcf( + &variant_locations, + &fasta_order, + config.ploidy, + &config.reference, + config.overwrite_output, + &output_file, + &mut rng + ) + .or(Err(NeatErrors::VcfError)) + .expect("Error writing VCF file"); + } + + let mut read_sets: HashSet> = HashSet::new(); + for (_name, sequence) in mutated_map.iter() { + // defined as a set of read sequences that should cover + // the mutated sequence `coverage` number of times + let data_set = generate_reads( + sequence, + &config.read_len, + &config.coverage, + config.paired_ended, + config.fragment_mean, + config.fragment_st_dev, + &mut rng + ); + + read_sets.extend(*data_set); + } + + if config.produce_fastq { + info!("Shuffling output fastq data"); + let mut outsets: Box>> = Box::new(read_sets.iter().collect()); + outsets.shuffle(&mut rng); + + info!("Writing fastq"); + write_fastq( + &output_file, + config.overwrite_output, + config.paired_ended, + *outsets, + ) + .or(Err(NeatErrors::FastqError)) + .expect("Problem writing fastq file"); + info!("Processing complete") + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::fs; + use rand::thread_rng; + use std::path::Path; + use super::*; + + #[test] + fn test_runner() -> Result<(), NeatErrors>{ + let config = RunConfiguration::build().build(); + run_neat( + Box::new(config), + &mut thread_rng(), + ).unwrap(); + let fastq_file = Path::new("neat_out_r1.fastq").canonicalize().unwrap(); + fs::remove_file(fastq_file).unwrap(); + Ok(()) + } +} \ No newline at end of file diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs index fa0fafc..26b58af 100644 --- a/src/utils/vcf_tools.rs +++ b/src/utils/vcf_tools.rs @@ -6,7 +6,7 @@ use std::io::Write; use rand::Rng; use rand::rngs::ThreadRng; use rand::seq::IndexedRandom; -use utils::fasta_tools::num_to_char; +use utils::nucleotides::u8_to_base; use utils::file_tools::open_file; fn genotype_to_string(genotype: Vec) -> String { @@ -25,7 +25,7 @@ pub fn write_vcf( variant_locations: &HashMap>, fasta_order: &Vec, ploidy: usize, - reference: &str, + reference_path: &str, overwrite_output: bool, output_file_prefix: &str, mut rng: &mut ThreadRng, @@ -36,6 +36,7 @@ pub fn write_vcf( consisting of a tuple of (position, alt base, ref base). fasta_order: A vector of contig names in the order of the reference fasta. ploidy: The number of copies of each chromosome present in the organism + reference_path: The location of the reference file this vcf is showing variants from. output_file_prefix: The path to the directory and the prefix to use for filenames rng: A random number generator for this run Result: @@ -45,10 +46,11 @@ pub fn write_vcf( let ploid_numbers: Vec = (1..ploidy+1).collect(); // set the filename of the output vcf let mut filename = format!("{}.vcf", output_file_prefix); - let mut outfile = open_file(&mut filename, overwrite_output); + let mut outfile = open_file(&mut filename, overwrite_output) + .expect(&format!("Problem opening {} for output.", filename)); // add the vcf header writeln!(&mut outfile, "##fileformat=VCFv4.1")?; - writeln!(&mut outfile, "##reference={}", reference)?; + writeln!(&mut outfile, "##reference={}", reference_path)?; writeln!(&mut outfile, "##Generated by rusty-neat")?; writeln!(&mut outfile, "##INFO=")?; writeln!(&mut outfile, "##INFO=")?; @@ -87,15 +89,53 @@ pub fn write_vcf( // Format the output line. Any fields without data will be a simple period. Quality // is set to 37 for all these variants. let line = format!("{}\t{}\t.\t{}\t{}\t37\tPASS\t.\tGT\t{}", - contig, - mutation.0 + 1, - num_to_char(mutation.2), - num_to_char(mutation.1), - genotype_to_string(genotype), + contig, + mutation.0 + 1, + u8_to_base(mutation.2), + u8_to_base(mutation.1), + genotype_to_string(genotype), ); writeln!(&mut outfile, "{}", line)?; } }; Ok(()) +} + +#[cfg(test)] +mod tests { + use std::fs; + use rand::thread_rng; + use super::*; + use std::path::Path; + + #[test] + fn test_genotype_to_string() { + let genotype = vec![0, 1, 0]; + assert_eq!(String::from("0/1/0"), genotype_to_string(genotype)); + } + + #[test] + fn test_write_vcf() { + let variant_locations = HashMap::from([ + ("chr1".to_string(), vec![(3, 0, 1), (7, 1, 2)]) + ]); + let fasta_order = vec!["chr1".to_string()]; + let ploidy = 2; + let reference_path = "/fake/path/to/H1N1.fa"; + let overwrite_output = false; + let output_file_prefix = "test"; + let mut rng = thread_rng(); + write_vcf( + &variant_locations, + &fasta_order, + ploidy, + reference_path, + overwrite_output, + output_file_prefix, + &mut rng, + ).unwrap(); + assert!(Path::new("test.vcf").exists()); + fs::remove_file("test.vcf").unwrap(); + } } \ No newline at end of file