From f1caa35f2ce407500cffca2def5e78faacb13e48 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 13 Mar 2024 00:45:32 -0500 Subject: [PATCH 01/23] Making some changes to make room for paired-ended reads. --- Cargo.toml | 2 +- src/main.rs | 35 +++++---- src/utils.rs | 3 +- src/utils/fasta_tools.rs | 14 ++-- src/utils/make_reads.rs | 96 +++++++++++++++++++------ src/utils/mutate.rs | 152 +++++++++++++++++++++++---------------- src/utils/vcf_tools.rs | 7 ++ 7 files changed, 201 insertions(+), 108 deletions(-) create mode 100644 src/utils/vcf_tools.rs diff --git a/Cargo.toml b/Cargo.toml index 8430db1..1efe611 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,4 +12,4 @@ simplelog = "0.12.1" serde_yaml = "0.9.32" clap = { version = "4.5.1", features = ["derive"] } itertools = "0.12.1" -assert_fs = "1.1.1" +assert_fs = "1.1.1" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 43cba6f..bbe0f8d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,9 +32,11 @@ fn main() { let mut rng = thread_rng(); info!("Begin processing"); - + // parse the arguments from the command line let args = cli::Cli::parse(); + // set up the config struct based on whether there was an input config. Input config + // overrides any other inputs. let config = if args.config != "" { info!("Using Configuration file input: {}", &args.config); read_config_yaml(args.config) @@ -44,23 +46,28 @@ fn main() { Ok(build_config_from_args(args).expect("Problem reading configuration yaml file")) }.unwrap(); + // Create the prefix of the files to write let output_file = format!("{}/{}", config.output_dir, config.output_prefix); - info!("Mapping fasta file: {}", &config.reference); + // Reading the reference file into memory + info!("Mapping reference fasta file: {}", &config.reference); let (fasta_map, fasta_order) = read_fasta(&config.reference); - // todo: - // need to add this twice, produce two mutated fastas, or at least 2 separate mutation - // datasets, each with half the mutation rate. Going to mean twice as much memory needed for - // fasta creation, which isn't ideal - info!("Mutating fasta"); - let mutated_map: Box>>> = mutate_fasta( + + // Mutating the reference and recoring the variant locations. + info!("Mutating reference."); + let (mutated_map, variant_locations) = mutate_fasta( &fasta_map, - config.ploidy, &mut rng ); - info!("Outputting fasta files"); - if config.produce_fasta == true { + if config.produce_vcf { + info!("Writing vcf file"); + todo!(); + // write_vcf(&variant_locations, &fasta_order, &config.ploidy, &output_file, &mut rng) + } + + if config.produce_fasta { + info!("Outputting fasta file"); write_fasta( &mutated_map, &fasta_order, @@ -70,16 +77,18 @@ fn main() { } let mut read_sets: HashSet> = HashSet::new(); - for (_name, sequences) in mutated_map.iter() { + for (_name, sequence) in mutated_map.iter() { // defined as a set of read sequences that should cover // the mutated sequence `coverage` number of times let data_set = generate_reads( - &sequences, + sequence, &config.read_len, &config.coverage, + config.paired_ended, &mut rng ); + // todo: we need to keep track of these reads better for paired endedness read_sets.extend(*data_set); } diff --git a/src/utils.rs b/src/utils.rs index 9bd2c47..04d2f1f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,4 +4,5 @@ pub mod config; pub mod cli; pub mod make_reads; pub mod mutate; -pub mod fastq_tools; \ No newline at end of file +pub mod fastq_tools; +pub mod vcf_tools; \ No newline at end of file diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 6cfbfb6..58c9ddb 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -86,7 +86,7 @@ pub fn read_fasta( } pub fn write_fasta( - fasta_output: &Box>>>, + fasta_output: &Box>>, fasta_order: &Vec, output_file: &str, ploidy: usize @@ -99,12 +99,12 @@ pub fn write_fasta( info!("Writing file: {}", this_fasta); let mut outfile = File::options().create(true).append(true).open(this_fasta)?; for contig in fasta_order { - let sequences = &fasta_output[contig]; + let sequence = &fasta_output[contig]; // Write contig name writeln!(&mut outfile, ">{}", contig)?; // write sequences[ploid] to this_fasta let mut i = 0; - let sequence_to_write: &Vec = &sequences[ploid]; + let sequence_to_write: &Vec = &sequence; while i < sequence_to_write.len() { let mut line = String::new(); let mut max: usize = 70; @@ -144,11 +144,9 @@ fn test_read_fasta() { fn test_write_fasta() -> Result<(), Box> { let seq1: Vec = vec![0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]; - let seq2: Vec = vec![0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,2]; - let sequences = vec![seq1, seq2]; - let fasta_output: HashMap>> = HashMap::from([ - (String::from("H1N1_HA"), sequences) - ]); + let fasta_output: HashMap> = HashMap::from( + (String::from("H1N1_HA"), seq1) + ); let fasta_pointer = Box::new(fasta_output); let fasta_order = vec![String::from("H1N1_HA")]; let output_file = "test"; diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index 4f70b23..2a67ab5 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -8,10 +8,19 @@ fn cover_dataset( coverage: usize, rng: &mut ThreadRng, ) -> Vec<(usize, usize)> { - // This function selects the positions of the reads. It starts at the beginning and goes out - // one read length, then picks a random jump between 0 and half the read length to move - // And picks those coordinates for a second read. Once the tail of the read is past the end, - // we start over again at 0. + /* + Takes: + span_length: Total number of bases in the sequence + read_length: The length of the reads for this run + coverage: The coverage depth for the reads + Returns: + A vector of tuples (usize, usize), denoting the start and end positions of the reads + + This function selects the positions of the reads. It starts at the beginning and goes out + one read length, then picks a random jump between 0 and half the read length to move + And picks those coordinates for a second read. Once the tail of the read is past the end, + we start over again at 0. + */ // todo currently the reads look a little light so we may need to improve this calculation. let mut read_set: Vec<(usize, usize)> = vec![]; let mut start: usize = 0; @@ -31,34 +40,75 @@ fn cover_dataset( read_set } +fn complement(nucleotide: u8) -> u8 { + /* + 0 = A, 1 = C, 2 = G, 3 = T, + matches with the complement of each nucleotide. + + Todo: make this part of a struct to standardize across the program. + */ + return match nucleotide { + 0 => 3, + 1 => 2, + 2 => 1, + 3 => 0, + _ => 4, + } +} + +fn reverse_complement(sequence: &Vec) -> Vec { + /* + Returns the reverse complement of a vector of u8's representing a DNA sequence. + */ + let length = sequence.len(); + let mut rev_comp = Vec::new(); + for i in (0..length).rev() { + rev_comp.push(complement(sequence[i])) + } + rev_comp +} + pub fn generate_reads( - mutated_sequences: &Vec>, + mutated_sequence: &Vec, read_length: &usize, coverage: &usize, - rng: &mut ThreadRng, + paired_ended: bool, + mut rng: &mut ThreadRng, ) -> Box>> { - // Infer ploidy - let ploidy = mutated_sequences.len(); - // Need X/ploidy reads per ploid, but probably fewer - let coverage_per_ploid = coverage / ploidy + 1; + /* + Takes: + mutated_sequence: a vector of u8's representing the mutated sequence. + read_length: the length ef the reads for this run + coverage: the average depth of coverage for this run + rng: the random number generator for the run + Returns: + HashSet of vectors representing the read sequences, stored on the heap in box. + + This takes a mutated sequence and produces a set of reads based on the mutated sequence. For + paired ended reads, this will generate a set of reads from each end, by taking the reverse + complement int the output + */ + + if paired_ended { + let rev_comp = reverse_complement(mutated_sequence); + todo!(); + } // set up some defaults and storage let mut read_set: HashSet> = HashSet::new(); - for i in 0..ploidy { - let seq_len = *(&mutated_sequences[i].len()); - // Generate a vector of read positions - let read_positions: Vec<(usize, usize)> = cover_dataset( - seq_len, - *read_length, - coverage_per_ploid, - rng, - ); + let seq_len = mutated_sequence.len(); + // Generate a vector of read positions + let read_positions: Vec<(usize, usize)> = cover_dataset( + seq_len, + *read_length, + *coverage, + &mut rng, + ); - // Generate the reads from the read positions. - for (start, end) in read_positions { - read_set.insert(mutated_sequences[i][start..end].into()); - } + // Generate the reads from the read positions. + for (start, end) in read_positions { + read_set.insert(mutated_sequence[start..end].into()); } // puts the reads in the heap. Box::new(read_set) diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs index 6472dce..dd0d3ca 100644 --- a/src/utils/mutate.rs +++ b/src/utils/mutate.rs @@ -54,84 +54,98 @@ impl NucModel { pub fn mutate_fasta( file_struct: &HashMap>, - ploidy: usize, mut rng: &mut ThreadRng -) -> Box>>> { - // This takes a hashmap of contig names (keys) and a vector representing the reference sequence. - // It performs a basic calculation (length x mutation rate) and chooses that many positions - // along the sequence to mutate. It then builds a return string that represents the altered - // sequence. +) -> (Box>>, Box>>) { + /* + Takes: + file_struct: a hashmap of contig names (keys) and a vector + representing the reference sequence. + ploidy: The number of copies of the genome within an organism's cells + rng: random number generator for the run - // Todo: potentially re-write this to simple record the positions. Maybe position and length. - // and which ploid. Need to do this once for each ploid. - const MUT_RATE: f64 = 0.01; // will update this with something more elaborate later. - // Generate random proportions to give the ploids, to keep things interesting. - let mut proportions: Vec = Vec::with_capacity(ploidy); - // For ploidy = N, we generate N random values, then divide them by the sum of the values, this - // gives N random numbers whose sum = 1, to randomly assign proportion to the ploids. We may - // find there is a better model later. - for _ in 0..ploidy { - proportions.push((&mut rng).gen::()); - } - // Normalize my randos (divide by their total) - let total: f64 = proportions.iter().sum(); - proportions = proportions.iter().map(|x| x/total).collect(); - // This leaves a list of numbers that add up to 1, since these are proportions of 1, will - // ensure our mutation rate of each ploid adds up to the total MUT_RATE - let ploid_mut_rate: Vec = proportions.iter().map(|x| x*MUT_RATE).collect(); + Returns: + A tuple with pointers to: + A hashmap with keys that are contig names and a vector with the mutated sequence + A vector of tuples containing the location and alt of each variant - let mut return_struct: HashMap>> = HashMap::new(); // the mutated sequences - for (name, sequence) in file_struct { - let mut strands: Vec> = Vec::new(); - for ploid in 0..ploidy { - let sequence_length = sequence.len(); - debug!("Sequence {} is {} bp long", name, sequence_length); + This function performs a basic calculation (length x mutation rate +/- a random amount) + and chooses that many positions along the sequence to mutate. It then builds a return + string that represents the altered sequence and stores all the variants. + */ + const MUT_RATE: f64 = 0.01; // will update this with something more elaborate later. + let mut return_struct: HashMap> = HashMap::new(); // the mutated sequences - // Clone the reference - let mut mutated_record: Vec = sequence.clone(); + // hashmap with keys of the contig names with a list of positions and alts under the contig. + let mut all_variants: HashMap> = HashMap::new(); - // Calculate how many mutations to add - let num_positions = ( - sequence_length as f64 * ploid_mut_rate[ploid] - ).round() as usize; - if num_positions == 0 { - if rng.gen_bool(1.0/(ploidy as f64)) == true { - mutated_record = mutate_sequence( - mutated_record, 1, sequence_length, &mut rng - ); - } - } else { - mutated_record = mutate_sequence( - mutated_record, num_positions, sequence_length, &mut rng - ); - } - strands.push(mutated_record.clone()); + for (name, sequence) in file_struct { + // Mutations for this contig + let contig_mutations: Vec<(usize, u8)> = Vec::new(); + // The length of this sequence + let sequence_length = sequence.len(); + debug!("Sequence {} is {} bp long", name, sequence_length); + // Clone the reference to create mutations + let mut mutated_record: Vec = sequence.clone(); + // Calculate how many mutations to add + let mut num_positions = sequence_length as f64 * MUT_RATE; + // Add or subtract a few extra positions. + num_positions += { + // A random amount up to 10% of the reads + let factor = rng.gen() * 0.10; + // 25% of the time subtract, otherwise we'll add. + let sign = if rng.bool(0.25) { -1 } else { 1 }; + // add or subtract up to 10% of the reads. + num_positions * (sign * factor) + }; + // round the number of positions to the nearest usize. + num_positions = num_positions.round() as usize; + // If we somehow ended up with negative or no reads, we may still randomly add one mutation. + if num_positions <= 0 { + // we want to add at least 1 mutation to each contig + (mutated_record, contig_mutations) = mutate_sequence( + mutated_record, 1, &mut rng + ); + } else { + // else add num positions number of variants + (mutated_record, contig_mutations) = mutate_sequence( + mutated_record, num_positions, &mut rng + ); } - // Add to the new hashmap - return_struct.entry(name.clone()).or_insert(strands.clone()); + // Add to the return struct and variants map. + return_struct.entry(name.clone()).or_insert(mutated_record.clone()); + all_variants.entry(name.clone()).or_insert(contig_mutations); } - Box::new(return_struct) + (Box::new(return_struct), Box::new(all_variants)) } fn mutate_sequence( sequence: Vec, num_positions: usize, - length: usize, mut rng: &mut ThreadRng -) -> Vec { +) -> (Vec, Vec<(usize, u8)>) { /* - Takes a vector of u8's and mutate a few positions at random. + Takes: + sequence: A u8 vector representing a sequence of DNA + num_positions: The number of mutations to add to this sequence + rng: random number generator for the run + + returns a tuple with: + Vec = the sequence itself + Vec(usize, u8) = the position of the snp and the alt allele for that snp. + + Takes a vector of u8's and mutate a few positions at random. Returns the mutated sequence and + a list of tuples with the position and the alts of the SNPs. */ debug!("Adding {} mutations", num_positions); let mut mutated_record = sequence.clone(); // Randomly select num_positions from positions, weighted by gc bias and whatever. For now // all he weights are just equal. - let weights = vec![1; length]; + let weights = vec![1; mutated_record.len()]; // zip together weights and positions let mut weighted_positions: Vec<(usize, i32)> = Vec::new(); // izip!() accepts iterators and/or values with IntoIterator. - for (x, y) in izip!(&mut (0..length), &weights) { + for (x, y) in izip!(&mut (0..mutated_record.len()), &weights) { weighted_positions.push((x, *y)) } // now choose a random selection of num_positions without replacement @@ -140,19 +154,33 @@ fn mutate_sequence( .unwrap() .collect::>(); // Build mutation model - let mut_a = NucModel::from(0, vec![0, 1, 1, 1]); - let mut_c = NucModel::from(1, vec![1, 0, 1, 1]); - let mut_g = NucModel::from(2, vec![1, 1, 0, 1]); - let mut_t = NucModel::from(3, vec![1, 1, 1, 0]); + /* + Using NEAT's original mutation model of + [[0.0, 0.15, 0.7, 0.15], + [0.15, 0.0, 0.15, 0.7], + [0.7, 0.15, 0.0, 0.15], + [0.15, 0.7, 0.15, 0.0]] - for (index, _) in mutated_elements { + multiply each value by 20, which produces integer values + */ + let mut_a = NucModel::from(0, vec![0, 3, 14, 3]); + let mut_c = NucModel::from(1, vec![3, 0, 3, 14]); + let mut_g = NucModel::from(2, vec![14, 3, 0, 3]); + let mut_t = NucModel::from(3, vec![3, 14, 3, 0]); + + // Will hold the variants added to this sequence + let mut sequence_variants: Vec<(usize, u8)> = Vec::new(); + // for each index, picks a new base + for (index, _weight) in mutated_elements { mutated_record[*index] = match sequence[*index] { 0 => mut_a.choose_new_nuc(&mut rng), 1 => mut_c.choose_new_nuc(&mut rng), 2 => mut_g.choose_new_nuc(&mut rng), 3 => mut_t.choose_new_nuc(&mut rng), _ => 4 - } + }; + // add the location and alt base for the variant + sequence_variants.push((*index, mutated_record[index])) } - mutated_record + (mutated_record, sequence_variants) } \ No newline at end of file diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs new file mode 100644 index 0000000..7df4bf0 --- /dev/null +++ b/src/utils/vcf_tools.rs @@ -0,0 +1,7 @@ +pub fn write_vcf( + variant_locations, + fasta_order, + ploidy, + output_file_prefix, + mut rng +) -> Result<(), > \ No newline at end of file From 24cd2781ffb0746b18b45dc30f533ce092c0e996 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 13 Mar 2024 14:07:39 -0500 Subject: [PATCH 02/23] Add vcf writer to rusty-neat --- src/main.rs | 14 +++-- src/utils/cli.rs | 2 +- src/utils/config.rs | 5 +- src/utils/fasta_tools.rs | 61 ++++++++++++---------- src/utils/mutate.rs | 16 +++--- src/utils/vcf_tools.rs | 107 ++++++++++++++++++++++++++++++++++++--- 6 files changed, 154 insertions(+), 51 deletions(-) diff --git a/src/main.rs b/src/main.rs index bbe0f8d..662fb55 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,6 +19,7 @@ use utils::config::{read_config_yaml, build_config_from_args}; use utils::mutate::mutate_fasta; use utils::make_reads::generate_reads; use utils::fastq_tools::write_fastq; +use utils::vcf_tools::write_vcf; fn main() { @@ -53,7 +54,7 @@ fn main() { info!("Mapping reference fasta file: {}", &config.reference); let (fasta_map, fasta_order) = read_fasta(&config.reference); - // Mutating the reference and recoring the variant locations. + // Mutating the reference and recording the variant locations. info!("Mutating reference."); let (mutated_map, variant_locations) = mutate_fasta( &fasta_map, @@ -62,8 +63,14 @@ fn main() { if config.produce_vcf { info!("Writing vcf file"); - todo!(); - // write_vcf(&variant_locations, &fasta_order, &config.ploidy, &output_file, &mut rng) + write_vcf( + &fasta_map, + &variant_locations, + &fasta_order, + config.ploidy, + &config.reference, + &output_file, + &mut rng).expect("Error writing vcf file") } if config.produce_fasta { @@ -72,7 +79,6 @@ fn main() { &mutated_map, &fasta_order, &output_file, - config.ploidy ).expect("Problem writing fasta file"); } diff --git a/src/utils/cli.rs b/src/utils/cli.rs index dfe4d77..e61d1ff 100644 --- a/src/utils/cli.rs +++ b/src/utils/cli.rs @@ -19,7 +19,7 @@ pub struct Cli { read_length = the length of the reads in the output fastq file. Default = 150 coverage = The average depth per read of the fastq files. Default = 10 */ - #[arg(short='y', long="configuration_yaml", default_value_t=String::new())] + #[arg(short='C', long="configuration_yaml", default_value_t=String::new())] pub config: String, #[arg(short='r', long="reference", default_value_t=String::from("data/H1N1.fa"))] pub reference: String, diff --git a/src/utils/config.rs b/src/utils/config.rs index d2272f6..12d2ac7 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -209,10 +209,7 @@ impl ConfigBuilder { } } if self.produce_fasta { - debug!("Producing fasta files:"); - for ploid in 0..self.ploidy { - debug!("\t> {}_p{}.fasta", file_prefix, ploid+1); - } + debug!("Producing fasta file: {}.fasta", file_prefix); } if self.produce_vcf { debug!("Producing vcf file: {}.vcf", file_prefix) diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 58c9ddb..70833e6 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -35,6 +35,7 @@ pub fn num_to_char(nuc_num: u8) -> &'static str { #[allow(unused)] pub enum NucRepr { // I just discovered this repr type thing and need to investigate. + // todo figure out if we can use this idea to simplify things A = 0, C = 1, G = 2, @@ -43,6 +44,7 @@ pub enum NucRepr { } pub fn sequence_array_to_string(input_array: &Vec) -> String { + // Converts a sequence vector into a string representing the DNA sequence let mut return_string = String::new(); for num in input_array { return_string += num_to_char(*num); @@ -53,7 +55,7 @@ pub fn sequence_array_to_string(input_array: &Vec) -> String { pub fn read_fasta( fasta_path: &str ) -> (Box>>, Vec) { - // Reads a fasta and turns it into a HashMap and puts it in the heap + // Reads a fasta file and turns it into a HashMap and puts it in the heap info!("Reading fasta: {}", fasta_path); let mut fasta_map: HashMap> = HashMap::new(); @@ -89,36 +91,39 @@ pub fn write_fasta( fasta_output: &Box>>, fasta_order: &Vec, output_file: &str, - ploidy: usize ) -> io::Result<()> { + /* + Takes: + fasta_output: the hashmap of mutated sequences with contig names + fasta_order: A vector with the proper order for the fasta elements + output_file: the prefix for the output file name + ploidy: the number of copies of each chromosome the simulation is using + Returns: + Errors if there is a problem writing the file, otherwise it returns nothing. + */ // writing fasta output to files - let mut fasta_files: Vec = Vec::new(); - for ploid in 0..ploidy { - let this_fasta = format!("{}_p{}.fasta", output_file, ploid+1); - fasta_files.push(this_fasta.clone()); - info!("Writing file: {}", this_fasta); - let mut outfile = File::options().create(true).append(true).open(this_fasta)?; - for contig in fasta_order { - let sequence = &fasta_output[contig]; - // Write contig name - writeln!(&mut outfile, ">{}", contig)?; - // write sequences[ploid] to this_fasta - let mut i = 0; - let sequence_to_write: &Vec = &sequence; - while i < sequence_to_write.len() { - let mut line = String::new(); - let mut max: usize = 70; - let this_length: usize = sequence_to_write[i..].len(); - // If we don't have 70 characters, write out whatever is left. - if this_length < 70 { - max = this_length - } - for j in 0..max { - line += num_to_char(sequence_to_write[i + j]); - } - writeln!(&mut outfile, "{}", line)?; - i += 70; + let output_fasta = format!("{}.fasta", output_file); + let mut outfile = File::options().create_new(true).append(true).open(output_fasta)?; + for contig in fasta_order { + let sequence = &fasta_output[contig]; + // Write contig name + writeln!(&mut outfile, ">{}", contig)?; + // write sequences[ploid] to this_fasta + let mut i = 0; + let sequence_to_write: &Vec = &sequence; + while i < sequence_to_write.len() { + let mut line = String::new(); + let mut max: usize = 70; + let this_length: usize = sequence_to_write[i..].len(); + // If we don't have 70 characters, write out whatever is left. + if this_length < 70 { + max = this_length + } + for j in 0..max { + line += num_to_char(sequence_to_write[i + j]); } + writeln!(&mut outfile, "{}", line)?; + i += 70; } }; Ok(()) diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs index dd0d3ca..c3ec9a3 100644 --- a/src/utils/mutate.rs +++ b/src/utils/mutate.rs @@ -80,25 +80,25 @@ pub fn mutate_fasta( for (name, sequence) in file_struct { // Mutations for this contig - let contig_mutations: Vec<(usize, u8)> = Vec::new(); + let mut contig_mutations: Vec<(usize, u8)> = Vec::new(); // The length of this sequence let sequence_length = sequence.len(); debug!("Sequence {} is {} bp long", name, sequence_length); // Clone the reference to create mutations let mut mutated_record: Vec = sequence.clone(); // Calculate how many mutations to add - let mut num_positions = sequence_length as f64 * MUT_RATE; + let mut rough_num_positions: f64 = sequence_length as f64 * MUT_RATE; // Add or subtract a few extra positions. - num_positions += { + rough_num_positions += { // A random amount up to 10% of the reads - let factor = rng.gen() * 0.10; + let factor: f64 = rng.gen::() * 0.10; // 25% of the time subtract, otherwise we'll add. - let sign = if rng.bool(0.25) { -1 } else { 1 }; + let sign: f64 = if rng.gen_bool(0.25) { -1.0 } else { 1.0 }; // add or subtract up to 10% of the reads. - num_positions * (sign * factor) + rough_num_positions * (sign * factor) }; // round the number of positions to the nearest usize. - num_positions = num_positions.round() as usize; + let num_positions = rough_num_positions.round() as usize; // If we somehow ended up with negative or no reads, we may still randomly add one mutation. if num_positions <= 0 { // we want to add at least 1 mutation to each contig @@ -180,7 +180,7 @@ fn mutate_sequence( _ => 4 }; // add the location and alt base for the variant - sequence_variants.push((*index, mutated_record[index])) + sequence_variants.push((*index, mutated_record[*index])) } (mutated_record, sequence_variants) } \ No newline at end of file diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs index 7df4bf0..6261920 100644 --- a/src/utils/vcf_tools.rs +++ b/src/utils/vcf_tools.rs @@ -1,7 +1,102 @@ +extern crate log; + +use std::collections::HashMap; +use std::io; +use std::io::Write; +use std::fs::File; +use rand::Rng; +use rand::rngs::ThreadRng; +use rand::seq::IndexedRandom; +use utils::fasta_tools::num_to_char; + +fn genotype_to_string(genotype: Vec) -> String { + /* + Converts a vector of 0s and 1s representing genotype to a standard + vcf genotype string. + */ + let mut geno_string = String::new(); + for ploid in genotype { + geno_string += &format!("{}/", ploid.to_string()) + } + geno_string.strip_suffix("/").unwrap().to_string() +} + pub fn write_vcf( - variant_locations, - fasta_order, - ploidy, - output_file_prefix, - mut rng -) -> Result<(), > \ No newline at end of file + reference_sequence: &HashMap>, + variant_locations: &HashMap>, + fasta_order: &Vec, + ploidy: usize, + reference: &str, + output_file_prefix: &str, + mut rng: &mut ThreadRng, +) -> io::Result<()> { + /* + Takes: + reference_sequence: A map of the contig names keyed to a u8 vector representing the sequence + variant_locations: A map of contig names keyed to lists of variants in that contig + consisting of a tuple of (position, alt base). + fasta_order: A vector of contig names in the order of the reference fasta. + ploidy: The number of copies of each chromosome present in the organism + output_file_prefix: The path to the directory and the prefix to use for filenames + rng: A random number generator for this run + Result: + Throws and error if there's a problem, or else returns nothing. + */ + // ploid numbers are used to pick a number of ploids to mutate + let ploid_numbers: Vec = (1..ploidy+1).collect(); + // set the filename of the output vcf + let mut filename = format!("{}.vcf", output_file_prefix); + let mut outfile = File::options().create_new(true).append(true).open(&mut filename)?; + // add the vcf header + writeln!(&mut outfile, "##fileformat=VCFv4.1")?; + writeln!(&mut outfile, "##reference={}", reference)?; + writeln!(&mut outfile, "##Generated by rusty-neat")?; + writeln!(&mut outfile, "##INFO=")?; + writeln!(&mut outfile, "##INFO=")?; + writeln!(&mut outfile, "##INFO=")?; + writeln!(&mut outfile, "##INFO=")?; + writeln!(&mut outfile, "##INFO=")?; + writeln!(&mut outfile, "##ALT=")?; + writeln!(&mut outfile, "##ALT=")?; + writeln!(&mut outfile, "##ALT=")?; + writeln!(&mut outfile, "##ALT=")?; + writeln!(&mut outfile, "##ALT=")?; + writeln!(&mut outfile, "##ALT=")?; + writeln!(&mut outfile, "##ALT=")?; + writeln!(&mut outfile, "##FORMAT=")?; + // Add a neat sample column + writeln!(&mut outfile, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNEAT_simulated_sample")?; + // insert mutations + for contig in fasta_order { + for mutation in &variant_locations[contig] { + // If we're going to mutate more than one ploid (i.e. homozygous + // for diploid organisms), we must add it to the list. + let mut genotype: Vec = vec![0; ploidy]; + // We need to enumerate the index list for the genotype + let ploid_index: Vec = (0..ploidy).collect(); + // By default we'll assume heterozygous (only on one ploid). + let mut num_ploids: usize = 1; + let is_multiploid = (&mut rng).gen_bool(0.001); + if is_multiploid { + num_ploids = *ploid_numbers.choose(&mut rng).unwrap(); + } + for _ in 0..num_ploids { + // for each ploid that has the mutation, change one random + // genotype to 1, indicating the mutation is on that copy. + genotype[*ploid_index.choose(&mut rng).unwrap()] = 1 + } + // Format the output line. Any fields without data will be a simple period. Quality + // is set to 37 for all these variants. + let line = format!("{}\t{}\t.\t{}\t{}\t37\tPASS\t.\tGT\t{}", + contig, + mutation.0 + 1, + num_to_char(reference_sequence[contig][mutation.0]), + num_to_char(mutation.1), + genotype_to_string(genotype), + ); + + writeln!(&mut outfile, "{}", line)?; + } + }; + Ok(()) +} \ No newline at end of file From e41a0aa5b1a4c4aad135f2feb3bcc1bc7c731d33 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 13 Mar 2024 14:26:20 -0500 Subject: [PATCH 03/23] adding fastq paired-ended option --- src/main.rs | 1 + src/utils/fastq_tools.rs | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index bbe0f8d..a703bb9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -99,6 +99,7 @@ fn main() { info!("Writing fastq"); write_fastq( &output_file, + config.paired_ended, *outsets, ).expect("Problem writing fastq file"); info!("Processing complete") diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index ca3fc96..440febc 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -6,6 +6,7 @@ use utils::fasta_tools::sequence_array_to_string; pub fn write_fastq( fastq_filename: &str, + paired_ended: bool, dataset: Vec<&Vec>, ) -> io::Result<()> { /* @@ -17,8 +18,15 @@ pub fn write_fastq( // (Although this feature is currently untested and unknown). // (May need sorting.) let name_prefix = "neat_generated_".to_string(); - let fastq_filename = String::from(fastq_filename) + "_r1.fastq"; + // vector to hold the fastq filenames, since we don't know ahead of time if we need one or two + let mut fastq_files: Vec = Vec::new(); + fastq_files.push(String::from(fastq_filename) + "_r1.fastq"); + if paired_ended { + fastq_files.push(String::from(fastq_filename) + "_r1.fastq"); + } + // open the filename let mut outfile = File::options().create_new(true).append(true).open(fastq_filename)?; + // write out sequence by sequence using index to track the numbering let mut index = 1; for sequence in dataset { let sequence_len = sequence.len(); From b996dc373871e6659d2314e71b5ccb56fa5fafbe Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 13 Mar 2024 14:26:20 -0500 Subject: [PATCH 04/23] adding fastq paired-ended option --- src/main.rs | 1 + src/utils/fastq_tools.rs | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 662fb55..e7ad50e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -105,6 +105,7 @@ fn main() { info!("Writing fastq"); write_fastq( &output_file, + config.paired_ended, *outsets, ).expect("Problem writing fastq file"); info!("Processing complete") diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index ca3fc96..440febc 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -6,6 +6,7 @@ use utils::fasta_tools::sequence_array_to_string; pub fn write_fastq( fastq_filename: &str, + paired_ended: bool, dataset: Vec<&Vec>, ) -> io::Result<()> { /* @@ -17,8 +18,15 @@ pub fn write_fastq( // (Although this feature is currently untested and unknown). // (May need sorting.) let name_prefix = "neat_generated_".to_string(); - let fastq_filename = String::from(fastq_filename) + "_r1.fastq"; + // vector to hold the fastq filenames, since we don't know ahead of time if we need one or two + let mut fastq_files: Vec = Vec::new(); + fastq_files.push(String::from(fastq_filename) + "_r1.fastq"); + if paired_ended { + fastq_files.push(String::from(fastq_filename) + "_r1.fastq"); + } + // open the filename let mut outfile = File::options().create_new(true).append(true).open(fastq_filename)?; + // write out sequence by sequence using index to track the numbering let mut index = 1; for sequence in dataset { let sequence_len = sequence.len(); From fbd9eb71cecef9343256242cbd99197475d7adad Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 13 Mar 2024 17:28:21 -0500 Subject: [PATCH 05/23] Working on adding paired-ended --- src/main.rs | 43 ++++++++++++++++++++-------------------- src/utils/fastq_tools.rs | 6 ++++++ src/utils/mutate.rs | 9 +++++---- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/main.rs b/src/main.rs index e7ad50e..1c3acf3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,7 +22,6 @@ use utils::fastq_tools::write_fastq; use utils::vcf_tools::write_vcf; fn main() { - TermLogger::init( LevelFilter::Trace, Config::default(), @@ -61,6 +60,15 @@ fn main() { &mut rng ); + if config.produce_fasta { + info!("Outputting fasta file"); + write_fasta( + &mutated_map, + &fasta_order, + &output_file, + ).expect("Problem writing fasta file"); + } + if config.produce_vcf { info!("Writing vcf file"); write_vcf( @@ -73,15 +81,6 @@ fn main() { &mut rng).expect("Error writing vcf file") } - if config.produce_fasta { - info!("Outputting fasta file"); - write_fasta( - &mutated_map, - &fasta_order, - &output_file, - ).expect("Problem writing fasta file"); - } - let mut read_sets: HashSet> = HashSet::new(); for (_name, sequence) in mutated_map.iter() { // defined as a set of read sequences that should cover @@ -98,16 +97,18 @@ fn main() { read_sets.extend(*data_set); } - info!("Shuffling output fastq data"); - let mut outsets: Box>> = Box::new(read_sets.iter().collect()); - outsets.shuffle(&mut rng); - - info!("Writing fastq"); - write_fastq( - &output_file, - config.paired_ended, - *outsets, - ).expect("Problem writing fastq file"); - info!("Processing complete") + if config.produce_fastq { + info!("Shuffling output fastq data"); + let mut outsets: Box>> = Box::new(read_sets.iter().collect()); + outsets.shuffle(&mut rng); + + info!("Writing fastq"); + write_fastq( + &output_file, + config.paired_ended, + *outsets, + ).expect("Problem writing fastq file"); + info!("Processing complete") + } } diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index 440febc..f545ba2 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -10,6 +10,12 @@ pub fn write_fastq( dataset: Vec<&Vec>, ) -> io::Result<()> { /* + Takes: + fastq_filename: prefix for the output fastq files + dataset: List of u8 vectors representing dna sequences. + returns: + Error if there is a problem or else nothing. + Writes fastq files. At the moment, it only writes out single r1 file, but will eventually write out r1 and r2 files. */ diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs index c3ec9a3..1b8b252 100644 --- a/src/utils/mutate.rs +++ b/src/utils/mutate.rs @@ -123,7 +123,7 @@ fn mutate_sequence( sequence: Vec, num_positions: usize, mut rng: &mut ThreadRng -) -> (Vec, Vec<(usize, u8)>) { +) -> (Vec, Vec<(usize, u8, u8)>) { /* Takes: sequence: A u8 vector representing a sequence of DNA @@ -169,10 +169,11 @@ fn mutate_sequence( let mut_t = NucModel::from(3, vec![3, 14, 3, 0]); // Will hold the variants added to this sequence - let mut sequence_variants: Vec<(usize, u8)> = Vec::new(); + let mut sequence_variants: Vec<(usize, u8, u8)> = Vec::new(); // for each index, picks a new base for (index, _weight) in mutated_elements { - mutated_record[*index] = match sequence[*index] { + let reference_base = sequence[*index]; + mutated_record[*index] = match reference_base { 0 => mut_a.choose_new_nuc(&mut rng), 1 => mut_c.choose_new_nuc(&mut rng), 2 => mut_g.choose_new_nuc(&mut rng), @@ -180,7 +181,7 @@ fn mutate_sequence( _ => 4 }; // add the location and alt base for the variant - sequence_variants.push((*index, mutated_record[*index])) + sequence_variants.push((*index, mutated_record[*index], *reference_base)) } (mutated_record, sequence_variants) } \ No newline at end of file From ebc81aa90e98aebe4d4f2ae271273d38b668a9bc Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 14 Mar 2024 13:20:05 -0500 Subject: [PATCH 06/23] Added paired ended fragment generater to generate reads --- Cargo.lock | 18 +++++++ Cargo.toml | 3 +- src/main.rs | 5 +- src/utils/config.rs | 8 ++-- src/utils/make_reads.rs | 101 +++++++++++++++++++++++++++++++++------- src/utils/mutate.rs | 15 +++--- src/utils/vcf_tools.rs | 8 ++-- 7 files changed, 121 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 15db81d..e40cda8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -407,6 +407,12 @@ version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + [[package]] name = "linux-raw-sys" version = "0.4.13" @@ -438,6 +444,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -582,6 +589,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "rand_distr" +version = "0.5.0-alpha.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc403db446a1a58a404b9944a91c4adf7d99b0397429c998f0394d751ac98eff" +dependencies = [ + "num-traits", + "rand 0.9.0-alpha.0", +] + [[package]] name = "rdrand" version = "0.4.0" @@ -631,6 +648,7 @@ dependencies = [ "itertools", "log", "rand 0.9.0-alpha.0", + "rand_distr", "serde_yaml", "simplelog", "xorshift", diff --git a/Cargo.toml b/Cargo.toml index 1efe611..6ef81fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,4 +12,5 @@ simplelog = "0.12.1" serde_yaml = "0.9.32" clap = { version = "4.5.1", features = ["derive"] } itertools = "0.12.1" -assert_fs = "1.1.1" \ No newline at end of file +assert_fs = "1.1.1" +rand_distr = "0.5.0-alpha.0" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 1c3acf3..b1a380d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,6 +3,8 @@ extern crate clap; extern crate log; extern crate simplelog; extern crate serde_yaml; +extern crate rand_distr; +extern crate itertools; mod utils; @@ -72,7 +74,6 @@ fn main() { if config.produce_vcf { info!("Writing vcf file"); write_vcf( - &fasta_map, &variant_locations, &fasta_order, config.ploidy, @@ -90,6 +91,8 @@ fn main() { &config.read_len, &config.coverage, config.paired_ended, + config.fragment_mean, + config.fragment_st_dev, &mut rng ); diff --git a/src/utils/config.rs b/src/utils/config.rs index 12d2ac7..27befc3 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -48,7 +48,7 @@ pub struct RunConfiguration { pub output_prefix: String, } -#[warn(dead_code)] +#[allow(dead_code)] impl RunConfiguration { // The purpose of this function is to redirect you to the ConfigBuilder pub fn build() -> ConfigBuilder { @@ -99,7 +99,7 @@ impl ConfigBuilder { fragment_mean: 300.0, fragment_st_dev: 30.0, produce_fastq: true, - produce_fasta: true, + produce_fasta: false, produce_vcf: false, produce_bam: false, output_dir: current_dir, @@ -241,7 +241,7 @@ impl ConfigBuilder { } } -pub fn read_config_yaml(yaml: String) -> Result, &'static str> { +pub fn read_config_yaml(yaml: String) -> Result, ConfigError> { /* Reads an input configuration file from yaml using the serde package. Then sets the parameters based on the inputs. A "." value means to use the default value. @@ -251,7 +251,7 @@ pub fn read_config_yaml(yaml: String) -> Result, &'static let f = std::fs::File::open(yaml); let file = match f { Ok(l) => l, - Err(_) => { return Err("problem reading configuration file"); }, + Err(_) => return Err(ConfigError::FileReadError), }; // Uses serde_yaml to read the file into a HashMap let scrape_config: HashMap = serde_yaml::from_reader(file) diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index 2a67ab5..efe351d 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -1,40 +1,98 @@ -use std::collections::HashSet; +use std::collections::{HashSet, VecDeque}; + use rand::RngCore; use rand::rngs::ThreadRng; +use rand::seq::SliceRandom; +use rand_distr::{Normal, Distribution}; fn cover_dataset( span_length: usize, read_length: usize, + mut fragment_pool: Vec, coverage: usize, - rng: &mut ThreadRng, + mut rng: &mut ThreadRng, ) -> Vec<(usize, usize)> { /* Takes: span_length: Total number of bases in the sequence read_length: The length of the reads for this run + fragment_pool: a vector of sizes for the fragments. If empty, it will instead be filled + by the read_length (single ended reads) + paired_ended: true or false if the run is paired ended mode or not. coverage: The coverage depth for the reads Returns: - A vector of tuples (usize, usize), denoting the start and end positions of the reads + A vector of tuples (usize, usize), denoting the start and end positions of the fragment of + DNA that was sequenced. This function selects the positions of the reads. It starts at the beginning and goes out one read length, then picks a random jump between 0 and half the read length to move And picks those coordinates for a second read. Once the tail of the read is past the end, we start over again at 0. */ - // todo currently the reads look a little light so we may need to improve this calculation. + // Reads that will be start and end of the fragment. let mut read_set: Vec<(usize, usize)> = vec![]; + + let mut cover_fragment_pool: VecDeque; + if fragment_pool.is_empty() { + // set the shuffled fragment pool just equal to an instance of read_length + cover_fragment_pool = VecDeque::from([read_length]); + } else { + // shuffle the fragment pool + fragment_pool.shuffle(&mut rng); + cover_fragment_pool = VecDeque::from(fragment_pool) + } + + // Gap size to keep track of how many uncovered bases we have per layer, to help decide if we + // need more layers + let mut gap_size: usize = 0; + let mut layer_count: usize = 0; + + // start this party off at zero. let mut start: usize = 0; - 'outer: for _ in 0..coverage { - while start < span_length { - let temp_end = start+read_length; - if temp_end > span_length { - start = 0; - continue 'outer; + + while layer_count <= coverage { + let fragment_length = cover_fragment_pool[0]; + cover_fragment_pool.push_back(fragment_length); + let temp_end = start+fragment_length; + if temp_end > span_length { + // TODO some variation on this modulo idea will work for bacterial reads + start = temp_end % span_length; + gap_size += start; + // + if gap_size >= span_length { + // if we have accumulated enough gap, then we need to run the same layer again. + // We'll reset gap size but not increment layer_count. + gap_size = gap_size % span_length; + continue; + } else { + layer_count += 1; + continue } - read_set.push((start, temp_end)); - // Picks a number between zero and half of a read length - let wildcard: usize = (rng.next_u32() % (read_length/4) as u32) as usize; - start += read_length + wildcard; // adds to the start to give it some spice + } + read_set.push((start, temp_end)); + + // insert size is the number of bases between reads in the fragment for paired ended reads + let insert_size = fragment_length - (read_length * 2); + // if these are singled ended reads, then the insert size will always be negative + if insert_size > 0 { + // if there's uncovered bases in between the reads on paired ended reads, we'll add + // that to the gap to ensure adequate coverage. + gap_size += insert_size; + } + + // Picks a number between zero and a quarter of a read length + let wildcard: usize = (rng.next_u32() % (read_length/4) as u32) as usize; + // adds to the start to give it some spice + start += temp_end + wildcard; + // sanity check. If we are already out of bounds, take the modulo + if start >= span_length { + // get us back in bounds + start = start % span_length; + // add the gap + gap_size += start; + } else { + // still in bounds, just add the gap + gap_size += wildcard; } } read_set @@ -44,8 +102,6 @@ fn complement(nucleotide: u8) -> u8 { /* 0 = A, 1 = C, 2 = G, 3 = T, matches with the complement of each nucleotide. - - Todo: make this part of a struct to standardize across the program. */ return match nucleotide { 0 => 3, @@ -73,6 +129,8 @@ pub fn generate_reads( read_length: &usize, coverage: &usize, paired_ended: bool, + mean: f64, + st_dev: f64, mut rng: &mut ThreadRng, ) -> Box>> { /* @@ -90,8 +148,16 @@ pub fn generate_reads( */ if paired_ended { + // generate the reverse complement of the fragment let rev_comp = reverse_complement(mutated_sequence); - todo!(); + } + let mut fragment_pool: Vec = Vec::new(); + let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2); + let mut fragment_distribution = Normal::new(mean, st_dev).unwrap(); + // add fragments to the fragment pool + for _ in 0..num_frags { + let frag = fragment_distribution.sample(&mut rng).round() as usize; + fragment_pool.push(frag); } // set up some defaults and storage @@ -102,6 +168,7 @@ pub fn generate_reads( let read_positions: Vec<(usize, usize)> = cover_dataset( seq_len, *read_length, + fragment_pool, *coverage, &mut rng, ); diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs index 1b8b252..762e64c 100644 --- a/src/utils/mutate.rs +++ b/src/utils/mutate.rs @@ -1,13 +1,10 @@ -extern crate log; -extern crate itertools; - use std::collections::HashMap; use rand::distributions::Distribution; use rand::prelude::{ThreadRng, IndexedRandom}; use rand::seq::SliceRandom; use rand::Rng; -use self::log::debug; -use self::itertools::izip; +use log::debug; +use itertools::izip; #[derive(Debug, Ord, PartialOrd, Eq, PartialEq)] struct NucModel { @@ -55,7 +52,7 @@ impl NucModel { pub fn mutate_fasta( file_struct: &HashMap>, mut rng: &mut ThreadRng -) -> (Box>>, Box>>) { +) -> (Box>>, Box>>) { /* Takes: file_struct: a hashmap of contig names (keys) and a vector @@ -76,11 +73,11 @@ pub fn mutate_fasta( let mut return_struct: HashMap> = HashMap::new(); // the mutated sequences // hashmap with keys of the contig names with a list of positions and alts under the contig. - let mut all_variants: HashMap> = HashMap::new(); + let mut all_variants: HashMap> = HashMap::new(); for (name, sequence) in file_struct { // Mutations for this contig - let mut contig_mutations: Vec<(usize, u8)> = Vec::new(); + let mut contig_mutations: Vec<(usize, u8, u8)> = Vec::new(); // The length of this sequence let sequence_length = sequence.len(); debug!("Sequence {} is {} bp long", name, sequence_length); @@ -181,7 +178,7 @@ fn mutate_sequence( _ => 4 }; // add the location and alt base for the variant - sequence_variants.push((*index, mutated_record[*index], *reference_base)) + sequence_variants.push((*index, mutated_record[*index], reference_base)) } (mutated_record, sequence_variants) } \ No newline at end of file diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs index 6261920..e9b85d6 100644 --- a/src/utils/vcf_tools.rs +++ b/src/utils/vcf_tools.rs @@ -22,8 +22,7 @@ fn genotype_to_string(genotype: Vec) -> String { } pub fn write_vcf( - reference_sequence: &HashMap>, - variant_locations: &HashMap>, + variant_locations: &HashMap>, fasta_order: &Vec, ploidy: usize, reference: &str, @@ -32,9 +31,8 @@ pub fn write_vcf( ) -> io::Result<()> { /* Takes: - reference_sequence: A map of the contig names keyed to a u8 vector representing the sequence variant_locations: A map of contig names keyed to lists of variants in that contig - consisting of a tuple of (position, alt base). + consisting of a tuple of (position, alt base, ref base). fasta_order: A vector of contig names in the order of the reference fasta. ploidy: The number of copies of each chromosome present in the organism output_file_prefix: The path to the directory and the prefix to use for filenames @@ -90,7 +88,7 @@ pub fn write_vcf( let line = format!("{}\t{}\t.\t{}\t{}\t37\tPASS\t.\tGT\t{}", contig, mutation.0 + 1, - num_to_char(reference_sequence[contig][mutation.0]), + num_to_char(mutation.2), num_to_char(mutation.1), genotype_to_string(genotype), ); From 88d2ed41dc141293a8239dc84f665fef9c37e65d Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 14 Mar 2024 14:05:08 -0500 Subject: [PATCH 07/23] Finished paired ended. Need to run some tests --- src/utils/fastq_tools.rs | 60 ++++++++++++++++++++++++++++++++-------- src/utils/make_reads.rs | 46 ++++++------------------------ 2 files changed, 58 insertions(+), 48 deletions(-) diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index f545ba2..b42df5b 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -4,6 +4,32 @@ use std::io; use utils::fasta_tools::sequence_array_to_string; +fn complement(nucleotide: u8) -> u8 { + /* + 0 = A, 1 = C, 2 = G, 3 = T, + matches with the complement of each nucleotide. + */ + return match nucleotide { + 0 => 3, + 1 => 2, + 2 => 1, + 3 => 0, + _ => 4, + } +} + +fn reverse_complement(sequence: &Vec) -> Vec { + /* + Returns the reverse complement of a vector of u8's representing a DNA sequence. + */ + let length = sequence.len(); + let mut rev_comp = Vec::new(); + for i in (0..length).rev() { + rev_comp.push(complement(sequence[i])) + } + rev_comp +} + pub fn write_fastq( fastq_filename: &str, paired_ended: bool, @@ -11,7 +37,8 @@ pub fn write_fastq( ) -> io::Result<()> { /* Takes: - fastq_filename: prefix for the output fastq files + fastq_filename: prefix for the output fastq files. + paired_ended: boolean to set paired ended mode on or off. dataset: List of u8 vectors representing dna sequences. returns: Error if there is a problem or else nothing. @@ -26,25 +53,36 @@ pub fn write_fastq( let name_prefix = "neat_generated_".to_string(); // vector to hold the fastq filenames, since we don't know ahead of time if we need one or two let mut fastq_files: Vec = Vec::new(); - fastq_files.push(String::from(fastq_filename) + "_r1.fastq"); - if paired_ended { - fastq_files.push(String::from(fastq_filename) + "_r1.fastq"); - } - // open the filename - let mut outfile = File::options().create_new(true).append(true).open(fastq_filename)?; + let filename1 = String::from(fastq_filename) + "_r1.fastq"; + // open the file and append lines + let mut outfile1 = File::options().create_new(true).append(true).open(&filename1)?; + // setting up pairend ended reads For single ended reads, this will go unused. + let filename2 = String::from(fastq_filename) + "_r2.fastq"; + // open the second file and append lines + let mut outfile2 = File::options().create_new(true).append(true).open(&filename2)?; // write out sequence by sequence using index to track the numbering let mut index = 1; for sequence in dataset { let sequence_len = sequence.len(); // sequence name - writeln!(&mut outfile, "@{}", name_prefix.clone() + &index.to_string())?; + writeln!(&mut outfile1, "@{}/1", name_prefix.clone() + &index.to_string())?; // Array as a string - writeln!(&mut outfile, "{}", sequence_array_to_string(&sequence))?; + writeln!(&mut outfile1, "{}", sequence_array_to_string(&sequence))?; // The stupid plus sign - writeln!(&mut outfile, "+")?; + writeln!(&mut outfile1, "+")?; // Qual score of all F's for the whole thing. - writeln!(&mut outfile, "{}", std::iter::repeat("F").take(sequence_len).collect::())?; + writeln!(&mut outfile1, "{}", std::iter::repeat("F").take(sequence_len).collect::())?; index += 1; + if paired_ended { + // sequence name + writeln!(&mut outfile2, "@{}/2", name_prefix.clone() + &index.to_string())?; + // Array as a string + writeln!(&mut outfile2, "{}", sequence_array_to_string(&reverse_complement(&sequence)))?; + // The stupid plus sign + writeln!(&mut outfile2, "+")?; + // Qual score of all F's for the whole thing. + writeln!(&mut outfile2, "{}", std::iter::repeat("F").take(sequence_len).collect::())?; + } }; Ok(()) } \ No newline at end of file diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index efe351d..451061e 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -98,32 +98,6 @@ fn cover_dataset( read_set } -fn complement(nucleotide: u8) -> u8 { - /* - 0 = A, 1 = C, 2 = G, 3 = T, - matches with the complement of each nucleotide. - */ - return match nucleotide { - 0 => 3, - 1 => 2, - 2 => 1, - 3 => 0, - _ => 4, - } -} - -fn reverse_complement(sequence: &Vec) -> Vec { - /* - Returns the reverse complement of a vector of u8's representing a DNA sequence. - */ - let length = sequence.len(); - let mut rev_comp = Vec::new(); - for i in (0..length).rev() { - rev_comp.push(complement(sequence[i])) - } - rev_comp -} - pub fn generate_reads( mutated_sequence: &Vec, read_length: &usize, @@ -147,22 +121,20 @@ pub fn generate_reads( complement int the output */ - if paired_ended { - // generate the reverse complement of the fragment - let rev_comp = reverse_complement(mutated_sequence); - } let mut fragment_pool: Vec = Vec::new(); - let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2); - let mut fragment_distribution = Normal::new(mean, st_dev).unwrap(); - // add fragments to the fragment pool - for _ in 0..num_frags { - let frag = fragment_distribution.sample(&mut rng).round() as usize; - fragment_pool.push(frag); + if paired_ended { + let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2); + let mut fragment_distribution = Normal::new(mean, st_dev).unwrap(); + // add fragments to the fragment pool + for _ in 0..num_frags { + let frag = fragment_distribution.sample(&mut rng).round() as usize; + fragment_pool.push(frag); + } } // set up some defaults and storage let mut read_set: HashSet> = HashSet::new(); - + // length of the mutated sequence let seq_len = mutated_sequence.len(); // Generate a vector of read positions let read_positions: Vec<(usize, usize)> = cover_dataset( From 39510258f51edd1a1e58d9a604244d3531de8f43 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Fri, 15 Mar 2024 21:30:19 -0500 Subject: [PATCH 08/23] Can now create paired-ended reads! --- src/utils/fasta_tools.rs | 20 +++++++------------- src/utils/fastq_tools.rs | 7 ++++--- src/utils/make_reads.rs | 14 ++++++-------- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 70833e6..2bf7e17 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -2,7 +2,6 @@ extern crate rand; extern crate log; extern crate assert_fs; -use std::collections::HashMap; use crate::utils::file_tools::read_lines; use self::log::info; use std::fs::File; @@ -10,6 +9,7 @@ use std::{fs, io}; use std::io::Write; use std::*; use std::os::unix::fs::MetadataExt; +use HashMap; pub fn char_to_num(char_of_interest: char) -> u8 { return match char_of_interest { @@ -149,27 +149,21 @@ fn test_read_fasta() { fn test_write_fasta() -> Result<(), Box> { let seq1: Vec = vec![0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]; - let fasta_output: HashMap> = HashMap::from( + let fasta_output: HashMap> = HashMap::from([ (String::from("H1N1_HA"), seq1) - ); + ]); let fasta_pointer = Box::new(fasta_output); let fasta_order = vec![String::from("H1N1_HA")]; let output_file = "test"; - let ploidy: usize = 2; let test_write = write_fasta( &fasta_pointer, &fasta_order, - output_file, - ploidy + output_file ); - let file_name_p1 = "test_p1.fasta"; - let file_name_p2 = "test_p2.fasta"; + let file_name = "test.fasta"; assert_eq!(test_write.unwrap(), ()); - let attr = fs::metadata(file_name_p1).unwrap(); - assert!(attr.size() > 0); - let attr = fs::metadata(file_name_p2).unwrap(); + let attr = fs::metadata(file_name).unwrap(); assert!(attr.size() > 0); - fs::remove_file(file_name_p1)?; - fs::remove_file(file_name_p2)?; + fs::remove_file(file_name)?; Ok(()) } \ No newline at end of file diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index b42df5b..450a06b 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -1,6 +1,6 @@ use std::io::Write; use std::fs::File; -use std::io; +use std::{fs, io}; use utils::fasta_tools::sequence_array_to_string; @@ -51,8 +51,6 @@ pub fn write_fastq( // (Although this feature is currently untested and unknown). // (May need sorting.) let name_prefix = "neat_generated_".to_string(); - // vector to hold the fastq filenames, since we don't know ahead of time if we need one or two - let mut fastq_files: Vec = Vec::new(); let filename1 = String::from(fastq_filename) + "_r1.fastq"; // open the file and append lines let mut outfile1 = File::options().create_new(true).append(true).open(&filename1)?; @@ -84,5 +82,8 @@ pub fn write_fastq( writeln!(&mut outfile2, "{}", std::iter::repeat("F").take(sequence_len).collect::())?; } }; + if !paired_ended { + fs::remove_file(filename2)?; + } Ok(()) } \ No newline at end of file diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs index 451061e..a4aba9a 100644 --- a/src/utils/make_reads.rs +++ b/src/utils/make_reads.rs @@ -72,14 +72,12 @@ fn cover_dataset( read_set.push((start, temp_end)); // insert size is the number of bases between reads in the fragment for paired ended reads - let insert_size = fragment_length - (read_length * 2); - // if these are singled ended reads, then the insert size will always be negative - if insert_size > 0 { - // if there's uncovered bases in between the reads on paired ended reads, we'll add + // if these are singled ended reads, then the insert size will always be -read_length + if fragment_length > (read_length * 2) { + // if there's any insert size on paired ended reads, we'll add // that to the gap to ensure adequate coverage. - gap_size += insert_size; - } - + gap_size += fragment_length - (read_length * 2) + }; // Picks a number between zero and a quarter of a read length let wildcard: usize = (rng.next_u32() % (read_length/4) as u32) as usize; // adds to the start to give it some spice @@ -124,7 +122,7 @@ pub fn generate_reads( let mut fragment_pool: Vec = Vec::new(); if paired_ended { let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2); - let mut fragment_distribution = Normal::new(mean, st_dev).unwrap(); + let fragment_distribution = Normal::new(mean, st_dev).unwrap(); // add fragments to the fragment pool for _ in 0..num_frags { let frag = fragment_distribution.sample(&mut rng).round() as usize; From 9d0513f7d111647ba97df1390f710c286614bd63 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 16 Mar 2024 10:43:29 -0500 Subject: [PATCH 09/23] Updated version number, almost ready to merge to main --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e40cda8..4233824 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -640,7 +640,7 @@ dependencies = [ [[package]] name = "rusty-neat" -version = "0.1.0" +version = "0.2.0" dependencies = [ "assert_fs", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 6ef81fe..10feb57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rusty-neat" -version = "0.1.0" +version = "0.2.0" authors = ["Joshua Allen "] [dependencies] From 1c38c3726a87724e03d52afe0d62129c39a572e6 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 16 Mar 2024 16:01:17 -0500 Subject: [PATCH 10/23] Added ability to overwrite the output with a config option --- config/simple_template.yml | 14 +++++------ src/main.rs | 49 +++++++++++++++++++++++++++++++------- src/utils/cli.rs | 7 ++++++ src/utils/config.rs | 22 ++++++++++++++++- src/utils/fasta_tools.rs | 6 +++-- src/utils/fastq_tools.rs | 10 ++++---- src/utils/file_tools.rs | 25 +++++++++++++++++-- src/utils/neat_logger.rs | 0 src/utils/vcf_tools.rs | 4 +++- 9 files changed, 112 insertions(+), 25 deletions(-) create mode 100644 src/utils/neat_logger.rs diff --git a/config/simple_template.yml b/config/simple_template.yml index 3c7063e..3af3026 100644 --- a/config/simple_template.yml +++ b/config/simple_template.yml @@ -1,10 +1,8 @@ reference: REQUIRED read_len: . coverage: 20 -output_dir: /home/joshfactorial/code/rust_neat_outputs/ -output_prefix: neat_test_1 +mutation_rate: . -# Below are not yet active ploidy: . paired_ended: . fragment_mean: . @@ -15,6 +13,11 @@ produce_vcf: . produce_fasta: . produce_fastq: . +overwrite_output: . +output_dir: . +output_prefix: . + +# Below are not yet active error_model: . mutation_model: . fragment_model: . @@ -29,10 +32,7 @@ include_vcf: . target_bed: . off_target_scalar: . discard_bed: . -mutation_rate: . mutation_bed: . no_coverage_bias: . rng_seed: . -min_mutations: . -fasta_per_ploid: . -overwrite_output: . +min_mutations: . \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index b1a380d..6620453 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,11 +9,13 @@ extern crate itertools; mod utils; use std::collections::{HashMap, HashSet}; +use std::fs::File; use clap::{Parser}; use log::*; use simplelog::*; use rand::thread_rng; use rand::prelude::*; +use std::path::Path; use utils::cli; use utils::fasta_tools::{read_fasta, write_fasta}; @@ -21,22 +23,50 @@ use utils::config::{read_config_yaml, build_config_from_args}; use utils::mutate::mutate_fasta; use utils::make_reads::generate_reads; use utils::fastq_tools::write_fastq; +use utils::file_tools::check_parent_and_create; use utils::vcf_tools::write_vcf; fn main() { - TermLogger::init( - LevelFilter::Trace, - Config::default(), - TerminalMode::Stdout, - ColorChoice::Auto, - ).unwrap(); - - let mut rng = thread_rng(); info!("Begin processing"); // parse the arguments from the command line let args = cli::Cli::parse(); + let level_filter = match args.log_level.to_lowercase().as_str() { + "trace" => LevelFilter::Trace, + "debug" => LevelFilter::Debug, + "info" => LevelFilter::Info, + "warn" => LevelFilter::Warn, + "error" => LevelFilter::Error, + "off" => LevelFilter::Off, + _ => panic!( + "Unknown log level, please set to one of \ + Trace, Debug, Info, Warn, Error, or Off (case insensitive)." + ) + }; + + // Check that the parent dir exists + let log_destination = check_parent_and_create(&args.log_dest).unwrap(); + + CombinedLogger::init(vec![ + #[cfg(feature = "termcolor")] + TermLogger::new( + level_filter, + Config::default(), + TerminalMode::Stdout, + ColorChoice::Always, + ), + #[cfg(not(feature = "termcolor"))] + SimpleLogger::new(LevelFilter::Trace, Config::default()), + WriteLogger::new( + level_filter, + Config::default(), + File::create(log_destination).unwrap(), + ) + ]).unwrap(); + + let mut rng = thread_rng(); + // set up the config struct based on whether there was an input config. Input config // overrides any other inputs. let config = if args.config != "" { @@ -67,6 +97,7 @@ fn main() { write_fasta( &mutated_map, &fasta_order, + config.overwrite_output, &output_file, ).expect("Problem writing fasta file"); } @@ -78,6 +109,7 @@ fn main() { &fasta_order, config.ploidy, &config.reference, + config.overwrite_output, &output_file, &mut rng).expect("Error writing vcf file") } @@ -108,6 +140,7 @@ fn main() { info!("Writing fastq"); write_fastq( &output_file, + config.overwrite_output, config.paired_ended, *outsets, ).expect("Problem writing fastq file"); diff --git a/src/utils/cli.rs b/src/utils/cli.rs index e61d1ff..5840416 100644 --- a/src/utils/cli.rs +++ b/src/utils/cli.rs @@ -1,5 +1,6 @@ extern crate clap; +use std::fs; use clap::Parser; #[derive(Parser, Debug)] @@ -31,4 +32,10 @@ pub struct Cli { pub read_length: usize, #[arg(short='c', long="coverage", default_value_t = 10)] pub coverage: usize, + + // These options relate to the logging features + #[arg(long="log-level", default_value_t=String::from("Trace"), help="Enter one of Trace, Debug, Info, Warn, Error, Off")] + pub log_level: String, + #[arg(long="log-dest", default_value_t=String::new(), help="Full path and name to log file")] + pub log_dest: String, } \ No newline at end of file diff --git a/src/utils/config.rs b/src/utils/config.rs index 27befc3..83584e9 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -3,6 +3,7 @@ extern crate log; use std::collections::{HashMap}; use std::env; use std::string::String; +use clap::builder::Str; use crate::utils::cli::Cli; use log::{debug, error}; use serde_yaml::Error; @@ -29,6 +30,8 @@ pub struct RunConfiguration { produce_vcf: True or false on whether to produce an output VCF file, with genotyped variants. produce_bam: True or false on whether to produce an output BAM file, which will be aligned to the reference. + overwrite_output: if true, will overwrite output. If false will error and exit you attempt to + overwrite files with the same name. output_dir: The directory, relative or absolute, path to the directory to place output. output_prefix: The name to use for the output files. */ @@ -44,10 +47,10 @@ pub struct RunConfiguration { pub produce_fasta: bool, pub produce_vcf: bool, pub produce_bam: bool, + pub overwrite_output: bool, pub output_dir: String, pub output_prefix: String, } - #[allow(dead_code)] impl RunConfiguration { // The purpose of this function is to redirect you to the ConfigBuilder @@ -72,6 +75,7 @@ pub struct ConfigBuilder { produce_fasta: bool, produce_vcf: bool, produce_bam: bool, + overwrite_output: bool, output_dir: String, output_prefix: String, } @@ -102,6 +106,7 @@ impl ConfigBuilder { produce_fasta: false, produce_vcf: false, produce_bam: false, + overwrite_output: false, output_dir: current_dir, output_prefix: String::from("neat_out"), } @@ -168,6 +173,11 @@ impl ConfigBuilder { self } + pub fn set_overwrite_output(mut self) -> ConfigBuilder { + self.overwrite_output = true; + self + } + pub fn set_output_dir(mut self, output_dir: String) -> ConfigBuilder { self.output_dir = String::from(output_dir); self @@ -235,6 +245,7 @@ impl ConfigBuilder { produce_fasta: self.produce_fasta, produce_vcf: self.produce_vcf, produce_bam: self.produce_bam, + overwrite_output: self.overwrite_output, output_dir: self.output_dir, output_prefix: self.output_prefix, } @@ -327,6 +338,13 @@ pub fn read_config_yaml(yaml: String) -> Result, ConfigErr config_builder = config_builder.set_produce_bam(true) } }, + "overwrite_output" => { + // overwrite_output is false by default, to prevent data loss, setting this flag + // will instead enable rusty-neat to overwrite the output + if value.to_lowercase() == "true" { + config_builder = config_builder.set_overwrite_output() + } + } "output_dir" => { if value != ".".to_string() { config_builder = config_builder.set_output_dir(value) @@ -386,6 +404,8 @@ fn test_command_line_inputs() { config: String::new(), reference: String::from("data/ecoli.fa"), output_dir: String::from("test_dir"), + log_level: String::from("Trace"), + log_dest: String::new(), output_file_prefix: String::from("test"), read_length: 150, coverage: 10, diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 2bf7e17..01d5f1d 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -10,6 +10,7 @@ use std::io::Write; use std::*; use std::os::unix::fs::MetadataExt; use HashMap; +use utils::file_tools::open_file; pub fn char_to_num(char_of_interest: char) -> u8 { return match char_of_interest { @@ -90,6 +91,7 @@ pub fn read_fasta( pub fn write_fasta( fasta_output: &Box>>, fasta_order: &Vec, + overwrite_output: bool, output_file: &str, ) -> io::Result<()> { /* @@ -102,8 +104,8 @@ pub fn write_fasta( Errors if there is a problem writing the file, otherwise it returns nothing. */ // writing fasta output to files - let output_fasta = format!("{}.fasta", output_file); - let mut outfile = File::options().create_new(true).append(true).open(output_fasta)?; + let mut output_fasta = format!("{}.fasta", output_file); + let mut outfile = open_file(&mut output_fasta, overwrite_output); for contig in fasta_order { let sequence = &fasta_output[contig]; // Write contig name diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index 450a06b..32a5e65 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -3,6 +3,7 @@ use std::fs::File; use std::{fs, io}; use utils::fasta_tools::sequence_array_to_string; +use utils::file_tools::open_file; fn complement(nucleotide: u8) -> u8 { /* @@ -32,6 +33,7 @@ fn reverse_complement(sequence: &Vec) -> Vec { pub fn write_fastq( fastq_filename: &str, + overwrite_output: bool, paired_ended: bool, dataset: Vec<&Vec>, ) -> io::Result<()> { @@ -51,13 +53,13 @@ pub fn write_fastq( // (Although this feature is currently untested and unknown). // (May need sorting.) let name_prefix = "neat_generated_".to_string(); - let filename1 = String::from(fastq_filename) + "_r1.fastq"; + let mut filename1 = String::from(fastq_filename) + "_r1.fastq"; // open the file and append lines - let mut outfile1 = File::options().create_new(true).append(true).open(&filename1)?; + let mut outfile1 = open_file(&mut filename1, overwrite_output); // setting up pairend ended reads For single ended reads, this will go unused. - let filename2 = String::from(fastq_filename) + "_r2.fastq"; + let mut filename2 = String::from(fastq_filename) + "_r2.fastq"; // open the second file and append lines - let mut outfile2 = File::options().create_new(true).append(true).open(&filename2)?; + let mut outfile2 = open_file(&mut filename2, overwrite_output); // write out sequence by sequence using index to track the numbering let mut index = 1; for sequence in dataset { diff --git a/src/utils/file_tools.rs b/src/utils/file_tools.rs index e540756..eba4043 100644 --- a/src/utils/file_tools.rs +++ b/src/utils/file_tools.rs @@ -1,9 +1,30 @@ -use std::fs::File; +use std::fs::{File, OpenOptions}; use std::io; -use std::io::BufRead; +use std::io::{BufRead, Error}; +use std::path::Path; +use log::error; pub fn read_lines(filename: &str) -> io::Result>> { // This creates a buffer to read lines let file = File::open(filename)?; Ok(io::BufReader::new(file).lines()) +} + +pub fn open_file(mut filename: &mut str, overwrite_file: bool) -> File { + if overwrite_file { + File::options().create(true).write(true).open(&mut filename).unwrap() + } else { + File::options().create_new(true).append(true).open(&mut filename).unwrap() + } +} + +pub fn check_parent_and_create(filename: &str) -> io::Result<&Path> { + // checks that the parent dir exists and then if so creates the file object open + // and ready to write + let file_path = Path::new(filename); + if !file_path.parent().unwrap().exists() { + error!("Path to log file not found!"); + assert!(file_path.parent().unwrap().exists()) + } + Ok(file_path) } \ No newline at end of file diff --git a/src/utils/neat_logger.rs b/src/utils/neat_logger.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs index e9b85d6..9fc5e36 100644 --- a/src/utils/vcf_tools.rs +++ b/src/utils/vcf_tools.rs @@ -8,6 +8,7 @@ use rand::Rng; use rand::rngs::ThreadRng; use rand::seq::IndexedRandom; use utils::fasta_tools::num_to_char; +use utils::file_tools::open_file; fn genotype_to_string(genotype: Vec) -> String { /* @@ -26,6 +27,7 @@ pub fn write_vcf( fasta_order: &Vec, ploidy: usize, reference: &str, + overwrite_output: bool, output_file_prefix: &str, mut rng: &mut ThreadRng, ) -> io::Result<()> { @@ -44,7 +46,7 @@ pub fn write_vcf( let ploid_numbers: Vec = (1..ploidy+1).collect(); // set the filename of the output vcf let mut filename = format!("{}.vcf", output_file_prefix); - let mut outfile = File::options().create_new(true).append(true).open(&mut filename)?; + let mut outfile = open_file(&mut filename, overwrite_output); // add the vcf header writeln!(&mut outfile, "##fileformat=VCFv4.1")?; writeln!(&mut outfile, "##reference={}", reference)?; From a5f92ec1eeb68a847d6490f25399bf31906b297e Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 16 Mar 2024 22:31:57 -0500 Subject: [PATCH 11/23] Cleaned up the warnings and some of the code. Still need to add more unit tests --- src/main.rs | 2 -- src/utils/cli.rs | 22 ++++++++++++++++--- src/utils/config.rs | 7 +++--- src/utils/fasta_tools.rs | 46 +++++++++++++++++++--------------------- src/utils/fastq_tools.rs | 1 - src/utils/file_tools.rs | 4 ++-- src/utils/mutate.rs | 25 ++++++++-------------- src/utils/vcf_tools.rs | 1 - 8 files changed, 56 insertions(+), 52 deletions(-) diff --git a/src/main.rs b/src/main.rs index 6620453..70907ee 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,6 @@ use log::*; use simplelog::*; use rand::thread_rng; use rand::prelude::*; -use std::path::Path; use utils::cli; use utils::fasta_tools::{read_fasta, write_fasta}; @@ -128,7 +127,6 @@ fn main() { &mut rng ); - // todo: we need to keep track of these reads better for paired endedness read_sets.extend(*data_set); } diff --git a/src/utils/cli.rs b/src/utils/cli.rs index 5840416..a6520e2 100644 --- a/src/utils/cli.rs +++ b/src/utils/cli.rs @@ -1,6 +1,10 @@ +/// This is a pretty basic implementation of Clap CLI +/// The idea of this interface and supporting code is that the user can enter an optional +/// config file that will take the place of the other command line options, except the logging +/// features, which are handled separately, outside of the run configuration parsing. + extern crate clap; -use std::fs; use clap::Parser; #[derive(Parser, Debug)] @@ -12,6 +16,7 @@ pub struct Cli { config = the full path to a configuration yaml file. If entered, it will override all other command line inputs. No default. + reference = The relative path or full path to the reference file. Must be in fasta file format. Default "data/H1N1.fa" output_dir = The directory where output files will be written. If nothing is entered, @@ -19,9 +24,20 @@ pub struct Cli { output_file_prefix = output files will start with this name. Default = neat_out read_length = the length of the reads in the output fastq file. Default = 150 coverage = The average depth per read of the fastq files. Default = 10 + + The following commands are independent of the config and not affected by it one way or another: + log_level = Set a log level for the run. Everything at and above the level chosen will + be displayed in both logs. See simplelog docs for more info: + https://docs.rs/simplelog/latest/simplelog/enum.Level.html + log_dest = Full path filename where to write the log. The default is current working + dir, filename "neat_out.log," which is set during config parsing. */ - #[arg(short='C', long="configuration_yaml", default_value_t=String::new())] + #[arg(short='C', long="configuration_yaml", default_value_t=String::new(), + help="Enter a full path and filename to a configuration file. \ + This will override most other options")] pub config: String, + + // All of these arguments are overridden by the config file #[arg(short='r', long="reference", default_value_t=String::from("data/H1N1.fa"))] pub reference: String, #[arg(short='o', long="output_dir", default_value_t=String::new())] @@ -33,7 +49,7 @@ pub struct Cli { #[arg(short='c', long="coverage", default_value_t = 10)] pub coverage: usize, - // These options relate to the logging features + // These options relate to the logging features and are not overridden by a config #[arg(long="log-level", default_value_t=String::from("Trace"), help="Enter one of Trace, Debug, Info, Warn, Error, Off")] pub log_level: String, #[arg(long="log-dest", default_value_t=String::new(), help="Full path and name to log file")] diff --git a/src/utils/config.rs b/src/utils/config.rs index 83584e9..2265b16 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -3,10 +3,8 @@ extern crate log; use std::collections::{HashMap}; use std::env; use std::string::String; -use clap::builder::Str; use crate::utils::cli::Cli; -use log::{debug, error}; -use serde_yaml::Error; +use log::{debug, warn, error}; #[derive(Debug)] pub struct RunConfiguration { @@ -195,6 +193,9 @@ impl ConfigBuilder { debug!("\t> mutation rate: {}", self.mutation_rate); debug!("\t> ploidy: {}", self.ploidy); debug!("\t> paired_ended: {}", self.paired_ended); + if self.overwrite_output { + warn!("Overwriting any existing files.") + } let file_prefix = format!("{}/{}", self.output_dir, self.output_prefix); if !(self.produce_fastq | self.produce_fasta | self.produce_vcf | self.produce_bam) { error!("All file types set to false, no files would be produced."); diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index 01d5f1d..cc3f5ea 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -4,11 +4,9 @@ extern crate assert_fs; use crate::utils::file_tools::read_lines; use self::log::info; -use std::fs::File; -use std::{fs, io}; +use std::io; use std::io::Write; use std::*; -use std::os::unix::fs::MetadataExt; use HashMap; use utils::file_tools::open_file; @@ -63,28 +61,27 @@ pub fn read_fasta( let mut fasta_order: Vec = Vec::new(); let mut current_key = String::new(); - if let lines = read_lines(fasta_path).expect("Error reading fasta") { - let mut temp_seq: Vec = vec![]; - lines.for_each(|line| match line { - Ok(l) => { - if l.starts_with('>') { - if !current_key.is_empty() { - fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone()); - } - current_key = String::from(l.strip_prefix('>').unwrap()); - fasta_order.push(current_key.clone()); - temp_seq = vec![]; - } else { - for char in l.chars() { - temp_seq.push(char_to_num(char)); - } + let lines = read_lines(fasta_path).unwrap(); + let mut temp_seq: Vec = vec![]; + lines.for_each(|line| match line { + Ok(l) => { + if l.starts_with('>') { + if !current_key.is_empty() { + fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone()); } - }, - Err(error) => panic!("Error reading fasta file: {:?}", error), - }); - // Need to pick up the last one - fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone()); - } + current_key = String::from(l.strip_prefix('>').unwrap()); + fasta_order.push(current_key.clone()); + temp_seq = vec![]; + } else { + for char in l.chars() { + temp_seq.push(char_to_num(char)); + } + } + }, + Err(error) => panic!("Error reading fasta file: {:?}", error), + }); + // Need to pick up the last one + fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone()); (Box::new(fasta_map), fasta_order) } @@ -160,6 +157,7 @@ fn test_write_fasta() -> Result<(), Box> { let test_write = write_fasta( &fasta_pointer, &fasta_order, + true, output_file ); let file_name = "test.fasta"; diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs index 32a5e65..4d82443 100644 --- a/src/utils/fastq_tools.rs +++ b/src/utils/fastq_tools.rs @@ -1,5 +1,4 @@ use std::io::Write; -use std::fs::File; use std::{fs, io}; use utils::fasta_tools::sequence_array_to_string; diff --git a/src/utils/file_tools.rs b/src/utils/file_tools.rs index eba4043..cc950fb 100644 --- a/src/utils/file_tools.rs +++ b/src/utils/file_tools.rs @@ -1,6 +1,6 @@ -use std::fs::{File, OpenOptions}; +use std::fs::File; use std::io; -use std::io::{BufRead, Error}; +use std::io::BufRead; use std::path::Path; use log::error; diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs index 762e64c..20894ca 100644 --- a/src/utils/mutate.rs +++ b/src/utils/mutate.rs @@ -1,5 +1,5 @@ +use std::cmp::max; use std::collections::HashMap; -use rand::distributions::Distribution; use rand::prelude::{ThreadRng, IndexedRandom}; use rand::seq::SliceRandom; use rand::Rng; @@ -77,7 +77,7 @@ pub fn mutate_fasta( for (name, sequence) in file_struct { // Mutations for this contig - let mut contig_mutations: Vec<(usize, u8, u8)> = Vec::new(); + let contig_mutations: Vec<(usize, u8, u8)>; // The length of this sequence let sequence_length = sequence.len(); debug!("Sequence {} is {} bp long", name, sequence_length); @@ -94,20 +94,13 @@ pub fn mutate_fasta( // add or subtract up to 10% of the reads. rough_num_positions * (sign * factor) }; - // round the number of positions to the nearest usize. - let num_positions = rough_num_positions.round() as usize; - // If we somehow ended up with negative or no reads, we may still randomly add one mutation. - if num_positions <= 0 { - // we want to add at least 1 mutation to each contig - (mutated_record, contig_mutations) = mutate_sequence( - mutated_record, 1, &mut rng - ); - } else { - // else add num positions number of variants - (mutated_record, contig_mutations) = mutate_sequence( - mutated_record, num_positions, &mut rng - ); - } + // Round the number of positions to the nearest usize. + // If negative or no reads, we still want at least 1 mutation per contig. + let num_positions = max(1, rough_num_positions.round() as usize); + + (mutated_record, contig_mutations) = mutate_sequence( + mutated_record, num_positions, &mut rng + ); // Add to the return struct and variants map. return_struct.entry(name.clone()).or_insert(mutated_record.clone()); all_variants.entry(name.clone()).or_insert(contig_mutations); diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs index 9fc5e36..fa0fafc 100644 --- a/src/utils/vcf_tools.rs +++ b/src/utils/vcf_tools.rs @@ -3,7 +3,6 @@ extern crate log; use std::collections::HashMap; use std::io; use std::io::Write; -use std::fs::File; use rand::Rng; use rand::rngs::ThreadRng; use rand::seq::IndexedRandom; From 2010879fc863dc083d233bdea59ece3cbbdcd529 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sun, 17 Mar 2024 00:09:51 -0500 Subject: [PATCH 12/23] Developing tests --- src/utils/cli.rs | 11 ++- src/utils/config.rs | 149 ++++++++++++++++++++++++++++++--------- src/utils/fasta_tools.rs | 2 +- 3 files changed, 125 insertions(+), 37 deletions(-) diff --git a/src/utils/cli.rs b/src/utils/cli.rs index a6520e2..2cb74fc 100644 --- a/src/utils/cli.rs +++ b/src/utils/cli.rs @@ -1,11 +1,14 @@ /// This is a pretty basic implementation of Clap CLI /// The idea of this interface and supporting code is that the user can enter an optional /// config file that will take the place of the other command line options, except the logging -/// features, which are handled separately, outside of the run configuration parsing. +/// features, which are handled separately. Either way, these options are read into a configuration +/// struct that holds the variables for the run. Logging, meanwhile, is handled separately, +/// outside run configuration parsing. extern crate clap; use clap::Parser; +use std::env; #[derive(Parser, Debug)] pub struct Cli { @@ -52,6 +55,8 @@ pub struct Cli { // These options relate to the logging features and are not overridden by a config #[arg(long="log-level", default_value_t=String::from("Trace"), help="Enter one of Trace, Debug, Info, Warn, Error, Off")] pub log_level: String, - #[arg(long="log-dest", default_value_t=String::new(), help="Full path and name to log file")] + #[arg(long="log-dest", default_value_t=env::current_dir().unwrap().display().to_string() + "neat_out.log", help="Full path and name to log file")] pub log_dest: String, -} \ No newline at end of file +} + +// Tests are handled in other places. \ No newline at end of file diff --git a/src/utils/config.rs b/src/utils/config.rs index 2265b16..495202a 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -1,7 +1,6 @@ extern crate log; use std::collections::{HashMap}; -use std::env; use std::string::String; use crate::utils::cli::Cli; use log::{debug, warn, error}; @@ -86,10 +85,6 @@ pub enum ConfigError { impl ConfigBuilder { pub fn new() -> ConfigBuilder { - let current_dir = env::current_dir() - .expect("Could not find current working directory") - .display() - .to_string(); ConfigBuilder { // Setting default values reference: String::from("data/H1N1.fa"), @@ -105,7 +100,7 @@ impl ConfigBuilder { produce_vcf: false, produce_bam: false, overwrite_output: false, - output_dir: current_dir, + output_dir: String::from(""), output_prefix: String::from("neat_out"), } } @@ -188,11 +183,11 @@ impl ConfigBuilder { pub fn check_and_print_config(&self) -> Result<(), ConfigError> { debug!("Running rusty-neat to generate reads on {} with...", self.reference); - debug!("\t> read length: {}", self.read_len); - debug!("\t> coverage: {}", self.coverage); - debug!("\t> mutation rate: {}", self.mutation_rate); - debug!("\t> ploidy: {}", self.ploidy); - debug!("\t> paired_ended: {}", self.paired_ended); + debug!(" >read length: {}", self.read_len); + debug!(" >coverage: {}", self.coverage); + debug!(" >mutation rate: {}", self.mutation_rate); + debug!(" >ploidy: {}", self.ploidy); + debug!(" >paired_ended: {}", self.paired_ended); if self.overwrite_output { warn!("Overwriting any existing files.") } @@ -277,6 +272,9 @@ pub fn read_config_yaml(yaml: String) -> Result, ConfigErr "reference" => { if value != ".".to_string() { config_builder = config_builder.set_reference(value) + } else { + error!("Reference is required."); + return Err(ConfigError::ConfigurationError) } }, "read_len" => { @@ -371,9 +369,12 @@ pub fn build_config_from_args(args: Cli) -> Result, &'stat // Create the ConfigBuilder object with default values let mut config_builder = ConfigBuilder::new(); - // The default assigned by CLI will keep it as data/H1N1.fa + // Can't do a run without a reference if args.reference != "" { config_builder = config_builder.set_reference(args.reference) + } else { + error!("No reference specified"); + return Err("Reference File Error."); } // The default value works directly for the config builder config_builder = config_builder.set_read_len(args.read_length); @@ -391,27 +392,109 @@ pub fn build_config_from_args(args: Cli) -> Result, &'stat Ok(Box::new(config_builder.build())) } -#[test] -fn test_read_config_yaml() { - let yaml = String::from("config/neat_test.yml"); - let test_config = read_config_yaml(yaml).unwrap(); - assert_eq!(test_config.reference, "data/ecoli.fa".to_string()); - assert_eq!(test_config.coverage, 10); -} +#[cfg(test)] +mod tests { + use super::*; -#[test] -fn test_command_line_inputs() { - let args: Cli = Cli{ - config: String::new(), - reference: String::from("data/ecoli.fa"), - output_dir: String::from("test_dir"), - log_level: String::from("Trace"), - log_dest: String::new(), - output_file_prefix: String::from("test"), - read_length: 150, - coverage: 10, - }; + #[test] + fn test_run_configuration() { + let test_configuration = RunConfiguration { + reference: String::from("Hello.world"), + read_len: 100, + coverage: 22, + mutation_rate: 0.09, + ploidy: 3, + paired_ended: true, + fragment_mean: 333.0, + fragment_st_dev: 33.0, + produce_fastq: false, + produce_bam: true, + produce_fasta: true, + produce_vcf: true, + overwrite_output: true, + output_dir: String::from("/my/my"), + output_prefix: String::from("Hey.hey") + }; + + assert_eq!(test_configuration.reference, "Hello.world".to_string()); + assert_eq!(test_configuration.read_len, 100); + assert_eq!(test_configuration.coverage, 22); + assert_eq!(test_configuration.mutation_rate, 0.09); + assert_eq!(test_configuration.ploidy, 3); + assert_eq!(test_configuration.paired_ended, true); + assert_eq!(test_configuration.fragment_mean, 333.0); + assert_eq!(test_configuration.fragment_st_dev, 33.0); + assert_eq!(test_configuration.produce_fastq, false); + assert_eq!(test_configuration.produce_vcf, true); + assert_eq!(test_configuration.produce_bam, true); + assert_eq!(test_configuration.produce_fasta, true); + assert_eq!(test_configuration.overwrite_output, true); + assert_eq!(test_configuration.output_dir, "/my/my".to_string()); + assert_eq!(test_configuration.output_prefix, "Hey.hey".to_string()); + } - let test_config = build_config_from_args(args).unwrap(); - assert_eq!(test_config.reference, "data/ecoli.fa".to_string()) + #[test] + fn test_build() { + use super::*; + let x = RunConfiguration::build(); + assert_eq!(x.reference, "data/H1N1.fa".to_string()) + } + + #[test] + fn test_read_config_yaml() { + let yaml = String::from("config/neat_test.yml"); + let test_config = read_config_yaml(yaml).unwrap(); + assert_eq!(test_config.reference, "data/ecoli.fa".to_string()); + assert_eq!(test_config.coverage, 10); + } + + #[test] + fn test_setters() { + let mut config = ConfigBuilder::new(); + assert_eq!(config.mutation_rate, 0.001); + assert_eq!(config.ploidy, 2); + assert_eq!(config.paired_ended, false); + assert_eq!(config.fragment_mean, 300.0); + assert_eq!(config.fragment_st_dev, 30.0); + assert_eq!(config.produce_fasta, false); + assert_eq!(config.produce_fastq, true); + assert_eq!(config.produce_vcf, false); + assert_eq!(config.produce_bam, false); + config = config.set_mutation_rate(0.111) + .set_ploidy(3) + .set_paired_ended(true) + .set_fragment_mean(111.0) + .set_fragment_st_dev(0.011) + .set_produce_fastq(false) + .set_produce_fasta(true) + .set_produce_vcf(true) + .set_produce_bam(true) + .set_overwrite_output(); + assert_eq!(config.mutation_rate, 0.111); + assert_eq!(config.ploidy, 3); + assert_eq!(config.paired_ended, true); + assert_eq!(config.fragment_mean, 111.0); + assert_eq!(config.fragment_st_dev, 0.011); + assert_eq!(config.produce_fasta, true); + assert_eq!(config.produce_fastq, false); + assert_eq!(config.produce_vcf, true); + assert_eq!(config.produce_bam, true); + } + + #[test] + fn test_command_line_inputs() { + let args: Cli = Cli{ + config: String::new(), + reference: String::from("data/ecoli.fa"), + output_dir: String::from("test_dir"), + log_level: String::from("Trace"), + log_dest: String::new(), + output_file_prefix: String::from("test"), + read_length: 150, + coverage: 10, + }; + + let test_config = build_config_from_args(args).unwrap(); + assert_eq!(test_config.reference, "data/ecoli.fa".to_string()) + } } \ No newline at end of file diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs index cc3f5ea..5415bb5 100644 --- a/src/utils/fasta_tools.rs +++ b/src/utils/fasta_tools.rs @@ -163,7 +163,7 @@ fn test_write_fasta() -> Result<(), Box> { let file_name = "test.fasta"; assert_eq!(test_write.unwrap(), ()); let attr = fs::metadata(file_name).unwrap(); - assert!(attr.size() > 0); + assert!(attr.len() > 0); fs::remove_file(file_name)?; Ok(()) } \ No newline at end of file From c305090f4aa0c0438af291c5ff405daf19f323b7 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:07:15 -0500 Subject: [PATCH 13/23] Added a yaml file to handle CI --- .github/workflows/ci.yml | 74 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..87de341 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,74 @@ +# .github/workflows/ci.yml +name: CI + +on: + pull_request: + workflow_dispatch: + +jobs: + tests: + name: ${{ matrix.make.name }} (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + rust: [stable] + make: + - name: Unit tests + task: "build test" + include: + - os: ubuntu-latest + sccache-path: /home/runner/.cache/sccache + env: + RUST_BACKTRACE: full + RUSTC_WRAPPER: sccache + RUSTV: ${{ matrix.rust }} + SCCACHE_CACHE_SIZE: 2G + SCCACHE_DIR: ${{ matrix.sccache-path }} + # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out + steps: + - uses: actions/checkout@v2 + - name: Install sccache (ubuntu-latest) + if: matrix.os == 'ubuntu-latest' + env: + LINK: https://github.com/mozilla/sccache/releases/download + SCCACHE_VERSION: 0.2.13 + run: | + SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl + mkdir -p $HOME/.local/bin + curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz + mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache + echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: Install Rust ${{ matrix.rust }} + uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + profile: minimal + override: true + - name: Cache cargo registry + uses: actions/cache@v2 + continue-on-error: false + with: + path: | + ~/.cargo/registry + ~/.cargo/git + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + - name: Save sccache + uses: actions/cache@v2 + continue-on-error: false + with: + path: ${{ matrix.sccache-path }} + key: ${{ runner.os }}-sccache-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-sccache- + - name: Start sccache server + run: sccache --start-server + - name: ${{ matrix.make.name }} + run: cargo ${{ matrix.make.task }} + - name: Print sccache stats + run: sccache --show-stats + - name: Stop sccache server + run: sccache --stop-server || true From e58cb8e32236f1adc339dcbd4675b6fb0b8f62d3 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:20:04 -0500 Subject: [PATCH 14/23] Had the indent wrong on the yaml --- .github/workflows/ci.yml | 104 +++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 87de341..e28fe33 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,55 +20,55 @@ jobs: include: - os: ubuntu-latest sccache-path: /home/runner/.cache/sccache - env: - RUST_BACKTRACE: full - RUSTC_WRAPPER: sccache - RUSTV: ${{ matrix.rust }} - SCCACHE_CACHE_SIZE: 2G - SCCACHE_DIR: ${{ matrix.sccache-path }} - # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out - steps: - - uses: actions/checkout@v2 - - name: Install sccache (ubuntu-latest) - if: matrix.os == 'ubuntu-latest' - env: - LINK: https://github.com/mozilla/sccache/releases/download - SCCACHE_VERSION: 0.2.13 - run: | - SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl - mkdir -p $HOME/.local/bin - curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz - mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache - echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: Install Rust ${{ matrix.rust }} - uses: actions-rs/toolchain@v1 - with: - toolchain: ${{ matrix.rust }} - profile: minimal - override: true - - name: Cache cargo registry - uses: actions/cache@v2 - continue-on-error: false - with: - path: | - ~/.cargo/registry - ~/.cargo/git - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - restore-keys: | - ${{ runner.os }}-cargo- - - name: Save sccache - uses: actions/cache@v2 - continue-on-error: false - with: - path: ${{ matrix.sccache-path }} - key: ${{ runner.os }}-sccache-${{ hashFiles('**/Cargo.lock') }} - restore-keys: | - ${{ runner.os }}-sccache- - - name: Start sccache server - run: sccache --start-server - - name: ${{ matrix.make.name }} - run: cargo ${{ matrix.make.task }} - - name: Print sccache stats - run: sccache --show-stats - - name: Stop sccache server - run: sccache --stop-server || true + env: + RUST_BACKTRACE: full + RUSTC_WRAPPER: sccache + RUSTV: ${{ matrix.rust }} + SCCACHE_CACHE_SIZE: 2G + SCCACHE_DIR: ${{ matrix.sccache-path }} + # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out + steps: + - uses: actions/checkout@v2 + - name: Install sccache (ubuntu-latest) + if: matrix.os == 'ubuntu-latest' + env: + LINK: https://github.com/mozilla/sccache/releases/download + SCCACHE_VERSION: 0.2.13 + run: | + SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl + mkdir -p $HOME/.local/bin + curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz + mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache + echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: Install Rust ${{ matrix.rust }} + uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + profile: minimal + override: true + - name: Cache cargo registry + uses: actions/cache@v2 + continue-on-error: false + with: + path: | + ~/.cargo/registry + ~/.cargo/git + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + - name: Save sccache + uses: actions/cache@v2 + continue-on-error: false + with: + path: ${{ matrix.sccache-path }} + key: ${{ runner.os }}-sccache-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-sccache- + - name: Start sccache server + run: sccache --start-server + - name: ${{ matrix.make.name }} + run: cargo ${{ matrix.make.task }} + - name: Print sccache stats + run: sccache --show-stats + - name: Stop sccache server + run: sccache --stop-server || true From 44b5a0eff8d4bc0eed8a1965837d47570162febe Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:22:02 -0500 Subject: [PATCH 15/23] Clearing cache --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e28fe33..33d68d0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: RUSTV: ${{ matrix.rust }} SCCACHE_CACHE_SIZE: 2G SCCACHE_DIR: ${{ matrix.sccache-path }} - # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out + SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out steps: - uses: actions/checkout@v2 - name: Install sccache (ubuntu-latest) From 16e78a86016bfe65a65ea80de2871e3bdccd687c Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:22:27 -0500 Subject: [PATCH 16/23] Okay, cache cleared, pushing again --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 33d68d0..e28fe33 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: RUSTV: ${{ matrix.rust }} SCCACHE_CACHE_SIZE: 2G SCCACHE_DIR: ${{ matrix.sccache-path }} - SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out + # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out steps: - uses: actions/checkout@v2 - name: Install sccache (ubuntu-latest) From d685ef4615d2f80b87cfd87fa11786bf18e098e4 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:25:04 -0500 Subject: [PATCH 17/23] Trying to allow the workflow to trigger manually --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e28fe33..59463bc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,6 +4,8 @@ name: CI on: pull_request: workflow_dispatch: + workflow_run: + branches: [feature/trying_ci] jobs: tests: From 6f00024e7b3b3a88fe958f88eab3185d17cbf53f Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:25:55 -0500 Subject: [PATCH 18/23] No, that wasn't it --- .github/workflows/ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 59463bc..b137ef5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,9 +4,7 @@ name: CI on: pull_request: workflow_dispatch: - workflow_run: - branches: [feature/trying_ci] - + jobs: tests: name: ${{ matrix.make.name }} (${{ matrix.os }}) From 5704ceba44ddf32133eceb923909a002fb3c0a48 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:29:18 -0500 Subject: [PATCH 19/23] Adding push to the triggers --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b137ef5..dbf9672 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,10 @@ name: CI on: + push: pull_request: workflow_dispatch: - + jobs: tests: name: ${{ matrix.make.name }} (${{ matrix.os }}) From ee1d15676d57da23da23c738aa7db15cbd66199d Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:37:20 -0500 Subject: [PATCH 20/23] Create rusty-neat-tests.yml --- .github/workflows/rusty-neat-tests.yml | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/rusty-neat-tests.yml diff --git a/.github/workflows/rusty-neat-tests.yml b/.github/workflows/rusty-neat-tests.yml new file mode 100644 index 0000000..5058aaa --- /dev/null +++ b/.github/workflows/rusty-neat-tests.yml @@ -0,0 +1,33 @@ +name: rusty-neat-tests + +on: + workflow_dispatch: + push: + branches: [ "main", "feature/trying_ci" ] + pull_request: + branches: [ "main" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Cache + uses: actions/cache@v4.0.1 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + key: ubuntu-latest-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ubuntu-latest-cargo- + - name: Build + run: cargo build --verbose + - name: Run tests + run: cargo test --verbose From 78f57cc74fe0875e3ba20dff04356f848fd0f62f Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:38:53 -0500 Subject: [PATCH 21/23] Keeping some dev changes --- src/main.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 70907ee..0785e57 100644 --- a/src/main.rs +++ b/src/main.rs @@ -134,7 +134,7 @@ fn main() { info!("Shuffling output fastq data"); let mut outsets: Box>> = Box::new(read_sets.iter().collect()); outsets.shuffle(&mut rng); - + info!("Writing fastq"); write_fastq( &output_file, @@ -144,5 +144,6 @@ fn main() { ).expect("Problem writing fastq file"); info!("Processing complete") } + } From 20660941d7a2fcc6756078ab479150ea17e8f381 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:39:54 -0500 Subject: [PATCH 22/23] Deleting the file...again --- .github/workflows/ci.yml | 75 ---------------------------------------- 1 file changed, 75 deletions(-) delete mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index dbf9672..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,75 +0,0 @@ -# .github/workflows/ci.yml -name: CI - -on: - push: - pull_request: - workflow_dispatch: - -jobs: - tests: - name: ${{ matrix.make.name }} (${{ matrix.os }}) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - rust: [stable] - make: - - name: Unit tests - task: "build test" - include: - - os: ubuntu-latest - sccache-path: /home/runner/.cache/sccache - env: - RUST_BACKTRACE: full - RUSTC_WRAPPER: sccache - RUSTV: ${{ matrix.rust }} - SCCACHE_CACHE_SIZE: 2G - SCCACHE_DIR: ${{ matrix.sccache-path }} - # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out - steps: - - uses: actions/checkout@v2 - - name: Install sccache (ubuntu-latest) - if: matrix.os == 'ubuntu-latest' - env: - LINK: https://github.com/mozilla/sccache/releases/download - SCCACHE_VERSION: 0.2.13 - run: | - SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl - mkdir -p $HOME/.local/bin - curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz - mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache - echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: Install Rust ${{ matrix.rust }} - uses: actions-rs/toolchain@v1 - with: - toolchain: ${{ matrix.rust }} - profile: minimal - override: true - - name: Cache cargo registry - uses: actions/cache@v2 - continue-on-error: false - with: - path: | - ~/.cargo/registry - ~/.cargo/git - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - restore-keys: | - ${{ runner.os }}-cargo- - - name: Save sccache - uses: actions/cache@v2 - continue-on-error: false - with: - path: ${{ matrix.sccache-path }} - key: ${{ runner.os }}-sccache-${{ hashFiles('**/Cargo.lock') }} - restore-keys: | - ${{ runner.os }}-sccache- - - name: Start sccache server - run: sccache --start-server - - name: ${{ matrix.make.name }} - run: cargo ${{ matrix.make.task }} - - name: Print sccache stats - run: sccache --show-stats - - name: Stop sccache server - run: sccache --stop-server || true From 3596a97cac49625f3df6dae4fa09770e0758e3fa Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 18 Mar 2024 23:42:39 -0500 Subject: [PATCH 23/23] Trying to clear a warning from github --- .github/workflows/rusty-neat-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rusty-neat-tests.yml b/.github/workflows/rusty-neat-tests.yml index 5058aaa..15f279a 100644 --- a/.github/workflows/rusty-neat-tests.yml +++ b/.github/workflows/rusty-neat-tests.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Cache uses: actions/cache@v4.0.1 with: