From f1caa35f2ce407500cffca2def5e78faacb13e48 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Wed, 13 Mar 2024 00:45:32 -0500
Subject: [PATCH 01/23] Making some changes to make room for paired-ended
 reads.

---
 Cargo.toml               |   2 +-
 src/main.rs              |  35 +++++----
 src/utils.rs             |   3 +-
 src/utils/fasta_tools.rs |  14 ++--
 src/utils/make_reads.rs  |  96 +++++++++++++++++++------
 src/utils/mutate.rs      | 152 +++++++++++++++++++++++----------------
 src/utils/vcf_tools.rs   |   7 ++
 7 files changed, 201 insertions(+), 108 deletions(-)
 create mode 100644 src/utils/vcf_tools.rs
diff --git a/Cargo.toml b/Cargo.toml
index 8430db1..1efe611 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,4 +12,4 @@ simplelog = "0.12.1"
 serde_yaml = "0.9.32"
 clap = { version = "4.5.1", features = ["derive"] }
 itertools = "0.12.1"
-assert_fs = "1.1.1"
+assert_fs = "1.1.1"
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
index 43cba6f..bbe0f8d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -32,9 +32,11 @@ fn main() {
     let mut rng = thread_rng();
 
     info!("Begin processing");
-
+    // parse the arguments from the command line
     let args = cli::Cli::parse();
 
+    // set up the config struct based on whether there was an input config. Input config
+    // overrides any other inputs.
     let config = if args.config != "" {
         info!("Using Configuration file input: {}", &args.config);
         read_config_yaml(args.config)
@@ -44,23 +46,28 @@ fn main() {
         Ok(build_config_from_args(args).expect("Problem reading configuration yaml file"))
     }.unwrap();
 
+    // Create the prefix of the files to write
     let output_file = format!("{}/{}", config.output_dir, config.output_prefix);
 
-    info!("Mapping fasta file: {}", &config.reference);
+    // Reading the reference file into memory
+    info!("Mapping reference fasta file: {}", &config.reference);
     let (fasta_map, fasta_order) = read_fasta(&config.reference);
-    // todo:
-    // need to add this twice, produce two mutated fastas, or at least 2 separate mutation
-    // datasets, each with half the mutation rate. Going to mean twice as much memory needed for
-    // fasta creation, which isn't ideal
-    info!("Mutating fasta");
-    let mutated_map: Box<HashMap<String, Vec<Vec<u8>>>> = mutate_fasta(
+
+    // Mutating the reference and recoring the variant locations.
+    info!("Mutating reference.");
+    let (mutated_map, variant_locations) = mutate_fasta(
         &fasta_map,
-        config.ploidy,
         &mut rng
     );
 
-    info!("Outputting fasta files");
-    if config.produce_fasta == true {
+    if config.produce_vcf {
+        info!("Writing vcf file");
+        todo!();
+        // write_vcf(&variant_locations, &fasta_order, &config.ploidy, &output_file, &mut rng)
+    }
+
+    if config.produce_fasta {
+        info!("Outputting fasta file");
         write_fasta(
             &mutated_map,
             &fasta_order,
@@ -70,16 +77,18 @@ fn main() {
     }
 
     let mut read_sets: HashSet<Vec<u8>> = HashSet::new();
-    for (_name, sequences) in mutated_map.iter() {
+    for (_name, sequence) in mutated_map.iter() {
         // defined as a set of read sequences that should cover
         // the mutated sequence `coverage` number of times
         let data_set = generate_reads(
-            &sequences,
+            sequence,
             &config.read_len,
             &config.coverage,
+            config.paired_ended,
             &mut rng
         );
 
+        // todo: we need to keep track of these reads better for paired endedness
         read_sets.extend(*data_set);
     }
 
diff --git a/src/utils.rs b/src/utils.rs
index 9bd2c47..04d2f1f 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -4,4 +4,5 @@ pub mod config;
 pub mod cli;
 pub mod make_reads;
 pub mod mutate;
-pub mod fastq_tools;
\ No newline at end of file
+pub mod fastq_tools;
+pub mod vcf_tools;
\ No newline at end of file
diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs
index 6cfbfb6..58c9ddb 100644
--- a/src/utils/fasta_tools.rs
+++ b/src/utils/fasta_tools.rs
@@ -86,7 +86,7 @@ pub fn read_fasta(
 }
 
 pub fn write_fasta(
-    fasta_output: &Box<HashMap<String, Vec<Vec<u8>>>>,
+    fasta_output: &Box<HashMap<String, Vec<u8>>>,
     fasta_order: &Vec<String>,
     output_file: &str,
     ploidy: usize
@@ -99,12 +99,12 @@ pub fn write_fasta(
         info!("Writing file: {}", this_fasta);
         let mut outfile = File::options().create(true).append(true).open(this_fasta)?;
         for contig in fasta_order {
-            let sequences = &fasta_output[contig];
+            let sequence = &fasta_output[contig];
             // Write contig name
             writeln!(&mut outfile, ">{}", contig)?;
             // write sequences[ploid] to this_fasta
             let mut i = 0;
-            let sequence_to_write: &Vec<u8> = &sequences[ploid];
+            let sequence_to_write: &Vec<u8> = &sequence;
             while i < sequence_to_write.len() {
                 let mut line = String::new();
                 let mut max: usize = 70;
@@ -144,11 +144,9 @@ fn test_read_fasta() {
 fn test_write_fasta() -> Result<(), Box<dyn error::Error>> {
 
     let seq1: Vec<u8> = vec![0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1];
-    let seq2: Vec<u8> = vec![0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,2];
-    let sequences = vec![seq1, seq2];
-    let fasta_output: HashMap<String, Vec<Vec<u8>>> = HashMap::from([
-        (String::from("H1N1_HA"), sequences)
-    ]);
+    let fasta_output: HashMap<String, Vec<u8>> = HashMap::from(
+        (String::from("H1N1_HA"), seq1)
+    );
     let fasta_pointer = Box::new(fasta_output);
     let fasta_order = vec![String::from("H1N1_HA")];
     let output_file = "test";
diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs
index 4f70b23..2a67ab5 100644
--- a/src/utils/make_reads.rs
+++ b/src/utils/make_reads.rs
@@ -8,10 +8,19 @@ fn cover_dataset(
     coverage: usize,
     rng: &mut ThreadRng,
 ) -> Vec<(usize, usize)> {
-    // This function selects the positions of the reads. It starts at the beginning and goes out
-    // one read length, then picks a random jump between 0 and half the read length to move
-    // And picks those coordinates for a second read. Once the tail of the read is past the end,
-    // we start over again at 0.
+    /*
+    Takes:
+        span_length: Total number of bases in the sequence
+        read_length: The length of the reads for this run
+        coverage: The coverage depth for the reads
+    Returns:
+        A vector of tuples (usize, usize), denoting the start and end positions of the reads
+
+    This function selects the positions of the reads. It starts at the beginning and goes out
+    one read length, then picks a random jump between 0 and half the read length to move
+    And picks those coordinates for a second read. Once the tail of the read is past the end,
+    we start over again at 0.
+     */
     // todo currently the reads look a little light so we may need to improve this calculation.
     let mut read_set: Vec<(usize, usize)> = vec![];
     let mut start: usize = 0;
@@ -31,34 +40,75 @@ fn cover_dataset(
     read_set
 }
 
+fn complement(nucleotide: u8) -> u8 {
+    /*
+    0 = A, 1 = C, 2 = G, 3 = T,
+    matches with the complement of each nucleotide.
+
+    Todo: make this part of a struct to standardize across the program.
+     */
+    return match nucleotide {
+        0 => 3,
+        1 => 2,
+        2 => 1,
+        3 => 0,
+        _ => 4,
+    }
+}
+
+fn reverse_complement(sequence: &Vec<u8>) -> Vec<u8> {
+    /*
+    Returns the reverse complement of a vector of u8's representing a DNA sequence.
+     */
+    let length = sequence.len();
+    let mut rev_comp = Vec::new();
+    for i in (0..length).rev() {
+        rev_comp.push(complement(sequence[i]))
+    }
+    rev_comp
+}
+
 pub fn generate_reads(
-    mutated_sequences: &Vec<Vec<u8>>,
+    mutated_sequence: &Vec<u8>,
     read_length: &usize,
     coverage: &usize,
-    rng: &mut ThreadRng,
+    paired_ended: bool,
+    mut rng: &mut ThreadRng,
 ) -> Box<HashSet<Vec<u8>>> {
-    // Infer ploidy
-    let ploidy = mutated_sequences.len();
-    // Need X/ploidy reads per ploid, but probably fewer
-    let coverage_per_ploid = coverage / ploidy + 1;
+    /*
+    Takes:
+        mutated_sequence: a vector of u8's representing the mutated sequence.
+        read_length: the length ef the reads for this run
+        coverage: the average depth of coverage for this run
+        rng: the random number generator for the run
+    Returns:
+        HashSet of vectors representing the read sequences, stored on the heap in box.
+
+    This takes a mutated sequence and produces a set of reads based on the mutated sequence. For
+    paired ended reads, this will generate a set of reads from each end, by taking the reverse
+    complement int the output
+     */
+
+    if paired_ended {
+        let rev_comp = reverse_complement(mutated_sequence);
+        todo!();
+    }
 
     // set up some defaults and storage
     let mut read_set: HashSet<Vec<u8>> = HashSet::new();
 
-    for i in 0..ploidy {
-        let seq_len = *(&mutated_sequences[i].len());
-        // Generate a vector of read positions
-        let read_positions: Vec<(usize, usize)> = cover_dataset(
-            seq_len,
-            *read_length,
-            coverage_per_ploid,
-            rng,
-        );
+    let seq_len = mutated_sequence.len();
+    // Generate a vector of read positions
+    let read_positions: Vec<(usize, usize)> = cover_dataset(
+        seq_len,
+        *read_length,
+        *coverage,
+        &mut rng,
+    );
 
-        // Generate the reads from the read positions.
-        for (start, end) in read_positions {
-            read_set.insert(mutated_sequences[i][start..end].into());
-        }
+    // Generate the reads from the read positions.
+    for (start, end) in read_positions {
+        read_set.insert(mutated_sequence[start..end].into());
     }
     // puts the reads in the heap.
     Box::new(read_set)
diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs
index 6472dce..dd0d3ca 100644
--- a/src/utils/mutate.rs
+++ b/src/utils/mutate.rs
@@ -54,84 +54,98 @@ impl NucModel {
 
 pub fn mutate_fasta(
     file_struct: &HashMap<String, Vec<u8>>,
-    ploidy: usize,
     mut rng: &mut ThreadRng
-) -> Box<HashMap<String, Vec<Vec<u8>>>> {
-    // This takes a hashmap of contig names (keys) and a vector representing the reference sequence.
-    // It performs a basic calculation (length x mutation rate) and chooses that many positions
-    // along the sequence to mutate. It then builds a return string that represents the altered
-    // sequence.
+) -> (Box<HashMap<String, Vec<u8>>>, Box<HashMap<String, Vec<(usize, u8)>>>) {
+    /*
+    Takes:
+        file_struct: a hashmap of contig names (keys) and a vector
+            representing the reference sequence.
+        ploidy: The number of copies of the genome within an organism's cells
+        rng: random number generator for the run
 
-    // Todo: potentially re-write this to simple record the positions. Maybe position and length.
-    // and which ploid. Need to do this once for each ploid.
-    const MUT_RATE: f64 = 0.01; // will update this with something more elaborate later.
-    // Generate random proportions to give the ploids, to keep things interesting.
-    let mut proportions: Vec<f64> = Vec::with_capacity(ploidy);
-    // For ploidy = N, we generate N random values, then divide them by the sum of the values, this
-    // gives N random numbers whose sum = 1, to randomly assign proportion to the ploids. We may
-    // find there is a better model later.
-    for _ in 0..ploidy {
-        proportions.push((&mut rng).gen::<f64>());
-    }
-    // Normalize my randos (divide by their total)
-    let total: f64 = proportions.iter().sum();
-    proportions = proportions.iter().map(|x| x/total).collect();
-    // This leaves a list of numbers that add up to 1, since these are proportions of 1, will
-    // ensure our mutation rate of each ploid adds up to the total MUT_RATE
-    let ploid_mut_rate: Vec<f64> = proportions.iter().map(|x| x*MUT_RATE).collect();
+    Returns:
+        A tuple with pointers to:
+          A hashmap with keys that are contig names and a vector with the mutated sequence
+          A vector of tuples containing the location and alt of each variant
 
-    let mut return_struct: HashMap<String, Vec<Vec<u8>>> = HashMap::new(); // the mutated sequences
-    for (name, sequence) in file_struct {
-        let mut strands: Vec<Vec<u8>> = Vec::new();
-        for ploid in 0..ploidy {
-            let sequence_length = sequence.len();
-            debug!("Sequence {} is {} bp long", name, sequence_length);
+    This function performs a basic calculation (length x mutation rate +/- a random amount)
+    and chooses that many positions along the sequence to mutate. It then builds a return
+    string that represents the altered sequence and stores all the variants.
+     */
+    const MUT_RATE: f64 = 0.01; // will update this with something more elaborate later.
+    let mut return_struct: HashMap<String, Vec<u8>> = HashMap::new(); // the mutated sequences
 
-            // Clone the reference
-            let mut mutated_record: Vec<u8> = sequence.clone();
+    // hashmap with keys of the contig names with a list of positions and alts under the contig.
+    let mut all_variants: HashMap<String, Vec<(usize, u8)>> = HashMap::new();
 
-            // Calculate how many mutations to add
-            let num_positions = (
-                sequence_length as f64 * ploid_mut_rate[ploid]
-            ).round() as usize;
-            if num_positions == 0 {
-                if rng.gen_bool(1.0/(ploidy as f64)) == true {
-                    mutated_record = mutate_sequence(
-                        mutated_record, 1, sequence_length, &mut rng
-                    );
-                }
-            } else {
-                mutated_record = mutate_sequence(
-                    mutated_record, num_positions, sequence_length, &mut rng
-                );
-            }
-            strands.push(mutated_record.clone());
+    for (name, sequence) in file_struct {
+        // Mutations for this contig
+        let contig_mutations: Vec<(usize, u8)> = Vec::new();
+        // The length of this sequence
+        let sequence_length = sequence.len();
+        debug!("Sequence {} is {} bp long", name, sequence_length);
+        // Clone the reference to create mutations
+        let mut mutated_record: Vec<u8> = sequence.clone();
+        // Calculate how many mutations to add
+        let mut num_positions = sequence_length as f64 * MUT_RATE;
+        // Add or subtract a few extra positions.
+        num_positions += {
+            // A random amount up to 10% of the reads
+            let factor = rng.gen() * 0.10;
+            // 25% of the time subtract, otherwise we'll add.
+            let sign = if rng.bool(0.25) { -1 } else { 1 };
+            // add or subtract up to 10% of the reads.
+            num_positions * (sign * factor)
+        };
+        // round the number of positions to the nearest usize.
+        num_positions = num_positions.round() as usize;
+        // If we somehow ended up with negative or no reads, we may still randomly add one mutation.
+        if num_positions <= 0 {
+            // we want to add at least 1 mutation to each contig
+            (mutated_record, contig_mutations) = mutate_sequence(
+                mutated_record, 1, &mut rng
+            );
+        } else {
+            // else add num positions number of variants
+            (mutated_record, contig_mutations) = mutate_sequence(
+                mutated_record, num_positions, &mut rng
+            );
         }
-        // Add to the new hashmap
-        return_struct.entry(name.clone()).or_insert(strands.clone());
+        // Add to the return struct and variants map.
+        return_struct.entry(name.clone()).or_insert(mutated_record.clone());
+        all_variants.entry(name.clone()).or_insert(contig_mutations);
     }
 
-    Box::new(return_struct)
+    (Box::new(return_struct), Box::new(all_variants))
 }
 
 fn mutate_sequence(
     sequence: Vec<u8>,
     num_positions: usize,
-    length: usize,
     mut rng: &mut ThreadRng
-) -> Vec<u8> {
+) -> (Vec<u8>, Vec<(usize, u8)>) {
     /*
-    Takes a vector of u8's and mutate a few positions at random.
+    Takes:
+        sequence: A u8 vector representing a sequence of DNA
+        num_positions: The number of mutations to add to this sequence
+        rng: random number generator for the run
+
+    returns a tuple with:
+        Vec<u8> = the sequence itself
+        Vec(usize, u8) = the position of the snp and the alt allele for that snp.
+
+    Takes a vector of u8's and mutate a few positions at random. Returns the mutated sequence and
+    a list of tuples with the position and the alts of the SNPs.
      */
     debug!("Adding {} mutations", num_positions);
     let mut mutated_record = sequence.clone();
     // Randomly select num_positions from positions, weighted by gc bias and whatever. For now
     // all he weights are just equal.
-    let weights = vec![1; length];
+    let weights = vec![1; mutated_record.len()];
     // zip together weights and positions
     let mut weighted_positions: Vec<(usize, i32)> = Vec::new();
     // izip!() accepts iterators and/or values with IntoIterator.
-    for (x, y) in izip!(&mut (0..length), &weights) {
+    for (x, y) in izip!(&mut (0..mutated_record.len()), &weights) {
         weighted_positions.push((x, *y))
     }
     // now choose a random selection of num_positions without replacement
@@ -140,19 +154,33 @@ fn mutate_sequence(
         .unwrap()
         .collect::<Vec<_>>();
     // Build mutation model
-    let mut_a = NucModel::from(0, vec![0, 1, 1, 1]);
-    let mut_c = NucModel::from(1, vec![1, 0, 1, 1]);
-    let mut_g = NucModel::from(2, vec![1, 1, 0, 1]);
-    let mut_t = NucModel::from(3, vec![1, 1, 1, 0]);
+    /*
+    Using NEAT's original mutation model of
+    [[0.0, 0.15, 0.7, 0.15],
+     [0.15, 0.0, 0.15, 0.7],
+     [0.7, 0.15, 0.0, 0.15],
+     [0.15, 0.7, 0.15, 0.0]]
 
-    for (index, _) in mutated_elements {
+     multiply each value by 20, which produces integer values
+     */
+    let mut_a = NucModel::from(0, vec![0, 3, 14, 3]);
+    let mut_c = NucModel::from(1, vec![3, 0, 3, 14]);
+    let mut_g = NucModel::from(2, vec![14, 3, 0, 3]);
+    let mut_t = NucModel::from(3, vec![3, 14, 3, 0]);
+
+    // Will hold the variants added to this sequence
+    let mut sequence_variants: Vec<(usize, u8)> = Vec::new();
+    // for each index, picks a new base
+    for (index, _weight) in mutated_elements {
         mutated_record[*index] = match sequence[*index] {
             0 => mut_a.choose_new_nuc(&mut rng),
             1 => mut_c.choose_new_nuc(&mut rng),
             2 => mut_g.choose_new_nuc(&mut rng),
             3 => mut_t.choose_new_nuc(&mut rng),
             _ => 4
-        }
+        };
+        // add the location and alt base for the variant
+        sequence_variants.push((*index, mutated_record[index]))
     }
-    mutated_record
+    (mutated_record, sequence_variants)
 }
\ No newline at end of file
diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs
new file mode 100644
index 0000000..7df4bf0
--- /dev/null
+++ b/src/utils/vcf_tools.rs
@@ -0,0 +1,7 @@
+pub fn write_vcf(
+    variant_locations,
+    fasta_order,
+    ploidy,
+    output_file_prefix,
+    mut rng
+) -> Result<(), >
\ No newline at end of file

From 24cd2781ffb0746b18b45dc30f533ce092c0e996 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Wed, 13 Mar 2024 14:07:39 -0500
Subject: [PATCH 02/23] Add vcf writer to rusty-neat

---
 src/main.rs              |  14 +++--
 src/utils/cli.rs         |   2 +-
 src/utils/config.rs      |   5 +-
 src/utils/fasta_tools.rs |  61 ++++++++++++----------
 src/utils/mutate.rs      |  16 +++---
 src/utils/vcf_tools.rs   | 107 ++++++++++++++++++++++++++++++++++++---
 6 files changed, 154 insertions(+), 51 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index bbe0f8d..662fb55 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -19,6 +19,7 @@ use utils::config::{read_config_yaml, build_config_from_args};
 use utils::mutate::mutate_fasta;
 use utils::make_reads::generate_reads;
 use utils::fastq_tools::write_fastq;
+use utils::vcf_tools::write_vcf;
 
 fn main() {
 
@@ -53,7 +54,7 @@ fn main() {
     info!("Mapping reference fasta file: {}", &config.reference);
     let (fasta_map, fasta_order) = read_fasta(&config.reference);
 
-    // Mutating the reference and recoring the variant locations.
+    // Mutating the reference and recording the variant locations.
     info!("Mutating reference.");
     let (mutated_map, variant_locations) = mutate_fasta(
         &fasta_map,
@@ -62,8 +63,14 @@ fn main() {
 
     if config.produce_vcf {
         info!("Writing vcf file");
-        todo!();
-        // write_vcf(&variant_locations, &fasta_order, &config.ploidy, &output_file, &mut rng)
+        write_vcf(
+            &fasta_map,
+            &variant_locations,
+            &fasta_order,
+            config.ploidy,
+            &config.reference,
+            &output_file,
+            &mut rng).expect("Error writing vcf file")
     }
 
     if config.produce_fasta {
@@ -72,7 +79,6 @@ fn main() {
             &mutated_map,
             &fasta_order,
             &output_file,
-            config.ploidy
         ).expect("Problem writing fasta file");
     }
 
diff --git a/src/utils/cli.rs b/src/utils/cli.rs
index dfe4d77..e61d1ff 100644
--- a/src/utils/cli.rs
+++ b/src/utils/cli.rs
@@ -19,7 +19,7 @@ pub struct Cli {
     read_length <usize> = the length of the reads in the output fastq file. Default = 150
     coverage <usize> = The average depth per read of the fastq files. Default = 10
      */
-    #[arg(short='y', long="configuration_yaml", default_value_t=String::new())]
+    #[arg(short='C', long="configuration_yaml", default_value_t=String::new())]
     pub config: String,
     #[arg(short='r', long="reference", default_value_t=String::from("data/H1N1.fa"))]
     pub reference: String,
diff --git a/src/utils/config.rs b/src/utils/config.rs
index d2272f6..12d2ac7 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -209,10 +209,7 @@ impl ConfigBuilder {
             }
         }
         if self.produce_fasta {
-            debug!("Producing fasta files:");
-            for ploid in 0..self.ploidy {
-                debug!("\t> {}_p{}.fasta", file_prefix, ploid+1);
-            }
+            debug!("Producing fasta file: {}.fasta", file_prefix);
         }
         if self.produce_vcf {
             debug!("Producing vcf file: {}.vcf", file_prefix)
diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs
index 58c9ddb..70833e6 100644
--- a/src/utils/fasta_tools.rs
+++ b/src/utils/fasta_tools.rs
@@ -35,6 +35,7 @@ pub fn num_to_char(nuc_num: u8) -> &'static str {
 #[allow(unused)]
 pub enum NucRepr {
     // I just discovered this repr type thing and need to investigate.
+    // todo figure out if we can use this idea to simplify things
     A = 0,
     C = 1,
     G = 2,
@@ -43,6 +44,7 @@ pub enum NucRepr {
 }
 
 pub fn sequence_array_to_string(input_array: &Vec<u8>) -> String {
+    // Converts a sequence vector into a string representing the DNA sequence
     let mut return_string = String::new();
     for num in input_array {
         return_string += num_to_char(*num);
@@ -53,7 +55,7 @@ pub fn sequence_array_to_string(input_array: &Vec<u8>) -> String {
 pub fn read_fasta(
     fasta_path: &str
 ) -> (Box<HashMap<String, Vec<u8>>>, Vec<String>) {
-    // Reads a fasta and turns it into a HashMap and puts it in the heap
+    // Reads a fasta file and turns it into a HashMap and puts it in the heap
     info!("Reading fasta: {}", fasta_path);
 
     let mut fasta_map: HashMap<String, Vec<u8>> = HashMap::new();
@@ -89,36 +91,39 @@ pub fn write_fasta(
     fasta_output: &Box<HashMap<String, Vec<u8>>>,
     fasta_order: &Vec<String>,
     output_file: &str,
-    ploidy: usize
 ) -> io::Result<()> {
+    /*
+    Takes:
+        fasta_output: the hashmap of mutated sequences with contig names
+        fasta_order: A vector with the proper order for the fasta elements
+        output_file: the prefix for the output file name
+        ploidy: the number of copies of each chromosome the simulation is using
+    Returns:
+        Errors if there is a problem writing the file, otherwise it returns nothing.
+     */
     // writing fasta output to files
-    let mut fasta_files: Vec<String> = Vec::new();
-    for ploid in 0..ploidy {
-        let this_fasta = format!("{}_p{}.fasta", output_file, ploid+1);
-        fasta_files.push(this_fasta.clone());
-        info!("Writing file: {}", this_fasta);
-        let mut outfile = File::options().create(true).append(true).open(this_fasta)?;
-        for contig in fasta_order {
-            let sequence = &fasta_output[contig];
-            // Write contig name
-            writeln!(&mut outfile, ">{}", contig)?;
-            // write sequences[ploid] to this_fasta
-            let mut i = 0;
-            let sequence_to_write: &Vec<u8> = &sequence;
-            while i < sequence_to_write.len() {
-                let mut line = String::new();
-                let mut max: usize = 70;
-                let this_length: usize = sequence_to_write[i..].len();
-                // If we don't have 70 characters, write out whatever is left.
-                if this_length < 70 {
-                    max = this_length
-                }
-                for j in 0..max {
-                    line += num_to_char(sequence_to_write[i + j]);
-                }
-                writeln!(&mut outfile, "{}", line)?;
-                i += 70;
+    let output_fasta = format!("{}.fasta", output_file);
+    let mut outfile = File::options().create_new(true).append(true).open(output_fasta)?;
+    for contig in fasta_order {
+        let sequence = &fasta_output[contig];
+        // Write contig name
+        writeln!(&mut outfile, ">{}", contig)?;
+        // write sequences[ploid] to this_fasta
+        let mut i = 0;
+        let sequence_to_write: &Vec<u8> = &sequence;
+        while i < sequence_to_write.len() {
+            let mut line = String::new();
+            let mut max: usize = 70;
+            let this_length: usize = sequence_to_write[i..].len();
+            // If we don't have 70 characters, write out whatever is left.
+            if this_length < 70 {
+                max = this_length
+            }
+            for j in 0..max {
+                line += num_to_char(sequence_to_write[i + j]);
             }
+            writeln!(&mut outfile, "{}", line)?;
+            i += 70;
         }
     };
     Ok(())
diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs
index dd0d3ca..c3ec9a3 100644
--- a/src/utils/mutate.rs
+++ b/src/utils/mutate.rs
@@ -80,25 +80,25 @@ pub fn mutate_fasta(
 
     for (name, sequence) in file_struct {
         // Mutations for this contig
-        let contig_mutations: Vec<(usize, u8)> = Vec::new();
+        let mut contig_mutations: Vec<(usize, u8)> = Vec::new();
         // The length of this sequence
         let sequence_length = sequence.len();
         debug!("Sequence {} is {} bp long", name, sequence_length);
         // Clone the reference to create mutations
         let mut mutated_record: Vec<u8> = sequence.clone();
         // Calculate how many mutations to add
-        let mut num_positions = sequence_length as f64 * MUT_RATE;
+        let mut rough_num_positions: f64 = sequence_length as f64 * MUT_RATE;
         // Add or subtract a few extra positions.
-        num_positions += {
+        rough_num_positions += {
             // A random amount up to 10% of the reads
-            let factor = rng.gen() * 0.10;
+            let factor: f64 = rng.gen::<f64>() * 0.10;
             // 25% of the time subtract, otherwise we'll add.
-            let sign = if rng.bool(0.25) { -1 } else { 1 };
+            let sign: f64 = if rng.gen_bool(0.25) { -1.0 } else { 1.0 };
             // add or subtract up to 10% of the reads.
-            num_positions * (sign * factor)
+            rough_num_positions * (sign * factor)
         };
         // round the number of positions to the nearest usize.
-        num_positions = num_positions.round() as usize;
+        let num_positions = rough_num_positions.round() as usize;
         // If we somehow ended up with negative or no reads, we may still randomly add one mutation.
         if num_positions <= 0 {
             // we want to add at least 1 mutation to each contig
@@ -180,7 +180,7 @@ fn mutate_sequence(
             _ => 4
         };
         // add the location and alt base for the variant
-        sequence_variants.push((*index, mutated_record[index]))
+        sequence_variants.push((*index, mutated_record[*index]))
     }
     (mutated_record, sequence_variants)
 }
\ No newline at end of file
diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs
index 7df4bf0..6261920 100644
--- a/src/utils/vcf_tools.rs
+++ b/src/utils/vcf_tools.rs
@@ -1,7 +1,102 @@
+extern crate log;
+
+use std::collections::HashMap;
+use std::io;
+use std::io::Write;
+use std::fs::File;
+use rand::Rng;
+use rand::rngs::ThreadRng;
+use rand::seq::IndexedRandom;
+use utils::fasta_tools::num_to_char;
+
+fn genotype_to_string(genotype: Vec<usize>) -> String {
+    /*
+    Converts a vector of 0s and 1s representing genotype to a standard
+    vcf genotype string.
+     */
+    let mut geno_string = String::new();
+    for ploid in genotype {
+        geno_string += &format!("{}/", ploid.to_string())
+    }
+    geno_string.strip_suffix("/").unwrap().to_string()
+}
+
 pub fn write_vcf(
-    variant_locations,
-    fasta_order,
-    ploidy,
-    output_file_prefix,
-    mut rng
-) -> Result<(), >
\ No newline at end of file
+    reference_sequence: &HashMap<String, Vec<u8>>,
+    variant_locations: &HashMap<String, Vec<(usize, u8)>>,
+    fasta_order: &Vec<String>,
+    ploidy: usize,
+    reference: &str,
+    output_file_prefix: &str,
+    mut rng: &mut ThreadRng,
+) -> io::Result<()> {
+    /*
+    Takes:
+        reference_sequence: A map of the contig names keyed to a u8 vector representing the sequence
+        variant_locations: A map of contig names keyed to lists of variants in that contig
+            consisting of a tuple of (position, alt base).
+        fasta_order: A vector of contig names in the order of the reference fasta.
+        ploidy: The number of copies of each chromosome present in the organism
+        output_file_prefix: The path to the directory and the prefix to use for filenames
+        rng: A random number generator for this run
+    Result:
+        Throws and error if there's a problem, or else returns nothing.
+     */
+    // ploid numbers are used to pick a number of ploids to mutate
+    let ploid_numbers: Vec<usize> = (1..ploidy+1).collect();
+    // set the filename of the output vcf
+    let mut filename = format!("{}.vcf", output_file_prefix);
+    let mut outfile = File::options().create_new(true).append(true).open(&mut filename)?;
+    // add the vcf header
+    writeln!(&mut outfile, "##fileformat=VCFv4.1")?;
+    writeln!(&mut outfile, "##reference={}", reference)?;
+    writeln!(&mut outfile, "##Generated by rusty-neat")?;
+    writeln!(&mut outfile, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">")?;
+    writeln!(&mut outfile, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">")?;
+    writeln!(&mut outfile, "##INFO=<ID=VMX,Number=1,Type=String, Description=\"SNP is Missense in these Read Frames\">")?;
+    writeln!(&mut outfile, "##INFO=<ID=VNX,Number=1,Type=String, Description=\"SNP is Nonsense in these Read Frames\">")?;
+    writeln!(&mut outfile, "##INFO=<ID=VFX,Number=1,Type=String,Description=\"Indel Causes Frameshift\">")?;
+    writeln!(&mut outfile, "##ALT=<ID=DEL,Description=\"Deletion\">")?;
+    writeln!(&mut outfile, "##ALT=<ID=DUP,Description=\"Duplication\">")?;
+    writeln!(&mut outfile, "##ALT=<ID=INS,Description=\"Insertion of novel sequence\">")?;
+    writeln!(&mut outfile, "##ALT=<ID=INV,Description=\"Inversion\">")?;
+    writeln!(&mut outfile, "##ALT=<ID=CNV,Description=\"Copy number variable region\">")?;
+    writeln!(&mut outfile, "##ALT=<ID=TRANS,Description=\"Translocation\">")?;
+    writeln!(&mut outfile, "##ALT=<ID=INV-TRANS,Description=\"Inverted translocation\">")?;
+    writeln!(&mut outfile, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")?;
+    // Add a neat sample column
+    writeln!(&mut outfile, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNEAT_simulated_sample")?;
+    // insert mutations
+    for contig in fasta_order {
+        for mutation in &variant_locations[contig] {
+            // If we're going to mutate more than one ploid (i.e. homozygous
+            // for diploid organisms), we must add it to the list.
+            let mut genotype: Vec<usize> = vec![0; ploidy];
+            // We need to enumerate the index list for the genotype
+            let ploid_index: Vec<usize> = (0..ploidy).collect();
+            // By default we'll assume heterozygous (only on one ploid).
+            let mut num_ploids: usize = 1;
+            let is_multiploid = (&mut rng).gen_bool(0.001);
+            if is_multiploid {
+                num_ploids = *ploid_numbers.choose(&mut rng).unwrap();
+            }
+            for _ in 0..num_ploids {
+                // for each ploid that has the mutation, change one random
+                // genotype to 1, indicating the mutation is on that copy.
+                genotype[*ploid_index.choose(&mut rng).unwrap()] = 1
+            }
+            // Format the output line. Any fields without data will be a simple period. Quality
+            // is set to 37 for all these variants.
+            let line = format!("{}\t{}\t.\t{}\t{}\t37\tPASS\t.\tGT\t{}",
+                    contig,
+                    mutation.0 + 1,
+                    num_to_char(reference_sequence[contig][mutation.0]),
+                    num_to_char(mutation.1),
+                    genotype_to_string(genotype),
+                );
+
+            writeln!(&mut outfile, "{}", line)?;
+        }
+    };
+    Ok(())
+}
\ No newline at end of file

From e41a0aa5b1a4c4aad135f2feb3bcc1bc7c731d33 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Wed, 13 Mar 2024 14:26:20 -0500
Subject: [PATCH 03/23] adding fastq paired-ended option

---
 src/main.rs              |  1 +
 src/utils/fastq_tools.rs | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index bbe0f8d..a703bb9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -99,6 +99,7 @@ fn main() {
     info!("Writing fastq");
     write_fastq(
         &output_file,
+        config.paired_ended,
         *outsets,
     ).expect("Problem writing fastq file");
     info!("Processing complete")
diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs
index ca3fc96..440febc 100644
--- a/src/utils/fastq_tools.rs
+++ b/src/utils/fastq_tools.rs
@@ -6,6 +6,7 @@ use utils::fasta_tools::sequence_array_to_string;
 
 pub fn write_fastq(
     fastq_filename: &str,
+    paired_ended: bool,
     dataset: Vec<&Vec<u8>>,
 ) -> io::Result<()> {
     /*
@@ -17,8 +18,15 @@ pub fn write_fastq(
     // (Although this feature is currently untested and unknown).
     // (May need sorting.)
     let name_prefix = "neat_generated_".to_string();
-    let fastq_filename = String::from(fastq_filename) + "_r1.fastq";
+    // vector to hold the fastq filenames, since we don't know ahead of time if we need one or two
+    let mut fastq_files: Vec<String> = Vec::new();
+    fastq_files.push(String::from(fastq_filename) + "_r1.fastq");
+    if paired_ended {
+        fastq_files.push(String::from(fastq_filename) + "_r1.fastq");
+    }
+    // open the filename
     let mut outfile = File::options().create_new(true).append(true).open(fastq_filename)?;
+    // write out sequence by sequence using index to track the numbering
     let mut index = 1;
     for sequence in dataset {
         let sequence_len = sequence.len();

From b996dc373871e6659d2314e71b5ccb56fa5fafbe Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Wed, 13 Mar 2024 14:26:20 -0500
Subject: [PATCH 04/23] adding fastq paired-ended option

---
 src/main.rs              |  1 +
 src/utils/fastq_tools.rs | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index 662fb55..e7ad50e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -105,6 +105,7 @@ fn main() {
     info!("Writing fastq");
     write_fastq(
         &output_file,
+        config.paired_ended,
         *outsets,
     ).expect("Problem writing fastq file");
     info!("Processing complete")
diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs
index ca3fc96..440febc 100644
--- a/src/utils/fastq_tools.rs
+++ b/src/utils/fastq_tools.rs
@@ -6,6 +6,7 @@ use utils::fasta_tools::sequence_array_to_string;
 
 pub fn write_fastq(
     fastq_filename: &str,
+    paired_ended: bool,
     dataset: Vec<&Vec<u8>>,
 ) -> io::Result<()> {
     /*
@@ -17,8 +18,15 @@ pub fn write_fastq(
     // (Although this feature is currently untested and unknown).
     // (May need sorting.)
     let name_prefix = "neat_generated_".to_string();
-    let fastq_filename = String::from(fastq_filename) + "_r1.fastq";
+    // vector to hold the fastq filenames, since we don't know ahead of time if we need one or two
+    let mut fastq_files: Vec<String> = Vec::new();
+    fastq_files.push(String::from(fastq_filename) + "_r1.fastq");
+    if paired_ended {
+        fastq_files.push(String::from(fastq_filename) + "_r1.fastq");
+    }
+    // open the filename
     let mut outfile = File::options().create_new(true).append(true).open(fastq_filename)?;
+    // write out sequence by sequence using index to track the numbering
     let mut index = 1;
     for sequence in dataset {
         let sequence_len = sequence.len();

From fbd9eb71cecef9343256242cbd99197475d7adad Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Wed, 13 Mar 2024 17:28:21 -0500
Subject: [PATCH 05/23] Working on adding paired-ended

---
 src/main.rs              | 43 ++++++++++++++++++++--------------------
 src/utils/fastq_tools.rs |  6 ++++++
 src/utils/mutate.rs      |  9 +++++----
 3 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index e7ad50e..1c3acf3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -22,7 +22,6 @@ use utils::fastq_tools::write_fastq;
 use utils::vcf_tools::write_vcf;
 
 fn main() {
-
     TermLogger::init(
         LevelFilter::Trace,
         Config::default(),
@@ -61,6 +60,15 @@ fn main() {
         &mut rng
     );
 
+    if config.produce_fasta {
+        info!("Outputting fasta file");
+        write_fasta(
+            &mutated_map,
+            &fasta_order,
+            &output_file,
+        ).expect("Problem writing fasta file");
+    }
+
     if config.produce_vcf {
         info!("Writing vcf file");
         write_vcf(
@@ -73,15 +81,6 @@ fn main() {
             &mut rng).expect("Error writing vcf file")
     }
 
-    if config.produce_fasta {
-        info!("Outputting fasta file");
-        write_fasta(
-            &mutated_map,
-            &fasta_order,
-            &output_file,
-        ).expect("Problem writing fasta file");
-    }
-
     let mut read_sets: HashSet<Vec<u8>> = HashSet::new();
     for (_name, sequence) in mutated_map.iter() {
         // defined as a set of read sequences that should cover
@@ -98,16 +97,18 @@ fn main() {
         read_sets.extend(*data_set);
     }
 
-    info!("Shuffling output fastq data");
-    let mut outsets: Box<Vec<&Vec<u8>>> = Box::new(read_sets.iter().collect());
-    outsets.shuffle(&mut rng);
-
-    info!("Writing fastq");
-    write_fastq(
-        &output_file,
-        config.paired_ended,
-        *outsets,
-    ).expect("Problem writing fastq file");
-    info!("Processing complete")
+    if config.produce_fastq {
+        info!("Shuffling output fastq data");
+        let mut outsets: Box<Vec<&Vec<u8>>> = Box::new(read_sets.iter().collect());
+        outsets.shuffle(&mut rng);
+
+        info!("Writing fastq");
+        write_fastq(
+            &output_file,
+            config.paired_ended,
+            *outsets,
+        ).expect("Problem writing fastq file");
+        info!("Processing complete")
+    }
 }
 
diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs
index 440febc..f545ba2 100644
--- a/src/utils/fastq_tools.rs
+++ b/src/utils/fastq_tools.rs
@@ -10,6 +10,12 @@ pub fn write_fastq(
     dataset: Vec<&Vec<u8>>,
 ) -> io::Result<()> {
     /*
+    Takes:
+        fastq_filename: prefix for the output fastq files
+        dataset: List of u8 vectors representing dna sequences.
+    returns:
+        Error if there is a problem or else nothing.
+
     Writes fastq files. At the moment, it only writes out single r1 file, but will eventually write
     out r1 and r2 files.
      */
diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs
index c3ec9a3..1b8b252 100644
--- a/src/utils/mutate.rs
+++ b/src/utils/mutate.rs
@@ -123,7 +123,7 @@ fn mutate_sequence(
     sequence: Vec<u8>,
     num_positions: usize,
     mut rng: &mut ThreadRng
-) -> (Vec<u8>, Vec<(usize, u8)>) {
+) -> (Vec<u8>, Vec<(usize, u8, u8)>) {
     /*
     Takes:
         sequence: A u8 vector representing a sequence of DNA
@@ -169,10 +169,11 @@ fn mutate_sequence(
     let mut_t = NucModel::from(3, vec![3, 14, 3, 0]);
 
     // Will hold the variants added to this sequence
-    let mut sequence_variants: Vec<(usize, u8)> = Vec::new();
+    let mut sequence_variants: Vec<(usize, u8, u8)> = Vec::new();
     // for each index, picks a new base
     for (index, _weight) in mutated_elements {
-        mutated_record[*index] = match sequence[*index] {
+        let reference_base = sequence[*index];
+        mutated_record[*index] = match reference_base {
             0 => mut_a.choose_new_nuc(&mut rng),
             1 => mut_c.choose_new_nuc(&mut rng),
             2 => mut_g.choose_new_nuc(&mut rng),
@@ -180,7 +181,7 @@ fn mutate_sequence(
             _ => 4
         };
         // add the location and alt base for the variant
-        sequence_variants.push((*index, mutated_record[*index]))
+        sequence_variants.push((*index, mutated_record[*index],  *reference_base))
     }
     (mutated_record, sequence_variants)
 }
\ No newline at end of file

From ebc81aa90e98aebe4d4f2ae271273d38b668a9bc Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 14 Mar 2024 13:20:05 -0500
Subject: [PATCH 06/23] Added paired ended fragment generater to generate reads

---
 Cargo.lock              |  18 +++++++
 Cargo.toml              |   3 +-
 src/main.rs             |   5 +-
 src/utils/config.rs     |   8 ++--
 src/utils/make_reads.rs | 101 +++++++++++++++++++++++++++++++++-------
 src/utils/mutate.rs     |  15 +++---
 src/utils/vcf_tools.rs  |   8 ++--
 7 files changed, 121 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 15db81d..e40cda8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -407,6 +407,12 @@ version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
 
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -438,6 +444,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a"
 dependencies = [
  "autocfg",
+ "libm",
 ]
 
 [[package]]
@@ -582,6 +589,16 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "rand_distr"
+version = "0.5.0-alpha.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc403db446a1a58a404b9944a91c4adf7d99b0397429c998f0394d751ac98eff"
+dependencies = [
+ "num-traits",
+ "rand 0.9.0-alpha.0",
+]
+
 [[package]]
 name = "rdrand"
 version = "0.4.0"
@@ -631,6 +648,7 @@ dependencies = [
  "itertools",
  "log",
  "rand 0.9.0-alpha.0",
+ "rand_distr",
  "serde_yaml",
  "simplelog",
  "xorshift",
diff --git a/Cargo.toml b/Cargo.toml
index 1efe611..6ef81fe 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,4 +12,5 @@ simplelog = "0.12.1"
 serde_yaml = "0.9.32"
 clap = { version = "4.5.1", features = ["derive"] }
 itertools = "0.12.1"
-assert_fs = "1.1.1"
\ No newline at end of file
+assert_fs = "1.1.1"
+rand_distr = "0.5.0-alpha.0"
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
index 1c3acf3..b1a380d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,6 +3,8 @@ extern crate clap;
 extern crate log;
 extern crate simplelog;
 extern crate serde_yaml;
+extern crate rand_distr;
+extern crate itertools;
 
 mod utils;
 
@@ -72,7 +74,6 @@ fn main() {
     if config.produce_vcf {
         info!("Writing vcf file");
         write_vcf(
-            &fasta_map,
             &variant_locations,
             &fasta_order,
             config.ploidy,
@@ -90,6 +91,8 @@ fn main() {
             &config.read_len,
             &config.coverage,
             config.paired_ended,
+            config.fragment_mean,
+            config.fragment_st_dev,
             &mut rng
         );
 
diff --git a/src/utils/config.rs b/src/utils/config.rs
index 12d2ac7..27befc3 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -48,7 +48,7 @@ pub struct RunConfiguration {
     pub output_prefix: String,
 }
 
-#[warn(dead_code)]
+#[allow(dead_code)]
 impl RunConfiguration {
     // The purpose of this function is to redirect you to the ConfigBuilder
     pub fn build() -> ConfigBuilder {
@@ -99,7 +99,7 @@ impl ConfigBuilder {
             fragment_mean: 300.0,
             fragment_st_dev: 30.0,
             produce_fastq: true,
-            produce_fasta: true,
+            produce_fasta: false,
             produce_vcf: false,
             produce_bam: false,
             output_dir: current_dir,
@@ -241,7 +241,7 @@ impl ConfigBuilder {
     }
 }
 
-pub fn read_config_yaml(yaml: String) -> Result<Box<RunConfiguration>, &'static str> {
+pub fn read_config_yaml(yaml: String) -> Result<Box<RunConfiguration>, ConfigError> {
     /*
     Reads an input configuration file from yaml using the serde package. Then sets the parameters
     based on the inputs. A "." value means to use the default value.
@@ -251,7 +251,7 @@ pub fn read_config_yaml(yaml: String) -> Result<Box<RunConfiguration>, &'static
     let f = std::fs::File::open(yaml);
     let file = match f {
         Ok(l) => l,
-        Err(_) => { return Err("problem reading configuration file"); },
+        Err(_) => return Err(ConfigError::FileReadError),
     };
     // Uses serde_yaml to read the file into a HashMap
     let scrape_config: HashMap<String, String> = serde_yaml::from_reader(file)
diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs
index 2a67ab5..efe351d 100644
--- a/src/utils/make_reads.rs
+++ b/src/utils/make_reads.rs
@@ -1,40 +1,98 @@
-use std::collections::HashSet;
+use std::collections::{HashSet, VecDeque};
+
 use rand::RngCore;
 use rand::rngs::ThreadRng;
+use rand::seq::SliceRandom;
+use rand_distr::{Normal, Distribution};
 
 fn cover_dataset(
     span_length: usize,
     read_length: usize,
+    mut fragment_pool: Vec<usize>,
     coverage: usize,
-    rng: &mut ThreadRng,
+    mut rng: &mut ThreadRng,
 ) -> Vec<(usize, usize)> {
     /*
     Takes:
         span_length: Total number of bases in the sequence
         read_length: The length of the reads for this run
+        fragment_pool: a vector of sizes for the fragments. If empty, it will instead be filled
+            by the read_length (single ended reads)
+        paired_ended: true or false if the run is paired ended mode or not.
         coverage: The coverage depth for the reads
     Returns:
-        A vector of tuples (usize, usize), denoting the start and end positions of the reads
+        A vector of tuples (usize, usize), denoting the start and end positions of the fragment of
+        DNA that was sequenced.
 
     This function selects the positions of the reads. It starts at the beginning and goes out
     one read length, then picks a random jump between 0 and half the read length to move
     And picks those coordinates for a second read. Once the tail of the read is past the end,
     we start over again at 0.
      */
-    // todo currently the reads look a little light so we may need to improve this calculation.
+    // Reads that will be start and end of the fragment.
     let mut read_set: Vec<(usize, usize)> = vec![];
+
+    let mut cover_fragment_pool: VecDeque<usize>;
+    if fragment_pool.is_empty() {
+        // set the shuffled fragment pool just equal to an instance of read_length
+        cover_fragment_pool = VecDeque::from([read_length]);
+    } else {
+        // shuffle the fragment pool
+        fragment_pool.shuffle(&mut rng);
+        cover_fragment_pool = VecDeque::from(fragment_pool)
+    }
+
+    // Gap size to keep track of how many uncovered bases we have per layer, to help decide if we
+    // need more layers
+    let mut gap_size: usize = 0;
+    let mut layer_count: usize = 0;
+
+    // start this party off at zero.
     let mut start: usize = 0;
-    'outer: for _ in 0..coverage {
-        while start < span_length {
-            let temp_end = start+read_length;
-            if temp_end > span_length {
-                start = 0;
-                continue 'outer;
+
+    while layer_count <= coverage {
+        let fragment_length = cover_fragment_pool[0];
+        cover_fragment_pool.push_back(fragment_length);
+        let temp_end = start+fragment_length;
+        if temp_end > span_length {
+            // TODO some variation on this modulo idea will work for bacterial reads
+            start = temp_end % span_length;
+            gap_size += start;
+            //
+            if gap_size >= span_length {
+                // if we have accumulated enough gap, then we need to run the same layer again.
+                // We'll reset gap size but not increment layer_count.
+                gap_size = gap_size % span_length;
+                continue;
+            } else {
+                layer_count += 1;
+                continue
             }
-            read_set.push((start, temp_end));
-            // Picks a number between zero and half of a read length
-            let wildcard: usize = (rng.next_u32() % (read_length/4) as u32) as usize;
-            start += read_length + wildcard; // adds to the start to give it some spice
+        }
+        read_set.push((start, temp_end));
+
+        // insert size is the number of bases between reads in the fragment for paired ended reads
+        let insert_size = fragment_length - (read_length * 2);
+        // if these are singled ended reads, then the insert size will always be negative
+        if insert_size > 0 {
+            // if there's uncovered bases in between the reads on paired ended reads, we'll add
+            // that to the gap to ensure adequate coverage.
+            gap_size += insert_size;
+        }
+
+        // Picks a number between zero and a quarter of a read length
+        let wildcard: usize = (rng.next_u32() % (read_length/4) as u32) as usize;
+        // adds to the start to give it some spice
+        start += temp_end + wildcard;
+        // sanity check. If we are already out of bounds, take the modulo
+        if start >= span_length {
+            // get us back in bounds
+            start = start % span_length;
+            // add the gap
+            gap_size += start;
+        } else {
+            // still in bounds, just add the gap
+            gap_size += wildcard;
         }
     }
     read_set
@@ -44,8 +102,6 @@ fn complement(nucleotide: u8) -> u8 {
     /*
     0 = A, 1 = C, 2 = G, 3 = T,
     matches with the complement of each nucleotide.
-
-    Todo: make this part of a struct to standardize across the program.
      */
     return match nucleotide {
         0 => 3,
@@ -73,6 +129,8 @@ pub fn generate_reads(
     read_length: &usize,
     coverage: &usize,
     paired_ended: bool,
+    mean: f64,
+    st_dev: f64,
     mut rng: &mut ThreadRng,
 ) -> Box<HashSet<Vec<u8>>> {
     /*
@@ -90,8 +148,16 @@ pub fn generate_reads(
      */
 
     if paired_ended {
+        // generate the reverse complement of the fragment
         let rev_comp = reverse_complement(mutated_sequence);
-        todo!();
+    }
+    let mut fragment_pool: Vec<usize> = Vec::new();
+    let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2);
+    let mut fragment_distribution = Normal::new(mean, st_dev).unwrap();
+    // add fragments to the fragment pool
+    for _ in 0..num_frags {
+        let frag = fragment_distribution.sample(&mut rng).round() as usize;
+        fragment_pool.push(frag);
     }
 
     // set up some defaults and storage
@@ -102,6 +168,7 @@ pub fn generate_reads(
     let read_positions: Vec<(usize, usize)> = cover_dataset(
         seq_len,
         *read_length,
+        fragment_pool,
         *coverage,
         &mut rng,
     );
diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs
index 1b8b252..762e64c 100644
--- a/src/utils/mutate.rs
+++ b/src/utils/mutate.rs
@@ -1,13 +1,10 @@
-extern crate log;
-extern crate itertools;
-
 use std::collections::HashMap;
 use rand::distributions::Distribution;
 use rand::prelude::{ThreadRng, IndexedRandom};
 use rand::seq::SliceRandom;
 use rand::Rng;
-use self::log::debug;
-use self::itertools::izip;
+use log::debug;
+use itertools::izip;
 
 #[derive(Debug, Ord, PartialOrd, Eq, PartialEq)]
 struct NucModel {
@@ -55,7 +52,7 @@ impl NucModel {
 pub fn mutate_fasta(
     file_struct: &HashMap<String, Vec<u8>>,
     mut rng: &mut ThreadRng
-) -> (Box<HashMap<String, Vec<u8>>>, Box<HashMap<String, Vec<(usize, u8)>>>) {
+) -> (Box<HashMap<String, Vec<u8>>>, Box<HashMap<String, Vec<(usize, u8, u8)>>>) {
     /*
     Takes:
         file_struct: a hashmap of contig names (keys) and a vector
@@ -76,11 +73,11 @@ pub fn mutate_fasta(
     let mut return_struct: HashMap<String, Vec<u8>> = HashMap::new(); // the mutated sequences
 
     // hashmap with keys of the contig names with a list of positions and alts under the contig.
-    let mut all_variants: HashMap<String, Vec<(usize, u8)>> = HashMap::new();
+    let mut all_variants: HashMap<String, Vec<(usize, u8, u8)>> = HashMap::new();
 
     for (name, sequence) in file_struct {
         // Mutations for this contig
-        let mut contig_mutations: Vec<(usize, u8)> = Vec::new();
+        let mut contig_mutations: Vec<(usize, u8, u8)> = Vec::new();
         // The length of this sequence
         let sequence_length = sequence.len();
         debug!("Sequence {} is {} bp long", name, sequence_length);
@@ -181,7 +178,7 @@ fn mutate_sequence(
             _ => 4
         };
         // add the location and alt base for the variant
-        sequence_variants.push((*index, mutated_record[*index],  *reference_base))
+        sequence_variants.push((*index, mutated_record[*index], reference_base))
     }
     (mutated_record, sequence_variants)
 }
\ No newline at end of file
diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs
index 6261920..e9b85d6 100644
--- a/src/utils/vcf_tools.rs
+++ b/src/utils/vcf_tools.rs
@@ -22,8 +22,7 @@ fn genotype_to_string(genotype: Vec<usize>) -> String {
 }
 
 pub fn write_vcf(
-    reference_sequence: &HashMap<String, Vec<u8>>,
-    variant_locations: &HashMap<String, Vec<(usize, u8)>>,
+    variant_locations: &HashMap<String, Vec<(usize, u8, u8)>>,
     fasta_order: &Vec<String>,
     ploidy: usize,
     reference: &str,
@@ -32,9 +31,8 @@ pub fn write_vcf(
 ) -> io::Result<()> {
     /*
     Takes:
-        reference_sequence: A map of the contig names keyed to a u8 vector representing the sequence
         variant_locations: A map of contig names keyed to lists of variants in that contig
-            consisting of a tuple of (position, alt base).
+            consisting of a tuple of (position, alt base, ref base).
         fasta_order: A vector of contig names in the order of the reference fasta.
         ploidy: The number of copies of each chromosome present in the organism
         output_file_prefix: The path to the directory and the prefix to use for filenames
@@ -90,7 +88,7 @@ pub fn write_vcf(
             let line = format!("{}\t{}\t.\t{}\t{}\t37\tPASS\t.\tGT\t{}",
                     contig,
                     mutation.0 + 1,
-                    num_to_char(reference_sequence[contig][mutation.0]),
+                    num_to_char(mutation.2),
                     num_to_char(mutation.1),
                     genotype_to_string(genotype),
                 );

From 88d2ed41dc141293a8239dc84f665fef9c37e65d Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 14 Mar 2024 14:05:08 -0500
Subject: [PATCH 07/23] Finished paired ended. Need to run some tests

---
 src/utils/fastq_tools.rs | 60 ++++++++++++++++++++++++++++++++--------
 src/utils/make_reads.rs  | 46 ++++++------------------------
 2 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs
index f545ba2..b42df5b 100644
--- a/src/utils/fastq_tools.rs
+++ b/src/utils/fastq_tools.rs
@@ -4,6 +4,32 @@ use std::io;
 
 use utils::fasta_tools::sequence_array_to_string;
 
+fn complement(nucleotide: u8) -> u8 {
+    /*
+    0 = A, 1 = C, 2 = G, 3 = T,
+    matches with the complement of each nucleotide.
+     */
+    return match nucleotide {
+        0 => 3,
+        1 => 2,
+        2 => 1,
+        3 => 0,
+        _ => 4,
+    }
+}
+
+fn reverse_complement(sequence: &Vec<u8>) -> Vec<u8> {
+    /*
+    Returns the reverse complement of a vector of u8's representing a DNA sequence.
+     */
+    let length = sequence.len();
+    let mut rev_comp = Vec::new();
+    for i in (0..length).rev() {
+        rev_comp.push(complement(sequence[i]))
+    }
+    rev_comp
+}
+
 pub fn write_fastq(
     fastq_filename: &str,
     paired_ended: bool,
@@ -11,7 +37,8 @@ pub fn write_fastq(
 ) -> io::Result<()> {
     /*
     Takes:
-        fastq_filename: prefix for the output fastq files
+        fastq_filename: prefix for the output fastq files.
+        paired_ended: boolean to set paired ended mode on or off.
         dataset: List of u8 vectors representing dna sequences.
     returns:
         Error if there is a problem or else nothing.
@@ -26,25 +53,36 @@ pub fn write_fastq(
     let name_prefix = "neat_generated_".to_string();
     // vector to hold the fastq filenames, since we don't know ahead of time if we need one or two
     let mut fastq_files: Vec<String> = Vec::new();
-    fastq_files.push(String::from(fastq_filename) + "_r1.fastq");
-    if paired_ended {
-        fastq_files.push(String::from(fastq_filename) + "_r1.fastq");
-    }
-    // open the filename
-    let mut outfile = File::options().create_new(true).append(true).open(fastq_filename)?;
+    let filename1 = String::from(fastq_filename) + "_r1.fastq";
+    // open the file and append lines
+    let mut outfile1 = File::options().create_new(true).append(true).open(&filename1)?;
+    // setting up pairend ended reads For single ended reads, this will go unused.
+    let filename2 = String::from(fastq_filename) + "_r2.fastq";
+    // open the second file and append lines
+    let mut outfile2 = File::options().create_new(true).append(true).open(&filename2)?;
     // write out sequence by sequence using index to track the numbering
     let mut index = 1;
     for sequence in dataset {
         let sequence_len = sequence.len();
         // sequence name
-        writeln!(&mut outfile, "@{}", name_prefix.clone() + &index.to_string())?;
+        writeln!(&mut outfile1, "@{}/1", name_prefix.clone() + &index.to_string())?;
         // Array as a string
-        writeln!(&mut outfile, "{}", sequence_array_to_string(&sequence))?;
+        writeln!(&mut outfile1, "{}", sequence_array_to_string(&sequence))?;
         // The stupid plus sign
-        writeln!(&mut outfile, "+")?;
+        writeln!(&mut outfile1, "+")?;
         // Qual score of all F's for the whole thing.
-        writeln!(&mut outfile, "{}", std::iter::repeat("F").take(sequence_len).collect::<String>())?;
+        writeln!(&mut outfile1, "{}", std::iter::repeat("F").take(sequence_len).collect::<String>())?;
         index += 1;
+        if paired_ended {
+            // sequence name
+            writeln!(&mut outfile2, "@{}/2", name_prefix.clone() + &index.to_string())?;
+            // Array as a string
+            writeln!(&mut outfile2, "{}", sequence_array_to_string(&reverse_complement(&sequence)))?;
+            // The stupid plus sign
+            writeln!(&mut outfile2, "+")?;
+            // Qual score of all F's for the whole thing.
+            writeln!(&mut outfile2, "{}", std::iter::repeat("F").take(sequence_len).collect::<String>())?;
+        }
     };
     Ok(())
 }
\ No newline at end of file
diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs
index efe351d..451061e 100644
--- a/src/utils/make_reads.rs
+++ b/src/utils/make_reads.rs
@@ -98,32 +98,6 @@ fn cover_dataset(
     read_set
 }
 
-fn complement(nucleotide: u8) -> u8 {
-    /*
-    0 = A, 1 = C, 2 = G, 3 = T,
-    matches with the complement of each nucleotide.
-     */
-    return match nucleotide {
-        0 => 3,
-        1 => 2,
-        2 => 1,
-        3 => 0,
-        _ => 4,
-    }
-}
-
-fn reverse_complement(sequence: &Vec<u8>) -> Vec<u8> {
-    /*
-    Returns the reverse complement of a vector of u8's representing a DNA sequence.
-     */
-    let length = sequence.len();
-    let mut rev_comp = Vec::new();
-    for i in (0..length).rev() {
-        rev_comp.push(complement(sequence[i]))
-    }
-    rev_comp
-}
-
 pub fn generate_reads(
     mutated_sequence: &Vec<u8>,
     read_length: &usize,
@@ -147,22 +121,20 @@ pub fn generate_reads(
     complement int the output
      */
 
-    if paired_ended {
-        // generate the reverse complement of the fragment
-        let rev_comp = reverse_complement(mutated_sequence);
-    }
     let mut fragment_pool: Vec<usize> = Vec::new();
-    let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2);
-    let mut fragment_distribution = Normal::new(mean, st_dev).unwrap();
-    // add fragments to the fragment pool
-    for _ in 0..num_frags {
-        let frag = fragment_distribution.sample(&mut rng).round() as usize;
-        fragment_pool.push(frag);
+    if paired_ended {
+        let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2);
+        let mut fragment_distribution = Normal::new(mean, st_dev).unwrap();
+        // add fragments to the fragment pool
+        for _ in 0..num_frags {
+            let frag = fragment_distribution.sample(&mut rng).round() as usize;
+            fragment_pool.push(frag);
+        }
     }
 
     // set up some defaults and storage
     let mut read_set: HashSet<Vec<u8>> = HashSet::new();
-
+    // length of the mutated sequence
     let seq_len = mutated_sequence.len();
     // Generate a vector of read positions
     let read_positions: Vec<(usize, usize)> = cover_dataset(

From 39510258f51edd1a1e58d9a604244d3531de8f43 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Fri, 15 Mar 2024 21:30:19 -0500
Subject: [PATCH 08/23] Can now create paired-ended reads!

---
 src/utils/fasta_tools.rs | 20 +++++++-------------
 src/utils/fastq_tools.rs |  7 ++++---
 src/utils/make_reads.rs  | 14 ++++++--------
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs
index 70833e6..2bf7e17 100644
--- a/src/utils/fasta_tools.rs
+++ b/src/utils/fasta_tools.rs
@@ -2,7 +2,6 @@ extern crate rand;
 extern crate log;
 extern crate assert_fs;
 
-use std::collections::HashMap;
 use crate::utils::file_tools::read_lines;
 use self::log::info;
 use std::fs::File;
@@ -10,6 +9,7 @@ use std::{fs, io};
 use std::io::Write;
 use std::*;
 use std::os::unix::fs::MetadataExt;
+use HashMap;
 
 pub fn char_to_num(char_of_interest: char) -> u8 {
     return match char_of_interest {
@@ -149,27 +149,21 @@ fn test_read_fasta() {
 fn test_write_fasta() -> Result<(), Box<dyn error::Error>> {
 
     let seq1: Vec<u8> = vec![0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1];
-    let fasta_output: HashMap<String, Vec<u8>> = HashMap::from(
+    let fasta_output: HashMap<String, Vec<u8>> = HashMap::from([
         (String::from("H1N1_HA"), seq1)
-    );
+    ]);
     let fasta_pointer = Box::new(fasta_output);
     let fasta_order = vec![String::from("H1N1_HA")];
     let output_file = "test";
-    let ploidy: usize = 2;
     let test_write = write_fasta(
         &fasta_pointer,
         &fasta_order,
-        output_file,
-        ploidy
+        output_file
     );
-    let file_name_p1 = "test_p1.fasta";
-    let file_name_p2 = "test_p2.fasta";
+    let file_name = "test.fasta";
     assert_eq!(test_write.unwrap(), ());
-    let attr = fs::metadata(file_name_p1).unwrap();
-    assert!(attr.size() > 0);
-    let attr = fs::metadata(file_name_p2).unwrap();
+    let attr = fs::metadata(file_name).unwrap();
     assert!(attr.size() > 0);
-    fs::remove_file(file_name_p1)?;
-    fs::remove_file(file_name_p2)?;
+    fs::remove_file(file_name)?;
     Ok(())
 }
\ No newline at end of file
diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs
index b42df5b..450a06b 100644
--- a/src/utils/fastq_tools.rs
+++ b/src/utils/fastq_tools.rs
@@ -1,6 +1,6 @@
 use std::io::Write;
 use std::fs::File;
-use std::io;
+use std::{fs, io};
 
 use utils::fasta_tools::sequence_array_to_string;
 
@@ -51,8 +51,6 @@ pub fn write_fastq(
     // (Although this feature is currently untested and unknown).
     // (May need sorting.)
     let name_prefix = "neat_generated_".to_string();
-    // vector to hold the fastq filenames, since we don't know ahead of time if we need one or two
-    let mut fastq_files: Vec<String> = Vec::new();
     let filename1 = String::from(fastq_filename) + "_r1.fastq";
     // open the file and append lines
     let mut outfile1 = File::options().create_new(true).append(true).open(&filename1)?;
@@ -84,5 +82,8 @@ pub fn write_fastq(
             writeln!(&mut outfile2, "{}", std::iter::repeat("F").take(sequence_len).collect::<String>())?;
         }
     };
+    if !paired_ended {
+        fs::remove_file(filename2)?;
+    }
     Ok(())
 }
\ No newline at end of file
diff --git a/src/utils/make_reads.rs b/src/utils/make_reads.rs
index 451061e..a4aba9a 100644
--- a/src/utils/make_reads.rs
+++ b/src/utils/make_reads.rs
@@ -72,14 +72,12 @@ fn cover_dataset(
         read_set.push((start, temp_end));
 
         // insert size is the number of bases between reads in the fragment for paired ended reads
-        let insert_size = fragment_length - (read_length * 2);
-        // if these are singled ended reads, then the insert size will always be negative
-        if insert_size > 0 {
-            // if there's uncovered bases in between the reads on paired ended reads, we'll add
+        // if these are singled ended reads, then the insert size will always be -read_length
+        if fragment_length > (read_length * 2) {
+            // if there's any insert size on paired ended reads, we'll add
             // that to the gap to ensure adequate coverage.
-            gap_size += insert_size;
-        }
-
+            gap_size += fragment_length - (read_length * 2)
+        };
         // Picks a number between zero and a quarter of a read length
         let wildcard: usize = (rng.next_u32() % (read_length/4) as u32) as usize;
         // adds to the start to give it some spice
@@ -124,7 +122,7 @@ pub fn generate_reads(
     let mut fragment_pool: Vec<usize> = Vec::new();
     if paired_ended {
         let num_frags = (mutated_sequence.len() / read_length) * (coverage * 2);
-        let mut fragment_distribution = Normal::new(mean, st_dev).unwrap();
+        let fragment_distribution = Normal::new(mean, st_dev).unwrap();
         // add fragments to the fragment pool
         for _ in 0..num_frags {
             let frag = fragment_distribution.sample(&mut rng).round() as usize;

From 9d0513f7d111647ba97df1390f710c286614bd63 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Sat, 16 Mar 2024 10:43:29 -0500
Subject: [PATCH 09/23] Updated version number, almost ready to merge to main

---
 Cargo.lock | 2 +-
 Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e40cda8..4233824 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -640,7 +640,7 @@ dependencies = [
 
 [[package]]
 name = "rusty-neat"
-version = "0.1.0"
+version = "0.2.0"
 dependencies = [
  "assert_fs",
  "chrono",
diff --git a/Cargo.toml b/Cargo.toml
index 6ef81fe..10feb57 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rusty-neat"
-version = "0.1.0"
+version = "0.2.0"
 authors = ["Joshua Allen <jallen17@illinois.edu>"]
 
 [dependencies]

From 1c38c3726a87724e03d52afe0d62129c39a572e6 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Sat, 16 Mar 2024 16:01:17 -0500
Subject: [PATCH 10/23] Added ability to overwrite the output with a config
 option

---
 config/simple_template.yml | 14 +++++------
 src/main.rs                | 49 +++++++++++++++++++++++++++++++-------
 src/utils/cli.rs           |  7 ++++++
 src/utils/config.rs        | 22 ++++++++++++++++-
 src/utils/fasta_tools.rs   |  6 +++--
 src/utils/fastq_tools.rs   | 10 ++++----
 src/utils/file_tools.rs    | 25 +++++++++++++++++--
 src/utils/neat_logger.rs   |  0
 src/utils/vcf_tools.rs     |  4 +++-
 9 files changed, 112 insertions(+), 25 deletions(-)
 create mode 100644 src/utils/neat_logger.rs

diff --git a/config/simple_template.yml b/config/simple_template.yml
index 3c7063e..3af3026 100644
--- a/config/simple_template.yml
+++ b/config/simple_template.yml
@@ -1,10 +1,8 @@
 reference: REQUIRED
 read_len: .
 coverage: 20
-output_dir: /home/joshfactorial/code/rust_neat_outputs/
-output_prefix: neat_test_1
+mutation_rate: .
 
-# Below are not yet active
 ploidy: .
 paired_ended: .
 fragment_mean: .
@@ -15,6 +13,11 @@ produce_vcf: .
 produce_fasta: .
 produce_fastq: .
 
+overwrite_output: .
+output_dir: .
+output_prefix: .
+
+# Below are not yet active
 error_model: .
 mutation_model: .
 fragment_model: .
@@ -29,10 +32,7 @@ include_vcf: .
 target_bed: .
 off_target_scalar: .
 discard_bed: .
-mutation_rate: .
 mutation_bed: .
 no_coverage_bias: .
 rng_seed: .
-min_mutations: .
-fasta_per_ploid: .
-overwrite_output: .
+min_mutations: .
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
index b1a380d..6620453 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -9,11 +9,13 @@ extern crate itertools;
 mod utils;
 
 use std::collections::{HashMap, HashSet};
+use std::fs::File;
 use clap::{Parser};
 use log::*;
 use simplelog::*;
 use rand::thread_rng;
 use rand::prelude::*;
+use std::path::Path;
 
 use utils::cli;
 use utils::fasta_tools::{read_fasta, write_fasta};
@@ -21,22 +23,50 @@ use utils::config::{read_config_yaml, build_config_from_args};
 use utils::mutate::mutate_fasta;
 use utils::make_reads::generate_reads;
 use utils::fastq_tools::write_fastq;
+use utils::file_tools::check_parent_and_create;
 use utils::vcf_tools::write_vcf;
 
 fn main() {
-    TermLogger::init(
-        LevelFilter::Trace,
-        Config::default(),
-        TerminalMode::Stdout,
-        ColorChoice::Auto,
-    ).unwrap();
-
-    let mut rng = thread_rng();
 
     info!("Begin processing");
     // parse the arguments from the command line
     let args = cli::Cli::parse();
 
+    let level_filter = match args.log_level.to_lowercase().as_str() {
+        "trace" => LevelFilter::Trace,
+        "debug" => LevelFilter::Debug,
+        "info" => LevelFilter::Info,
+        "warn" => LevelFilter::Warn,
+        "error" => LevelFilter::Error,
+        "off" => LevelFilter::Off,
+        _ => panic!(
+            "Unknown log level, please set to one of \
+            Trace, Debug, Info, Warn, Error, or Off (case insensitive)."
+        )
+    };
+
+    // Check that the parent dir exists
+    let log_destination = check_parent_and_create(&args.log_dest).unwrap();
+
+    CombinedLogger::init(vec![
+        #[cfg(feature = "termcolor")]
+        TermLogger::new(
+            level_filter,
+            Config::default(),
+            TerminalMode::Stdout,
+            ColorChoice::Always,
+        ),
+        #[cfg(not(feature = "termcolor"))]
+        SimpleLogger::new(LevelFilter::Trace, Config::default()),
+        WriteLogger::new(
+            level_filter,
+            Config::default(),
+            File::create(log_destination).unwrap(),
+        )
+    ]).unwrap();
+
+    let mut rng = thread_rng();
+
     // set up the config struct based on whether there was an input config. Input config
     // overrides any other inputs.
     let config = if args.config != "" {
@@ -67,6 +97,7 @@ fn main() {
         write_fasta(
             &mutated_map,
             &fasta_order,
+            config.overwrite_output,
             &output_file,
         ).expect("Problem writing fasta file");
     }
@@ -78,6 +109,7 @@ fn main() {
             &fasta_order,
             config.ploidy,
             &config.reference,
+            config.overwrite_output,
             &output_file,
             &mut rng).expect("Error writing vcf file")
     }
@@ -108,6 +140,7 @@ fn main() {
         info!("Writing fastq");
         write_fastq(
             &output_file,
+            config.overwrite_output,
             config.paired_ended,
             *outsets,
         ).expect("Problem writing fastq file");
diff --git a/src/utils/cli.rs b/src/utils/cli.rs
index e61d1ff..5840416 100644
--- a/src/utils/cli.rs
+++ b/src/utils/cli.rs
@@ -1,5 +1,6 @@
 extern crate clap;
 
+use std::fs;
 use clap::Parser;
 
 #[derive(Parser, Debug)]
@@ -31,4 +32,10 @@ pub struct Cli {
     pub read_length: usize,
     #[arg(short='c', long="coverage", default_value_t = 10)]
     pub coverage: usize,
+
+    // These options relate to the logging features
+    #[arg(long="log-level", default_value_t=String::from("Trace"), help="Enter one of Trace, Debug, Info, Warn, Error, Off")]
+    pub log_level: String,
+    #[arg(long="log-dest", default_value_t=String::new(), help="Full path and name to log file")]
+    pub log_dest: String,
 }
\ No newline at end of file
diff --git a/src/utils/config.rs b/src/utils/config.rs
index 27befc3..83584e9 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -3,6 +3,7 @@ extern crate log;
 use std::collections::{HashMap};
 use std::env;
 use std::string::String;
+use clap::builder::Str;
 use crate::utils::cli::Cli;
 use log::{debug, error};
 use serde_yaml::Error;
@@ -29,6 +30,8 @@ pub struct RunConfiguration {
     produce_vcf: True or false on whether to produce an output VCF file, with genotyped variants.
     produce_bam: True or false on whether to produce an output BAM file, which will be aligned to
         the reference.
+    overwrite_output: if true, will overwrite output. If false will error and exit you attempt to
+        overwrite files with the same name.
     output_dir: The directory, relative or absolute, path to the directory to place output.
     output_prefix: The name to use for the output files.
      */
@@ -44,10 +47,10 @@ pub struct RunConfiguration {
     pub produce_fasta: bool,
     pub produce_vcf:  bool,
     pub produce_bam: bool,
+    pub overwrite_output: bool,
     pub output_dir: String,
     pub output_prefix: String,
 }
-
 #[allow(dead_code)]
 impl RunConfiguration {
     // The purpose of this function is to redirect you to the ConfigBuilder
@@ -72,6 +75,7 @@ pub struct ConfigBuilder {
     produce_fasta: bool,
     produce_vcf:  bool,
     produce_bam: bool,
+    overwrite_output: bool,
     output_dir: String,
     output_prefix: String,
 }
@@ -102,6 +106,7 @@ impl ConfigBuilder {
             produce_fasta: false,
             produce_vcf: false,
             produce_bam: false,
+            overwrite_output: false,
             output_dir: current_dir,
             output_prefix: String::from("neat_out"),
         }
@@ -168,6 +173,11 @@ impl ConfigBuilder {
         self
     }
 
+    pub fn set_overwrite_output(mut self) -> ConfigBuilder {
+        self.overwrite_output = true;
+        self
+    }
+
     pub fn set_output_dir(mut self, output_dir: String) -> ConfigBuilder {
         self.output_dir = String::from(output_dir);
         self
@@ -235,6 +245,7 @@ impl ConfigBuilder {
             produce_fasta: self.produce_fasta,
             produce_vcf: self.produce_vcf,
             produce_bam: self.produce_bam,
+            overwrite_output: self.overwrite_output,
             output_dir: self.output_dir,
             output_prefix: self.output_prefix,
         }
@@ -327,6 +338,13 @@ pub fn read_config_yaml(yaml: String) -> Result<Box<RunConfiguration>, ConfigErr
                     config_builder = config_builder.set_produce_bam(true)
                 }
             },
+            "overwrite_output" => {
+                // overwrite_output is false by default, to prevent data loss, setting this flag
+                // will instead enable rusty-neat to overwrite the output
+                if value.to_lowercase() == "true" {
+                    config_builder = config_builder.set_overwrite_output()
+                }
+            }
             "output_dir" => {
                 if value != ".".to_string() {
                     config_builder = config_builder.set_output_dir(value)
@@ -386,6 +404,8 @@ fn test_command_line_inputs() {
         config: String::new(),
         reference: String::from("data/ecoli.fa"),
         output_dir: String::from("test_dir"),
+        log_level: String::from("Trace"),
+        log_dest: String::new(),
         output_file_prefix: String::from("test"),
         read_length: 150,
         coverage: 10,
diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs
index 2bf7e17..01d5f1d 100644
--- a/src/utils/fasta_tools.rs
+++ b/src/utils/fasta_tools.rs
@@ -10,6 +10,7 @@ use std::io::Write;
 use std::*;
 use std::os::unix::fs::MetadataExt;
 use HashMap;
+use utils::file_tools::open_file;
 
 pub fn char_to_num(char_of_interest: char) -> u8 {
     return match char_of_interest {
@@ -90,6 +91,7 @@ pub fn read_fasta(
 pub fn write_fasta(
     fasta_output: &Box<HashMap<String, Vec<u8>>>,
     fasta_order: &Vec<String>,
+    overwrite_output: bool,
     output_file: &str,
 ) -> io::Result<()> {
     /*
@@ -102,8 +104,8 @@ pub fn write_fasta(
         Errors if there is a problem writing the file, otherwise it returns nothing.
      */
     // writing fasta output to files
-    let output_fasta = format!("{}.fasta", output_file);
-    let mut outfile = File::options().create_new(true).append(true).open(output_fasta)?;
+    let mut output_fasta = format!("{}.fasta", output_file);
+    let mut outfile = open_file(&mut output_fasta, overwrite_output);
     for contig in fasta_order {
         let sequence = &fasta_output[contig];
         // Write contig name
diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs
index 450a06b..32a5e65 100644
--- a/src/utils/fastq_tools.rs
+++ b/src/utils/fastq_tools.rs
@@ -3,6 +3,7 @@ use std::fs::File;
 use std::{fs, io};
 
 use utils::fasta_tools::sequence_array_to_string;
+use utils::file_tools::open_file;
 
 fn complement(nucleotide: u8) -> u8 {
     /*
@@ -32,6 +33,7 @@ fn reverse_complement(sequence: &Vec<u8>) -> Vec<u8> {
 
 pub fn write_fastq(
     fastq_filename: &str,
+    overwrite_output: bool,
     paired_ended: bool,
     dataset: Vec<&Vec<u8>>,
 ) -> io::Result<()> {
@@ -51,13 +53,13 @@ pub fn write_fastq(
     // (Although this feature is currently untested and unknown).
     // (May need sorting.)
     let name_prefix = "neat_generated_".to_string();
-    let filename1 = String::from(fastq_filename) + "_r1.fastq";
+    let mut filename1 = String::from(fastq_filename) + "_r1.fastq";
     // open the file and append lines
-    let mut outfile1 = File::options().create_new(true).append(true).open(&filename1)?;
+    let mut outfile1 = open_file(&mut filename1, overwrite_output);
     // setting up pairend ended reads For single ended reads, this will go unused.
-    let filename2 = String::from(fastq_filename) + "_r2.fastq";
+    let mut filename2 = String::from(fastq_filename) + "_r2.fastq";
     // open the second file and append lines
-    let mut outfile2 = File::options().create_new(true).append(true).open(&filename2)?;
+    let mut outfile2 = open_file(&mut filename2, overwrite_output);
     // write out sequence by sequence using index to track the numbering
     let mut index = 1;
     for sequence in dataset {
diff --git a/src/utils/file_tools.rs b/src/utils/file_tools.rs
index e540756..eba4043 100644
--- a/src/utils/file_tools.rs
+++ b/src/utils/file_tools.rs
@@ -1,9 +1,30 @@
-use std::fs::File;
+use std::fs::{File, OpenOptions};
 use std::io;
-use std::io::BufRead;
+use std::io::{BufRead, Error};
+use std::path::Path;
+use log::error;
 
 pub fn read_lines(filename: &str) -> io::Result<io::Lines<io::BufReader<File>>> {
     // This creates a buffer to read lines
     let file = File::open(filename)?;
     Ok(io::BufReader::new(file).lines())
+}
+
+pub fn open_file(mut filename: &mut str, overwrite_file: bool) -> File {
+    if overwrite_file {
+        File::options().create(true).write(true).open(&mut filename).unwrap()
+    } else {
+        File::options().create_new(true).append(true).open(&mut filename).unwrap()
+    }
+}
+
+pub fn check_parent_and_create(filename: &str) -> io::Result<&Path> {
+    // checks that the parent dir exists and then if so creates the file object open
+    // and ready to write
+    let file_path = Path::new(filename);
+    if !file_path.parent().unwrap().exists() {
+        error!("Path to log file not found!");
+        assert!(file_path.parent().unwrap().exists())
+    }
+    Ok(file_path)
 }
\ No newline at end of file
diff --git a/src/utils/neat_logger.rs b/src/utils/neat_logger.rs
new file mode 100644
index 0000000..e69de29
diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs
index e9b85d6..9fc5e36 100644
--- a/src/utils/vcf_tools.rs
+++ b/src/utils/vcf_tools.rs
@@ -8,6 +8,7 @@ use rand::Rng;
 use rand::rngs::ThreadRng;
 use rand::seq::IndexedRandom;
 use utils::fasta_tools::num_to_char;
+use utils::file_tools::open_file;
 
 fn genotype_to_string(genotype: Vec<usize>) -> String {
     /*
@@ -26,6 +27,7 @@ pub fn write_vcf(
     fasta_order: &Vec<String>,
     ploidy: usize,
     reference: &str,
+    overwrite_output: bool,
     output_file_prefix: &str,
     mut rng: &mut ThreadRng,
 ) -> io::Result<()> {
@@ -44,7 +46,7 @@ pub fn write_vcf(
     let ploid_numbers: Vec<usize> = (1..ploidy+1).collect();
     // set the filename of the output vcf
     let mut filename = format!("{}.vcf", output_file_prefix);
-    let mut outfile = File::options().create_new(true).append(true).open(&mut filename)?;
+    let mut outfile = open_file(&mut filename, overwrite_output);
     // add the vcf header
     writeln!(&mut outfile, "##fileformat=VCFv4.1")?;
     writeln!(&mut outfile, "##reference={}", reference)?;

From a5f92ec1eeb68a847d6490f25399bf31906b297e Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Sat, 16 Mar 2024 22:31:57 -0500
Subject: [PATCH 11/23] Cleaned up the warnings and some of the code. Still
 need to add more unit tests

---
 src/main.rs              |  2 --
 src/utils/cli.rs         | 22 ++++++++++++++++---
 src/utils/config.rs      |  7 +++---
 src/utils/fasta_tools.rs | 46 +++++++++++++++++++---------------------
 src/utils/fastq_tools.rs |  1 -
 src/utils/file_tools.rs  |  4 ++--
 src/utils/mutate.rs      | 25 ++++++++--------------
 src/utils/vcf_tools.rs   |  1 -
 8 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 6620453..70907ee 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,7 +15,6 @@ use log::*;
 use simplelog::*;
 use rand::thread_rng;
 use rand::prelude::*;
-use std::path::Path;
 
 use utils::cli;
 use utils::fasta_tools::{read_fasta, write_fasta};
@@ -128,7 +127,6 @@ fn main() {
             &mut rng
         );
 
-        // todo: we need to keep track of these reads better for paired endedness
         read_sets.extend(*data_set);
     }
 
diff --git a/src/utils/cli.rs b/src/utils/cli.rs
index 5840416..a6520e2 100644
--- a/src/utils/cli.rs
+++ b/src/utils/cli.rs
@@ -1,6 +1,10 @@
+/// This is a pretty basic implementation of Clap CLI
+/// The idea of this interface and supporting code is that the user can enter an optional
+/// config file that will take the place of the other command line options, except the logging
+/// features, which are handled separately, outside of the run configuration parsing.
+
 extern crate clap;
 
-use std::fs;
 use clap::Parser;
 
 #[derive(Parser, Debug)]
@@ -12,6 +16,7 @@ pub struct Cli {
 
     config <String> = the full path to a configuration yaml file. If entered, it will override
         all other command line inputs. No default.
+
     reference <String> = The relative path or full path to the reference file. Must be in fasta
         file format. Default "data/H1N1.fa"
     output_dir <String> = The directory where output files will be written. If nothing is entered,
@@ -19,9 +24,20 @@ pub struct Cli {
     output_file_prefix <String> = output files will start with this name. Default = neat_out
     read_length <usize> = the length of the reads in the output fastq file. Default = 150
     coverage <usize> = The average depth per read of the fastq files. Default = 10
+
+    The following commands are independent of the config and not affected by it one way or another:
+    log_level <String> = Set a log level for the run. Everything at and above the level chosen will
+        be displayed in both logs. See simplelog docs for more info:
+        https://docs.rs/simplelog/latest/simplelog/enum.Level.html
+    log_dest <String> = Full path filename where to write the log. The default is current working
+        dir, filename "neat_out.log," which is set during config parsing.
      */
-    #[arg(short='C', long="configuration_yaml", default_value_t=String::new())]
+    #[arg(short='C', long="configuration_yaml", default_value_t=String::new(),
+    help="Enter a full path and filename to a configuration file. \
+    This will override most other options")]
     pub config: String,
+
+    // All of these arguments are overridden by the config file
     #[arg(short='r', long="reference", default_value_t=String::from("data/H1N1.fa"))]
     pub reference: String,
     #[arg(short='o', long="output_dir", default_value_t=String::new())]
@@ -33,7 +49,7 @@ pub struct Cli {
     #[arg(short='c', long="coverage", default_value_t = 10)]
     pub coverage: usize,
 
-    // These options relate to the logging features
+    // These options relate to the logging features and are not overridden by a config
     #[arg(long="log-level", default_value_t=String::from("Trace"), help="Enter one of Trace, Debug, Info, Warn, Error, Off")]
     pub log_level: String,
     #[arg(long="log-dest", default_value_t=String::new(), help="Full path and name to log file")]
diff --git a/src/utils/config.rs b/src/utils/config.rs
index 83584e9..2265b16 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -3,10 +3,8 @@ extern crate log;
 use std::collections::{HashMap};
 use std::env;
 use std::string::String;
-use clap::builder::Str;
 use crate::utils::cli::Cli;
-use log::{debug, error};
-use serde_yaml::Error;
+use log::{debug, warn, error};
 
 #[derive(Debug)]
 pub struct RunConfiguration {
@@ -195,6 +193,9 @@ impl ConfigBuilder {
         debug!("\t> mutation rate: {}", self.mutation_rate);
         debug!("\t> ploidy: {}", self.ploidy);
         debug!("\t> paired_ended: {}", self.paired_ended);
+        if self.overwrite_output {
+            warn!("Overwriting any existing files.")
+        }
         let file_prefix = format!("{}/{}", self.output_dir, self.output_prefix);
         if !(self.produce_fastq | self.produce_fasta | self.produce_vcf | self.produce_bam) {
             error!("All file types set to false, no files would be produced.");
diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs
index 01d5f1d..cc3f5ea 100644
--- a/src/utils/fasta_tools.rs
+++ b/src/utils/fasta_tools.rs
@@ -4,11 +4,9 @@ extern crate assert_fs;
 
 use crate::utils::file_tools::read_lines;
 use self::log::info;
-use std::fs::File;
-use std::{fs, io};
+use std::io;
 use std::io::Write;
 use std::*;
-use std::os::unix::fs::MetadataExt;
 use HashMap;
 use utils::file_tools::open_file;
 
@@ -63,28 +61,27 @@ pub fn read_fasta(
     let mut fasta_order: Vec<String> = Vec::new();
     let mut current_key = String::new();
 
-    if let lines = read_lines(fasta_path).expect("Error reading fasta") {
-        let mut temp_seq: Vec<u8> = vec![];
-        lines.for_each(|line| match line {
-            Ok(l) => {
-                if l.starts_with('>') {
-                    if !current_key.is_empty() {
-                        fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone());
-                    }
-                    current_key = String::from(l.strip_prefix('>').unwrap());
-                    fasta_order.push(current_key.clone());
-                    temp_seq = vec![];
-                } else {
-                    for char in l.chars() {
-                        temp_seq.push(char_to_num(char));
-                    }
+    let lines = read_lines(fasta_path).unwrap();
+    let mut temp_seq: Vec<u8> = vec![];
+    lines.for_each(|line| match line {
+        Ok(l) => {
+            if l.starts_with('>') {
+                if !current_key.is_empty() {
+                    fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone());
                 }
-            },
-            Err(error) => panic!("Error reading fasta file: {:?}", error),
-        });
-        // Need to pick up the last one
-        fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone());
-    }
+                current_key = String::from(l.strip_prefix('>').unwrap());
+                fasta_order.push(current_key.clone());
+                temp_seq = vec![];
+            } else {
+                for char in l.chars() {
+                    temp_seq.push(char_to_num(char));
+                }
+            }
+        },
+        Err(error) => panic!("Error reading fasta file: {:?}", error),
+    });
+    // Need to pick up the last one
+    fasta_map.entry(current_key.clone()).or_insert(temp_seq.clone());
     (Box::new(fasta_map), fasta_order)
 }
 
@@ -160,6 +157,7 @@ fn test_write_fasta() -> Result<(), Box<dyn error::Error>> {
     let test_write = write_fasta(
         &fasta_pointer,
         &fasta_order,
+        true,
         output_file
     );
     let file_name = "test.fasta";
diff --git a/src/utils/fastq_tools.rs b/src/utils/fastq_tools.rs
index 32a5e65..4d82443 100644
--- a/src/utils/fastq_tools.rs
+++ b/src/utils/fastq_tools.rs
@@ -1,5 +1,4 @@
 use std::io::Write;
-use std::fs::File;
 use std::{fs, io};
 
 use utils::fasta_tools::sequence_array_to_string;
diff --git a/src/utils/file_tools.rs b/src/utils/file_tools.rs
index eba4043..cc950fb 100644
--- a/src/utils/file_tools.rs
+++ b/src/utils/file_tools.rs
@@ -1,6 +1,6 @@
-use std::fs::{File, OpenOptions};
+use std::fs::File;
 use std::io;
-use std::io::{BufRead, Error};
+use std::io::BufRead;
 use std::path::Path;
 use log::error;
 
diff --git a/src/utils/mutate.rs b/src/utils/mutate.rs
index 762e64c..20894ca 100644
--- a/src/utils/mutate.rs
+++ b/src/utils/mutate.rs
@@ -1,5 +1,5 @@
+use std::cmp::max;
 use std::collections::HashMap;
-use rand::distributions::Distribution;
 use rand::prelude::{ThreadRng, IndexedRandom};
 use rand::seq::SliceRandom;
 use rand::Rng;
@@ -77,7 +77,7 @@ pub fn mutate_fasta(
 
     for (name, sequence) in file_struct {
         // Mutations for this contig
-        let mut contig_mutations: Vec<(usize, u8, u8)> = Vec::new();
+        let contig_mutations: Vec<(usize, u8, u8)>;
         // The length of this sequence
         let sequence_length = sequence.len();
         debug!("Sequence {} is {} bp long", name, sequence_length);
@@ -94,20 +94,13 @@ pub fn mutate_fasta(
             // add or subtract up to 10% of the reads.
             rough_num_positions * (sign * factor)
         };
-        // round the number of positions to the nearest usize.
-        let num_positions = rough_num_positions.round() as usize;
-        // If we somehow ended up with negative or no reads, we may still randomly add one mutation.
-        if num_positions <= 0 {
-            // we want to add at least 1 mutation to each contig
-            (mutated_record, contig_mutations) = mutate_sequence(
-                mutated_record, 1, &mut rng
-            );
-        } else {
-            // else add num positions number of variants
-            (mutated_record, contig_mutations) = mutate_sequence(
-                mutated_record, num_positions, &mut rng
-            );
-        }
+        // Round the number of positions to the nearest usize.
+        // If negative or no reads, we still want at least 1 mutation per contig.
+        let num_positions = max(1, rough_num_positions.round() as usize);
+
+        (mutated_record, contig_mutations) = mutate_sequence(
+            mutated_record, num_positions, &mut rng
+        );
         // Add to the return struct and variants map.
         return_struct.entry(name.clone()).or_insert(mutated_record.clone());
         all_variants.entry(name.clone()).or_insert(contig_mutations);
diff --git a/src/utils/vcf_tools.rs b/src/utils/vcf_tools.rs
index 9fc5e36..fa0fafc 100644
--- a/src/utils/vcf_tools.rs
+++ b/src/utils/vcf_tools.rs
@@ -3,7 +3,6 @@ extern crate log;
 use std::collections::HashMap;
 use std::io;
 use std::io::Write;
-use std::fs::File;
 use rand::Rng;
 use rand::rngs::ThreadRng;
 use rand::seq::IndexedRandom;

From 2010879fc863dc083d233bdea59ece3cbbdcd529 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Sun, 17 Mar 2024 00:09:51 -0500
Subject: [PATCH 12/23] Developing tests

---
 src/utils/cli.rs         |  11 ++-
 src/utils/config.rs      | 149 ++++++++++++++++++++++++++++++---------
 src/utils/fasta_tools.rs |   2 +-
 3 files changed, 125 insertions(+), 37 deletions(-)

diff --git a/src/utils/cli.rs b/src/utils/cli.rs
index a6520e2..2cb74fc 100644
--- a/src/utils/cli.rs
+++ b/src/utils/cli.rs
@@ -1,11 +1,14 @@
 /// This is a pretty basic implementation of Clap CLI
 /// The idea of this interface and supporting code is that the user can enter an optional
 /// config file that will take the place of the other command line options, except the logging
-/// features, which are handled separately, outside of the run configuration parsing.
+/// features, which are handled separately. Either way, these options are read into a configuration
+/// struct that holds the variables for the run. Logging, meanwhile, is handled separately,
+/// outside run configuration parsing.
 
 extern crate clap;
 
 use clap::Parser;
+use std::env;
 
 #[derive(Parser, Debug)]
 pub struct Cli {
@@ -52,6 +55,8 @@ pub struct Cli {
     // These options relate to the logging features and are not overridden by a config
     #[arg(long="log-level", default_value_t=String::from("Trace"), help="Enter one of Trace, Debug, Info, Warn, Error, Off")]
     pub log_level: String,
-    #[arg(long="log-dest", default_value_t=String::new(), help="Full path and name to log file")]
+    #[arg(long="log-dest", default_value_t=env::current_dir().unwrap().display().to_string() + "neat_out.log", help="Full path and name to log file")]
     pub log_dest: String,
-}
\ No newline at end of file
+}
+
+// Tests are handled in other places.
\ No newline at end of file
diff --git a/src/utils/config.rs b/src/utils/config.rs
index 2265b16..495202a 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -1,7 +1,6 @@
 extern crate log;
 
 use std::collections::{HashMap};
-use std::env;
 use std::string::String;
 use crate::utils::cli::Cli;
 use log::{debug, warn, error};
@@ -86,10 +85,6 @@ pub enum ConfigError {
 
 impl ConfigBuilder {
     pub fn new() -> ConfigBuilder {
-        let current_dir = env::current_dir()
-            .expect("Could not find current working directory")
-            .display()
-            .to_string();
         ConfigBuilder {
             // Setting default values
             reference: String::from("data/H1N1.fa"),
@@ -105,7 +100,7 @@ impl ConfigBuilder {
             produce_vcf: false,
             produce_bam: false,
             overwrite_output: false,
-            output_dir: current_dir,
+            output_dir: String::from(""),
             output_prefix: String::from("neat_out"),
         }
     }
@@ -188,11 +183,11 @@ impl ConfigBuilder {
 
     pub fn check_and_print_config(&self) -> Result<(), ConfigError> {
         debug!("Running rusty-neat to generate reads on {} with...", self.reference);
-        debug!("\t> read length: {}", self.read_len);
-        debug!("\t> coverage: {}", self.coverage);
-        debug!("\t> mutation rate: {}", self.mutation_rate);
-        debug!("\t> ploidy: {}", self.ploidy);
-        debug!("\t> paired_ended: {}", self.paired_ended);
+        debug!("  >read length: {}", self.read_len);
+        debug!("  >coverage: {}", self.coverage);
+        debug!("  >mutation rate: {}", self.mutation_rate);
+        debug!("  >ploidy: {}", self.ploidy);
+        debug!("  >paired_ended: {}", self.paired_ended);
         if self.overwrite_output {
             warn!("Overwriting any existing files.")
         }
@@ -277,6 +272,9 @@ pub fn read_config_yaml(yaml: String) -> Result<Box<RunConfiguration>, ConfigErr
             "reference" => {
                 if value != ".".to_string() {
                     config_builder = config_builder.set_reference(value)
+                } else {
+                    error!("Reference is required.");
+                    return Err(ConfigError::ConfigurationError)
                 }
             },
             "read_len" => {
@@ -371,9 +369,12 @@ pub fn build_config_from_args(args: Cli) -> Result<Box<RunConfiguration>, &'stat
 
     // Create the ConfigBuilder object with default values
     let mut config_builder = ConfigBuilder::new();
-    // The default assigned by CLI will keep it as data/H1N1.fa
+    // Can't do a run without a reference
     if args.reference != "" {
         config_builder = config_builder.set_reference(args.reference)
+    } else {
+        error!("No reference specified");
+        return Err("Reference File Error.");
     }
     // The default value works directly for the config builder
     config_builder = config_builder.set_read_len(args.read_length);
@@ -391,27 +392,109 @@ pub fn build_config_from_args(args: Cli) -> Result<Box<RunConfiguration>, &'stat
     Ok(Box::new(config_builder.build()))
 }
 
-#[test]
-fn test_read_config_yaml() {
-    let yaml = String::from("config/neat_test.yml");
-    let test_config = read_config_yaml(yaml).unwrap();
-    assert_eq!(test_config.reference, "data/ecoli.fa".to_string());
-    assert_eq!(test_config.coverage, 10);
-}
+#[cfg(test)]
+mod tests {
+    use super::*;
 
-#[test]
-fn test_command_line_inputs() {
-    let args: Cli = Cli{
-        config: String::new(),
-        reference: String::from("data/ecoli.fa"),
-        output_dir: String::from("test_dir"),
-        log_level: String::from("Trace"),
-        log_dest: String::new(),
-        output_file_prefix: String::from("test"),
-        read_length: 150,
-        coverage: 10,
-    };
+    #[test]
+    fn test_run_configuration() {
+        let test_configuration = RunConfiguration {
+            reference: String::from("Hello.world"),
+            read_len: 100,
+            coverage: 22,
+            mutation_rate: 0.09,
+            ploidy: 3,
+            paired_ended: true,
+            fragment_mean: 333.0,
+            fragment_st_dev: 33.0,
+            produce_fastq: false,
+            produce_bam: true,
+            produce_fasta: true,
+            produce_vcf: true,
+            overwrite_output: true,
+            output_dir: String::from("/my/my"),
+            output_prefix: String::from("Hey.hey")
+        };
+
+        assert_eq!(test_configuration.reference, "Hello.world".to_string());
+        assert_eq!(test_configuration.read_len, 100);
+        assert_eq!(test_configuration.coverage, 22);
+        assert_eq!(test_configuration.mutation_rate, 0.09);
+        assert_eq!(test_configuration.ploidy, 3);
+        assert_eq!(test_configuration.paired_ended, true);
+        assert_eq!(test_configuration.fragment_mean, 333.0);
+        assert_eq!(test_configuration.fragment_st_dev, 33.0);
+        assert_eq!(test_configuration.produce_fastq, false);
+        assert_eq!(test_configuration.produce_vcf, true);
+        assert_eq!(test_configuration.produce_bam, true);
+        assert_eq!(test_configuration.produce_fasta, true);
+        assert_eq!(test_configuration.overwrite_output, true);
+        assert_eq!(test_configuration.output_dir, "/my/my".to_string());
+        assert_eq!(test_configuration.output_prefix, "Hey.hey".to_string());
+    }
 
-    let test_config = build_config_from_args(args).unwrap();
-    assert_eq!(test_config.reference, "data/ecoli.fa".to_string())
+    #[test]
+    fn test_build() {
+        use super::*;
+        let x = RunConfiguration::build();
+        assert_eq!(x.reference, "data/H1N1.fa".to_string())
+    }
+
+    #[test]
+    fn test_read_config_yaml() {
+        let yaml = String::from("config/neat_test.yml");
+        let test_config = read_config_yaml(yaml).unwrap();
+        assert_eq!(test_config.reference, "data/ecoli.fa".to_string());
+        assert_eq!(test_config.coverage, 10);
+    }
+
+    #[test]
+    fn test_setters() {
+        let mut config = ConfigBuilder::new();
+        assert_eq!(config.mutation_rate, 0.001);
+        assert_eq!(config.ploidy, 2);
+        assert_eq!(config.paired_ended, false);
+        assert_eq!(config.fragment_mean, 300.0);
+        assert_eq!(config.fragment_st_dev, 30.0);
+        assert_eq!(config.produce_fasta, false);
+        assert_eq!(config.produce_fastq, true);
+        assert_eq!(config.produce_vcf, false);
+        assert_eq!(config.produce_bam, false);
+        config = config.set_mutation_rate(0.111)
+            .set_ploidy(3)
+            .set_paired_ended(true)
+            .set_fragment_mean(111.0)
+            .set_fragment_st_dev(0.011)
+            .set_produce_fastq(false)
+            .set_produce_fasta(true)
+            .set_produce_vcf(true)
+            .set_produce_bam(true)
+            .set_overwrite_output();
+        assert_eq!(config.mutation_rate, 0.111);
+        assert_eq!(config.ploidy, 3);
+        assert_eq!(config.paired_ended, true);
+        assert_eq!(config.fragment_mean, 111.0);
+        assert_eq!(config.fragment_st_dev, 0.011);
+        assert_eq!(config.produce_fasta, true);
+        assert_eq!(config.produce_fastq, false);
+        assert_eq!(config.produce_vcf, true);
+        assert_eq!(config.produce_bam, true);
+    }
+
+    #[test]
+    fn test_command_line_inputs() {
+        let args: Cli = Cli{
+            config: String::new(),
+            reference: String::from("data/ecoli.fa"),
+            output_dir: String::from("test_dir"),
+            log_level: String::from("Trace"),
+            log_dest: String::new(),
+            output_file_prefix: String::from("test"),
+            read_length: 150,
+            coverage: 10,
+        };
+
+        let test_config = build_config_from_args(args).unwrap();
+        assert_eq!(test_config.reference, "data/ecoli.fa".to_string())
+    }
 }
\ No newline at end of file
diff --git a/src/utils/fasta_tools.rs b/src/utils/fasta_tools.rs
index cc3f5ea..5415bb5 100644
--- a/src/utils/fasta_tools.rs
+++ b/src/utils/fasta_tools.rs
@@ -163,7 +163,7 @@ fn test_write_fasta() -> Result<(), Box<dyn error::Error>> {
     let file_name = "test.fasta";
     assert_eq!(test_write.unwrap(), ());
     let attr = fs::metadata(file_name).unwrap();
-    assert!(attr.size() > 0);
+    assert!(attr.len() > 0);
     fs::remove_file(file_name)?;
     Ok(())
 }
\ No newline at end of file

From c305090f4aa0c0438af291c5ff405daf19f323b7 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:07:15 -0500
Subject: [PATCH 13/23] Added a yaml file to handle CI

---
 .github/workflows/ci.yml | 74 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..87de341
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,74 @@
+# .github/workflows/ci.yml
+name: CI
+
+on:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  tests:
+    name: ${{ matrix.make.name }} (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        rust: [stable]
+        make:
+          - name: Unit tests
+            task: "build test"
+        include:
+          - os: ubuntu-latest
+            sccache-path: /home/runner/.cache/sccache
+  env:
+    RUST_BACKTRACE: full
+    RUSTC_WRAPPER: sccache
+    RUSTV: ${{ matrix.rust }}
+    SCCACHE_CACHE_SIZE: 2G
+    SCCACHE_DIR: ${{ matrix.sccache-path }}
+    # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out
+  steps:
+    - uses: actions/checkout@v2
+    - name: Install sccache (ubuntu-latest)
+      if: matrix.os == 'ubuntu-latest'
+      env:
+        LINK: https://github.com/mozilla/sccache/releases/download
+        SCCACHE_VERSION: 0.2.13
+      run: |
+        SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl
+        mkdir -p $HOME/.local/bin
+        curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz
+        mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache
+        echo "$HOME/.local/bin" >> $GITHUB_PATH
+    - name: Install Rust ${{ matrix.rust }}
+      uses: actions-rs/toolchain@v1
+      with:
+        toolchain: ${{ matrix.rust }}
+        profile: minimal
+        override: true
+    - name: Cache cargo registry
+      uses: actions/cache@v2
+      continue-on-error: false
+      with:
+        path: |
+          ~/.cargo/registry
+          ~/.cargo/git
+        key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+        restore-keys: |
+          ${{ runner.os }}-cargo-
+    - name: Save sccache
+      uses: actions/cache@v2
+      continue-on-error: false
+      with:
+        path: ${{ matrix.sccache-path }}
+        key: ${{ runner.os }}-sccache-${{ hashFiles('**/Cargo.lock') }}
+        restore-keys: |
+          ${{ runner.os }}-sccache-
+    - name: Start sccache server
+      run: sccache --start-server
+    - name: ${{ matrix.make.name }}
+      run: cargo ${{ matrix.make.task }}
+    - name: Print sccache stats
+      run: sccache --show-stats
+    - name: Stop sccache server
+      run: sccache --stop-server || true

From e58cb8e32236f1adc339dcbd4675b6fb0b8f62d3 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:20:04 -0500
Subject: [PATCH 14/23] Had the indent wrong on the yaml

---
 .github/workflows/ci.yml | 104 +++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 87de341..e28fe33 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,55 +20,55 @@ jobs:
         include:
           - os: ubuntu-latest
             sccache-path: /home/runner/.cache/sccache
-  env:
-    RUST_BACKTRACE: full
-    RUSTC_WRAPPER: sccache
-    RUSTV: ${{ matrix.rust }}
-    SCCACHE_CACHE_SIZE: 2G
-    SCCACHE_DIR: ${{ matrix.sccache-path }}
-    # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out
-  steps:
-    - uses: actions/checkout@v2
-    - name: Install sccache (ubuntu-latest)
-      if: matrix.os == 'ubuntu-latest'
-      env:
-        LINK: https://github.com/mozilla/sccache/releases/download
-        SCCACHE_VERSION: 0.2.13
-      run: |
-        SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl
-        mkdir -p $HOME/.local/bin
-        curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz
-        mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache
-        echo "$HOME/.local/bin" >> $GITHUB_PATH
-    - name: Install Rust ${{ matrix.rust }}
-      uses: actions-rs/toolchain@v1
-      with:
-        toolchain: ${{ matrix.rust }}
-        profile: minimal
-        override: true
-    - name: Cache cargo registry
-      uses: actions/cache@v2
-      continue-on-error: false
-      with:
-        path: |
-          ~/.cargo/registry
-          ~/.cargo/git
-        key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
-        restore-keys: |
-          ${{ runner.os }}-cargo-
-    - name: Save sccache
-      uses: actions/cache@v2
-      continue-on-error: false
-      with:
-        path: ${{ matrix.sccache-path }}
-        key: ${{ runner.os }}-sccache-${{ hashFiles('**/Cargo.lock') }}
-        restore-keys: |
-          ${{ runner.os }}-sccache-
-    - name: Start sccache server
-      run: sccache --start-server
-    - name: ${{ matrix.make.name }}
-      run: cargo ${{ matrix.make.task }}
-    - name: Print sccache stats
-      run: sccache --show-stats
-    - name: Stop sccache server
-      run: sccache --stop-server || true
+    env:
+      RUST_BACKTRACE: full
+      RUSTC_WRAPPER: sccache
+      RUSTV: ${{ matrix.rust }}
+      SCCACHE_CACHE_SIZE: 2G
+      SCCACHE_DIR: ${{ matrix.sccache-path }}
+      # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install sccache (ubuntu-latest)
+        if: matrix.os == 'ubuntu-latest'
+        env:
+          LINK: https://github.com/mozilla/sccache/releases/download
+          SCCACHE_VERSION: 0.2.13
+        run: |
+          SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl
+          mkdir -p $HOME/.local/bin
+          curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz
+          mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+      - name: Install Rust ${{ matrix.rust }}
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: ${{ matrix.rust }}
+          profile: minimal
+          override: true
+      - name: Cache cargo registry
+        uses: actions/cache@v2
+        continue-on-error: false
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-cargo-
+      - name: Save sccache
+        uses: actions/cache@v2
+        continue-on-error: false
+        with:
+          path: ${{ matrix.sccache-path }}
+          key: ${{ runner.os }}-sccache-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-sccache-
+      - name: Start sccache server
+        run: sccache --start-server
+      - name: ${{ matrix.make.name }}
+        run: cargo ${{ matrix.make.task }}
+      - name: Print sccache stats
+        run: sccache --show-stats
+      - name: Stop sccache server
+        run: sccache --stop-server || true

From 44b5a0eff8d4bc0eed8a1965837d47570162febe Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:22:02 -0500
Subject: [PATCH 15/23] Clearing cache

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e28fe33..33d68d0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,7 +26,7 @@ jobs:
       RUSTV: ${{ matrix.rust }}
       SCCACHE_CACHE_SIZE: 2G
       SCCACHE_DIR: ${{ matrix.sccache-path }}
-      # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out
+      SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out
     steps:
       - uses: actions/checkout@v2
       - name: Install sccache (ubuntu-latest)

From 16e78a86016bfe65a65ea80de2871e3bdccd687c Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:22:27 -0500
Subject: [PATCH 16/23] Okay, cache cleared, pushing again

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 33d68d0..e28fe33 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,7 +26,7 @@ jobs:
       RUSTV: ${{ matrix.rust }}
       SCCACHE_CACHE_SIZE: 2G
       SCCACHE_DIR: ${{ matrix.sccache-path }}
-      SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out
+      # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out
     steps:
       - uses: actions/checkout@v2
       - name: Install sccache (ubuntu-latest)

From d685ef4615d2f80b87cfd87fa11786bf18e098e4 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:25:04 -0500
Subject: [PATCH 17/23] Trying to allow the workflow to trigger manually

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e28fe33..59463bc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4,6 +4,8 @@ name: CI
 on:
   pull_request:
   workflow_dispatch:
+  workflow_run:
+    branches: [feature/trying_ci]
 
 jobs:
   tests:

From 6f00024e7b3b3a88fe958f88eab3185d17cbf53f Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:25:55 -0500
Subject: [PATCH 18/23] No, that wasn't it

---
 .github/workflows/ci.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 59463bc..b137ef5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4,9 +4,7 @@ name: CI
 on:
   pull_request:
   workflow_dispatch:
-  workflow_run:
-    branches: [feature/trying_ci]
-
+    
 jobs:
   tests:
     name: ${{ matrix.make.name }} (${{ matrix.os }})

From 5704ceba44ddf32133eceb923909a002fb3c0a48 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:29:18 -0500
Subject: [PATCH 19/23] Adding push to the triggers

---
 .github/workflows/ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b137ef5..dbf9672 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,9 +2,10 @@
 name: CI
 
 on:
+  push:
   pull_request:
   workflow_dispatch:
-    
+
 jobs:
   tests:
     name: ${{ matrix.make.name }} (${{ matrix.os }})

From ee1d15676d57da23da23c738aa7db15cbd66199d Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:37:20 -0500
Subject: [PATCH 20/23] Create rusty-neat-tests.yml

---
 .github/workflows/rusty-neat-tests.yml | 33 ++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .github/workflows/rusty-neat-tests.yml

diff --git a/.github/workflows/rusty-neat-tests.yml b/.github/workflows/rusty-neat-tests.yml
new file mode 100644
index 0000000..5058aaa
--- /dev/null
+++ b/.github/workflows/rusty-neat-tests.yml
@@ -0,0 +1,33 @@
+name: rusty-neat-tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ "main", "feature/trying_ci" ]
+  pull_request:
+    branches: [ "main" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+        
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Cache
+      uses: actions/cache@v4.0.1
+      with:
+        path: |
+          ~/.cargo/registry
+          ~/.cargo/git
+        key: ubuntu-latest-cargo-${{ hashFiles('**/Cargo.lock') }}
+        restore-keys: |
+          ubuntu-latest-cargo-
+    - name: Build
+      run: cargo build --verbose
+    - name: Run tests
+      run: cargo test --verbose

From 78f57cc74fe0875e3ba20dff04356f848fd0f62f Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:38:53 -0500
Subject: [PATCH 21/23] Keeping some dev changes

---
 src/main.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index 70907ee..0785e57 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -134,7 +134,7 @@ fn main() {
         info!("Shuffling output fastq data");
         let mut outsets: Box<Vec<&Vec<u8>>> = Box::new(read_sets.iter().collect());
         outsets.shuffle(&mut rng);
-
+        
         info!("Writing fastq");
         write_fastq(
             &output_file,
@@ -144,5 +144,6 @@ fn main() {
         ).expect("Problem writing fastq file");
         info!("Processing complete")
     }
+
 }
 

From 20660941d7a2fcc6756078ab479150ea17e8f381 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:39:54 -0500
Subject: [PATCH 22/23] Deleting the file...again

---
 .github/workflows/ci.yml | 75 ----------------------------------------
 1 file changed, 75 deletions(-)
 delete mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index dbf9672..0000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,75 +0,0 @@
-# .github/workflows/ci.yml
-name: CI
-
-on:
-  push:
-  pull_request:
-  workflow_dispatch:
-
-jobs:
-  tests:
-    name: ${{ matrix.make.name }} (${{ matrix.os }})
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest]
-        rust: [stable]
-        make:
-          - name: Unit tests
-            task: "build test"
-        include:
-          - os: ubuntu-latest
-            sccache-path: /home/runner/.cache/sccache
-    env:
-      RUST_BACKTRACE: full
-      RUSTC_WRAPPER: sccache
-      RUSTV: ${{ matrix.rust }}
-      SCCACHE_CACHE_SIZE: 2G
-      SCCACHE_DIR: ${{ matrix.sccache-path }}
-      # SCCACHE_RECACHE: 1 # Uncomment this line to clear cache, then comment it back out
-    steps:
-      - uses: actions/checkout@v2
-      - name: Install sccache (ubuntu-latest)
-        if: matrix.os == 'ubuntu-latest'
-        env:
-          LINK: https://github.com/mozilla/sccache/releases/download
-          SCCACHE_VERSION: 0.2.13
-        run: |
-          SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl
-          mkdir -p $HOME/.local/bin
-          curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz
-          mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache
-          echo "$HOME/.local/bin" >> $GITHUB_PATH
-      - name: Install Rust ${{ matrix.rust }}
-        uses: actions-rs/toolchain@v1
-        with:
-          toolchain: ${{ matrix.rust }}
-          profile: minimal
-          override: true
-      - name: Cache cargo registry
-        uses: actions/cache@v2
-        continue-on-error: false
-        with:
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
-          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-cargo-
-      - name: Save sccache
-        uses: actions/cache@v2
-        continue-on-error: false
-        with:
-          path: ${{ matrix.sccache-path }}
-          key: ${{ runner.os }}-sccache-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-sccache-
-      - name: Start sccache server
-        run: sccache --start-server
-      - name: ${{ matrix.make.name }}
-        run: cargo ${{ matrix.make.task }}
-      - name: Print sccache stats
-        run: sccache --show-stats
-      - name: Stop sccache server
-        run: sccache --stop-server || true

From 3596a97cac49625f3df6dae4fa09770e0758e3fa Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 18 Mar 2024 23:42:39 -0500
Subject: [PATCH 23/23] Trying to clear a warning from github

---
 .github/workflows/rusty-neat-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rusty-neat-tests.yml b/.github/workflows/rusty-neat-tests.yml
index 5058aaa..15f279a 100644
--- a/.github/workflows/rusty-neat-tests.yml
+++ b/.github/workflows/rusty-neat-tests.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Cache
       uses: actions/cache@v4.0.1
       with: