Skip to content

Commit

Permalink
Merge pull request #8 from ncsa/feature/trying_ci
Browse files Browse the repository at this point in the history
Feature/trying ci
  • Loading branch information
joshfactorial authored Mar 19, 2024
2 parents 1aed614 + 0b96443 commit f9f5bed
Show file tree
Hide file tree
Showing 15 changed files with 751 additions and 271 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/rusty-neat-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: rusty-neat-tests
on:
workflow_dispatch:
push:
branches: [ "main", "feature/trying_ci" ]
branches: [ "main", "feature/develop_tests" ]
pull_request:
branches: [ "main" ]

Expand All @@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Cache
uses: actions/[email protected]
with:
Expand Down
20 changes: 19 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "rusty-neat"
version = "0.1.0"
version = "0.2.0"
authors = ["Joshua Allen <[email protected]>"]

[dependencies]
Expand All @@ -13,3 +13,4 @@ serde_yaml = "0.9.32"
clap = { version = "4.5.1", features = ["derive"] }
itertools = "0.12.1"
assert_fs = "1.1.1"
rand_distr = "0.5.0-alpha.0"
14 changes: 7 additions & 7 deletions config/simple_template.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
reference: REQUIRED
read_len: .
coverage: 20
output_dir: /home/joshfactorial/code/rust_neat_outputs/
output_prefix: neat_test_1
mutation_rate: .

# Below are not yet active
ploidy: .
paired_ended: .
fragment_mean: .
Expand All @@ -15,6 +13,11 @@ produce_vcf: .
produce_fasta: .
produce_fastq: .

overwrite_output: .
output_dir: .
output_prefix: .

# Below are not yet active
error_model: .
mutation_model: .
fragment_model: .
Expand All @@ -29,10 +32,7 @@ include_vcf: .
target_bed: .
off_target_scalar: .
discard_bed: .
mutation_rate: .
mutation_bed: .
no_coverage_bias: .
rng_seed: .
min_mutations: .
fasta_per_ploid: .
overwrite_output: .
min_mutations: .
116 changes: 84 additions & 32 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ extern crate clap;
extern crate log;
extern crate simplelog;
extern crate serde_yaml;
extern crate rand_distr;
extern crate itertools;

mod utils;

use std::collections::{HashMap, HashSet};
use std::fs::File;
use clap::{Parser};
use log::*;
use simplelog::*;
Expand All @@ -19,22 +22,52 @@ use utils::config::{read_config_yaml, build_config_from_args};
use utils::mutate::mutate_fasta;
use utils::make_reads::generate_reads;
use utils::fastq_tools::write_fastq;
use utils::file_tools::check_parent_and_create;
use utils::vcf_tools::write_vcf;

fn main() {

TermLogger::init(
LevelFilter::Trace,
Config::default(),
TerminalMode::Stdout,
ColorChoice::Auto,
).unwrap();

let mut rng = thread_rng();

info!("Begin processing");

// parse the arguments from the command line
let args = cli::Cli::parse();

let level_filter = match args.log_level.to_lowercase().as_str() {
"trace" => LevelFilter::Trace,
"debug" => LevelFilter::Debug,
"info" => LevelFilter::Info,
"warn" => LevelFilter::Warn,
"error" => LevelFilter::Error,
"off" => LevelFilter::Off,
_ => panic!(
"Unknown log level, please set to one of \
Trace, Debug, Info, Warn, Error, or Off (case insensitive)."
)
};

// Check that the parent dir exists
let log_destination = check_parent_and_create(&args.log_dest).unwrap();

CombinedLogger::init(vec![
#[cfg(feature = "termcolor")]
TermLogger::new(
level_filter,
Config::default(),
TerminalMode::Stdout,
ColorChoice::Always,
),
#[cfg(not(feature = "termcolor"))]
SimpleLogger::new(LevelFilter::Trace, Config::default()),
WriteLogger::new(
level_filter,
Config::default(),
File::create(log_destination).unwrap(),
)
]).unwrap();

let mut rng = thread_rng();

// set up the config struct based on whether there was an input config. Input config
// overrides any other inputs.
let config = if args.config != "" {
info!("Using Configuration file input: {}", &args.config);
read_config_yaml(args.config)
Expand All @@ -44,54 +77,73 @@ fn main() {
Ok(build_config_from_args(args).expect("Problem reading configuration yaml file"))
}.unwrap();

// Create the prefix of the files to write
let output_file = format!("{}/{}", config.output_dir, config.output_prefix);

info!("Mapping fasta file: {}", &config.reference);
// Reading the reference file into memory
info!("Mapping reference fasta file: {}", &config.reference);
let (fasta_map, fasta_order) = read_fasta(&config.reference);
// todo:
// need to add this twice, produce two mutated fastas, or at least 2 separate mutation
// datasets, each with half the mutation rate. Going to mean twice as much memory needed for
// fasta creation, which isn't ideal
info!("Mutating fasta");
let mutated_map: Box<HashMap<String, Vec<Vec<u8>>>> = mutate_fasta(

// Mutating the reference and recording the variant locations.
info!("Mutating reference.");
let (mutated_map, variant_locations) = mutate_fasta(
&fasta_map,
config.ploidy,
&mut rng
);

info!("Outputting fasta files");
if config.produce_fasta == true {
if config.produce_fasta {
info!("Outputting fasta file");
write_fasta(
&mutated_map,
&fasta_order,
config.overwrite_output,
&output_file,
config.ploidy
).expect("Problem writing fasta file");
}

if config.produce_vcf {
info!("Writing vcf file");
write_vcf(
&variant_locations,
&fasta_order,
config.ploidy,
&config.reference,
config.overwrite_output,
&output_file,
&mut rng).expect("Error writing vcf file")
}

let mut read_sets: HashSet<Vec<u8>> = HashSet::new();
for (_name, sequences) in mutated_map.iter() {
for (_name, sequence) in mutated_map.iter() {
// defined as a set of read sequences that should cover
// the mutated sequence `coverage` number of times
let data_set = generate_reads(
&sequences,
sequence,
&config.read_len,
&config.coverage,
config.paired_ended,
config.fragment_mean,
config.fragment_st_dev,
&mut rng
);

read_sets.extend(*data_set);
}

info!("Shuffling output fastq data");
let mut outsets: Box<Vec<&Vec<u8>>> = Box::new(read_sets.iter().collect());
outsets.shuffle(&mut rng);
if config.produce_fastq {
info!("Shuffling output fastq data");
let mut outsets: Box<Vec<&Vec<u8>>> = Box::new(read_sets.iter().collect());
outsets.shuffle(&mut rng);

info!("Writing fastq");
write_fastq(
&output_file,
config.overwrite_output,
config.paired_ended,
*outsets,
).expect("Problem writing fastq file");
info!("Processing complete")
}

info!("Writing fastq");
write_fastq(
&output_file,
*outsets,
).expect("Problem writing fastq file");
info!("Processing complete")
}

3 changes: 2 additions & 1 deletion src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pub mod config;
pub mod cli;
pub mod make_reads;
pub mod mutate;
pub mod fastq_tools;
pub mod fastq_tools;
pub mod vcf_tools;
32 changes: 30 additions & 2 deletions src/utils/cli.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
/// This is a pretty basic implementation of Clap CLI
/// The idea of this interface and supporting code is that the user can enter an optional
/// config file that will take the place of the other command line options, except the logging
/// features, which are handled separately. Either way, these options are read into a configuration
/// struct that holds the variables for the run. Logging, meanwhile, is handled separately,
/// outside run configuration parsing.

extern crate clap;

use clap::Parser;
use std::env;

#[derive(Parser, Debug)]
pub struct Cli {
Expand All @@ -11,16 +19,28 @@ pub struct Cli {
config <String> = the full path to a configuration yaml file. If entered, it will override
all other command line inputs. No default.
reference <String> = The relative path or full path to the reference file. Must be in fasta
file format. Default "data/H1N1.fa"
output_dir <String> = The directory where output files will be written. If nothing is entered,
it will write output files to the current working directory.
output_file_prefix <String> = output files will start with this name. Default = neat_out
read_length <usize> = the length of the reads in the output fastq file. Default = 150
coverage <usize> = The average depth per read of the fastq files. Default = 10
The following commands are independent of the config and not affected by it one way or another:
log_level <String> = Set a log level for the run. Everything at and above the level chosen will
be displayed in both logs. See simplelog docs for more info:
https://docs.rs/simplelog/latest/simplelog/enum.Level.html
log_dest <String> = Full path filename where to write the log. The default is current working
dir, filename "neat_out.log," which is set during config parsing.
*/
#[arg(short='y', long="configuration_yaml", default_value_t=String::new())]
#[arg(short='C', long="configuration_yaml", default_value_t=String::new(),
help="Enter a full path and filename to a configuration file. \
This will override most other options")]
pub config: String,

// All of these arguments are overridden by the config file
#[arg(short='r', long="reference", default_value_t=String::from("data/H1N1.fa"))]
pub reference: String,
#[arg(short='o', long="output_dir", default_value_t=String::new())]
Expand All @@ -31,4 +51,12 @@ pub struct Cli {
pub read_length: usize,
#[arg(short='c', long="coverage", default_value_t = 10)]
pub coverage: usize,
}

// These options relate to the logging features and are not overridden by a config
#[arg(long="log-level", default_value_t=String::from("Trace"), help="Enter one of Trace, Debug, Info, Warn, Error, Off")]
pub log_level: String,
#[arg(long="log-dest", default_value_t=env::current_dir().unwrap().display().to_string() + "neat_out.log", help="Full path and name to log file")]
pub log_dest: String,
}

// Tests are handled in other places.
Loading

0 comments on commit f9f5bed

Please sign in to comment.