From 3eae33030602a6373901b4aa224a1eca79b15e89 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 13 Apr 2023 11:05:58 +0200
Subject: [PATCH 01/72] Johannes over-simplification
---
Cargo.lock | 6 +-
Cargo.toml | 2 +-
src/main.rs | 267 +++++-----------------------------------------------
3 files changed, 27 insertions(+), 248 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 85ef609..65c85e3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -402,9 +402,9 @@ dependencies = [
[[package]]
name = "itertools"
-version = "0.10.3"
+version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
dependencies = [
"either",
]
@@ -912,8 +912,8 @@ dependencies = [
"clap",
"flate2",
"indicatif",
+ "itertools",
"lazy_static",
- "regex",
]
[[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 029c7aa..712d2cb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,7 +8,7 @@ edition = "2021"
[dependencies]
clap = { version = "3.2.10", features = ["derive"] }
bio = "0.41.0"
-regex = "1.6.0"
lazy_static = "1.4"
indicatif = "0.17.0"
flate2 = "1.0.24"
+itertools = "0.10.5"
diff --git a/src/main.rs b/src/main.rs
index ba77eb6..7ee443c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,24 +1,7 @@
use clap::Parser;
-use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
+use itertools::izip;
use std::iter::Iterator;
-use std::thread;
-lazy_static::lazy_static! {
-static ref UMI_PATTERN: regex::Regex = regex::Regex::new("^(N{2,})([ATCG]*)$").unwrap();
-}
-// Nucleotide pattern for inline transfer
-struct Nucleotide {
- offset: usize,
- spacer: String,
-}
-// Valid extraction of UMI and read for inline transfer
-enum ExtractedRecord {
- Empty,
- Valid {
- read: bio::io::fastq::Record,
- umi: Vec,
- },
-}
// Defining types for simplicity
type File = std::fs::File;
type Fastq = std::io::BufReader;
@@ -40,12 +23,7 @@ impl std::io::Read for ReadFile {
}
// Enum for the two accepted output formats, '.fastq' and '.fastq.gz'
enum OutputFile {
- Fastq {
- read: bio::io::fastq::Writer,
- },
- Gzip {
- read: bio::io::fastq::Writer>,
- },
+ Fastq { read: bio::io::fastq::Writer },
}
impl OutputFile {
// Implement write for OutputFile enum
@@ -55,10 +33,6 @@ impl OutputFile {
read.write(header, desc, s.seq(), s.qual()).unwrap();
OutputFile::Fastq { read }
}
- OutputFile::Gzip { mut read } => {
- read.write(header, desc, s.seq(), s.qual()).unwrap();
- OutputFile::Gzip { read }
- }
}
}
}
@@ -75,28 +49,19 @@ fn read_fastq(path: &str) -> bio::io::fastq::Reader
bio::io::fastq::Reader::new(ReadFile::Fastq(std::fs::File::open(path).unwrap()))
}
}
-// Create output files, gzipped optional
-fn output_file(name: &str, gz: bool) -> OutputFile {
- if gz {
- OutputFile::Gzip {
- read: std::fs::File::create(format!("{}.fastq.gz", name))
- .map(|w| flate2::write::GzEncoder::new(w, flate2::Compression::best()))
- .map(bio::io::fastq::Writer::new)
- .unwrap(),
- }
- } else {
- OutputFile::Fastq {
- read: std::fs::File::create(format!("{}.fastq", name))
- .map(bio::io::fastq::Writer::new)
- .unwrap(),
- }
+// Create output files
+fn output_file(name: &str) -> OutputFile {
+ OutputFile::Fastq {
+ read: std::fs::File::create(format!("{}.fastq", name))
+ .map(bio::io::fastq::Writer::new)
+ .unwrap(),
}
}
#[derive(clap::Parser)]
#[clap(
- version = "0.1.0",
- author = "Judit Hohenthal",
+ version = "0.2.0",
+ author = "Judit Hohenthal, Matthias Zepper, Johannes Alneberg",
about = "A tool for transfering Unique Molecular Identifiers (UMIs)."
)]
struct Opts {
@@ -126,45 +91,8 @@ struct Opts {
\n "
)]
edit_nr: bool,
- #[clap(
- long,
- help = "Disable gzipped output file (its enabled by default).
- \n "
- )]
- no_gzip: bool,
- // Subcommands specifying inline or separate extraction
- #[clap(subcommand)]
- sub: Commands,
-}
-
-#[derive(clap::Subcommand)]
-enum Commands {
- #[clap(
- name = "separate",
- about = "If the UMI reads is in separate fastq file 'separate' must be present in command line.
- \nUMI is entered after --ru-in flag.
- \nExample input: 'umi-transfer --no-gzip --r1-in 'example_file.fastq.gz separate --ru-in 'example_umi.fastq.gz''
- \n "
- )]
- Separate {
- #[clap(long, required = true)]
- ru_in: Vec,
- },
- #[clap(
- name = "inline",
- about = "If the UMI appears inline with the input read files 'inline' must be present in command line.
- \n--pattern1 a nucleotide pattern must be available to locate UMI in read file 1
- \n--pattern2 a nucleotide pattern must be available to locate UMI if read file 2 exists
- \nExample input: 'umi-transfer --no-gzip --r1-in 'example_file.fastq' inline --pattern1 'NNNNNNNNN'
- \n "
- )]
- Inline {
- // Patterns for locating UMI inline, given in Nucleotide pattern
- #[clap(long, required = true)]
- pattern1: String,
- #[clap(long)]
- pattern2: Option,
- },
+ #[clap(long, required = true)]
+ ru_in: Vec,
}
// Writes record with properly inserted UMI to Output file
@@ -186,77 +114,17 @@ fn write_to_file(
output.write(header, s.desc(), s.clone())
}
}
-// Parses Pattern for Inline extraction
-fn parse(pattern: &str) -> Option {
- if let Some(captures) = UMI_PATTERN.captures(pattern) {
- Some(Nucleotide {
- offset: captures.get(1)?.end(),
- spacer: captures.get(2)?.as_str().into(),
- })
- } else {
- panic!("")
- }
-}
-// Extracts UMI from inline record
-fn extract(record: bio::io::fastq::Record, pattern: &str) -> ExtractedRecord {
- let handler = parse(pattern);
- match handler {
- Some(Nucleotide { offset, spacer }) => {
- let end = offset + spacer.len();
- if end <= record.seq().len() && record.seq()[offset..end] == *spacer.as_bytes() {
- let read = bio::io::fastq::Record::with_attrs(
- record.id(),
- record.desc(),
- record.seq()[end..record.seq().len()].into(),
- record.qual()[end..record.qual().len()].into(),
- );
- ExtractedRecord::Valid {
- read: read,
- umi: record.seq()[0..offset].into(),
- }
- } else {
- ExtractedRecord::Empty
- }
- }
- None => panic!(""),
- }
-}
-// Write inline record to Outputfile
-fn write_inline_to_file(
- record: ExtractedRecord,
- write_file: OutputFile,
- second: bool,
-) -> OutputFile {
- match record {
- ExtractedRecord::Empty => panic!("Not Valid UMI/ Record"),
- ExtractedRecord::Valid { read, umi } => write_to_file(read, write_file, &umi, second),
- }
-}
fn main() {
// Parse commandline arguments
let args = Opts::parse();
- // Automatically gzip output file, if --no-gzip flag was included this will be disabled
- let mut gzip = true;
- if args.no_gzip {
- gzip = false;
- }
// Create write files, not gzipped if --no-gzip flag entered.
- let mut write_file_r1 = output_file(&format!("{}1", &args.prefix), gzip);
+ let mut write_file_r1 = output_file(&format!("{}1", &args.prefix));
// Create a record iterator from input file 1
let r1 = read_fastq(&args.r1_in[0]).records();
- // Settings for progress bar
- let len = read_fastq(&args.r1_in[0]).records().count();
- let m = MultiProgress::new();
- let style = ProgressStyle::with_template("[{elapsed_precise}] {bar:60} {pos:>7}/{len:7} {msg}")
- .unwrap();
- let pb = m.add(ProgressBar::new(len.try_into().unwrap()));
- pb.set_style(style.clone());
- let pb2 = m.insert_after(&pb, ProgressBar::new(len.try_into().unwrap()));
- pb2.set_style(style);
println!("[1/1] Transfering UMI to records...");
// Enables editing id in output file 2 if --edit-nr flag was included
@@ -264,108 +132,19 @@ fn main() {
if args.edit_nr {
edit_nr = true;
}
- // Match Subcommand
- match args.sub {
- Commands::Separate { ru_in } => {
- // Clone UMI file for second thread
- let ru1 = ru_in.clone();
- let handle1 = thread::spawn(move || {
- let ru = read_fastq(&ru_in[0]).records();
- // Iterate records in input file and UMI file
- for (r1_rec, ru_rec) in r1.zip(ru) {
- // Update progress bar
- pb.set_message("R1");
- pb.inc(1);
- // Write to Output file
- write_file_r1 =
- write_to_file(r1_rec.unwrap(), write_file_r1, ru_rec.unwrap().seq(), false);
- }
- pb.finish_with_message("R1 done");
- });
-
- // Save thread handler 1 in Vec
- let mut l = Vec::new();
- l.push(handle1);
-
- // If input file 2 exists:
- if !&args.r2_in.is_empty() {
- let r2 = read_fastq(&args.r2_in[0]).records();
- let mut write_file_r2 = output_file(&format!("{}2", &args.prefix), gzip);
- let handle2 = thread::spawn(move || {
- let ru = read_fastq(&ru1[0]).records();
-
- // Set progressbar to position 0
- pb2.set_position(0);
- for (r2_rec, ru_rec) in r2.zip(ru) {
- // Update progressbar
- pb2.set_message("R2");
- pb2.inc(1);
- // Write record to Output file
- write_file_r2 = write_to_file(
- r2_rec.unwrap(),
- write_file_r2,
- ru_rec.unwrap().seq(),
- edit_nr,
- );
- }
- pb2.finish_with_message("R2 done");
- });
- // Save thread handler 2 in Vec
- l.push(handle2);
- } else {
- // If no recond input file exists, remove second progress bar
- MultiProgress::remove(&m, &pb2);
- }
- // Wait for threads to finish
- for i in l {
- if !i.is_finished() {
- i.join().unwrap();
- }
- }
- }
- Commands::Inline { pattern1, pattern2 } => {
- let handle1 = thread::spawn(move || {
- // Iterate each record in input file 1
- for r1_rec in r1 {
- // Update progress bar
- pb.set_message("FASTQ 1");
- pb.inc(1);
- // Extract UMI from record and save both
- let record1 = extract(r1_rec.unwrap(), &pattern1);
+ let ru = read_fastq(&args.ru_in[0]).records();
+ let r2 = read_fastq(&args.r2_in[0]).records();
+ let mut write_file_r2 = output_file(&format!("{}2", &args.prefix));
- // Write record and extracted UMI to output file
- write_file_r1 = write_inline_to_file(record1, write_file_r1, false);
- }
- pb.finish_with_message("FASTQ 1 done");
- });
+ // Iterate records in input file and UMI file
+ for (r1_rec, ru_rec, r2_rec) in izip!(r1, ru, r2) {
+ let ru_rec2 = ru_rec.unwrap(); // Error "handling"
+ let ru2 = ru_rec2.clone();
- // Save thread handler 1 to Vec
- let mut l = Vec::new();
- l.push(handle1);
+ // Write to Output file
+ write_file_r1 = write_to_file(r1_rec.unwrap(), write_file_r1, ru_rec2.seq(), false);
- if !&args.r2_in.is_empty() {
- let mut write_file_r2 = output_file(&format!("{}2", &args.prefix), gzip);
- let r2 = read_fastq(&args.r2_in[0]).records();
- pb2.set_position(0);
- let handle2 = thread::spawn(move || {
- for r2_rec in r2 {
- pb2.set_message("FASTQ 2");
- pb2.inc(1);
- let record2 = extract(r2_rec.unwrap(), &(pattern2.as_ref().unwrap()));
- write_file_r2 = write_inline_to_file(record2, write_file_r2, false);
- }
- pb2.finish_with_message("FASTQ 2 done");
- });
- l.push(handle2);
- } else {
- MultiProgress::remove(&m, &pb2);
- }
- for i in l {
- if !i.is_finished() {
- i.join().unwrap();
- }
- }
- }
+ write_file_r2 = write_to_file(r2_rec.unwrap(), write_file_r2, ru2.seq(), edit_nr);
}
}
From 83c8612c6bbf5c49e885fd93370ad715808da80a Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 13 Apr 2023 16:05:47 +0200
Subject: [PATCH 02/72] Tidying up a bit
---
src/main.rs | 60 +++++++++++++++++++++++++++++++----------------------
1 file changed, 35 insertions(+), 25 deletions(-)
diff --git a/src/main.rs b/src/main.rs
index 7ee443c..2e32227 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -12,6 +12,7 @@ enum ReadFile {
Fastq(File),
Gzip(Gzip),
}
+
impl std::io::Read for ReadFile {
// Implement read for ReadFile enum
fn read(&mut self, into: &mut [u8]) -> std::io::Result {
@@ -21,10 +22,12 @@ impl std::io::Read for ReadFile {
}
}
}
+
// Enum for the two accepted output formats, '.fastq' and '.fastq.gz'
enum OutputFile {
Fastq { read: bio::io::fastq::Writer },
}
+
impl OutputFile {
// Implement write for OutputFile enum
fn write(self, header: &str, desc: Option<&str>, s: bio::io::fastq::Record) -> OutputFile {
@@ -36,6 +39,7 @@ impl OutputFile {
}
}
}
+
// Read input file to Reader. Automatically scans if gzipped from .gz suffix
fn read_fastq(path: &str) -> bio::io::fastq::Reader> {
if path.ends_with(".gz") {
@@ -49,6 +53,7 @@ fn read_fastq(path: &str) -> bio::io::fastq::Reader
bio::io::fastq::Reader::new(ReadFile::Fastq(std::fs::File::open(path).unwrap()))
}
}
+
// Create output files
fn output_file(name: &str) -> OutputFile {
OutputFile::Fastq {
@@ -62,16 +67,22 @@ fn output_file(name: &str) -> OutputFile {
#[clap(
version = "0.2.0",
author = "Judit Hohenthal, Matthias Zepper, Johannes Alneberg",
- about = "A tool for transfering Unique Molecular Identifiers (UMIs)."
+ about = "A tool for transfering Unique Molecular Identifiers (UMIs). \n\nThe UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the header of the first two fastq files. \n\n"
)]
struct Opts {
#[clap(
long,
- default_value = "integrated",
+ default_value = "output",
help = "Prefix for output files, omitted flag will result in default value.
\n "
)]
prefix: String,
+ #[clap(
+ long,
+ help = "Automatically change '3' into '2' in sequence header of output file from R3.
+ \n "
+ )]
+ edit_nr: bool,
#[clap(
long,
required = true,
@@ -81,17 +92,17 @@ struct Opts {
r1_in: Vec,
#[clap(
long,
- help = "Input file 2 with reads.
+ required = true,
+ help = "[REQUIRED] Input file 2 with reads.
\n "
)]
r2_in: Vec,
#[clap(
long,
- help = "Automatically change '3' into '2' in header of output file from R3.
- \n "
+ required = true,
+ help = "[REQUIRED] Input file with UMI.
+ \n"
)]
- edit_nr: bool,
- #[clap(long, required = true)]
ru_in: Vec,
}
@@ -100,10 +111,10 @@ fn write_to_file(
input: bio::io::fastq::Record,
output: OutputFile,
umi: &[u8],
- second: bool,
+ edit_nr: bool,
) -> OutputFile {
let s = input;
- if second {
+ if edit_nr {
let header = &[s.id(), ":", std::str::from_utf8(&umi).unwrap()].concat();
let mut string = String::from(s.desc().unwrap());
string.replace_range(0..1, "2");
@@ -119,32 +130,31 @@ fn main() {
// Parse commandline arguments
let args = Opts::parse();
- // Create write files, not gzipped if --no-gzip flag entered.
- let mut write_file_r1 = output_file(&format!("{}1", &args.prefix));
-
- // Create a record iterator from input file 1
- let r1 = read_fastq(&args.r1_in[0]).records();
-
- println!("[1/1] Transfering UMI to records...");
-
// Enables editing id in output file 2 if --edit-nr flag was included
let mut edit_nr = false;
if args.edit_nr {
edit_nr = true;
}
- let ru = read_fastq(&args.ru_in[0]).records();
+ // Create fastq record iterators from input files
+ let r1 = read_fastq(&args.r1_in[0]).records();
let r2 = read_fastq(&args.r2_in[0]).records();
+ let ru = read_fastq(&args.ru_in[0]).records();
+
+ // Create write files.
+ let mut write_file_r1 = output_file(&format!("{}1", &args.prefix));
let mut write_file_r2 = output_file(&format!("{}2", &args.prefix));
- // Iterate records in input file and UMI file
- for (r1_rec, ru_rec, r2_rec) in izip!(r1, ru, r2) {
- let ru_rec2 = ru_rec.unwrap(); // Error "handling"
- let ru2 = ru_rec2.clone();
+ println!("Transfering UMIs to records...");
- // Write to Output file
- write_file_r1 = write_to_file(r1_rec.unwrap(), write_file_r1, ru_rec2.seq(), false);
+ // Iterate over records in input files
+ for (r1_rec, ru_rec_res, r2_rec) in izip!(r1, ru, r2) {
+ let ru_rec = ru_rec_res.unwrap();
+ // Write to Output file (never edit nr for R1)
+ write_file_r1 = write_to_file(r1_rec.unwrap(), write_file_r1, ru_rec.seq(), false);
- write_file_r2 = write_to_file(r2_rec.unwrap(), write_file_r2, ru2.seq(), edit_nr);
+ let ru_rec2 = ru_rec.clone();
+ // Write to Output file (edit nr for R2 if --edit-nr flag was included)
+ write_file_r2 = write_to_file(r2_rec.unwrap(), write_file_r2, ru_rec2.seq(), edit_nr);
}
}
From 3d6a18fce3a49a5bfdfeb9fb1c3eb38ce2c570ad Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 13 Apr 2023 17:03:20 +0200
Subject: [PATCH 03/72] Added back option to gzip output
---
src/main.rs | 44 ++++++++++++++++++++++++++++++++++----------
1 file changed, 34 insertions(+), 10 deletions(-)
diff --git a/src/main.rs b/src/main.rs
index 2e32227..9fd2e10 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -13,8 +13,8 @@ enum ReadFile {
Gzip(Gzip),
}
+// Implement read for ReadFile enum
impl std::io::Read for ReadFile {
- // Implement read for ReadFile enum
fn read(&mut self, into: &mut [u8]) -> std::io::Result {
match self {
ReadFile::Fastq(file) => file.read(into),
@@ -25,17 +25,26 @@ impl std::io::Read for ReadFile {
// Enum for the two accepted output formats, '.fastq' and '.fastq.gz'
enum OutputFile {
- Fastq { read: bio::io::fastq::Writer },
+ Fastq {
+ read: bio::io::fastq::Writer,
+ },
+ Gzip {
+ read: bio::io::fastq::Writer>,
+ },
}
+// Implement write for OutputFile enum
impl OutputFile {
- // Implement write for OutputFile enum
fn write(self, header: &str, desc: Option<&str>, s: bio::io::fastq::Record) -> OutputFile {
match self {
OutputFile::Fastq { mut read } => {
read.write(header, desc, s.seq(), s.qual()).unwrap();
OutputFile::Fastq { read }
}
+ OutputFile::Gzip { mut read } => {
+ read.write(header, desc, s.seq(), s.qual()).unwrap();
+ OutputFile::Gzip { read }
+ }
}
}
}
@@ -55,11 +64,20 @@ fn read_fastq(path: &str) -> bio::io::fastq::Reader
}
// Create output files
-fn output_file(name: &str) -> OutputFile {
- OutputFile::Fastq {
- read: std::fs::File::create(format!("{}.fastq", name))
- .map(bio::io::fastq::Writer::new)
- .unwrap(),
+fn output_file(name: &str, gz: bool) -> OutputFile {
+ if gz {
+ OutputFile::Gzip {
+ read: std::fs::File::create(format!("{}.fastq.gz", name))
+ .map(|w| flate2::write::GzEncoder::new(w, flate2::Compression::default()))
+ .map(bio::io::fastq::Writer::new)
+ .unwrap(),
+ }
+ } else {
+ OutputFile::Fastq {
+ read: std::fs::File::create(format!("{}.fastq", name))
+ .map(bio::io::fastq::Writer::new)
+ .unwrap(),
+ }
}
}
@@ -104,6 +122,12 @@ struct Opts {
\n"
)]
ru_in: Vec,
+ #[clap(
+ long,
+ help = "Compress output files with gzip. By default turned off to encourage use of external compression (see Readme).
+ \n "
+ )]
+ gzip: bool,
}
// Writes record with properly inserted UMI to Output file
@@ -142,8 +166,8 @@ fn main() {
let ru = read_fastq(&args.ru_in[0]).records();
// Create write files.
- let mut write_file_r1 = output_file(&format!("{}1", &args.prefix));
- let mut write_file_r2 = output_file(&format!("{}2", &args.prefix));
+ let mut write_file_r1 = output_file(&format!("{}1", &args.prefix), args.gzip);
+ let mut write_file_r2 = output_file(&format!("{}2", &args.prefix), args.gzip);
println!("Transfering UMIs to records...");
From b8620263408c77db506cd65244cad3bcf9d49df8 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 13 Apr 2023 17:10:53 +0200
Subject: [PATCH 04/72] Started adjusting the readme
---
README.md | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index dec76b8..dde59e4 100644
--- a/README.md
+++ b/README.md
@@ -35,15 +35,17 @@ The tool requires an input as follows:
Running the tool can be done by `cargo run --release -- [options] --r1-in 'fastq' `, where the `--release` flag is optional, but will ensure an optimized build.
-### Inline UMI example:
+### Inline UMI example
`cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R2.fastq' inline --pattern1 'NNNNNNNNN' --pattern2 'NNNNNNNNN'`
-### UMI in seperate file example:
+### UMI in seperate file example
-`cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R3.fastq' separate --ru-in 'R2.fastq'`
+```shell
+cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R3.fastq' separate --ru-in 'R2.fastq'
+```
-### Special flags:
+### Special flags
> `--edit-nr` This flag will automatically change the '3' in the R3 files record-headers. Its disabled by default.
> `--no-gzip` This flag diables automatic compression (.gz) of output files.
From bfe147b6c0511ee6e3f265c757b9ada68bcb3257 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 13 Apr 2023 17:14:50 +0200
Subject: [PATCH 05/72] Readme table formatting
---
README.md | 32 +++++++++++++++++---------------
1 file changed, 17 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index dde59e4..5614ea8 100644
--- a/README.md
+++ b/README.md
@@ -20,24 +20,26 @@ The tool requires an input as follows:
`SUBCOMMANDS: `
-> `inline:`
->
-> > | Flag | Required | Description |
-> > | ------------ | :------------------------: | -------------------------: |
-> > | `--pattern1` | Yes | Nucleotide Pattern for UMI |
-> > | `--pattern2` | Needed if `--r2-in` exists | Nucleotide Pattern for UMI |
->
-> `separate:`
->
-> > | Flag | Required | Description |
-> > | --------- | :------: | ---------------------------: |
-> > | `--ru-in` | Yes | FASTQ containing UMI records |
+`inline:`
+|
+| Flag | Required | Description |
+| ------------ | :------------------------: | -------------------------: |
+| `--pattern1` | Yes | Nucleotide Pattern for UMI |
+| `--pattern2` | Needed if `--r2-in` exists | Nucleotide Pattern for UMI |
+|
+| `separate:`
+|
+| Flag | Required | Description |
+| --------- | :------: | ---------------------------: |
+| `--ru-in` | Yes | FASTQ containing UMI records |
Running the tool can be done by `cargo run --release -- [options] --r1-in 'fastq' `, where the `--release` flag is optional, but will ensure an optimized build.
### Inline UMI example
-`cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R2.fastq' inline --pattern1 'NNNNNNNNN' --pattern2 'NNNNNNNNN'`
+```shell
+cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R2.fastq' inline --pattern1 'NNNNNNNNN' --pattern2 'NNNNNNNNN'
+```
### UMI in seperate file example
@@ -47,5 +49,5 @@ cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R3.fastq' s
### Special flags
-> `--edit-nr` This flag will automatically change the '3' in the R3 files record-headers. Its disabled by default.
-> `--no-gzip` This flag diables automatic compression (.gz) of output files.
+`--edit-nr` This flag will automatically change the '3' in the R3 files record-headers. Its disabled by default.
+`--no-gzip` This flag diables automatic compression (.gz) of output files.
From 743ca2d30fddfd3df2645d0dbc4c6a9f97c5d113 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 13 Apr 2023 17:25:48 +0200
Subject: [PATCH 06/72] more readme changes
---
README.md | 80 +++++++++++++++++++++++++++++++------------------------
1 file changed, 45 insertions(+), 35 deletions(-)
diff --git a/README.md b/README.md
index 5614ea8..dc44f3e 100644
--- a/README.md
+++ b/README.md
@@ -1,47 +1,53 @@
-# Building
+# umi-transfer
+A tool for transfering Unique Molecular Identifiers (UMIs).
-Go to the directory with the tool and type in `cargo build` .
-
-# Running
+The UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the
+header of the first two fastq files.
-### Usage
+## Installation
+TODO
+## Usage
The tool requires an input as follows:
-> `umi-transfer [OPTIONS] `
-
-`OPTIONS:`
-| Flag | Required | Description |
-| ------------- | :-----------: | ----------: |
-| `-h`,`--help` | No | Print help information |
-| `--prefix` | No, but default will be '`integrated`' | dictates name of output files|
-| `--r1-in` | Yes | FASTQ file with reads|
-| `--r2-in` | No | FASTQ file with reads |
-
-`SUBCOMMANDS: `
-
-`inline:`
-|
-| Flag | Required | Description |
-| ------------ | :------------------------: | -------------------------: |
-| `--pattern1` | Yes | Nucleotide Pattern for UMI |
-| `--pattern2` | Needed if `--r2-in` exists | Nucleotide Pattern for UMI |
-|
-| `separate:`
-|
-| Flag | Required | Description |
-| --------- | :------: | ---------------------------: |
-| `--ru-in` | Yes | FASTQ containing UMI records |
+```bash
+umi-transfer 0.2.0
+Judit Hohenthal, Matthias Zepper, Johannes Alneberg
+A tool for transfering Unique Molecular Identifiers (UMIs).
+
+The UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the
+header of the first two fastq files.
+
+
+USAGE:
+ umi-transfer [OPTIONS] --r1-in --r2-in --ru-in
+
+OPTIONS:
+ --edit-nr Automatically change '3' into '2' in sequence header of output file
+ from R3.
+
+ --gzip Compress output files with gzip. By default turned off to encourage use
+ of external compression (see Readme).
+
+ -h, --help Print help information
+ --prefix Prefix for output files, omitted flag will result in default value.
+
+ [default: output]
+ --r1-in [REQUIRED] Input file 1 with reads.
+
+
+ --r2-in [REQUIRED] Input file 2 with reads.
+
+
+ --ru-in [REQUIRED] Input file with UMI.
+
+ -V, --version Print version information
+```
Running the tool can be done by `cargo run --release -- [options] --r1-in 'fastq' `, where the `--release` flag is optional, but will ensure an optimized build.
-### Inline UMI example
-```shell
-cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R2.fastq' inline --pattern1 'NNNNNNNNN' --pattern2 'NNNNNNNNN'
-```
-
-### UMI in seperate file example
+### Example
```shell
cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R3.fastq' separate --ru-in 'R2.fastq'
@@ -51,3 +57,7 @@ cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R3.fastq' s
`--edit-nr` This flag will automatically change the '3' in the R3 files record-headers. Its disabled by default.
`--no-gzip` This flag diables automatic compression (.gz) of output files.
+
+## For developers
+
+Go to the directory with the tool and type in `cargo build` .
From 2ff5e620995dede3f7eeff463aa313f1b4ef1c91 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 13 Apr 2023 22:26:08 +0200
Subject: [PATCH 07/72] Wrote the performance guide
---
README.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 67 insertions(+), 10 deletions(-)
diff --git a/README.md b/README.md
index dc44f3e..9161ec2 100644
--- a/README.md
+++ b/README.md
@@ -5,12 +5,27 @@ The UMIs are given as a fastq file and will be transferred, explaining the name
header of the first two fastq files.
## Installation
-TODO
+
+### Compile from source
+Given that you have [rust installed](https://www.rust-lang.org/tools/install) on your computer, download this repo and run
+```shell
+cargo build --release
+```
+That should create an executable `target/release/umi-transfer` that can be placed anywhere in your `$PATH` or be executed directly by specifying its' path:
+
+```shell
+./target/release/umi-transfer --version
+umi-transfer 0.2.0
+```
## Usage
-The tool requires an input as follows:
+>### Performance Note:
+>The decompression and compression used within umi-transfer is single-threaded, so to get the most reads per minute performance, see the [high performance guide](#high-performance-guide)
+
+The tool requires three fastq files and additionally accepts flags to adjust the behaviour as can be seen from the help message:
-```bash
+```raw
+$ umi-transfer --help
umi-transfer 0.2.0
Judit Hohenthal, Matthias Zepper, Johannes Alneberg
A tool for transfering Unique Molecular Identifiers (UMIs).
@@ -44,20 +59,62 @@ OPTIONS:
-V, --version Print version information
```
-Running the tool can be done by `cargo run --release -- [options] --r1-in 'fastq' `, where the `--release` flag is optional, but will ensure an optimized build.
-
-
### Example
```shell
-cargo run --release -- --prefix 'output' --r1-in 'R1.fastq' --r2-in 'R3.fastq' separate --ru-in 'R2.fastq'
+cargo run --release -- --prefix 'output' --edit-nr --r1-in 'R1.fastq' --r2-in 'R3.fastq' --ru-in 'R2.fastq'
```
-### Special flags
+### High Performance Guide
+If you have more than one thread available on your computer and would like to process the read files as quickly as possible we recommend to use unix FIFOs (First In First Out) to handle decompression and compression of the fastq files.
+This can be done as follows, given that you have your input files compressed as `fastq.gz`, first create FIFOs to represent your uncompressed input files:
-`--edit-nr` This flag will automatically change the '3' in the R3 files record-headers. Its disabled by default.
-`--no-gzip` This flag diables automatic compression (.gz) of output files.
+```shell
+$ mkfifo read1.fastq
+$ mkfifo read2.fastq
+$ mkfifo read3.fastq
+```
+and then we use `zcat` to decompress our input files and send it to the pipe that the FIFOs represent:
+```shell
+$ zcat read1.fastq.gz > read1.fastq &
+[1] 233387
+$ zcat read2.fastq.gz > read2.fastq &
+[2] 233388
+$ zcat read3.fastq.gz > read3.fastq &
+[3] 233389
+```
+Note the trailing `&` to leave these processes running in the background. We can inspect the directory with `ls`:
+```shell
+$ ls -lh
+total 1.5K
+-rw-rw----. 1 alneberg ngi2016004 4.5G Apr 13 12:18 read1.fastq.gz
+-rw-rw----. 1 alneberg ngi2016004 1.1G Apr 13 12:18 read2.fastq.gz
+-rw-rw----. 1 alneberg ngi2016004 4.5G Apr 13 12:18 read3.fastq.gz
+prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read1.fastq
+prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read2.fastq
+prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read3.fastq
+```
+We continue to create corresponding FIFOs for the output files (note that the filenames need to match the value given to `--prefix`)
+```shell
+$ mkfifo output1.fastq
+$ mkfifo output2.fastq
+$ pigz -p 10 --stdout > output1.fastq.gz < output1.fastq &
+[4] 233394
+$ pigz -p 10 --stdout > output2.fastq.gz < output2.fastq &
+[5] 233395
+```
+The value `10` is how many threads each of the `pigz` processes is allowed to use.
+The optimal value for this depends on several factors and for optimal performance you will have to do some testing on your exact hardware.
+We can then run the `umi-transfer` program as follows:
+```shell
+$ umi-transfer --prefix output --edit-nr --r1-in read1.fastq --r2-in read3.fastq --ru-in read2.fastq
+```
+
+It's good practice to remove the FIFOs after the program has finished:
+```shell
+rm read*.fastq output*.fastq
+```
## For developers
Go to the directory with the tool and type in `cargo build` .
From 532faac0e0cb0c735eb427358017f2fdffb0f998 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Fri, 14 Apr 2023 15:32:31 +0200
Subject: [PATCH 08/72] Version 0.2.0
---
Cargo.lock | 2 +-
Cargo.toml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 65c85e3..a188cc3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -906,7 +906,7 @@ checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
[[package]]
name = "umi-transfer"
-version = "0.1.0"
+version = "0.2.0"
dependencies = [
"bio",
"clap",
diff --git a/Cargo.toml b/Cargo.toml
index 712d2cb..a8c82e2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "umi-transfer"
-version = "0.1.0"
+version = "0.2.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
From 94bd11f0ca22d8be01c1aff34a70bad5de5f37c3 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Fri, 14 Apr 2023 15:33:02 +0200
Subject: [PATCH 09/72] Added background and for developers sections to README
---
README.md | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 9161ec2..1a86226 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,10 @@ A tool for transfering Unique Molecular Identifiers (UMIs).
The UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the
header of the first two fastq files.
+## Background
+Common demultiplexing softwares return a separate fastq file, usually named `R2`, containing UMIs.
+However, common analysis tools does not allow for this and instead requires the UMI to be contained within the header of the two reads in the pair.
+This tools performs this transform in an efficient manner and can also conveniently rename the oddly named read-`3` to read-`2` which is probably more widely recognized.
## Installation
@@ -116,5 +120,13 @@ It's good practice to remove the FIFOs after the program has finished:
rm read*.fastq output*.fastq
```
## For developers
+To make modifications to `umi-transfer`, clone this repository, make your changes and then run the code with
+```shell
+cargo run --
+```
+or build the executable with
+```shell
+cargo build --release
+```
-Go to the directory with the tool and type in `cargo build` .
+Please make sure to activate code formatting by `rust-analyzer`.
From 86135d8ca291e6a37d46d22a55388feae65dcac1 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Tue, 18 Apr 2023 17:05:06 +0200
Subject: [PATCH 10/72] Use file-format crate to check for gzipped file
---
Cargo.lock | 7 +++++++
Cargo.toml | 1 +
src/main.rs | 7 +++++--
3 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index a188cc3..7f6ca32 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -301,6 +301,12 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da"
+[[package]]
+name = "file-format"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fb46518b6034ba6dbc075ca73277d66cbd488c651c2ccc7255c62b00ce48d24"
+
[[package]]
name = "fixedbitset"
version = "0.4.2"
@@ -910,6 +916,7 @@ version = "0.2.0"
dependencies = [
"bio",
"clap",
+ "file-format",
"flate2",
"indicatif",
"itertools",
diff --git a/Cargo.toml b/Cargo.toml
index a8c82e2..2347d24 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,3 +12,4 @@ lazy_static = "1.4"
indicatif = "0.17.0"
flate2 = "1.0.24"
itertools = "0.10.5"
+file-format = "0.7.0"
diff --git a/src/main.rs b/src/main.rs
index 9fd2e10..bcafeb1 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
use clap::Parser;
+use file_format::FileFormat;
use itertools::izip;
use std::iter::Iterator;
@@ -49,9 +50,10 @@ impl OutputFile {
}
}
-// Read input file to Reader. Automatically scans if gzipped from .gz suffix
+// Read input file to Reader. Automatically scans if gzipped from file-format crate
fn read_fastq(path: &str) -> bio::io::fastq::Reader> {
- if path.ends_with(".gz") {
+ let format = FileFormat::from_file(path).unwrap();
+ if format == FileFormat::Gzip {
bio::io::fastq::Reader::new(ReadFile::Gzip(
std::fs::File::open(path)
.map(std::io::BufReader::new)
@@ -59,6 +61,7 @@ fn read_fastq(path: &str) -> bio::io::fastq::Reader
.unwrap(),
))
} else {
+ // If not gzipped, read as plain fastq
bio::io::fastq::Reader::new(ReadFile::Fastq(std::fs::File::open(path).unwrap()))
}
}
From b379998f114e61f4b966b67a23c70a65bf623ed0 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Tue, 18 Apr 2023 17:07:26 +0200
Subject: [PATCH 11/72] Added 'written by' to authors list in clap
---
src/main.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main.rs b/src/main.rs
index bcafeb1..bc18642 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -87,7 +87,7 @@ fn output_file(name: &str, gz: bool) -> OutputFile {
#[derive(clap::Parser)]
#[clap(
version = "0.2.0",
- author = "Judit Hohenthal, Matthias Zepper, Johannes Alneberg",
+ author = "Written by Judit Hohenthal, Matthias Zepper, Johannes Alneberg",
about = "A tool for transfering Unique Molecular Identifiers (UMIs). \n\nThe UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the header of the first two fastq files. \n\n"
)]
struct Opts {
From 6d3fe059e1342bebca49ce241588cf80bf2ca36a Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Tue, 18 Apr 2023 17:34:39 +0200
Subject: [PATCH 12/72] Failed splitting, cannot reach file_io from
umi_external
---
src/file_io.rs | 102 +++++++++++++++++++++++++++++++++
src/main.rs | 133 +-------------------------------------------
src/umi_external.rs | 34 +++++++++++
3 files changed, 139 insertions(+), 130 deletions(-)
create mode 100644 src/file_io.rs
create mode 100644 src/umi_external.rs
diff --git a/src/file_io.rs b/src/file_io.rs
new file mode 100644
index 0000000..484835d
--- /dev/null
+++ b/src/file_io.rs
@@ -0,0 +1,102 @@
+use file_format::FileFormat;
+
+// Defining types for simplicity
+type File = std::fs::File;
+type Fastq = std::io::BufReader;
+type Gzip = flate2::bufread::MultiGzDecoder;
+
+// Enum for the two acceptable input file formats: '.fastq' and '.fastq.gz'
+pub enum ReadFile {
+ Fastq(File),
+ Gzip(Gzip),
+}
+
+// Implement read for ReadFile enum
+impl std::io::Read for ReadFile {
+ fn read(&mut self, into: &mut [u8]) -> std::io::Result {
+ match self {
+ ReadFile::Fastq(file) => file.read(into),
+ ReadFile::Gzip(file) => file.read(into),
+ }
+ }
+}
+
+// Enum for the two accepted output formats, '.fastq' and '.fastq.gz'
+pub enum OutputFile {
+ Fastq {
+ read: bio::io::fastq::Writer,
+ },
+ Gzip {
+ read: bio::io::fastq::Writer>,
+ },
+}
+
+// Implement write for OutputFile enum
+impl OutputFile {
+ pub fn write(self, header: &str, desc: Option<&str>, s: bio::io::fastq::Record) -> OutputFile {
+ match self {
+ OutputFile::Fastq { mut read } => {
+ read.write(header, desc, s.seq(), s.qual()).unwrap();
+ OutputFile::Fastq { read }
+ }
+ OutputFile::Gzip { mut read } => {
+ read.write(header, desc, s.seq(), s.qual()).unwrap();
+ OutputFile::Gzip { read }
+ }
+ }
+ }
+}
+
+// Read input file to Reader. Automatically scans if gzipped from file-format crate
+pub fn read_fastq(path: &str) -> bio::io::fastq::Reader> {
+ let format = FileFormat::from_file(path).unwrap();
+ if format == FileFormat::Gzip {
+ bio::io::fastq::Reader::new(ReadFile::Gzip(
+ std::fs::File::open(path)
+ .map(std::io::BufReader::new)
+ .map(flate2::bufread::MultiGzDecoder::new)
+ .unwrap(),
+ ))
+ } else {
+ // If not gzipped, read as plain fastq
+ bio::io::fastq::Reader::new(ReadFile::Fastq(std::fs::File::open(path).unwrap()))
+ }
+}
+
+// Create output files
+pub fn output_file(name: &str, gz: bool) -> OutputFile {
+ if gz {
+ OutputFile::Gzip {
+ read: std::fs::File::create(format!("{}.fastq.gz", name))
+ .map(|w| flate2::write::GzEncoder::new(w, flate2::Compression::default()))
+ .map(bio::io::fastq::Writer::new)
+ .unwrap(),
+ }
+ } else {
+ OutputFile::Fastq {
+ read: std::fs::File::create(format!("{}.fastq", name))
+ .map(bio::io::fastq::Writer::new)
+ .unwrap(),
+ }
+ }
+}
+
+// Writes record with properly inserted UMI to Output file
+pub fn write_to_file(
+ input: bio::io::fastq::Record,
+ output: OutputFile,
+ umi: &[u8],
+ edit_nr: bool,
+) -> OutputFile {
+ let s = input;
+ if edit_nr {
+ let header = &[s.id(), ":", std::str::from_utf8(&umi).unwrap()].concat();
+ let mut string = String::from(s.desc().unwrap());
+ string.replace_range(0..1, "2");
+ let desc: Option<&str> = Some(&string);
+ output.write(header, desc, s)
+ } else {
+ let header = &[s.id(), ":", std::str::from_utf8(&umi).unwrap()].concat();
+ output.write(header, s.desc(), s.clone())
+ }
+}
diff --git a/src/main.rs b/src/main.rs
index bc18642..b5022d3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,88 +1,7 @@
use clap::Parser;
-use file_format::FileFormat;
-use itertools::izip;
-use std::iter::Iterator;
-// Defining types for simplicity
-type File = std::fs::File;
-type Fastq = std::io::BufReader;
-type Gzip = flate2::bufread::MultiGzDecoder;
-
-// Enum for the two acceptable input file formats: '.fastq' and '.fastq.gz'
-enum ReadFile {
- Fastq(File),
- Gzip(Gzip),
-}
-
-// Implement read for ReadFile enum
-impl std::io::Read for ReadFile {
- fn read(&mut self, into: &mut [u8]) -> std::io::Result {
- match self {
- ReadFile::Fastq(file) => file.read(into),
- ReadFile::Gzip(file) => file.read(into),
- }
- }
-}
-
-// Enum for the two accepted output formats, '.fastq' and '.fastq.gz'
-enum OutputFile {
- Fastq {
- read: bio::io::fastq::Writer,
- },
- Gzip {
- read: bio::io::fastq::Writer>,
- },
-}
-
-// Implement write for OutputFile enum
-impl OutputFile {
- fn write(self, header: &str, desc: Option<&str>, s: bio::io::fastq::Record) -> OutputFile {
- match self {
- OutputFile::Fastq { mut read } => {
- read.write(header, desc, s.seq(), s.qual()).unwrap();
- OutputFile::Fastq { read }
- }
- OutputFile::Gzip { mut read } => {
- read.write(header, desc, s.seq(), s.qual()).unwrap();
- OutputFile::Gzip { read }
- }
- }
- }
-}
-
-// Read input file to Reader. Automatically scans if gzipped from file-format crate
-fn read_fastq(path: &str) -> bio::io::fastq::Reader> {
- let format = FileFormat::from_file(path).unwrap();
- if format == FileFormat::Gzip {
- bio::io::fastq::Reader::new(ReadFile::Gzip(
- std::fs::File::open(path)
- .map(std::io::BufReader::new)
- .map(flate2::bufread::MultiGzDecoder::new)
- .unwrap(),
- ))
- } else {
- // If not gzipped, read as plain fastq
- bio::io::fastq::Reader::new(ReadFile::Fastq(std::fs::File::open(path).unwrap()))
- }
-}
-
-// Create output files
-fn output_file(name: &str, gz: bool) -> OutputFile {
- if gz {
- OutputFile::Gzip {
- read: std::fs::File::create(format!("{}.fastq.gz", name))
- .map(|w| flate2::write::GzEncoder::new(w, flate2::Compression::default()))
- .map(bio::io::fastq::Writer::new)
- .unwrap(),
- }
- } else {
- OutputFile::Fastq {
- read: std::fs::File::create(format!("{}.fastq", name))
- .map(bio::io::fastq::Writer::new)
- .unwrap(),
- }
- }
-}
+pub mod file_io;
+mod umi_external;
#[derive(clap::Parser)]
#[clap(
@@ -133,55 +52,9 @@ struct Opts {
gzip: bool,
}
-// Writes record with properly inserted UMI to Output file
-fn write_to_file(
- input: bio::io::fastq::Record,
- output: OutputFile,
- umi: &[u8],
- edit_nr: bool,
-) -> OutputFile {
- let s = input;
- if edit_nr {
- let header = &[s.id(), ":", std::str::from_utf8(&umi).unwrap()].concat();
- let mut string = String::from(s.desc().unwrap());
- string.replace_range(0..1, "2");
- let desc: Option<&str> = Some(&string);
- output.write(header, desc, s)
- } else {
- let header = &[s.id(), ":", std::str::from_utf8(&umi).unwrap()].concat();
- output.write(header, s.desc(), s.clone())
- }
-}
-
fn main() {
// Parse commandline arguments
let args = Opts::parse();
- // Enables editing id in output file 2 if --edit-nr flag was included
- let mut edit_nr = false;
- if args.edit_nr {
- edit_nr = true;
- }
-
- // Create fastq record iterators from input files
- let r1 = read_fastq(&args.r1_in[0]).records();
- let r2 = read_fastq(&args.r2_in[0]).records();
- let ru = read_fastq(&args.ru_in[0]).records();
-
- // Create write files.
- let mut write_file_r1 = output_file(&format!("{}1", &args.prefix), args.gzip);
- let mut write_file_r2 = output_file(&format!("{}2", &args.prefix), args.gzip);
-
- println!("Transfering UMIs to records...");
-
- // Iterate over records in input files
- for (r1_rec, ru_rec_res, r2_rec) in izip!(r1, ru, r2) {
- let ru_rec = ru_rec_res.unwrap();
- // Write to Output file (never edit nr for R1)
- write_file_r1 = write_to_file(r1_rec.unwrap(), write_file_r1, ru_rec.seq(), false);
-
- let ru_rec2 = ru_rec.clone();
- // Write to Output file (edit nr for R2 if --edit-nr flag was included)
- write_file_r2 = write_to_file(r2_rec.unwrap(), write_file_r2, ru_rec2.seq(), edit_nr);
- }
+ umi_external::run(args);
}
diff --git a/src/umi_external.rs b/src/umi_external.rs
new file mode 100644
index 0000000..b1902b9
--- /dev/null
+++ b/src/umi_external.rs
@@ -0,0 +1,34 @@
+use itertools::izip;
+
+use file_io;
+
+pub fn run(args: clap::Opts) {
+ // Enables editing id in output file 2 if --edit-nr flag was included
+ let mut edit_nr = false;
+ if args.edit_nr {
+ edit_nr = true;
+ }
+
+ // Create fastq record iterators from input files
+ let r1 = file_io::read_fastq(&args.r1_in[0]).records();
+ let r2 = file_io::read_fastq(&args.r2_in[0]).records();
+ let ru = file_io::read_fastq(&args.ru_in[0]).records();
+
+ // Create write files.
+ let mut write_file_r1 = file_io::output_file(&format!("{}1", &args.prefix), args.gzip);
+ let mut write_file_r2 = file_io::output_file(&format!("{}2", &args.prefix), args.gzip);
+
+ println!("Transfering UMIs to records...");
+
+ // Iterate over records in input files
+ for (r1_rec, ru_rec_res, r2_rec) in izip!(r1, ru, r2) {
+ let ru_rec = ru_rec_res.unwrap();
+ // Write to Output file (never edit nr for R1)
+ write_file_r1 = file_io::write_to_file(r1_rec.unwrap(), write_file_r1, ru_rec.seq(), false);
+
+ let ru_rec2 = ru_rec.clone();
+ // Write to Output file (edit nr for R2 if --edit-nr flag was included)
+ write_file_r2 =
+ file_io::write_to_file(r2_rec.unwrap(), write_file_r2, ru_rec2.seq(), edit_nr);
+ }
+}
From c1f4dcb45d1f9cadba4caff499cabe743ea90e34 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 20 Apr 2023 09:25:09 +0200
Subject: [PATCH 13/72] Managed to use modules
---
src/main.rs | 4 ++--
src/umi_external.rs | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/main.rs b/src/main.rs
index b5022d3..f15935a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,6 @@
use clap::Parser;
-pub mod file_io;
+mod file_io;
mod umi_external;
#[derive(clap::Parser)]
@@ -9,7 +9,7 @@ mod umi_external;
author = "Written by Judit Hohenthal, Matthias Zepper, Johannes Alneberg",
about = "A tool for transfering Unique Molecular Identifiers (UMIs). \n\nThe UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the header of the first two fastq files. \n\n"
)]
-struct Opts {
+pub struct Opts {
#[clap(
long,
default_value = "output",
diff --git a/src/umi_external.rs b/src/umi_external.rs
index b1902b9..5b9b80f 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -1,8 +1,8 @@
use itertools::izip;
-use file_io;
+use super::file_io;
-pub fn run(args: clap::Opts) {
+pub fn run(args: super::Opts) {
// Enables editing id in output file 2 if --edit-nr flag was included
let mut edit_nr = false;
if args.edit_nr {
From d00f40fc4b1a7a626f0a9fd8054ddc48f631eed3 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg
Date: Thu, 20 Apr 2023 09:25:44 +0200
Subject: [PATCH 14/72] Added a gitignore
---
.gitignore | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
create mode 100644 .gitignore
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9b96c89
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,18 @@
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
+# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+# We have so far commited Cargo.lock, so need decision if we want to remove it
+# Cargo.lock
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
+
+# Test data
+test_*.fastq.gz
\ No newline at end of file
From 723b08bef3ad1e2430c939170d08f0678c3c30ff Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Wed, 3 May 2023 15:40:22 +0200
Subject: [PATCH 15/72] Adding some basic error handling.
---
Cargo.lock | 5 +++--
Cargo.toml | 1 +
src/main.rs | 10 ++++++----
src/umi_external.rs | 26 +++++++++++++++++++-------
4 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 7f6ca32..298d3f7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -19,9 +19,9 @@ dependencies = [
[[package]]
name = "anyhow"
-version = "1.0.58"
+version = "1.0.71"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704"
+checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
[[package]]
name = "approx"
@@ -914,6 +914,7 @@ checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
name = "umi-transfer"
version = "0.2.0"
dependencies = [
+ "anyhow",
"bio",
"clap",
"file-format",
diff --git a/Cargo.toml b/Cargo.toml
index 2347d24..f8db807 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,3 +13,4 @@ indicatif = "0.17.0"
flate2 = "1.0.24"
itertools = "0.10.5"
file-format = "0.7.0"
+anyhow = "1.0.71"
diff --git a/src/main.rs b/src/main.rs
index f15935a..1033e06 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,4 @@
+use anyhow::{Context, Result};
use clap::Parser;
mod file_io;
@@ -7,7 +8,7 @@ mod umi_external;
#[clap(
version = "0.2.0",
author = "Written by Judit Hohenthal, Matthias Zepper, Johannes Alneberg",
- about = "A tool for transfering Unique Molecular Identifiers (UMIs). \n\nThe UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the header of the first two fastq files. \n\n"
+ about = "A tool for transferring Unique Molecular Identifiers (UMIs). \n\nThe UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the header of the first two fastq files. \n\n"
)]
pub struct Opts {
#[clap(
@@ -52,9 +53,10 @@ pub struct Opts {
gzip: bool,
}
-fn main() {
- // Parse commandline arguments
+fn main() -> Result<()> {
+ // Parse command line arguments
let args = Opts::parse();
- umi_external::run(args);
+ umi_external::run(args).context("Failed to include the UMIs")?;
+ Ok(())
}
diff --git a/src/umi_external.rs b/src/umi_external.rs
index 5b9b80f..b0263c1 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -1,8 +1,9 @@
+use anyhow::{Context, Result};
use itertools::izip;
use super::file_io;
-pub fn run(args: super::Opts) {
+pub fn run(args: super::Opts) -> Result<()> {
// Enables editing id in output file 2 if --edit-nr flag was included
let mut edit_nr = false;
if args.edit_nr {
@@ -18,17 +19,28 @@ pub fn run(args: super::Opts) {
let mut write_file_r1 = file_io::output_file(&format!("{}1", &args.prefix), args.gzip);
let mut write_file_r2 = file_io::output_file(&format!("{}2", &args.prefix), args.gzip);
- println!("Transfering UMIs to records...");
+ println!("Transferring UMIs to records...");
// Iterate over records in input files
for (r1_rec, ru_rec_res, r2_rec) in izip!(r1, ru, r2) {
- let ru_rec = ru_rec_res.unwrap();
+ let ru_rec = ru_rec_res
+ .with_context(|| format!("Failed to read records from {}", &args.ru_in[0]))?;
+
// Write to Output file (never edit nr for R1)
- write_file_r1 = file_io::write_to_file(r1_rec.unwrap(), write_file_r1, ru_rec.seq(), false);
+ write_file_r1 = file_io::write_to_file(
+ r1_rec.with_context(|| format!("Failed to read records from {}", &args.r1_in[0]))?,
+ write_file_r1,
+ &ru_rec.seq(),
+ false,
+ );
- let ru_rec2 = ru_rec.clone();
// Write to Output file (edit nr for R2 if --edit-nr flag was included)
- write_file_r2 =
- file_io::write_to_file(r2_rec.unwrap(), write_file_r2, ru_rec2.seq(), edit_nr);
+ write_file_r2 = file_io::write_to_file(
+ r2_rec.with_context(|| format!("Failed to read records from {}", &args.r2_in[0]))?,
+ write_file_r2,
+ &ru_rec.seq(),
+ edit_nr,
+ );
}
+ Ok(())
}
From b2c502bc62c1f9f113ab44e5d0988f9c05991fcf Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Wed, 3 May 2023 20:32:39 +0200
Subject: [PATCH 16/72] Implemented some basic error handling.
---
src/main.rs | 1 +
src/umi_errors.rs | 19 +++++++++++++++++++
src/umi_external.rs | 37 ++++++++++++++++++++-----------------
3 files changed, 40 insertions(+), 17 deletions(-)
create mode 100644 src/umi_errors.rs
diff --git a/src/main.rs b/src/main.rs
index 1033e06..2a65b3c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,6 +2,7 @@ use anyhow::{Context, Result};
use clap::Parser;
mod file_io;
+mod umi_errors;
mod umi_external;
#[derive(clap::Parser)]
diff --git a/src/umi_errors.rs b/src/umi_errors.rs
new file mode 100644
index 0000000..54075a8
--- /dev/null
+++ b/src/umi_errors.rs
@@ -0,0 +1,19 @@
+#[derive(Debug)]
+pub enum RuntimeErrors {
+ ReadIDMismatchError,
+ FileNotFoundError,
+ GeneralError,
+}
+
+impl std::fmt::Display for RuntimeErrors {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Self::ReadIDMismatchError => write!(
+ f,
+ "IDs of UMI and read records mismatch. Please provide sorted files!"
+ ),
+ Self::FileNotFoundError => write!(f, "Cannot read from specified path."),
+ Self::GeneralError => write!(f, "Encountered an error."),
+ }
+ }
+}
diff --git a/src/umi_external.rs b/src/umi_external.rs
index b0263c1..063c5c9 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -1,7 +1,8 @@
-use anyhow::{Context, Result};
+use anyhow::{anyhow, Context, Result};
use itertools::izip;
use super::file_io;
+use crate::umi_errors::RuntimeErrors;
pub fn run(args: super::Opts) -> Result<()> {
// Enables editing id in output file 2 if --edit-nr flag was included
@@ -22,25 +23,27 @@ pub fn run(args: super::Opts) -> Result<()> {
println!("Transferring UMIs to records...");
// Iterate over records in input files
- for (r1_rec, ru_rec_res, r2_rec) in izip!(r1, ru, r2) {
+ for (r1_rec_res, ru_rec_res, r2_rec_res) in izip!(r1, ru, r2) {
+ let r1_rec = r1_rec_res
+ .with_context(|| format!("Failed to read records from {}", &args.r1_in[0]))?;
+ let r2_rec = r2_rec_res
+ .with_context(|| format!("Failed to read records from {}", &args.r2_in[0]))?;
let ru_rec = ru_rec_res
.with_context(|| format!("Failed to read records from {}", &args.ru_in[0]))?;
- // Write to Output file (never edit nr for R1)
- write_file_r1 = file_io::write_to_file(
- r1_rec.with_context(|| format!("Failed to read records from {}", &args.r1_in[0]))?,
- write_file_r1,
- &ru_rec.seq(),
- false,
- );
-
- // Write to Output file (edit nr for R2 if --edit-nr flag was included)
- write_file_r2 = file_io::write_to_file(
- r2_rec.with_context(|| format!("Failed to read records from {}", &args.r2_in[0]))?,
- write_file_r2,
- &ru_rec.seq(),
- edit_nr,
- );
+ if r1_rec.id().eq(ru_rec.id()) {
+ // Write to Output file (never edit nr for R1)
+ write_file_r1 = file_io::write_to_file(r1_rec, write_file_r1, &ru_rec.seq(), false);
+ } else {
+ return Err(anyhow!(RuntimeErrors::ReadIDMismatchError));
+ }
+
+ if r2_rec.id().eq(ru_rec.id()) {
+ // Write to Output file (edit nr for R2 if --edit-nr flag was included)
+ write_file_r2 = file_io::write_to_file(r2_rec, write_file_r2, &ru_rec.seq(), edit_nr);
+ } else {
+ return Err(anyhow!(RuntimeErrors::ReadIDMismatchError));
+ }
}
Ok(())
}
From aef2bacbb29f93340c5d5e9bbd9e791ade5ca4d4 Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Fri, 5 May 2023 20:15:20 +0200
Subject: [PATCH 17/72] Further refactor: Subcommand structure to allow for
easy addition of further subcommands in later versions.
---
src/auxiliary.rs | 11 +++++++
src/main.rs | 76 ++++++++++++++++++---------------------------
src/umi_external.rs | 47 +++++++++++++++++++++++++++-
3 files changed, 87 insertions(+), 47 deletions(-)
create mode 100644 src/auxiliary.rs
diff --git a/src/auxiliary.rs b/src/auxiliary.rs
new file mode 100644
index 0000000..81d9099
--- /dev/null
+++ b/src/auxiliary.rs
@@ -0,0 +1,11 @@
+use std::time::Instant;
+
+pub fn timedrun(msg: &str, func: F) -> R
+where
+ F: FnOnce() -> R,
+{
+ let start = Instant::now();
+ let measure = func();
+ println!("{msg} after {:.1} seconds", start.elapsed().as_secs_f32());
+ measure
+}
diff --git a/src/main.rs b/src/main.rs
index 2a65b3c..f03129e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,12 @@
+extern crate core;
+
use anyhow::{Context, Result};
use clap::Parser;
+use crate::auxiliary::timedrun;
+use crate::umi_external::OptsExternal;
+///use crate::umi_internal::OptsInternal;
+mod auxiliary;
mod file_io;
mod umi_errors;
mod umi_external;
@@ -11,53 +17,31 @@ mod umi_external;
author = "Written by Judit Hohenthal, Matthias Zepper, Johannes Alneberg",
about = "A tool for transferring Unique Molecular Identifiers (UMIs). \n\nThe UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the header of the first two fastq files. \n\n"
)]
-pub struct Opts {
- #[clap(
- long,
- default_value = "output",
- help = "Prefix for output files, omitted flag will result in default value.
- \n "
- )]
- prefix: String,
- #[clap(
- long,
- help = "Automatically change '3' into '2' in sequence header of output file from R3.
- \n "
- )]
- edit_nr: bool,
- #[clap(
- long,
- required = true,
- help = "[REQUIRED] Input file 1 with reads.
- \n "
- )]
- r1_in: Vec,
- #[clap(
- long,
- required = true,
- help = "[REQUIRED] Input file 2 with reads.
- \n "
- )]
- r2_in: Vec,
- #[clap(
- long,
- required = true,
- help = "[REQUIRED] Input file with UMI.
- \n"
- )]
- ru_in: Vec,
- #[clap(
- long,
- help = "Compress output files with gzip. By default turned off to encourage use of external compression (see Readme).
- \n "
- )]
- gzip: bool,
+
+pub struct Opt {
+ #[clap(subcommand)]
+ cmd: Subcommand,
+}
+
+#[derive(Debug, Parser)]
+enum Subcommand {
+ /// Integrate UMIs from a separate FastQ file.
+ External(OptsExternal),
+ // Extract UMIs from the reads themselves.
+ // Internal(OptsInternal),
}
-fn main() -> Result<()> {
- // Parse command line arguments
- let args = Opts::parse();
+fn main() {
+ let opt: Opt = Opt::parse();
+ timedrun("umi-transfer finished ", || {
+ let res = match opt.cmd {
+ Subcommand::External(arg) => {
+ umi_external::run(arg).context("Failed to include the UMIs")
+ } //Subcommand::Internal(arg) => umi_internal::run(arg),
+ };
- umi_external::run(args).context("Failed to include the UMIs")?;
- Ok(())
+ if let Err(v) = res {
+ println!("{:?}", v)
+ }
+ });
}
diff --git a/src/umi_external.rs b/src/umi_external.rs
index 063c5c9..8a12ddd 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -1,10 +1,55 @@
use anyhow::{anyhow, Context, Result};
+use clap::Parser;
use itertools::izip;
+use std::path::PathBuf;
use super::file_io;
use crate::umi_errors::RuntimeErrors;
+#[derive(Debug, Parser)]
+pub struct OptsExternal {
+ #[clap(
+ long,
+ default_value = "output",
+ help = "Prefix for output files, omitted flag will result in default value.
+ \n "
+ )]
+ prefix: String,
+ #[clap(
+ long,
+ help = "Automatically change '3' into '2' in sequence header of output file from R3.
+ \n "
+ )]
+ edit_nr: bool,
+ #[clap(
+ long,
+ required = true,
+ help = "[REQUIRED] Input file 1 with reads.
+ \n "
+ )]
+ r1_in: Vec,
+ #[clap(
+ long,
+ required = true,
+ help = "[REQUIRED] Input file 2 with reads.
+ \n "
+ )]
+ r2_in: Vec,
+ #[clap(
+ long,
+ required = true,
+ help = "[REQUIRED] Input file with UMI.
+ \n"
+ )]
+ ru_in: Vec,
+ #[clap(
+ long,
+ help = "Compress output files with gzip. By default turned off to encourage use of external compression (see Readme).
+ \n "
+ )]
+ gzip: bool,
+}
-pub fn run(args: super::Opts) -> Result<()> {
+pub fn run(args: OptsExternal) -> Result<()> {
// Enables editing id in output file 2 if --edit-nr flag was included
let mut edit_nr = false;
if args.edit_nr {
From c62c17a9b9ec4e54c1982b336d53c0640332ded8 Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Fri, 5 May 2023 20:54:01 +0200
Subject: [PATCH 18/72] Implemented a simple counter for the records.
---
src/file_io.rs | 3 ++-
src/main.rs | 2 +-
src/umi_external.rs | 59 +++++++++++++++++++++++++++++++--------------
3 files changed, 44 insertions(+), 20 deletions(-)
diff --git a/src/file_io.rs b/src/file_io.rs
index 484835d..94470c2 100644
--- a/src/file_io.rs
+++ b/src/file_io.rs
@@ -1,4 +1,5 @@
use file_format::FileFormat;
+use std::path::PathBuf;
// Defining types for simplicity
type File = std::fs::File;
@@ -48,7 +49,7 @@ impl OutputFile {
}
// Read input file to Reader. Automatically scans if gzipped from file-format crate
-pub fn read_fastq(path: &str) -> bio::io::fastq::Reader> {
+pub fn read_fastq(path: &PathBuf) -> bio::io::fastq::Reader> {
let format = FileFormat::from_file(path).unwrap();
if format == FileFormat::Gzip {
bio::io::fastq::Reader::new(ReadFile::Gzip(
diff --git a/src/main.rs b/src/main.rs
index f03129e..3a2f539 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -33,7 +33,7 @@ enum Subcommand {
fn main() {
let opt: Opt = Opt::parse();
- timedrun("umi-transfer finished ", || {
+ timedrun("umi-transfer finished", || {
let res = match opt.cmd {
Subcommand::External(arg) => {
umi_external::run(arg).context("Failed to include the UMIs")
diff --git a/src/umi_external.rs b/src/umi_external.rs
index 8a12ddd..e6ffc37 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -21,35 +21,39 @@ pub struct OptsExternal {
)]
edit_nr: bool,
#[clap(
- long,
+ short = '1',
+ long = "in1",
required = true,
help = "[REQUIRED] Input file 1 with reads.
\n "
)]
- r1_in: Vec,
+ r1_in: PathBuf,
#[clap(
- long,
+ short = '2',
+ long = "in2",
required = true,
help = "[REQUIRED] Input file 2 with reads.
\n "
)]
- r2_in: Vec,
+ r2_in: PathBuf,
#[clap(
- long,
+ short = 'u',
+ long = "umi",
required = true,
help = "[REQUIRED] Input file with UMI.
\n"
)]
- ru_in: Vec,
+ ru_in: PathBuf,
#[clap(
- long,
+ short = 'z',
+ long = "gzip",
help = "Compress output files with gzip. By default turned off to encourage use of external compression (see Readme).
\n "
)]
gzip: bool,
}
-pub fn run(args: OptsExternal) -> Result<()> {
+pub fn run(args: OptsExternal) -> Result {
// Enables editing id in output file 2 if --edit-nr flag was included
let mut edit_nr = false;
if args.edit_nr {
@@ -57,24 +61,42 @@ pub fn run(args: OptsExternal) -> Result<()> {
}
// Create fastq record iterators from input files
- let r1 = file_io::read_fastq(&args.r1_in[0]).records();
- let r2 = file_io::read_fastq(&args.r2_in[0]).records();
- let ru = file_io::read_fastq(&args.ru_in[0]).records();
+ let r1 = file_io::read_fastq(&args.r1_in).records();
+ let r2 = file_io::read_fastq(&args.r2_in).records();
+ let ru = file_io::read_fastq(&args.ru_in).records();
// Create write files.
let mut write_file_r1 = file_io::output_file(&format!("{}1", &args.prefix), args.gzip);
let mut write_file_r2 = file_io::output_file(&format!("{}2", &args.prefix), args.gzip);
+ // Record counter
+ let mut counter: i32 = 0;
+
println!("Transferring UMIs to records...");
// Iterate over records in input files
for (r1_rec_res, ru_rec_res, r2_rec_res) in izip!(r1, ru, r2) {
- let r1_rec = r1_rec_res
- .with_context(|| format!("Failed to read records from {}", &args.r1_in[0]))?;
- let r2_rec = r2_rec_res
- .with_context(|| format!("Failed to read records from {}", &args.r2_in[0]))?;
- let ru_rec = ru_rec_res
- .with_context(|| format!("Failed to read records from {}", &args.ru_in[0]))?;
+ let r1_rec = r1_rec_res.with_context(|| {
+ format!(
+ "Failed to read records from {}",
+ &args.r1_in.to_string_lossy()
+ )
+ })?;
+ let r2_rec = r2_rec_res.with_context(|| {
+ format!(
+ "Failed to read records from {}",
+ &args.r2_in.to_string_lossy()
+ )
+ })?;
+ let ru_rec = ru_rec_res.with_context(|| {
+ format!(
+ "Failed to read records from {}",
+ &args.ru_in.to_string_lossy()
+ )
+ })?;
+
+ // Step counter
+ counter += 1;
if r1_rec.id().eq(ru_rec.id()) {
// Write to Output file (never edit nr for R1)
@@ -90,5 +112,6 @@ pub fn run(args: OptsExternal) -> Result<()> {
return Err(anyhow!(RuntimeErrors::ReadIDMismatchError));
}
}
- Ok(())
+ println!("Processed {:?} records", counter);
+ Ok(counter)
}
From 029c087282443985028728dc8c3b151d2452f51d Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Mon, 8 May 2023 15:08:52 +0200
Subject: [PATCH 19/72] Switching the CLI arguments from strings to
Option and implemented an output overwrite check and prompt.
---
Cargo.lock | 267 +++++++++++++++++++++++++++++++++++++++++++-
Cargo.toml | 1 +
src/auxiliary.rs | 23 +++-
src/file_io.rs | 10 +-
src/umi_errors.rs | 2 +
src/umi_external.rs | 58 +++++++---
6 files changed, 335 insertions(+), 26 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 298d3f7..b2a29bf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -38,7 +38,7 @@ version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
- "hermit-abi",
+ "hermit-abi 0.1.19",
"libc",
"winapi",
]
@@ -156,6 +156,12 @@ version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+[[package]]
+name = "cc"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+
[[package]]
name = "cfg-if"
version = "1.0.0"
@@ -263,6 +269,18 @@ dependencies = [
"syn",
]
+[[package]]
+name = "dialoguer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59c6f2989294b9a498d3ad5491a79c6deb604617378e1cdc4bfc1c1361fe2f87"
+dependencies = [
+ "console",
+ "shell-words",
+ "tempfile",
+ "zeroize",
+]
+
[[package]]
name = "either"
version = "1.7.0"
@@ -295,6 +313,36 @@ dependencies = [
"syn",
]
+[[package]]
+name = "errno"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+dependencies = [
+ "errno-dragonfly",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "errno-dragonfly"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "fastrand"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
+dependencies = [
+ "instant",
+]
+
[[package]]
name = "feature-probe"
version = "0.1.1"
@@ -385,6 +433,12 @@ dependencies = [
"libc",
]
+[[package]]
+name = "hermit-abi"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+
[[package]]
name = "indexmap"
version = "1.9.1"
@@ -406,6 +460,26 @@ dependencies = [
"unicode-width",
]
+[[package]]
+name = "instant"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "io-lifetimes"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220"
+dependencies = [
+ "hermit-abi 0.3.1",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
[[package]]
name = "itertools"
version = "0.10.5"
@@ -438,9 +512,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
-version = "0.2.126"
+version = "0.2.143"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
+checksum = "edc207893e85c5d6be840e969b496b53d94cec8be2d501b214f50daa97fa8024"
[[package]]
name = "libm"
@@ -448,6 +522,12 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33a33a362ce288760ec6a508b94caaec573ae7d3bbbd91b87aa0bad4456839db"
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f"
+
[[package]]
name = "matrixmultiply"
version = "0.3.2"
@@ -709,6 +789,15 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
+[[package]]
+name = "redox_syscall"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
+dependencies = [
+ "bitflags",
+]
+
[[package]]
name = "regex"
version = "1.6.0"
@@ -741,6 +830,20 @@ dependencies = [
"semver",
]
+[[package]]
+name = "rustix"
+version = "0.37.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
+dependencies = [
+ "bitflags",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.48.0",
+]
+
[[package]]
name = "rustversion"
version = "1.0.8"
@@ -779,6 +882,12 @@ dependencies = [
"syn",
]
+[[package]]
+name = "shell-words"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
+
[[package]]
name = "simba"
version = "0.5.1"
@@ -853,6 +962,19 @@ dependencies = [
"unicode-ident",
]
+[[package]]
+name = "tempfile"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "redox_syscall",
+ "rustix",
+ "windows-sys 0.45.0",
+]
+
[[package]]
name = "termcolor"
version = "1.1.3"
@@ -917,6 +1039,7 @@ dependencies = [
"anyhow",
"bio",
"clap",
+ "dialoguer",
"file-format",
"flate2",
"indicatif",
@@ -993,3 +1116,141 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.0",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.0",
+ "windows_aarch64_msvc 0.48.0",
+ "windows_i686_gnu 0.48.0",
+ "windows_i686_msvc 0.48.0",
+ "windows_x86_64_gnu 0.48.0",
+ "windows_x86_64_gnullvm 0.48.0",
+ "windows_x86_64_msvc 0.48.0",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+
+[[package]]
+name = "zeroize"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
diff --git a/Cargo.toml b/Cargo.toml
index f8db807..b8f599a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,3 +14,4 @@ flate2 = "1.0.24"
itertools = "0.10.5"
file-format = "0.7.0"
anyhow = "1.0.71"
+dialoguer = "0.10.4"
diff --git a/src/auxiliary.rs b/src/auxiliary.rs
index 81d9099..ef74693 100644
--- a/src/auxiliary.rs
+++ b/src/auxiliary.rs
@@ -1,4 +1,7 @@
-use std::time::Instant;
+use super::umi_errors::RuntimeErrors;
+use anyhow::{anyhow, Result};
+use dialoguer::Confirm;
+use std::{fs, path::PathBuf, time::Instant};
pub fn timedrun(msg: &str, func: F) -> R
where
@@ -9,3 +12,21 @@ where
println!("{msg} after {:.1} seconds", start.elapsed().as_secs_f32());
measure
}
+
+pub fn check_outputpath(path: PathBuf) -> Result {
+ let exists = fs::metadata(&path).is_ok();
+
+ if exists {
+ if Confirm::new()
+ .with_prompt(format!("{} exists. Overwrite?", path.display()))
+ .interact()?
+ {
+ println!("File will be overwritten.");
+ return Ok(path);
+ } else {
+ return Err(anyhow!(RuntimeErrors::FileExistsError));
+ }
+ } else {
+ return Ok(path);
+ }
+}
diff --git a/src/file_io.rs b/src/file_io.rs
index 94470c2..943b9f8 100644
--- a/src/file_io.rs
+++ b/src/file_io.rs
@@ -65,17 +65,19 @@ pub fn read_fastq(path: &PathBuf) -> bio::io::fastq::Reader OutputFile {
- if gz {
+pub fn output_file(mut name: PathBuf, gz: bool) -> OutputFile {
+ if gz | name.ends_with(".gz") {
+ name.set_extension("fastq.gz");
OutputFile::Gzip {
- read: std::fs::File::create(format!("{}.fastq.gz", name))
+ read: std::fs::File::create(name.as_path())
.map(|w| flate2::write::GzEncoder::new(w, flate2::Compression::default()))
.map(bio::io::fastq::Writer::new)
.unwrap(),
}
} else {
+ name.set_extension("fastq");
OutputFile::Fastq {
- read: std::fs::File::create(format!("{}.fastq", name))
+ read: std::fs::File::create(name.as_path())
.map(bio::io::fastq::Writer::new)
.unwrap(),
}
diff --git a/src/umi_errors.rs b/src/umi_errors.rs
index 54075a8..0d24054 100644
--- a/src/umi_errors.rs
+++ b/src/umi_errors.rs
@@ -2,6 +2,7 @@
pub enum RuntimeErrors {
ReadIDMismatchError,
FileNotFoundError,
+ FileExistsError,
GeneralError,
}
@@ -13,6 +14,7 @@ impl std::fmt::Display for RuntimeErrors {
"IDs of UMI and read records mismatch. Please provide sorted files!"
),
Self::FileNotFoundError => write!(f, "Cannot read from specified path."),
+ Self::FileExistsError => write!(f, "Output file exists, but must not be overwritten."),
Self::GeneralError => write!(f, "Encountered an error."),
}
}
diff --git a/src/umi_external.rs b/src/umi_external.rs
index e6ffc37..5374d7b 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -4,32 +4,31 @@ use itertools::izip;
use std::path::PathBuf;
use super::file_io;
-use crate::umi_errors::RuntimeErrors;
+use crate::{auxiliary::check_outputpath, umi_errors::RuntimeErrors};
#[derive(Debug, Parser)]
pub struct OptsExternal {
#[clap(
- long,
- default_value = "output",
- help = "Prefix for output files, omitted flag will result in default value.
+ short = 'f',
+ long = "fix_numbers",
+ help = "Automatically change '3' into '2' in sequence header of output file from R3.
\n "
)]
- prefix: String,
+ edit_nr: bool,
#[clap(
- long,
- help = "Automatically change '3' into '2' in sequence header of output file from R3.
+ short = 'z',
+ long = "gzip",
+ help = "Compress output files with gzip. By default turned off to encourage use of external compression (see Readme).
\n "
)]
- edit_nr: bool,
+ gzip: bool,
#[clap(
- short = '1',
- long = "in1",
+ long = "in",
required = true,
help = "[REQUIRED] Input file 1 with reads.
\n "
)]
r1_in: PathBuf,
#[clap(
- short = '2',
long = "in2",
required = true,
help = "[REQUIRED] Input file 2 with reads.
@@ -45,12 +44,24 @@ pub struct OptsExternal {
)]
ru_in: PathBuf,
#[clap(
- short = 'z',
- long = "gzip",
- help = "Compress output files with gzip. By default turned off to encourage use of external compression (see Readme).
+ long,
+ default_value = "output",
+ help = "Prefix for output files, omitted flag will result in default value.
\n "
)]
- gzip: bool,
+ prefix: String,
+ #[clap(
+ long = "out",
+ help = "Path to FastQ output file for R1.
+ \n "
+ )]
+ r1_out: Option,
+ #[clap(
+ long = "out2",
+ help = "Path to FastQ output file for R2.
+ \n "
+ )]
+ r2_out: Option,
}
pub fn run(args: OptsExternal) -> Result {
@@ -65,9 +76,20 @@ pub fn run(args: OptsExternal) -> Result {
let r2 = file_io::read_fastq(&args.r2_in).records();
let ru = file_io::read_fastq(&args.ru_in).records();
- // Create write files.
- let mut write_file_r1 = file_io::output_file(&format!("{}1", &args.prefix), args.gzip);
- let mut write_file_r2 = file_io::output_file(&format!("{}2", &args.prefix), args.gzip);
+ // If output paths have been specified, check if the are ok to use or use prefix constructors.
+ let output1: PathBuf;
+ let output2: PathBuf;
+
+ if args.r1_out.is_some() && args.r2_out.is_some() {
+ output1 = check_outputpath(args.r1_out.unwrap())?;
+ output2 = check_outputpath(args.r2_out.unwrap())?;
+ } else {
+ output1 = check_outputpath(PathBuf::from(format!("{}1", &args.prefix)))?;
+ output2 = check_outputpath(PathBuf::from(format!("{}2", &args.prefix)))?;
+ }
+
+ let mut write_file_r1 = file_io::output_file(output1, args.gzip);
+ let mut write_file_r2 = file_io::output_file(output2, args.gzip);
// Record counter
let mut counter: i32 = 0;
From e1c74020eccb859a8fde8194f5d3b65194be994e Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Mon, 8 May 2023 15:38:01 +0200
Subject: [PATCH 20/72] Cleaner code for output checks and suffix updates.
---
src/auxiliary.rs | 23 +----------------------
src/file_io.rs | 40 +++++++++++++++++++++++++++++++++++-----
src/umi_external.rs | 24 ++++++++++++------------
3 files changed, 48 insertions(+), 39 deletions(-)
diff --git a/src/auxiliary.rs b/src/auxiliary.rs
index ef74693..81d9099 100644
--- a/src/auxiliary.rs
+++ b/src/auxiliary.rs
@@ -1,7 +1,4 @@
-use super::umi_errors::RuntimeErrors;
-use anyhow::{anyhow, Result};
-use dialoguer::Confirm;
-use std::{fs, path::PathBuf, time::Instant};
+use std::time::Instant;
pub fn timedrun(msg: &str, func: F) -> R
where
@@ -12,21 +9,3 @@ where
println!("{msg} after {:.1} seconds", start.elapsed().as_secs_f32());
measure
}
-
-pub fn check_outputpath(path: PathBuf) -> Result {
- let exists = fs::metadata(&path).is_ok();
-
- if exists {
- if Confirm::new()
- .with_prompt(format!("{} exists. Overwrite?", path.display()))
- .interact()?
- {
- println!("File will be overwritten.");
- return Ok(path);
- } else {
- return Err(anyhow!(RuntimeErrors::FileExistsError));
- }
- } else {
- return Ok(path);
- }
-}
diff --git a/src/file_io.rs b/src/file_io.rs
index 943b9f8..eb260ca 100644
--- a/src/file_io.rs
+++ b/src/file_io.rs
@@ -1,5 +1,9 @@
+use anyhow::{anyhow, Result};
+use dialoguer::Confirm;
use file_format::FileFormat;
-use std::path::PathBuf;
+use std::{fs, path::PathBuf};
+
+use super::umi_errors::RuntimeErrors;
// Defining types for simplicity
type File = std::fs::File;
@@ -65,9 +69,8 @@ pub fn read_fastq(path: &PathBuf) -> bio::io::fastq::Reader OutputFile {
- if gz | name.ends_with(".gz") {
- name.set_extension("fastq.gz");
+pub fn output_file(name: PathBuf) -> OutputFile {
+ if name.ends_with(".gz") {
OutputFile::Gzip {
read: std::fs::File::create(name.as_path())
.map(|w| flate2::write::GzEncoder::new(w, flate2::Compression::default()))
@@ -75,7 +78,6 @@ pub fn output_file(mut name: PathBuf, gz: bool) -> OutputFile {
.unwrap(),
}
} else {
- name.set_extension("fastq");
OutputFile::Fastq {
read: std::fs::File::create(name.as_path())
.map(bio::io::fastq::Writer::new)
@@ -103,3 +105,31 @@ pub fn write_to_file(
output.write(header, s.desc(), s.clone())
}
}
+
+// Checks whether an output path exists.
+pub fn check_outputpath(mut path: PathBuf, compress: &bool) -> Result {
+ // handle the compression and adapt file extension.
+ if compress | path.ends_with(".gz") {
+ path.set_extension("fastq.gz");
+ } else {
+ path.set_extension("fastq");
+ }
+
+ // check if the path already exists
+ let exists = fs::metadata(&path).is_ok();
+
+ // return the path of it is ok to write, otherwise an error.
+ if exists {
+ if Confirm::new()
+ .with_prompt(format!("{} exists. Overwrite?", path.display()))
+ .interact()?
+ {
+ println!("File will be overwritten.");
+ return Ok(path);
+ } else {
+ return Err(anyhow!(RuntimeErrors::FileExistsError));
+ }
+ } else {
+ return Ok(path);
+ }
+}
diff --git a/src/umi_external.rs b/src/umi_external.rs
index 5374d7b..e5bb0be 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -4,7 +4,7 @@ use itertools::izip;
use std::path::PathBuf;
use super::file_io;
-use crate::{auxiliary::check_outputpath, umi_errors::RuntimeErrors};
+use crate::{file_io::check_outputpath, umi_errors::RuntimeErrors};
#[derive(Debug, Parser)]
pub struct OptsExternal {
#[clap(
@@ -77,19 +77,19 @@ pub fn run(args: OptsExternal) -> Result {
let ru = file_io::read_fastq(&args.ru_in).records();
// If output paths have been specified, check if the are ok to use or use prefix constructors.
- let output1: PathBuf;
- let output2: PathBuf;
+ let mut output1: PathBuf = args
+ .r1_out
+ .unwrap_or(PathBuf::from(format!("{}1", &args.prefix)));
+ let mut output2: PathBuf = args
+ .r2_out
+ .unwrap_or(PathBuf::from(format!("{}2", &args.prefix)));
- if args.r1_out.is_some() && args.r2_out.is_some() {
- output1 = check_outputpath(args.r1_out.unwrap())?;
- output2 = check_outputpath(args.r2_out.unwrap())?;
- } else {
- output1 = check_outputpath(PathBuf::from(format!("{}1", &args.prefix)))?;
- output2 = check_outputpath(PathBuf::from(format!("{}2", &args.prefix)))?;
- }
+ // modify if output path according to compression settings and check if exists.
+ output1 = check_outputpath(output1, &args.gzip)?;
+ output2 = check_outputpath(output2, &args.gzip)?;
- let mut write_file_r1 = file_io::output_file(output1, args.gzip);
- let mut write_file_r2 = file_io::output_file(output2, args.gzip);
+ let mut write_file_r1 = file_io::output_file(output1);
+ let mut write_file_r2 = file_io::output_file(output2);
// Record counter
let mut counter: i32 = 0;
From 18323d34f11d18ace72f05380a9f790d3f54451e Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Mon, 8 May 2023 17:48:17 +0200
Subject: [PATCH 21/72] file_io::append_to_path() must work without owning the
provided PathBuf. Had do clone twice :-(
---
src/file_io.rs | 9 +++++++++
src/umi_external.rs | 11 ++---------
2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/src/file_io.rs b/src/file_io.rs
index eb260ca..0bc3b0e 100644
--- a/src/file_io.rs
+++ b/src/file_io.rs
@@ -133,3 +133,12 @@ pub fn check_outputpath(mut path: PathBuf, compress: &bool) -> Result {
return Ok(path);
}
}
+
+// probably, there is a better way to do this than with two copies ?!?
+pub fn append_to_path(path: &PathBuf, string: &str) -> PathBuf {
+ let mut stem = path.to_owned();
+ stem.set_extension("");
+ let mut p_osstr = stem.as_os_str().to_owned();
+ p_osstr.push(string);
+ p_osstr.into()
+}
diff --git a/src/umi_external.rs b/src/umi_external.rs
index e5bb0be..5da48d8 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -43,13 +43,6 @@ pub struct OptsExternal {
\n"
)]
ru_in: PathBuf,
- #[clap(
- long,
- default_value = "output",
- help = "Prefix for output files, omitted flag will result in default value.
- \n "
- )]
- prefix: String,
#[clap(
long = "out",
help = "Path to FastQ output file for R1.
@@ -79,10 +72,10 @@ pub fn run(args: OptsExternal) -> Result {
// If output paths have been specified, check if the are ok to use or use prefix constructors.
let mut output1: PathBuf = args
.r1_out
- .unwrap_or(PathBuf::from(format!("{}1", &args.prefix)));
+ .unwrap_or(file_io::append_to_path(&args.r1_in, "_with_UMIs"));
let mut output2: PathBuf = args
.r2_out
- .unwrap_or(PathBuf::from(format!("{}2", &args.prefix)));
+ .unwrap_or(file_io::append_to_path(&args.r2_in, "_with_UMIs"));
// modify if output path according to compression settings and check if exists.
output1 = check_outputpath(output1, &args.gzip)?;
From 01a2db568ee72dd537f8606029f0326d3463f9a6 Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Mon, 8 May 2023 22:24:05 +0200
Subject: [PATCH 22/72] Finished autogeneration of the file name extension,
used a Regex to modify the input file names if no output file names were
given.
---
Cargo.lock | 13 ++++++-----
Cargo.toml | 1 +
src/file_io.rs | 51 +++++++++++++++++++++++-----------------
src/main.rs | 2 +-
src/umi_errors.rs | 8 ++++---
src/umi_external.rs | 57 ++++++++++++++++++++++++++-------------------
6 files changed, 76 insertions(+), 56 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b2a29bf..c00d1f4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10,9 +10,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aho-corasick"
-version = "0.7.18"
+version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
+checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04"
dependencies = [
"memchr",
]
@@ -800,9 +800,9 @@ dependencies = [
[[package]]
name = "regex"
-version = "1.6.0"
+version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
+checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370"
dependencies = [
"aho-corasick",
"memchr",
@@ -817,9 +817,9 @@ checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
[[package]]
name = "regex-syntax"
-version = "0.6.27"
+version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"
+checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c"
[[package]]
name = "rustc_version"
@@ -1045,6 +1045,7 @@ dependencies = [
"indicatif",
"itertools",
"lazy_static",
+ "regex",
]
[[package]]
diff --git a/Cargo.toml b/Cargo.toml
index b8f599a..dc3cc71 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,3 +15,4 @@ itertools = "0.10.5"
file-format = "0.7.0"
anyhow = "1.0.71"
dialoguer = "0.10.4"
+regex = "1.8.1"
diff --git a/src/file_io.rs b/src/file_io.rs
index 0bc3b0e..7e71f07 100644
--- a/src/file_io.rs
+++ b/src/file_io.rs
@@ -1,10 +1,10 @@
-use anyhow::{anyhow, Result};
+use super::umi_errors::RuntimeErrors;
+use anyhow::{anyhow, Context, Result};
use dialoguer::Confirm;
use file_format::FileFormat;
+use regex::Regex;
use std::{fs, path::PathBuf};
-use super::umi_errors::RuntimeErrors;
-
// Defining types for simplicity
type File = std::fs::File;
type Fastq = std::io::BufReader;
@@ -52,19 +52,26 @@ impl OutputFile {
}
}
-// Read input file to Reader. Automatically scans if gzipped from file-format crate
-pub fn read_fastq(path: &PathBuf) -> bio::io::fastq::Reader> {
- let format = FileFormat::from_file(path).unwrap();
+// Read input file to Reader. Automatically scans if input is compressed with file-format crate.
+pub fn read_fastq(path: &PathBuf) -> Result>> {
+ if fs::metadata(&path).is_err() {
+ return Err(anyhow!(RuntimeErrors::FileNotFoundError));
+ }
+
+ let format = FileFormat::from_file(path).context("Failed to determine file format")?;
if format == FileFormat::Gzip {
- bio::io::fastq::Reader::new(ReadFile::Gzip(
+ Ok(bio::io::fastq::Reader::new(ReadFile::Gzip(
std::fs::File::open(path)
.map(std::io::BufReader::new)
.map(flate2::bufread::MultiGzDecoder::new)
- .unwrap(),
- ))
+ .with_context(|| format!("Failed to open file: {:?}", path))?,
+ )))
} else {
// If not gzipped, read as plain fastq
- bio::io::fastq::Reader::new(ReadFile::Fastq(std::fs::File::open(path).unwrap()))
+ Ok(bio::io::fastq::Reader::new(ReadFile::Fastq(
+ std::fs::File::open(path)
+ .with_context(|| format!("Failed to open file: {:?}", path))?,
+ )))
}
}
@@ -108,11 +115,13 @@ pub fn write_to_file(
// Checks whether an output path exists.
pub fn check_outputpath(mut path: PathBuf, compress: &bool) -> Result {
- // handle the compression and adapt file extension.
- if compress | path.ends_with(".gz") {
- path.set_extension("fastq.gz");
- } else {
- path.set_extension("fastq");
+ // handle the compression and adapt file extension if necessary.
+ if compress & !path.ends_with(".gz") {
+ if let Some(extension) = path.extension() {
+ let mut new_extension = extension.to_str().unwrap_or("").to_owned();
+ new_extension.push_str(".gz");
+ path.set_extension(new_extension);
+ }
}
// check if the path already exists
@@ -134,11 +143,9 @@ pub fn check_outputpath(mut path: PathBuf, compress: &bool) -> Result {
}
}
-// probably, there is a better way to do this than with two copies ?!?
-pub fn append_to_path(path: &PathBuf, string: &str) -> PathBuf {
- let mut stem = path.to_owned();
- stem.set_extension("");
- let mut p_osstr = stem.as_os_str().to_owned();
- p_osstr.push(string);
- p_osstr.into()
+pub fn append_umi_to_path(path: &PathBuf) -> PathBuf {
+ let path_str = path.as_os_str().clone().to_string_lossy();
+ let re = Regex::new(r"^(?P\.*[^\.]+)\.(?P.*)$").unwrap();
+ let new_path_str = re.replace(&path_str, "${stem}_with_UMIs.${extension}");
+ PathBuf::from(new_path_str.to_string())
}
diff --git a/src/main.rs b/src/main.rs
index 3a2f539..5a00c9e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,6 @@
extern crate core;
-use anyhow::{Context, Result};
+use anyhow::{Context};
use clap::Parser;
use crate::auxiliary::timedrun;
diff --git a/src/umi_errors.rs b/src/umi_errors.rs
index 0d24054..90bfe15 100644
--- a/src/umi_errors.rs
+++ b/src/umi_errors.rs
@@ -3,7 +3,7 @@ pub enum RuntimeErrors {
ReadIDMismatchError,
FileNotFoundError,
FileExistsError,
- GeneralError,
+ //GeneralError,
}
impl std::fmt::Display for RuntimeErrors {
@@ -13,9 +13,11 @@ impl std::fmt::Display for RuntimeErrors {
f,
"IDs of UMI and read records mismatch. Please provide sorted files!"
),
- Self::FileNotFoundError => write!(f, "Cannot read from specified path."),
+ Self::FileNotFoundError => {
+ write!(f, "Specified file does not exist or is not readable!")
+ }
Self::FileExistsError => write!(f, "Output file exists, but must not be overwritten."),
- Self::GeneralError => write!(f, "Encountered an error."),
+ //Self::GeneralError => write!(f, "Encountered an error."),
}
}
}
diff --git a/src/umi_external.rs b/src/umi_external.rs
index 5da48d8..c144b5c 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -64,23 +64,47 @@ pub fn run(args: OptsExternal) -> Result {
edit_nr = true;
}
- // Create fastq record iterators from input files
- let r1 = file_io::read_fastq(&args.r1_in).records();
- let r2 = file_io::read_fastq(&args.r2_in).records();
- let ru = file_io::read_fastq(&args.ru_in).records();
+ // Read FastQ records from input files
+ let r1 = file_io::read_fastq(&args.r1_in)
+ .with_context(|| {
+ format!(
+ "Failed to read records from {}",
+ &args.r1_in.to_string_lossy()
+ )
+ })?
+ .records();
+ let r2 = file_io::read_fastq(&args.r2_in)
+ .with_context(|| {
+ format!(
+ "Failed to read records from {}",
+ &args.r2_in.to_string_lossy()
+ )
+ })?
+ .records();
+ let ru = file_io::read_fastq(&args.ru_in)
+ .with_context(|| {
+ format!(
+ "Failed to read records from {}",
+ &args.ru_in.to_string_lossy()
+ )
+ })?
+ .records();
// If output paths have been specified, check if the are ok to use or use prefix constructors.
let mut output1: PathBuf = args
.r1_out
- .unwrap_or(file_io::append_to_path(&args.r1_in, "_with_UMIs"));
+ .unwrap_or(file_io::append_umi_to_path(&args.r1_in));
let mut output2: PathBuf = args
.r2_out
- .unwrap_or(file_io::append_to_path(&args.r2_in, "_with_UMIs"));
+ .unwrap_or(file_io::append_umi_to_path(&args.r2_in));
// modify if output path according to compression settings and check if exists.
output1 = check_outputpath(output1, &args.gzip)?;
output2 = check_outputpath(output2, &args.gzip)?;
+ println!("Output 1 will be saved to: {}", output1.to_string_lossy());
+ println!("Output 2 will be saved to: {}", output2.to_string_lossy());
+
let mut write_file_r1 = file_io::output_file(output1);
let mut write_file_r2 = file_io::output_file(output2);
@@ -91,24 +115,9 @@ pub fn run(args: OptsExternal) -> Result {
// Iterate over records in input files
for (r1_rec_res, ru_rec_res, r2_rec_res) in izip!(r1, ru, r2) {
- let r1_rec = r1_rec_res.with_context(|| {
- format!(
- "Failed to read records from {}",
- &args.r1_in.to_string_lossy()
- )
- })?;
- let r2_rec = r2_rec_res.with_context(|| {
- format!(
- "Failed to read records from {}",
- &args.r2_in.to_string_lossy()
- )
- })?;
- let ru_rec = ru_rec_res.with_context(|| {
- format!(
- "Failed to read records from {}",
- &args.ru_in.to_string_lossy()
- )
- })?;
+ let r1_rec = r1_rec_res?;
+ let r2_rec = r2_rec_res?;
+ let ru_rec = ru_rec_res?;
// Step counter
counter += 1;
From 71ec566a5709d759fad98f6a19766320b4322eaf Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Tue, 9 May 2023 11:51:12 +0200
Subject: [PATCH 23/72] Implemented fixing the read numbers for both reads and
using a custom delimiter.
---
Cargo.lock | 259 +++++++++++++++++++++++---------------------
README.md | 106 +++++++++++-------
src/file_io.rs | 46 +++++---
src/main.rs | 6 +-
src/umi_external.rs | 46 ++++++--
5 files changed, 270 insertions(+), 193 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index c00d1f4..ecc4bbe 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -97,7 +97,7 @@ dependencies = [
"derive-new",
"lazy_static",
"regex",
- "strum_macros 0.24.2",
+ "strum_macros 0.24.3",
"thiserror",
]
@@ -122,18 +122,6 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-[[package]]
-name = "bstr"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
-dependencies = [
- "lazy_static",
- "memchr",
- "regex-automata",
- "serde",
-]
-
[[package]]
name = "bv"
version = "0.11.1"
@@ -170,9 +158,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
-version = "3.2.17"
+version = "3.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29e724a68d9319343bb3328c9cc2dfde263f4b3142ee1059a9980580171c954b"
+checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
dependencies = [
"atty",
"bitflags",
@@ -187,15 +175,15 @@ dependencies = [
[[package]]
name = "clap_derive"
-version = "3.2.17"
+version = "3.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13547f7012c01ab4a0e8f8967730ada8f9fdf419e8b6c792788f39cf4e46eefa"
+checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008"
dependencies = [
- "heck 0.4.0",
+ "heck 0.4.1",
"proc-macro-error",
"proc-macro2",
"quote",
- "syn",
+ "syn 1.0.109",
]
[[package]]
@@ -209,16 +197,15 @@ dependencies = [
[[package]]
name = "console"
-version = "0.15.1"
+version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89eab4d20ce20cea182308bca13088fecea9c05f6776cf287205d41a0ed3c847"
+checksum = "c3d79fbe8970a77e3e34151cc13d3b3e248aa0faaecb9f6091fa07ebefe5ad60"
dependencies = [
"encode_unicode",
+ "lazy_static",
"libc",
- "once_cell",
- "terminal_size",
"unicode-width",
- "winapi",
+ "windows-sys 0.42.0",
]
[[package]]
@@ -232,11 +219,10 @@ dependencies = [
[[package]]
name = "csv"
-version = "1.1.6"
+version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
+checksum = "0b015497079b9a9d69c02ad25de6c0a6edef051ea6360a327d0bd05802ef64ad"
dependencies = [
- "bstr",
"csv-core",
"itoa",
"ryu",
@@ -266,7 +252,7 @@ checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535"
dependencies = [
"proc-macro2",
"quote",
- "syn",
+ "syn 1.0.109",
]
[[package]]
@@ -283,9 +269,9 @@ dependencies = [
[[package]]
name = "either"
-version = "1.7.0"
+version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be"
+checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
[[package]]
name = "encode_unicode"
@@ -310,7 +296,7 @@ checksum = "84278eae0af6e34ff6c1db44c11634a694aafac559ff3080e4db4e4ac35907aa"
dependencies = [
"proc-macro2",
"quote",
- "syn",
+ "syn 1.0.109",
]
[[package]]
@@ -363,9 +349,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
[[package]]
name = "flate2"
-version = "1.0.24"
+version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6"
+checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
dependencies = [
"crc32fast",
"miniz_oxide",
@@ -382,9 +368,9 @@ dependencies = [
[[package]]
name = "getrandom"
-version = "0.2.7"
+version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6"
+checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
dependencies = [
"cfg-if",
"libc",
@@ -400,7 +386,7 @@ dependencies = [
"proc-macro-error",
"proc-macro2",
"quote",
- "syn",
+ "syn 1.0.109",
]
[[package]]
@@ -420,9 +406,9 @@ dependencies = [
[[package]]
name = "heck"
-version = "0.4.0"
+version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
@@ -441,9 +427,9 @@ checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "indexmap"
-version = "1.9.1"
+version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg",
"hashbrown",
@@ -451,12 +437,13 @@ dependencies = [
[[package]]
name = "indicatif"
-version = "0.17.0"
+version = "0.17.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcc42b206e70d86ec03285b123e65a5458c92027d1fb2ae3555878b8113b3ddf"
+checksum = "cef509aa9bc73864d6756f0d34d35504af3cf0844373afe9b8669a5b8005a729"
dependencies = [
"console",
"number_prefix",
+ "portable-atomic 0.3.20",
"unicode-width",
]
@@ -500,9 +487,9 @@ dependencies = [
[[package]]
name = "itoa"
-version = "0.4.8"
+version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
+checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
[[package]]
name = "lazy_static"
@@ -512,15 +499,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
-version = "0.2.143"
+version = "0.2.144"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edc207893e85c5d6be840e969b496b53d94cec8be2d501b214f50daa97fa8024"
+checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
[[package]]
name = "libm"
-version = "0.2.2"
+version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33a33a362ce288760ec6a508b94caaec573ae7d3bbbd91b87aa0bad4456839db"
+checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
[[package]]
name = "linux-raw-sys"
@@ -530,10 +517,11 @@ checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f"
[[package]]
name = "matrixmultiply"
-version = "0.3.2"
+version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "add85d4dd35074e6fedc608f8c8f513a3548619a9024b751949ef0e8e45a4d84"
+checksum = "090126dc04f95dc0d1c1c91f61bdd474b3930ca064c1edc8a849da2c6cbe1e77"
dependencies = [
+ "autocfg",
"rawpointer",
]
@@ -545,9 +533,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "miniz_oxide"
-version = "0.5.3"
+version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
dependencies = [
"adler",
]
@@ -587,14 +575,14 @@ checksum = "01fcc0b8149b4632adc89ac3b7b31a12fb6099a0317a4eb2ebff574ef7de7218"
dependencies = [
"proc-macro2",
"quote",
- "syn",
+ "syn 1.0.109",
]
[[package]]
name = "ndarray"
-version = "0.15.4"
+version = "0.15.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dec23e6762830658d2b3d385a75aa212af2f67a4586d4442907144f3bb6a1ca8"
+checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
dependencies = [
"matrixmultiply",
"num-complex",
@@ -614,9 +602,9 @@ dependencies = [
[[package]]
name = "num-complex"
-version = "0.4.2"
+version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ae39348c8bc5fbd7f40c727a9925f03517afd2ab27d46702108b6a7e5414c19"
+checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
dependencies = [
"num-traits",
]
@@ -660,9 +648,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "once_cell"
-version = "1.13.0"
+version = "1.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
[[package]]
name = "ordered-float"
@@ -675,31 +663,46 @@ dependencies = [
[[package]]
name = "os_str_bytes"
-version = "6.2.0"
+version = "6.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4"
+checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
[[package]]
name = "paste"
-version = "1.0.7"
+version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c520e05135d6e763148b6426a837e239041653ba7becd2e538c076c738025fc"
+checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
[[package]]
name = "petgraph"
-version = "0.6.2"
+version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143"
+checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
dependencies = [
"fixedbitset",
"indexmap",
]
+[[package]]
+name = "portable-atomic"
+version = "0.3.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e30165d31df606f5726b090ec7592c308a0eaf61721ff64c9a3018e344a8753e"
+dependencies = [
+ "portable-atomic 1.3.1",
+]
+
+[[package]]
+name = "portable-atomic"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bbda379e6e462c97ea6afe9f6233619b202bbc4968d7caa6917788d2070a044"
+
[[package]]
name = "ppv-lite86"
-version = "0.2.16"
+version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "proc-macro-error"
@@ -710,7 +713,7 @@ dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
- "syn",
+ "syn 1.0.109",
"version_check",
]
@@ -727,18 +730,18 @@ dependencies = [
[[package]]
name = "proc-macro2"
-version = "1.0.42"
+version = "1.0.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c278e965f1d8cf32d6e0e96de3d3e79712178ae67986d9cf9151f51e95aac89b"
+checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
-version = "1.0.20"
+version = "1.0.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804"
+checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
dependencies = [
"proc-macro2",
]
@@ -766,9 +769,9 @@ dependencies = [
[[package]]
name = "rand_core"
-version = "0.6.3"
+version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
@@ -809,12 +812,6 @@ dependencies = [
"regex-syntax",
]
-[[package]]
-name = "regex-automata"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
-
[[package]]
name = "regex-syntax"
version = "0.7.1"
@@ -846,15 +843,15 @@ dependencies = [
[[package]]
name = "rustversion"
-version = "1.0.8"
+version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8"
+checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06"
[[package]]
name = "ryu"
-version = "1.0.10"
+version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
+checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
[[package]]
name = "semver"
@@ -864,22 +861,22 @@ checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac"
[[package]]
name = "serde"
-version = "1.0.140"
+version = "1.0.162"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc855a42c7967b7c369eb5860f7164ef1f6f81c20c7cc1141f2a604e18723b03"
+checksum = "71b2f6e1ab5c2b98c05f0f35b236b22e8df7ead6ffbf51d7808da7f8817e7ab6"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
-version = "1.0.140"
+version = "1.0.162"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f2122636b9fe3b81f1cb25099fcf2d3f542cdb1d45940d56c713158884a05da"
+checksum = "a2a0814352fd64b58489904a44ea8d90cb1a91dcb6b4f5ebabc32c8318e93cb6"
dependencies = [
"proc-macro2",
"quote",
- "syn",
+ "syn 2.0.15",
]
[[package]]
@@ -935,27 +932,38 @@ dependencies = [
"proc-macro2",
"quote",
"rustversion",
- "syn",
+ "syn 1.0.109",
]
[[package]]
name = "strum_macros"
-version = "0.24.2"
+version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4faebde00e8ff94316c01800f9054fd2ba77d30d9e922541913051d1d978918b"
+checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
dependencies = [
- "heck 0.4.0",
+ "heck 0.4.1",
"proc-macro2",
"quote",
"rustversion",
- "syn",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
]
[[package]]
name = "syn"
-version = "1.0.98"
+version = "2.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd"
+checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822"
dependencies = [
"proc-macro2",
"quote",
@@ -977,47 +985,37 @@ dependencies = [
[[package]]
name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
dependencies = [
"winapi-util",
]
-[[package]]
-name = "terminal_size"
-version = "0.1.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df"
-dependencies = [
- "libc",
- "winapi",
-]
-
[[package]]
name = "textwrap"
-version = "0.15.0"
+version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
+checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
[[package]]
name = "thiserror"
-version = "1.0.31"
+version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a"
+checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
-version = "1.0.31"
+version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a"
+checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
dependencies = [
"proc-macro2",
"quote",
- "syn",
+ "syn 2.0.15",
]
[[package]]
@@ -1028,9 +1026,9 @@ checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63"
[[package]]
name = "typenum"
-version = "1.15.0"
+version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
+checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
[[package]]
name = "umi-transfer"
@@ -1050,21 +1048,21 @@ dependencies = [
[[package]]
name = "unicode-ident"
-version = "1.0.2"
+version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"
+checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
[[package]]
name = "unicode-segmentation"
-version = "1.9.0"
+version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
[[package]]
name = "unicode-width"
-version = "0.1.9"
+version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
+checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]]
name = "vec_map"
@@ -1118,6 +1116,21 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+[[package]]
+name = "windows-sys"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
[[package]]
name = "windows-sys"
version = "0.45.0"
diff --git a/README.md b/README.md
index 1a86226..d8232b0 100644
--- a/README.md
+++ b/README.md
@@ -1,84 +1,95 @@
# umi-transfer
-A tool for transfering Unique Molecular Identifiers (UMIs).
-The UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the
-header of the first two fastq files.
+A tool for transferring Unique Molecular Identifiers (UMIs) provided as separate FastQ file to the header of records in paired FastQ files.
+
## Background
-Common demultiplexing softwares return a separate fastq file, usually named `R2`, containing UMIs.
-However, common analysis tools does not allow for this and instead requires the UMI to be contained within the header of the two reads in the pair.
-This tools performs this transform in an efficient manner and can also conveniently rename the oddly named read-`3` to read-`2` which is probably more widely recognized.
+
+To increase the accuracy of quantitative DNA sequencing experiments, Unique Molecular Identifiers may be used. UMIs are short sequences used to uniquely tag each molecule in a sample library and facilitate the accurate identification of read duplicates. They must be added during library preparation and prior to sequencing, therefore require appropriate arrangements with your sequencing provider.
+
+Most tools capable of taking UMIs into consideration during an analysis workflow, expect the respective UMI sequence to be embedded into the read's ID. Please consult your tools' manuals regarding the exact specification.
+
+For some some library preparation kits and sequencing adapters, the UMI sequence needs to be read together with the index from the antisense strand and thus will be output as a separate FastQ file during demultiplexing.
+
+This tools can integrate those separate UMIs into the headers in an efficient manner and can also correct divergent read numbers back to the canonical `1` and `2`.
## Installation
### Compile from source
+
Given that you have [rust installed](https://www.rust-lang.org/tools/install) on your computer, download this repo and run
+
```shell
cargo build --release
```
-That should create an executable `target/release/umi-transfer` that can be placed anywhere in your `$PATH` or be executed directly by specifying its' path:
+
+That should create an executable `target/release/umi-transfer` that can be placed anywhere in your `$PATH` or be executed directly by specifying its path:
```shell
./target/release/umi-transfer --version
umi-transfer 0.2.0
```
+
## Usage
->### Performance Note:
+>### Performance Note
+>
>The decompression and compression used within umi-transfer is single-threaded, so to get the most reads per minute performance, see the [high performance guide](#high-performance-guide)
-The tool requires three fastq files and additionally accepts flags to adjust the behaviour as can be seen from the help message:
+The tool requires three FastQ files as input. You can manually specify the names and location of the output files with `--out` and `--out2` or the tool will append a `with_UMI` suffix to your input file names as output. It additionally accepts to choose a custom UMI delimiter with `--delim` and to set the flags `-f`, `-c` and `-z`. The latter specifies to compress the output and `-c` is used to ensure `1` and `2` as read numbers in the output. `-f` / `--force` will overwrite existing output files without prompting the user.
```raw
-$ umi-transfer --help
-umi-transfer 0.2.0
-Judit Hohenthal, Matthias Zepper, Johannes Alneberg
-A tool for transfering Unique Molecular Identifiers (UMIs).
-
-The UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the
-header of the first two fastq files.
-
+$ umi-transfer external --help
+ umi-transfer-external
+Integrate UMIs from a separate FastQ file
USAGE:
- umi-transfer [OPTIONS] --r1-in --r2-in --ru-in
+ umi-transfer external [OPTIONS] --in --in2 --umi
OPTIONS:
- --edit-nr Automatically change '3' into '2' in sequence header of output file
- from R3.
-
- --gzip Compress output files with gzip. By default turned off to encourage use
- of external compression (see Readme).
-
+ -c, --correct_numbers Ensure read numbers 1 and 2 in sequence header of output files.
+
+ -d, --delim Delimiter to use when joining the UMIs to the read name. Defaults to `:`.
+
+ -f, --force Overwrite existing output files without further warnings or prompts.
+
-h, --help Print help information
- --prefix Prefix for output files, omitted flag will result in default value.
-
- [default: output]
- --r1-in [REQUIRED] Input file 1 with reads.
-
-
- --r2-in [REQUIRED] Input file 2 with reads.
-
-
- --ru-in [REQUIRED] Input file with UMI.
-
- -V, --version Print version information
+ --in [REQUIRED] Input file 1 with reads.
+
+
+ --in2 [REQUIRED] Input file 2 with reads.
+
+
+ --out Path to FastQ output file for R1.
+
+
+ --out2 Path to FastQ output file for R2.
+
+
+ -u, --umi [REQUIRED] Input file with UMI.
+
+ -z, --gzip Compress output files with gzip. By default turned off to encourage use
+ of external compression (see Readme).
```
### Example
```shell
-cargo run --release -- --prefix 'output' --edit-nr --r1-in 'R1.fastq' --r2-in 'R3.fastq' --ru-in 'R2.fastq'
+umi-transfer external -f --in 'R1.fastq' --in2 'R3.fastq' --umi 'R2.fastq'
```
### High Performance Guide
-If you have more than one thread available on your computer and would like to process the read files as quickly as possible we recommend to use unix FIFOs (First In First Out) to handle decompression and compression of the fastq files.
+
+If you have more than one thread available on your computer and would like to process the read files as quickly as possible we recommend to use unix FIFOs (First In First Out) to handle decompression and compression of the FastQ files.
This can be done as follows, given that you have your input files compressed as `fastq.gz`, first create FIFOs to represent your uncompressed input files:
```shell
-$ mkfifo read1.fastq
-$ mkfifo read2.fastq
-$ mkfifo read3.fastq
+mkfifo read1.fastq
+mkfifo read2.fastq
+mkfifo read3.fastq
```
+
and then we use `zcat` to decompress our input files and send it to the pipe that the FIFOs represent:
+
```shell
$ zcat read1.fastq.gz > read1.fastq &
[1] 233387
@@ -87,6 +98,7 @@ $ zcat read2.fastq.gz > read2.fastq &
$ zcat read3.fastq.gz > read3.fastq &
[3] 233389
```
+
Note the trailing `&` to leave these processes running in the background. We can inspect the directory with `ls`:
```shell
@@ -99,7 +111,9 @@ prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read1.fastq
prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read2.fastq
prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read3.fastq
```
+
We continue to create corresponding FIFOs for the output files (note that the filenames need to match the value given to `--prefix`)
+
```shell
$ mkfifo output1.fastq
$ mkfifo output2.fastq
@@ -108,23 +122,31 @@ $ pigz -p 10 --stdout > output1.fastq.gz < output1.fastq &
$ pigz -p 10 --stdout > output2.fastq.gz < output2.fastq &
[5] 233395
```
+
The value `10` is how many threads each of the `pigz` processes is allowed to use.
The optimal value for this depends on several factors and for optimal performance you will have to do some testing on your exact hardware.
We can then run the `umi-transfer` program as follows:
+
```shell
-$ umi-transfer --prefix output --edit-nr --r1-in read1.fastq --r2-in read3.fastq --ru-in read2.fastq
+umi-transfer --prefix output --edit-nr --r1-in read1.fastq --r2-in read3.fastq --ru-in read2.fastq
```
It's good practice to remove the FIFOs after the program has finished:
+
```shell
rm read*.fastq output*.fastq
```
+
## For developers
+
To make modifications to `umi-transfer`, clone this repository, make your changes and then run the code with
+
```shell
cargo run --
```
+
or build the executable with
+
```shell
cargo build --release
```
diff --git a/src/file_io.rs b/src/file_io.rs
index 7e71f07..f8557db 100644
--- a/src/file_io.rs
+++ b/src/file_io.rs
@@ -77,14 +77,25 @@ pub fn read_fastq(path: &PathBuf) -> Result OutputFile {
- if name.ends_with(".gz") {
- OutputFile::Gzip {
- read: std::fs::File::create(name.as_path())
- .map(|w| flate2::write::GzEncoder::new(w, flate2::Compression::default()))
- .map(bio::io::fastq::Writer::new)
- .unwrap(),
+ if let Some(extension) = name.extension() {
+ if extension == "gz" {
+ // File has gz extension, which has been enforced by check_outputpath() if -z was provided.
+ OutputFile::Gzip {
+ read: std::fs::File::create(name.as_path())
+ .map(|w| flate2::write::GzEncoder::new(w, flate2::Compression::default()))
+ .map(bio::io::fastq::Writer::new)
+ .unwrap(),
+ }
+ } else {
+ // File has extension but not gz
+ OutputFile::Fastq {
+ read: std::fs::File::create(name.as_path())
+ .map(bio::io::fastq::Writer::new)
+ .unwrap(),
+ }
}
} else {
+ //file has no extension. Assume plain-text.
OutputFile::Fastq {
read: std::fs::File::create(name.as_path())
.map(bio::io::fastq::Writer::new)
@@ -98,27 +109,29 @@ pub fn write_to_file(
input: bio::io::fastq::Record,
output: OutputFile,
umi: &[u8],
- edit_nr: bool,
+ umi_sep: Option<&String>,
+ edit_nr: Option,
) -> OutputFile {
let s = input;
- if edit_nr {
- let header = &[s.id(), ":", std::str::from_utf8(&umi).unwrap()].concat();
+ let delim = umi_sep.as_ref().map(|s| s.as_str()).unwrap_or(":"); // the delimiter for the UMI
+ if let Some(number) = edit_nr {
+ let header = &[s.id(), delim, std::str::from_utf8(&umi).unwrap()].concat();
let mut string = String::from(s.desc().unwrap());
- string.replace_range(0..1, "2");
+ string.replace_range(0..1, &number.to_string());
let desc: Option<&str> = Some(&string);
output.write(header, desc, s)
} else {
- let header = &[s.id(), ":", std::str::from_utf8(&umi).unwrap()].concat();
+ let header = &[s.id(), delim, std::str::from_utf8(&umi).unwrap()].concat();
output.write(header, s.desc(), s.clone())
}
}
// Checks whether an output path exists.
-pub fn check_outputpath(mut path: PathBuf, compress: &bool) -> Result {
+pub fn check_outputpath(mut path: PathBuf, compress: &bool, force: &bool) -> Result {
// handle the compression and adapt file extension if necessary.
- if compress & !path.ends_with(".gz") {
- if let Some(extension) = path.extension() {
- let mut new_extension = extension.to_str().unwrap_or("").to_owned();
+ if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
+ if !extension.ends_with("gz") & compress {
+ let mut new_extension = extension.to_owned();
new_extension.push_str(".gz");
path.set_extension(new_extension);
}
@@ -128,7 +141,8 @@ pub fn check_outputpath(mut path: PathBuf, compress: &bool) -> Result {
let exists = fs::metadata(&path).is_ok();
// return the path of it is ok to write, otherwise an error.
- if exists {
+ if exists & !force {
+ // force will disable prompt, but not the check.
if Confirm::new()
.with_prompt(format!("{} exists. Overwrite?", path.display()))
.interact()?
diff --git a/src/main.rs b/src/main.rs
index 5a00c9e..ff163bf 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,6 @@
extern crate core;
-use anyhow::{Context};
+use anyhow::Context;
use clap::Parser;
use crate::auxiliary::timedrun;
@@ -14,8 +14,8 @@ mod umi_external;
#[derive(clap::Parser)]
#[clap(
version = "0.2.0",
- author = "Written by Judit Hohenthal, Matthias Zepper, Johannes Alneberg",
- about = "A tool for transferring Unique Molecular Identifiers (UMIs). \n\nThe UMIs are given as a fastq file and will be transferred, explaining the name umi-transfer, to the header of the first two fastq files. \n\n"
+ author = "Written by Judit Hohenthal, Matthias Zepper & Johannes Alneberg",
+ about = "A tool for transferring Unique Molecular Identifiers (UMIs).\n\nMost tools capable of using UMIs to increase the accuracy of quantitative DNA sequencing experiments expect the respective UMI sequence to be embedded into the reads' IDs.\n\n You can use `umi-transfer external` to retrieve UMIs from a separate FastQ file and embed them to the IDs of your paired FastQ files.\n\n"
)]
pub struct Opt {
diff --git a/src/umi_external.rs b/src/umi_external.rs
index c144b5c..5f92c1f 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -8,9 +8,9 @@ use crate::{file_io::check_outputpath, umi_errors::RuntimeErrors};
#[derive(Debug, Parser)]
pub struct OptsExternal {
#[clap(
- short = 'f',
- long = "fix_numbers",
- help = "Automatically change '3' into '2' in sequence header of output file from R3.
+ short = 'c',
+ long = "correct_numbers",
+ help = "Ensure read numbers 1 and 2 in sequence header of output files.
\n "
)]
edit_nr: bool,
@@ -21,6 +21,20 @@ pub struct OptsExternal {
\n "
)]
gzip: bool,
+ #[clap(
+ short = 'f',
+ long = "force",
+ help = "Overwrite existing output files without further warnings or prompts.
+ \n "
+ )]
+ force: bool,
+ #[clap(
+ short = 'd',
+ long = "delim",
+ help = "Delimiter to use when joining the UMIs to the read name. Defaults to `:`.
+ \n "
+ )]
+ delim: Option,
#[clap(
long = "in",
required = true,
@@ -99,8 +113,8 @@ pub fn run(args: OptsExternal) -> Result {
.unwrap_or(file_io::append_umi_to_path(&args.r2_in));
// modify if output path according to compression settings and check if exists.
- output1 = check_outputpath(output1, &args.gzip)?;
- output2 = check_outputpath(output2, &args.gzip)?;
+ output1 = check_outputpath(output1, &args.gzip, &args.force)?;
+ output2 = check_outputpath(output2, &args.gzip, &args.force)?;
println!("Output 1 will be saved to: {}", output1.to_string_lossy());
println!("Output 2 will be saved to: {}", output2.to_string_lossy());
@@ -123,15 +137,29 @@ pub fn run(args: OptsExternal) -> Result {
counter += 1;
if r1_rec.id().eq(ru_rec.id()) {
- // Write to Output file (never edit nr for R1)
- write_file_r1 = file_io::write_to_file(r1_rec, write_file_r1, &ru_rec.seq(), false);
+ // Write to Output file
+ let read_nr = if edit_nr { Some(1) } else { None };
+ write_file_r1 = file_io::write_to_file(
+ r1_rec,
+ write_file_r1,
+ &ru_rec.seq(),
+ args.delim.as_ref(),
+ read_nr,
+ );
} else {
return Err(anyhow!(RuntimeErrors::ReadIDMismatchError));
}
if r2_rec.id().eq(ru_rec.id()) {
- // Write to Output file (edit nr for R2 if --edit-nr flag was included)
- write_file_r2 = file_io::write_to_file(r2_rec, write_file_r2, &ru_rec.seq(), edit_nr);
+ // Write to Output file
+ let read_nr = if edit_nr { Some(2) } else { None };
+ write_file_r2 = file_io::write_to_file(
+ r2_rec,
+ write_file_r2,
+ &ru_rec.seq(),
+ args.delim.as_ref(),
+ read_nr,
+ );
} else {
return Err(anyhow!(RuntimeErrors::ReadIDMismatchError));
}
From aa2ab199a3d10a8847e40d830d27520924eff9cf Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Tue, 9 May 2023 14:02:06 +0200
Subject: [PATCH 24/72] Readme updates.
---
README.md | 28 ++++++++++------------------
1 file changed, 10 insertions(+), 18 deletions(-)
diff --git a/README.md b/README.md
index d8232b0..c38d46c 100644
--- a/README.md
+++ b/README.md
@@ -104,12 +104,12 @@ Note the trailing `&` to leave these processes running in the background. We can
```shell
$ ls -lh
total 1.5K
--rw-rw----. 1 alneberg ngi2016004 4.5G Apr 13 12:18 read1.fastq.gz
--rw-rw----. 1 alneberg ngi2016004 1.1G Apr 13 12:18 read2.fastq.gz
--rw-rw----. 1 alneberg ngi2016004 4.5G Apr 13 12:18 read3.fastq.gz
-prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read1.fastq
-prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read2.fastq
-prw-rw-r--. 1 alneberg ngi2016004 0 Apr 13 12:46 read3.fastq
+-rw-rw----. 1 alneberg ngisweden 4.5G Apr 13 12:18 read1.fastq.gz
+-rw-rw----. 1 alneberg ngisweden 1.1G Apr 13 12:18 read2.fastq.gz
+-rw-rw----. 1 alneberg ngisweden 4.5G Apr 13 12:18 read3.fastq.gz
+prw-rw-r--. 1 alneberg ngisweden 0 Apr 13 12:46 read1.fastq
+prw-rw-r--. 1 alneberg ngisweden 0 Apr 13 12:46 read2.fastq
+prw-rw-r--. 1 alneberg ngisweden 0 Apr 13 12:46 read3.fastq
```
We continue to create corresponding FIFOs for the output files (note that the filenames need to match the value given to `--prefix`)
@@ -128,7 +128,7 @@ The optimal value for this depends on several factors and for optimal performanc
We can then run the `umi-transfer` program as follows:
```shell
-umi-transfer --prefix output --edit-nr --r1-in read1.fastq --r2-in read3.fastq --ru-in read2.fastq
+umi-transfer --in read1.fastq --in2 read3.fastq --umi read2.fastq --out output1.fastq --out2 output2.fastq
```
It's good practice to remove the FIFOs after the program has finished:
@@ -139,16 +139,8 @@ rm read*.fastq output*.fastq
## For developers
-To make modifications to `umi-transfer`, clone this repository, make your changes and then run the code with
+`umi-transfer` is a free and open-source software developed and maintained by scientists of the [Swedish National Genomics Infrastructure](https://ngisweden.scilifelab.se). We gladly welcome suggestions for improvement, bug reports and code contributions.
-```shell
-cargo run --
-```
-
-or build the executable with
-
-```shell
-cargo build --release
-```
+If you'd like to contribute code, the best way to get started is to create a personal fork of the repository. Subsequently, use a new branch to develop your feature or contribute your bug fix. Ideally, use a code linter like `rust-analyzer` in your code editor.
-Please make sure to activate code formatting by `rust-analyzer`.
+Before developing a new feature, we recommend opening an issue on the main repository to discuss your proposal upfront. Once you're ready, simply open a pull request to the `dev` branch and we'll happily review your changes. Thanks for your interest in contributing to `umi-transfer`.
From eba0854051b56ad9499d269c93d7a64df3ebe761 Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Thu, 25 May 2023 16:31:50 +0200
Subject: [PATCH 25/72] Fixing issues highlighted by Clippy.
---
Cargo.lock | 71 ++++++++++++++++++---------------------------
src/file_io.rs | 18 ++++++------
src/umi_external.rs | 4 +--
3 files changed, 39 insertions(+), 54 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index ecc4bbe..02571d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -197,15 +197,15 @@ dependencies = [
[[package]]
name = "console"
-version = "0.15.5"
+version = "0.15.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3d79fbe8970a77e3e34151cc13d3b3e248aa0faaecb9f6091fa07ebefe5ad60"
+checksum = "c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8"
dependencies = [
"encode_unicode",
"lazy_static",
"libc",
"unicode-width",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
]
[[package]]
@@ -458,9 +458,9 @@ dependencies = [
[[package]]
name = "io-lifetimes"
-version = "1.0.10"
+version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
"hermit-abi 0.3.1",
"libc",
@@ -505,15 +505,15 @@ checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
[[package]]
name = "libm"
-version = "0.2.6"
+version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
+checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4"
[[package]]
name = "linux-raw-sys"
-version = "0.3.7"
+version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "matrixmultiply"
@@ -689,14 +689,14 @@ version = "0.3.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e30165d31df606f5726b090ec7592c308a0eaf61721ff64c9a3018e344a8753e"
dependencies = [
- "portable-atomic 1.3.1",
+ "portable-atomic 1.3.2",
]
[[package]]
name = "portable-atomic"
-version = "1.3.1"
+version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bbda379e6e462c97ea6afe9f6233619b202bbc4968d7caa6917788d2070a044"
+checksum = "dc59d1bcc64fc5d021d67521f818db868368028108d37f0e98d74e33f68297b5"
[[package]]
name = "ppv-lite86"
@@ -730,9 +730,9 @@ dependencies = [
[[package]]
name = "proc-macro2"
-version = "1.0.56"
+version = "1.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
dependencies = [
"unicode-ident",
]
@@ -803,9 +803,9 @@ dependencies = [
[[package]]
name = "regex"
-version = "1.8.1"
+version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370"
+checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
dependencies = [
"aho-corasick",
"memchr",
@@ -814,9 +814,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
-version = "0.7.1"
+version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c"
+checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
[[package]]
name = "rustc_version"
@@ -861,22 +861,22 @@ checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac"
[[package]]
name = "serde"
-version = "1.0.162"
+version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71b2f6e1ab5c2b98c05f0f35b236b22e8df7ead6ffbf51d7808da7f8817e7ab6"
+checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
-version = "1.0.162"
+version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2a0814352fd64b58489904a44ea8d90cb1a91dcb6b4f5ebabc32c8318e93cb6"
+checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
dependencies = [
"proc-macro2",
"quote",
- "syn 2.0.15",
+ "syn 2.0.16",
]
[[package]]
@@ -961,9 +961,9 @@ dependencies = [
[[package]]
name = "syn"
-version = "2.0.15"
+version = "2.0.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822"
+checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
dependencies = [
"proc-macro2",
"quote",
@@ -1015,7 +1015,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
dependencies = [
"proc-macro2",
"quote",
- "syn 2.0.15",
+ "syn 2.0.16",
]
[[package]]
@@ -1048,9 +1048,9 @@ dependencies = [
[[package]]
name = "unicode-ident"
-version = "1.0.8"
+version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
+checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
[[package]]
name = "unicode-segmentation"
@@ -1116,21 +1116,6 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-[[package]]
-name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
[[package]]
name = "windows-sys"
version = "0.45.0"
diff --git a/src/file_io.rs b/src/file_io.rs
index f8557db..bc6e74e 100644
--- a/src/file_io.rs
+++ b/src/file_io.rs
@@ -3,7 +3,7 @@ use anyhow::{anyhow, Context, Result};
use dialoguer::Confirm;
use file_format::FileFormat;
use regex::Regex;
-use std::{fs, path::PathBuf};
+use std::{fs, path::Path, path::PathBuf};
// Defining types for simplicity
type File = std::fs::File;
@@ -54,7 +54,7 @@ impl OutputFile {
// Read input file to Reader. Automatically scans if input is compressed with file-format crate.
pub fn read_fastq(path: &PathBuf) -> Result>> {
- if fs::metadata(&path).is_err() {
+ if fs::metadata(path).is_err() {
return Err(anyhow!(RuntimeErrors::FileNotFoundError));
}
@@ -115,13 +115,13 @@ pub fn write_to_file(
let s = input;
let delim = umi_sep.as_ref().map(|s| s.as_str()).unwrap_or(":"); // the delimiter for the UMI
if let Some(number) = edit_nr {
- let header = &[s.id(), delim, std::str::from_utf8(&umi).unwrap()].concat();
+ let header = &[s.id(), delim, std::str::from_utf8(umi).unwrap()].concat();
let mut string = String::from(s.desc().unwrap());
string.replace_range(0..1, &number.to_string());
let desc: Option<&str> = Some(&string);
output.write(header, desc, s)
} else {
- let header = &[s.id(), delim, std::str::from_utf8(&umi).unwrap()].concat();
+ let header = &[s.id(), delim, std::str::from_utf8(umi).unwrap()].concat();
output.write(header, s.desc(), s.clone())
}
}
@@ -148,17 +148,17 @@ pub fn check_outputpath(mut path: PathBuf, compress: &bool, force: &bool) -> Res
.interact()?
{
println!("File will be overwritten.");
- return Ok(path);
+ Ok(path)
} else {
- return Err(anyhow!(RuntimeErrors::FileExistsError));
+ Err(anyhow!(RuntimeErrors::FileExistsError))
}
} else {
- return Ok(path);
+ Ok(path)
}
}
-pub fn append_umi_to_path(path: &PathBuf) -> PathBuf {
- let path_str = path.as_os_str().clone().to_string_lossy();
+pub fn append_umi_to_path(path: &Path) -> PathBuf {
+ let path_str = path.as_os_str().to_string_lossy();
let re = Regex::new(r"^(?P\.*[^\.]+)\.(?P.*)$").unwrap();
let new_path_str = re.replace(&path_str, "${stem}_with_UMIs.${extension}");
PathBuf::from(new_path_str.to_string())
diff --git a/src/umi_external.rs b/src/umi_external.rs
index 5f92c1f..9b880ca 100644
--- a/src/umi_external.rs
+++ b/src/umi_external.rs
@@ -142,7 +142,7 @@ pub fn run(args: OptsExternal) -> Result {
write_file_r1 = file_io::write_to_file(
r1_rec,
write_file_r1,
- &ru_rec.seq(),
+ ru_rec.seq(),
args.delim.as_ref(),
read_nr,
);
@@ -156,7 +156,7 @@ pub fn run(args: OptsExternal) -> Result {
write_file_r2 = file_io::write_to_file(
r2_rec,
write_file_r2,
- &ru_rec.seq(),
+ ru_rec.seq(),
args.delim.as_ref(),
read_nr,
);
From 0d5c6d4eda49cd1519b7fbc01b61625fa863fbb8 Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Thu, 25 May 2023 17:52:06 +0200
Subject: [PATCH 26/72] Addressed the clippy warning 'Large size difference
between variants' for the ReadFile enum by introducing a around the
compressed input. Also used a BufReader for the plain text file.
---
src/file_io.rs | 39 +++++++++++++++++++--------------------
1 file changed, 19 insertions(+), 20 deletions(-)
diff --git a/src/file_io.rs b/src/file_io.rs
index bc6e74e..8029912 100644
--- a/src/file_io.rs
+++ b/src/file_io.rs
@@ -12,16 +12,16 @@ type Gzip = flate2::bufread::MultiGzDecoder;
// Enum for the two acceptable input file formats: '.fastq' and '.fastq.gz'
pub enum ReadFile {
- Fastq(File),
- Gzip(Gzip),
+ Fastq(std::io::BufReader),
+ Gzip(Box),
}
// Implement read for ReadFile enum
impl std::io::Read for ReadFile {
fn read(&mut self, into: &mut [u8]) -> std::io::Result {
match self {
- ReadFile::Fastq(file) => file.read(into),
- ReadFile::Gzip(file) => file.read(into),
+ ReadFile::Fastq(buf_reader) => buf_reader.read(into),
+ ReadFile::Gzip(buf_reader) => buf_reader.read(into),
}
}
}
@@ -54,25 +54,24 @@ impl OutputFile {
// Read input file to Reader. Automatically scans if input is compressed with file-format crate.
pub fn read_fastq(path: &PathBuf) -> Result>> {
- if fs::metadata(path).is_err() {
- return Err(anyhow!(RuntimeErrors::FileNotFoundError));
- }
+ fs::metadata(path).map_err(|_| anyhow!(RuntimeErrors::FileNotFoundError))?;
let format = FileFormat::from_file(path).context("Failed to determine file format")?;
- if format == FileFormat::Gzip {
- Ok(bio::io::fastq::Reader::new(ReadFile::Gzip(
- std::fs::File::open(path)
+ let reader: ReadFile = match format {
+ FileFormat::Gzip => {
+ let file = File::open(path)
.map(std::io::BufReader::new)
- .map(flate2::bufread::MultiGzDecoder::new)
- .with_context(|| format!("Failed to open file: {:?}", path))?,
- )))
- } else {
- // If not gzipped, read as plain fastq
- Ok(bio::io::fastq::Reader::new(ReadFile::Fastq(
- std::fs::File::open(path)
- .with_context(|| format!("Failed to open file: {:?}", path))?,
- )))
- }
+ .with_context(|| format!("Failed to open file: {:?}", path))?;
+ ReadFile::Gzip(Box::new(flate2::bufread::MultiGzDecoder::new(file)))
+ }
+ _ => {
+ let file =
+ File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?;
+ ReadFile::Fastq(std::io::BufReader::new(file))
+ }
+ };
+
+ Ok(bio::io::fastq::Reader::new(reader))
}
// Create output files
From b3a889082b1bafea9051788f5a86546bc2abb7c7 Mon Sep 17 00:00:00 2001
From: Matthias Zepper
Date: Fri, 2 Jun 2023 17:20:33 +0200
Subject: [PATCH 27/72] Readme updates.
---
README.md | 72 ++++++++++++++++++++++++++--------------
docs/.DS_Store | Bin 0 -> 6148 bytes
docs/img/ngi_dark.png | Bin 0 -> 19261 bytes
docs/img/scilifelab.png | Bin 0 -> 14568 bytes
src/umi_external.rs | 4 +--
5 files changed, 50 insertions(+), 26 deletions(-)
create mode 100644 docs/.DS_Store
create mode 100644 docs/img/ngi_dark.png
create mode 100644 docs/img/scilifelab.png
diff --git a/README.md b/README.md
index c38d46c..8f580dd 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,21 @@
-# umi-transfer
+
+
+
+
-A tool for transferring Unique Molecular Identifiers (UMIs) provided as separate FastQ file to the header of records in paired FastQ files.
+
+
umi-transfer
+ A command line tool for transferring Unique Molecular Identifiers (UMIs) provided as separate FastQ file to the header of records in paired FastQ files.
+
+
+
+- [Background on Unique Molecular Identifiers](#background)
+- [Installing `umi-transfer`](#installation)
+- [Using `umi-transfer` to integrate UMIs](#usage)
+- [Improving performance with external multi-threaded compression](#high-performance-guide)
+- [Contributing bugfixes and new features](#contribution-guide-for-developers)
+
+
## Background
@@ -35,7 +50,9 @@ umi-transfer 0.2.0
>
>The decompression and compression used within umi-transfer is single-threaded, so to get the most reads per minute performance, see the [high performance guide](#high-performance-guide)
-The tool requires three FastQ files as input. You can manually specify the names and location of the output files with `--out` and `--out2` or the tool will append a `with_UMI` suffix to your input file names as output. It additionally accepts to choose a custom UMI delimiter with `--delim` and to set the flags `-f`, `-c` and `-z`. The latter specifies to compress the output and `-c` is used to ensure `1` and `2` as read numbers in the output. `-f` / `--force` will overwrite existing output files without prompting the user.
+The tool requires three FastQ files as input. You can manually specify the names and location of the output files with `--out` and `--out2` or the tool will append a `with_UMI` suffix to your input file names as output. It additionally accepts to choose a custom UMI delimiter with `--delim` and to set the flags `-f`, `-c` and `-z`.
+
+`-c` is used to ensure the canonical `1` and `2` of paired files as read numbers in the output, regardless of the read numbers of the input reads. `-f` / `--force` will overwrite existing output files without prompting the user and `-c` enables the internal single-threaded compression of the output files. Alternatively, you can also specify an output file name with `.gz` suffix to obtain compressed output.
```raw
$ umi-transfer external --help
@@ -46,7 +63,7 @@ USAGE:
umi-transfer external [OPTIONS] --in --in2 --umi
OPTIONS:
- -c, --correct_numbers Ensure read numbers 1 and 2 in sequence header of output files.
+ -c, --correct_numbers Read numbers will be altered to ensure the canonical read numbers 1 and 2 in output file sequence headers.
-d, --delim Delimiter to use when joining the UMIs to the read name. Defaults to `:`.
@@ -67,20 +84,20 @@ OPTIONS:
-u, --umi [REQUIRED] Input file with UMI.
- -z, --gzip Compress output files with gzip. By default turned off to encourage use
- of external compression (see Readme).
+ -z, --gzip Compress output files. By default, turned off in favour of external compression.
```
### Example
```shell
-umi-transfer external -f --in 'R1.fastq' --in2 'R3.fastq' --umi 'R2.fastq'
+umi-transfer external -fz -d '_' --in 'R1.fastq' --in2 'R3.fastq' --umi 'R2.fastq'
```
### High Performance Guide
-If you have more than one thread available on your computer and would like to process the read files as quickly as possible we recommend to use unix FIFOs (First In First Out) to handle decompression and compression of the FastQ files.
-This can be done as follows, given that you have your input files compressed as `fastq.gz`, first create FIFOs to represent your uncompressed input files:
+The performance bottleneck of UMI integration is output file compression. [Parallel Gzip](https://github.com/madler/pigz) can be used on modern multi-processor, multi-core machines to significantly outclass the single-threaded compression that ships with `umi-transfer`.
+
+We recommend using Unix FIFOs (First In, First Out buffered pipes) to combine `umi-transfer` and `pigz`:
```shell
mkfifo read1.fastq
@@ -88,18 +105,20 @@ mkfifo read2.fastq
mkfifo read3.fastq
```
-and then we use `zcat` to decompress our input files and send it to the pipe that the FIFOs represent:
+Assuming your compressed input files are called `read1.fastq.gz` and `read2.fastq.gz` and `read3.fastq.gz`, each can be linked to its respective FIFO like so:
```shell
-$ zcat read1.fastq.gz > read1.fastq &
+$ pigz -dc read1.fastq.gz > read1.fastq &
[1] 233387
-$ zcat read2.fastq.gz > read2.fastq &
+$ pigz -dc read2.fastq.gz > read2.fastq &
[2] 233388
-$ zcat read3.fastq.gz > read3.fastq &
+$ pigz -dc read3.fastq.gz > read3.fastq &
[3] 233389
```
-Note the trailing `&` to leave these processes running in the background. We can inspect the directory with `ls`:
+Note the trailing `&` to leave these processes running in the background. Since multi-threading is hardly helpful for decompression, you could also use `zcat` or `gzip -dc` instead of `pigz -dc` here.
+
+We can inspect the directory with `ls` to list the compressed files and the created FIFOs:
```shell
$ ls -lh
@@ -112,35 +131,40 @@ prw-rw-r--. 1 alneberg ngisweden 0 Apr 13 12:46 read2.fastq
prw-rw-r--. 1 alneberg ngisweden 0 Apr 13 12:46 read3.fastq
```
-We continue to create corresponding FIFOs for the output files (note that the filenames need to match the value given to `--prefix`)
+We continue to create FIFOs for the output files:
```shell
$ mkfifo output1.fastq
$ mkfifo output2.fastq
-$ pigz -p 10 --stdout > output1.fastq.gz < output1.fastq &
+```
+
+and set-up a multi-threaded `pigz` compression process each:
+
+```shell
+$ pigz -p 10 -c > output1.fastq.gz < output1.fastq &
[4] 233394
-$ pigz -p 10 --stdout > output2.fastq.gz < output2.fastq &
+$ pigz -p 10 -c > output2.fastq.gz < output2.fastq &
[5] 233395
```
-The value `10` is how many threads each of the `pigz` processes is allowed to use.
-The optimal value for this depends on several factors and for optimal performance you will have to do some testing on your exact hardware.
-We can then run the `umi-transfer` program as follows:
+The argument `-p 10` specifies the number of threads that each `pigz` processes may use. The optimal setting is hardware-specific and will require some testing.
+
+Finally, we can then run `umi-transfer` using the FIFOs like so:
```shell
-umi-transfer --in read1.fastq --in2 read3.fastq --umi read2.fastq --out output1.fastq --out2 output2.fastq
+umi-transfer external --in read1.fastq --in2 read3.fastq --umi read2.fastq --out output1.fastq --out2 output2.fastq
```
It's good practice to remove the FIFOs after the program has finished:
```shell
-rm read*.fastq output*.fastq
+rm read1.fastq read2.fastq read3.fastq output1.fastq output2.fastq
```
-## For developers
+## Contribution guide for developers
`umi-transfer` is a free and open-source software developed and maintained by scientists of the [Swedish National Genomics Infrastructure](https://ngisweden.scilifelab.se). We gladly welcome suggestions for improvement, bug reports and code contributions.
If you'd like to contribute code, the best way to get started is to create a personal fork of the repository. Subsequently, use a new branch to develop your feature or contribute your bug fix. Ideally, use a code linter like `rust-analyzer` in your code editor.
-Before developing a new feature, we recommend opening an issue on the main repository to discuss your proposal upfront. Once you're ready, simply open a pull request to the `dev` branch and we'll happily review your changes. Thanks for your interest in contributing to `umi-transfer`.
+Before developing a new feature, we recommend opening an issue on the main repository to discuss your proposal upfront. Once you're ready, simply open a pull request to the `dev` branch and we'll happily review your changes. Thanks for your interest in contributing to `umi-transfer`!
diff --git a/docs/.DS_Store b/docs/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..4a1f78b6babd0011724029c032fc71193bb32ef2
GIT binary patch
literal 6148
zcmeHKOKQU~5S?jKFm#hmAkdAjkQ=DMJ%KM!l7L&VqY!A<+Vx~TORv#4nhC+g?b5Vm
z2BbHdpX>*gJVeBs*ZrDkK|~dXAd500VjgwfSnw^7rN+A1rT#cl;`f^5*=O`ZJ+)nX
z_59}PQ(bSnv_)){mhYeU%a4akUZ3_qc#qHDReftthb|pjR04Z0&VV!E3^)VMz(fq#
ztyA4iw4ZO|3^)UShXFYs0)}8T%!=jcz*JfQ;2h>6(507NRTvb7kj
z>97Zji-uWI(}}J5VB7iJyl~ka^+O3Kj*31z1J1yZftfBBa{oW#lNoLDhbg{t2AqL^
z#sJUjrrzMC>~6h!J-KTG#uJ8!#AQ(+&<8&OSjaiDDo*tW(GeF7v!YZHdr1fSk3b>B
KCud*+2EG7be<;2H
literal 0
HcmV?d00001
diff --git a/docs/img/ngi_dark.png b/docs/img/ngi_dark.png
new file mode 100644
index 0000000000000000000000000000000000000000..87ada3373a1458eb181754933820104cbc5c9c20
GIT binary patch
literal 19261
zcmaI618`BEvHYc`iJDE&u+qUhA?PM~sZ6|N|#