From 51f755f643f32551eb2921d07b2c17e61f3c1e94 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 6 Mar 2024 14:20:15 -0800 Subject: [PATCH] clean up json handling (#390) * Replaces all custom JSON parsing code with serde. * Adds a dependency on martian-filetypes for lazy JSON loading. * Inverts control flows to return values instead of mutating collections provided by arguments. --- Cargo.lock | 171 +++++++- deny.toml | 7 + enclone_args/Cargo.toml | 2 + enclone_args/src/read_json.rs | 765 +++++++++++++--------------------- enclone_stuff/src/start.rs | 27 +- io_utils/src/lib.rs | 79 ---- vdj_ann/src/annotate.rs | 13 +- 7 files changed, 464 insertions(+), 600 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cdf177945..94554fe38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" dependencies = [ "gimli", ] @@ -64,6 +64,9 @@ name = "anyhow" version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26fa4d7e3f2eebadf743988fc8aec9fa9a9e82611acafd77c1462ed6262440a" +dependencies = [ + "backtrace", +] [[package]] name = "approx" @@ -117,9 +120,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "backtrace" -version = "0.3.68" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" dependencies = [ "addr2line", "cc", @@ -354,6 +357,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "debruijn" version = "0.3.4" @@ -380,6 +404,12 @@ dependencies = [ "uuid", ] +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" + [[package]] name = "derive-new" version = "0.5.9" @@ -462,12 +492,14 @@ dependencies = [ "hdf5", "io_utils", "itertools", + "martian-filetypes", "rand", "rayon", "regex", "serde_json", "string_utils", "vdj_ann", + "vdj_types", "vector_utils", ] @@ -672,6 +704,15 @@ dependencies = [ "instant", ] +[[package]] +name = "fern" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9f0c14694cbd524c8720dd69b0e3179344f04ebb5f90f2e4a440c6ea3b2f1ee" +dependencies = [ + "log", +] + [[package]] name = "filetime" version = "0.2.19" @@ -751,9 +792,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.27.3" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "graph_simple" @@ -1030,6 +1071,52 @@ dependencies = [ "libc", ] +[[package]] +name = "martian" +version = "0.26.0" +source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21" +dependencies = [ + "anyhow", + "backtrace", + "fern", + "heck", + "log", + "rustc_version", + "serde", + "serde_json", + "tempfile", + "time", +] + +[[package]] +name = "martian-derive" +version = "0.26.0" +source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21" +dependencies = [ + "martian", + "proc-macro2", + "quote", + "serde", + "syn 2.0.52", +] + +[[package]] +name = "martian-filetypes" +version = "0.27.0" +source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21" +dependencies = [ + "anyhow", + "bincode", + "csv", + "flate2", + "lz4", + "martian", + "martian-derive", + "serde", + "serde_json", + "zstd", +] + [[package]] name = "matches" version = "0.1.9" @@ -1201,11 +1288,20 @@ dependencies = [ "libm", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "object" -version = "0.31.1" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] @@ -1384,7 +1480,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", - "syn 2.0.50", + "syn 2.0.52", ] [[package]] @@ -1478,7 +1574,7 @@ dependencies = [ "prost 0.12.3", "prost-types 0.12.3", "regex", - "syn 2.0.50", + "syn 2.0.52", "tempfile", "which", ] @@ -1506,7 +1602,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.50", + "syn 2.0.52", ] [[package]] @@ -1662,9 +1758,18 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.21" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustc_version" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] [[package]] name = "rustix" @@ -1739,6 +1844,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "semver" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" + [[package]] name = "serde" version = "1.0.156" @@ -1903,9 +2014,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.50" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f1bdc9872430ce9b75da68329d1c1746faf50ffac5f19e02b71e37ff881ffb" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -1980,6 +2091,36 @@ dependencies = [ "libc", ] +[[package]] +name = "time" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a79d09ac6b08c1ab3906a2f7cc2e81a0e27c7ae89c63812df75e52bef0751e07" +dependencies = [ + "deranged", + "itoa", + "libc", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75c65469ed6b3a4809d987a41eb1dc918e9bc1d92211cbad7ae82931846f7451" +dependencies = [ + "time-core", +] + [[package]] name = "tinyvec" version = "1.6.0" diff --git a/deny.toml b/deny.toml index 98dbe080c..89e15694c 100644 --- a/deny.toml +++ b/deny.toml @@ -130,6 +130,7 @@ allow-registry = ["https://github.com/rust-lang/crates.io-index"] allow-git = [ # TODO: remove this "https://github.com/Barandis/qd", + "https://github.com/martian-lang/martian-rust", ] [sources.allow-org] @@ -139,3 +140,9 @@ github = ["10XGenomics"] gitlab = [] # 1 or more bitbucket.org organizations to allow git sources for bitbucket = [] + + +[[bans.skip]] +# many packages depend on syn 1 +name = "syn" +version = "1.0.105" diff --git a/enclone_args/Cargo.toml b/enclone_args/Cargo.toml index cc16c5d5c..d634a1ffc 100644 --- a/enclone_args/Cargo.toml +++ b/enclone_args/Cargo.toml @@ -30,12 +30,14 @@ evalexpr = ">=7, <12" expr_tools = { path = "../expr_tools" } io_utils = { path = "../io_utils" } itertools.workspace = true +martian-filetypes = { git = "https://github.com/martian-lang/martian-rust", branch = "master" } rand = "0.8" rayon = "1" regex = { version = "1", default-features = false, features = ["std", "perf"] } serde_json = "1" string_utils = { path = "../string_utils" } vdj_ann = { path = "../vdj_ann" } +vdj_types = { path = "../vdj_types" } vector_utils = { path = "../vector_utils" } [target.'cfg(not(windows))'.dependencies.hdf5] diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index b5a376765..c08ebec52 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -1,163 +1,121 @@ // Copyright (c) 2021 10X Genomics, Inc. All rights reserved. -// Fields that are used in all_contig_annotations.json: -// • barcode -// • is_cell and is_asm_cell -- both are optional, but at least one needs to be present and -// true for a cell called by the VDJ pipeline -// • is_gex_cell -- optional -// • productive -- optional but should be true for contigs to be used -// • high_confidence -- optional but should be true for contigs to be used -// • contig_name -// • sequence -// • version -- optional -// • validated_umis -- optional -// • non_validated_umis -- optional -// • invalidated_umis -- optional -// • fraction_of_reads_for_this_barcode_provided_as_input_to_assembly -- optional -// • quals -// • umi_count -// • read_count -// • cdr3, unless in reannotate mode -// • cdr3_seq, unless in reannotate mode -// • cdr3_start, unless in reannotate mode -// • annotations, unless in reannotate mode. - use self::annotate::{annotate_seq, get_cdr3_using_ann, print_some_annotations}; use self::refx::RefData; use self::transcript::is_valid; use debruijn::dna_string::DnaString; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{EncloneControl, OriginInfo, TigData}; -use io_utils::{open_maybe_compressed, path_exists, read_vector_entry_from_json}; +use io_utils::{open_maybe_compressed, path_exists}; +use martian_filetypes::json_file::{Json, LazyJsonReader}; +use martian_filetypes::LazyRead; use rand::Rng; use rayon::prelude::*; -use serde_json::Value; +use std::collections::HashMap; use std::fmt::Write; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::{collections::HashMap, io::BufReader}; +use std::io::BufReader; use string_utils::{stringme, strme, TextUtils}; +use vdj_ann::annotate::ContigAnnotation; use vdj_ann::{annotate, refx, transcript}; +use vdj_types::{VdjChain, VdjRegion}; use vector_utils::{bin_position, erase_if, unique_sort}; // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -fn json_error( - json: Option<&str>, - ctl: &EncloneControl, - exiting: &AtomicBool, - msg: &str, -) -> Result<(), String> { - // The following line prevents error messages from this function from being - // printed multiple times. - let mut msgx = String::new(); - if !exiting.swap(true, Ordering::Relaxed) { - msgx = "\nThere is something wrong with the contig annotations in the cellranger output \ - file" +fn json_error(json: Option<&str>, internal_run: bool, msg: &str) -> String { + let mut msgx = + "There is something wrong with the contig annotations in the cellranger output file" .to_string(); - if json.is_some() { - write!(msgx, "\n{}.", json.unwrap()).unwrap(); - } else { - msgx += "."; - } - if ctl.gen_opt.internal_run { - writeln!(msgx, "\n\npossibly relevant internal data: {msg}").unwrap(); - } - if ctl.gen_opt.internal_run { - msgx += "\n\nATTENTION INTERNAL 10X USERS!\n\ - Quite possibly you are using data from a cellranger run carried out using a \ - version\n\ - between 3.1 and 4.0. For certain of these versions, it is necessary to add the\n\ - argument CURRENT_REF to your command line. If that doesn't work, \ - please see below.\n"; - } - msgx += "\n\nHere is what you should do:\n\n\ - 1. If you used cellranger version ≥ 4.0, the problem is very likely\n\ - that the directory outs/vdj_reference was not retained, so enclone\n\ - didn't see it, and had to guess what the reference sequence was.\n\ - Fix this and everything should be fine.\n\n\ - 2. If you used cellranger version 3.1, then you need to add a command-line\n\ - argument REF=, or if you already did that,\n\ - make sure it is the *same* as that which you gave cellranger.\n\n\ - 3. If you used cellranger version < 3.1 (the only other possibility), then\n\ - you have options:\n\ - • rerun cellranger using the current version\n\ - • or provide an argument REF= as above and RE to force reannotation\n\ - • or provide the argument BUILT_IN to use the current reference and force\n \ - reannotation (and MOUSE if you used mouse); only works with human and mouse.\n\n\ - Note that one way to get the error is to specify TCR when you meant BCR, or the\n\ - other way.\n\n\ - If you're stuck, please write to us at enclone@10xgenomics.com.\n"; - } - Err(msgx) + if let Some(json) = json { + write!(msgx, "\n{json}.").unwrap(); + } else { + msgx += "."; + } + if internal_run { + writeln!(msgx, "\n\npossibly relevant internal data: {msg}").unwrap(); + + msgx += "\n\nATTENTION INTERNAL 10X USERS!\n\ + Quite possibly you are using data from a cellranger run carried out using a \ + version\n\ + between 3.1 and 4.0. For certain of these versions, it is necessary to add the\n\ + argument CURRENT_REF to your command line. If that doesn't work, \ + please see below.\n"; + } + msgx += "\n\nHere is what you should do:\n\n\ + 1. If you used cellranger version ≥ 4.0, the problem is very likely\n\ + that the directory outs/vdj_reference was not retained, so enclone\n\ + didn't see it, and had to guess what the reference sequence was.\n\ + Fix this and everything should be fine.\n\n\ + 2. If you used cellranger version 3.1, then you need to add a command-line\n\ + argument REF=, or if you already did that,\n\ + make sure it is the *same* as that which you gave cellranger.\n\n\ + 3. If you used cellranger version < 3.1 (the only other possibility), then\n\ + you have options:\n\ + • rerun cellranger using the current version\n\ + • or provide an argument REF= as above and RE to force reannotation\n\ + • or provide the argument BUILT_IN to use the current reference and force\n \ + reannotation (and MOUSE if you used mouse); only works with human and mouse.\n\n\ + Note that one way to get the error is to specify TCR when you meant BCR, or the\n\ + other way.\n\n\ + If you're stuck, please write to us at enclone@10xgenomics.com.\n"; + + msgx } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -fn parse_vector_entry_from_json( - x: &[u8], +#[derive(Default)] +struct JsonParseResult { + vdj_cell: Option, + gex_cell: Option, + gex_cells_specified: bool, + tig: Option, +} + +fn process_json_annotation( + ann: ContigAnnotation, json: &str, accept_inconsistent: bool, origin_info: &OriginInfo, - li: usize, + dataset_index: usize, refdata: &RefData, to_ref_index: &HashMap, reannotate: bool, ctl: &EncloneControl, - vdj_cells: &mut Vec, - gex_cells: &mut Vec, - gex_cells_specified: &mut bool, - cr_version: &mut String, - tigs: &mut Vec, - exiting: &AtomicBool, -) -> Result<(), String> { - let v: Value = match serde_json::from_slice(x) { - Err(_) => { - return Err(format!( - "\nInternal error, failed to parse a value from a string. The string is:\n{}\n", - strme(x) - )); - } - Ok(v) => v, - }; - let barcode = v["barcode"].to_string().between("\"", "\"").to_string(); +) -> Result { + let mut res: JsonParseResult = Default::default(); // Get cell status. Sometime after CR 4.0 was released, and before 4.1 was released, // we added new fields is_asm_cell and is_gex_cell to the json file. The value of // is_asm_cell is the original determination of "cell" in the VDJ pipeline, whereas the // value of is_gex_cell is that for the GEX pipeline. - - let mut is_cell = v["is_cell"].as_bool().unwrap_or(false); - let is_asm_cell = v["is_asm_cell"].as_bool().unwrap_or(false); - if is_asm_cell { + let mut is_cell = ann.is_cell; + if ann.is_asm_cell.is_some_and(|is_asm_cell| is_asm_cell) { is_cell = true; } - let is_gex_cell = v["is_gex_cell"].as_bool(); - if is_gex_cell.is_some() { - *gex_cells_specified = true; - } - if is_gex_cell == Some(true) { - gex_cells.push(barcode.clone()); + if let Some(is_gex_cell) = ann.is_gex_cell { + res.gex_cells_specified = true; + if is_gex_cell { + res.gex_cell = Some(ann.barcode.clone()); + } } if !ctl.gen_opt.ncell && !is_cell { - return Ok(()); + return Ok(res); } if is_cell { - vdj_cells.push(barcode.clone()); + res.vdj_cell = Some(ann.barcode.clone()); } // Proceed. - if !ctl.gen_opt.reprod && !v["productive"].as_bool().unwrap_or(false) { - return Ok(()); + if !ctl.gen_opt.reprod && !ann.productive.unwrap_or(false) { + return Ok(res); } - if !ctl.gen_opt.reprod && !ctl.gen_opt.ncell && !v["high_confidence"].as_bool().unwrap_or(false) - { - return Ok(()); + if !ctl.gen_opt.reprod && !ctl.gen_opt.ncell && !ann.high_confidence { + return Ok(res); } - let tigname = v["contig_name"].to_string().between("\"", "\"").to_string(); - let full_seq = &v["sequence"].to_string().between("\"", "\"").to_string(); let mut left = false; let (mut v_ref_id, mut j_ref_id) = (1000000, 0); let mut d_ref_id: Option = None; @@ -175,126 +133,89 @@ fn parse_vector_entry_from_json( let mut cdr3_aa: String; let mut cdr3_dna: String; let mut cdr3_start: usize; - if v.get("version").is_some() { - *cr_version = v["version"].to_string().between("\"", "\"").to_string(); - } - // Read validated and non-validated UMIs. - - let mut validated_umis = Vec::::new(); - let mut validated_umis_present = false; - let val = v["validated_umis"].as_array(); - if let Some(val) = val { - validated_umis_present = true; - for vi in val { - validated_umis.push(vi.to_string().between("\"", "\"").to_string()); - } - } - let mut non_validated_umis = Vec::::new(); - let mut non_validated_umis_present = false; - let non_val = v["non_validated_umis"].as_array(); - if let Some(non_val) = non_val { - non_validated_umis_present = true; - for nv in non_val { - non_validated_umis.push(nv.to_string().between("\"", "\"").to_string()); - } - } - let mut invalidated_umis = Vec::::new(); - let mut invalidated_umis_present = false; - let inval = v["invalidated_umis"].as_array(); - if let Some(inval) = inval { - invalidated_umis_present = true; - for inv in inval { - invalidated_umis.push(inv.to_string().between("\"", "\"").to_string()); - } - } - - // Read fraction_of_reads_for_this_barcode_provided_as_input_to_assembly. - - let mut frac_reads_used = None; - let f = v["fraction_of_reads_for_this_barcode_provided_as_input_to_assembly"].as_f64(); - if let Some(f) = f { - frac_reads_used = Some((f * 1_000_000.0).round() as u32); - } + let frac_reads_used = ann + .fraction_of_reads_for_this_barcode_provided_as_input_to_assembly + .map(|f| (f * 1_000_000.0).round() as u32); // Reannotate. - if reannotate || ctl.gen_opt.reprod { - let x = DnaString::from_dna_string(full_seq); - let mut ann = Vec::<(i32, i32, i32, i32, i32)>::new(); - annotate_seq(&x, refdata, &mut ann, true, false, true); + let x = DnaString::from_dna_string(&ann.sequence); + let mut ann1 = Vec::<(i32, i32, i32, i32, i32)>::new(); + annotate_seq(&x, refdata, &mut ann1, true, false, true); // If there are multiple V segment alignments, possibly reduce to just one. let mut ann2 = Vec::<(i32, i32, i32, i32, i32)>::new(); let mut j = 0; - while j < ann.len() { - let t = ann[j].2 as usize; + while j < ann1.len() { + let t = ann1[j].2 as usize; let mut k = j + 1; - while k < ann.len() { - if refdata.segtype[ann[k].2 as usize] != refdata.segtype[t] { + while k < ann1.len() { + if refdata.segtype[ann1[k].2 as usize] != refdata.segtype[t] { break; } k += 1; } if refdata.segtype[t] == "V" && k - j > 1 { let mut entries = 1; - if j < ann.len() - 1 - && ann[j + 1].2 as usize == t - && ((ann[j].0 + ann[j].1 == ann[j + 1].0 && ann[j].3 + ann[j].1 < ann[j + 1].3) - || (ann[j].0 + ann[j].1 < ann[j + 1].0 - && ann[j].3 + ann[j].1 == ann[j + 1].3)) + if j < ann1.len() - 1 + && ann1[j + 1].2 as usize == t + && ((ann1[j].0 + ann1[j].1 == ann1[j + 1].0 + && ann1[j].3 + ann1[j].1 < ann1[j + 1].3) + || (ann1[j].0 + ann1[j].1 < ann1[j + 1].0 + && ann1[j].3 + ann1[j].1 == ann1[j + 1].3)) { entries = 2; } - ann2.extend(&ann[j..j + entries]); + ann2.extend(&ann1[j..j + entries]); } else { - ann2.extend(&ann[j..k]); + ann2.extend(&ann1[j..k]); } j = k; } - ann = ann2; + ann1 = ann2; // Proceed. - if ctl.gen_opt.trace_barcode == *barcode { + if ctl.gen_opt.trace_barcode == ann.barcode { let mut log = Vec::::new(); - print_some_annotations(refdata, &ann, &mut log, false); + print_some_annotations(refdata, &ann1, &mut log, false); print!("\n{}", strme(&log)); } let mut log = Vec::::new(); - if ctl.gen_opt.trace_barcode == *barcode { + if ctl.gen_opt.trace_barcode == ann.barcode { if !is_valid( &x, refdata, - &ann, + &ann1, true, &mut log, Some(ctl.gen_opt.gamma_delta), ) { print!("{}", strme(&log)); println!("invalid"); - return Ok(()); + return Ok(res); } } else if !is_valid( &x, refdata, - &ann, + &ann1, false, &mut log, Some(ctl.gen_opt.gamma_delta), ) { - return Ok(()); + return Ok(res); } let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - get_cdr3_using_ann(&x, refdata, &ann, &mut cdr3); + get_cdr3_using_ann(&x, refdata, &ann1, &mut cdr3); cdr3_aa = stringme(&cdr3[0].1); cdr3_start = cdr3[0].0; cdr3_dna = x .slice(cdr3_start, cdr3_start + 3 * cdr3_aa.len()) .to_string(); let mut seen_j = false; - for anni in ann { + for anni in ann1 { let t = anni.2 as usize; if refdata.is_u(t) { u_ref_id = Some(t); @@ -313,7 +234,7 @@ fn parse_vector_entry_from_json( if tig_start > cdr3_start as isize { panic!( "Something is wrong with the CDR3 start for this contig:\n\n{}.", - &full_seq + ann.sequence ); } cdr3_start -= tig_start as usize; @@ -340,39 +261,32 @@ fn parse_vector_entry_from_json( } else { // Use annotations from json file. - cdr3_aa = v["cdr3"].to_string().between("\"", "\"").to_string(); - cdr3_dna = v["cdr3_seq"].to_string().between("\"", "\"").to_string(); - cdr3_start = v["cdr3_start"].as_u64().unwrap() as usize; - let ann = v["annotations"].as_array(); - if ann.is_none() { + cdr3_aa = ann.cdr3.unwrap(); + cdr3_dna = ann.cdr3_seq.unwrap(); + cdr3_start = ann.cdr3_start.unwrap(); + let annotations = ann.annotations; + if annotations.is_empty() { return Err(format!( "\nThe file\n{json}\ndoes not contain annotations. To use enclone with it, \ please specify the argument BUILT_IN\nto force use of the internal \ reference and recompute annotations.\n" )); } - let ann = ann.unwrap(); let mut cigarv = String::new(); // cigar for V segment - for a in ann { - let region_type = &a["feature"]["region_type"]; - let feature_id = a["feature"]["feature_id"].as_u64().unwrap() as usize; + for a in annotations { + let region_type = a.feature.region_type; + let feature_id = a.feature.feature_id; if !to_ref_index.contains_key(&feature_id) { continue; } let feature_idx = to_ref_index[&feature_id]; - let ref_start = a["annotation_match_start"].as_u64().unwrap() as usize; - if region_type == "L-REGION+V-REGION" { - v_stop = a["contig_match_end"].as_i64().unwrap() as usize; - v_stop_ref = a["annotation_match_end"].as_i64().unwrap() as usize; + let ref_start = a.annotation_match_start; + if region_type == VdjRegion::V { + v_stop = a.contig_match_end; + v_stop_ref = a.annotation_match_end; } - let gene_name = a["feature"]["gene_name"] - .to_string() - .between("\"", "\"") - .to_string(); - if refdata.name[feature_idx] != gene_name - && !accept_inconsistent - && !exiting.swap(true, Ordering::Relaxed) - { + let gene_name = a.feature.gene_name; + if refdata.name[feature_idx] != gene_name && !accept_inconsistent { return Err(format!( "\nThere is an inconsistency between the reference \ file used to create the Cell Ranger output files in\n{}\nand the \ @@ -394,48 +308,44 @@ fn parse_vector_entry_from_json( refdata.name[feature_idx] )); } - if region_type == "L-REGION+V-REGION" && ref_start == 0 { - let chain = a["feature"]["chain"] - .to_string() - .between("\"", "\"") - .to_string(); - // if !chain.starts_with("IG") { continue; } // ******************* - tig_start = a["contig_match_start"].as_i64().unwrap() as isize; + if region_type == VdjRegion::V && ref_start == 0 { + let chain = a.feature.chain; + chain_type = chain.to_string(); + tig_start = a.contig_match_start as isize; cdr3_start -= tig_start as usize; - chain_type = chain.clone(); - if chain == *"IGH" - || chain == *"TRB" - || (chain == *"TRD" && ctl.gen_opt.gamma_delta) + if chain == VdjChain::IGH + || chain == VdjChain::TRB + || (chain == VdjChain::TRD && ctl.gen_opt.gamma_delta) { left = true; } v_ref_id = feature_idx; - cigarv = a["cigar"].to_string().between("\"", "\"").to_string(); + cigarv = a.cigar; } else { // also check for IG chain????????????????????????????????????????? - let ref_stop = a["annotation_match_end"].as_u64().unwrap() as usize; - let ref_len = a["annotation_length"].as_u64().unwrap() as usize; - if region_type == "J-REGION" && ref_stop == ref_len { - tig_stop = a["contig_match_end"].as_i64().unwrap() as isize; + let ref_stop = a.annotation_match_end; + let ref_len = a.annotation_length; + if region_type == VdjRegion::J && ref_stop == ref_len { + tig_stop = a.contig_match_end as isize; j_ref_id = feature_idx; - j_start = a["contig_match_start"].as_i64().unwrap() as usize; - j_start_ref = a["annotation_match_start"].as_i64().unwrap() as usize; + j_start = a.contig_match_start; + j_start_ref = a.annotation_match_start; } - if region_type == "5'UTR" { + if region_type == VdjRegion::UTR { u_ref_id = Some(feature_idx); } - if region_type == "D-REGION" { - d_start = Some(a["contig_match_start"].as_i64().unwrap() as usize); + if region_type == VdjRegion::D { + d_start = Some(a.contig_match_start); d_ref_id = Some(feature_idx); } - if region_type == "C-REGION" { + if region_type == VdjRegion::C { c_ref_id = Some(feature_idx); - c_start = Some(a["contig_match_start"].as_i64().unwrap() as usize); + c_start = Some(a.contig_match_start); } } } if v_ref_id == 1000000 { - return Ok(()); + return Ok(res); } // Compute annv from cigarv. We don't compute the mismatch entry. @@ -483,7 +393,7 @@ fn parse_vector_entry_from_json( let rt = &refdata.refs[v_ref_id]; if annv.len() == 2 && annv[0].1 as usize > rt.len() { let msg = format!("annv[0].1 = {}, rt.len() = {}", annv[0].1, rt.len()); - json_error(None, ctl, exiting, &msg)?; + return Err(json_error(None, ctl.gen_opt.internal_run, &msg)); } // Check to see if the CDR3 sequence has changed. This could happen if the cellranger @@ -492,17 +402,17 @@ fn parse_vector_entry_from_json( // inconsistencies, leading to an assert somewhere downstream. let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - let x = DnaString::from_dna_string(full_seq); + let x = DnaString::from_dna_string(&ann.sequence); get_cdr3_using_ann(&x, refdata, &annv, &mut cdr3); if cdr3.is_empty() { - return Ok(()); + return Ok(res); } let cdr3_aa_alt = stringme(&cdr3[0].1); if cdr3_aa != cdr3_aa_alt { // This is particularly pathological and rare: if tig_start as usize > cdr3[0].0 { - return Ok(()); + return Ok(res); } // Define start. @@ -525,63 +435,56 @@ fn parse_vector_entry_from_json( // It is not known if these correspond to bugs in cellranger that were subsequently fixed. if cdr3_aa.contains('*') { - return Ok(()); + return Ok(res); } if cdr3_start + 3 * cdr3_aa.len() > tig_stop as usize - tig_start as usize { - return Ok(()); + return Ok(res); } // Keep going. if tig_start < 0 || tig_stop < 0 { let msg = format!("tig_start = {tig_start}, tig_stop = {tig_stop}"); - json_error(Some(json), ctl, exiting, &msg)?; + return Err(json_error(Some(json), ctl.gen_opt.internal_run, &msg)); } let (tig_start, tig_stop) = (tig_start as usize, tig_stop as usize); - let quals0 = v["quals"].to_string(); - let quals0 = quals0.after("\"").as_bytes(); - let mut quals = Vec::::new(); - let mut slashed = false; - for &qual in quals0.iter().take(quals0.len() - 1) { - if !slashed && qual == b'\\' - /* && ( i == 0 || quals0[i-1] != b'\\' ) */ - { - slashed = true; - continue; - } - slashed = false; - quals.push(qual); - } - assert_eq!(full_seq.len(), quals.len()); - let seq = &full_seq[tig_start..tig_stop].to_string(); - for qual in quals.iter_mut() { + let mut quals = ann.quals.as_bytes().to_vec(); + assert_eq!(ann.sequence.len(), ann.quals.as_bytes().len()); + let seq = &ann.sequence[tig_start..tig_stop].to_string(); + for qual in &mut quals { *qual -= 33_u8; } let full_quals = quals; let quals = full_quals[tig_start..tig_stop].to_vec(); - let umi_count = v["umi_count"].as_i64().unwrap() as usize; - let read_count = v["read_count"].as_i64().unwrap() as usize; - let origin = origin_info.origin_for_bc[li].get(&barcode).or_else(|| { - // the way we use s1 here is flaky - if !origin_info.origin_id[li].is_empty() - && (origin_info.origin_id[li] != *"s1" || origin_info.origin_for_bc[li].is_empty()) - { - Some(&origin_info.origin_id[li]) - } else { - None - } - }); - let donor = origin_info.donor_for_bc[li].get(&barcode).or_else(|| { - // the way we use d1 here is flaky - if !origin_info.origin_id[li].is_empty() - && (origin_info.donor_id[li] != *"d1" || origin_info.donor_for_bc[li].is_empty()) - { - Some(&origin_info.donor_id[li]) - } else { - None - } - }); - let tag = origin_info.tag[li].get(&barcode); + let umi_count = ann.umi_count; + let read_count = ann.read_count; + let origin = origin_info.origin_for_bc[dataset_index] + .get(&ann.barcode) + .or_else(|| { + // the way we use s1 here is flaky + if !origin_info.origin_id[dataset_index].is_empty() + && (origin_info.origin_id[dataset_index] != *"s1" + || origin_info.origin_for_bc[dataset_index].is_empty()) + { + Some(&origin_info.origin_id[dataset_index]) + } else { + None + } + }); + let donor = origin_info.donor_for_bc[dataset_index] + .get(&ann.barcode) + .or_else(|| { + // the way we use d1 here is flaky + if !origin_info.origin_id[dataset_index].is_empty() + && (origin_info.donor_id[dataset_index] != *"d1" + || origin_info.donor_for_bc[dataset_index].is_empty()) + { + Some(&origin_info.donor_id[dataset_index]) + } else { + None + } + }); + let tag = origin_info.tag[dataset_index].get(&ann.barcode); let mut origin_index = None; let mut donor_index = None; let mut tag_index = None; @@ -594,19 +497,8 @@ fn parse_vector_entry_from_json( if let Some(tag) = tag { tag_index = Some(bin_position(&origin_info.tag_list, tag) as usize); } - let mut valu = None; - if validated_umis_present { - valu = Some(validated_umis); - } - let mut non_valu = None; - if non_validated_umis_present { - non_valu = Some(non_validated_umis); - } - let mut invalu = None; - if invalidated_umis_present { - invalu = Some(invalidated_umis); - } - tigs.push(TigData { + + res.tig = Some(TigData { cdr3_dna, len: seq.len(), v_start: tig_start, @@ -617,7 +509,7 @@ fn parse_vector_entry_from_json( j_start_ref, j_stop: tig_stop, c_start, - full_seq: full_seq.as_bytes().to_vec(), + full_seq: ann.sequence.as_bytes().to_vec(), v_ref_id, d_ref_id, j_ref_id, @@ -632,10 +524,10 @@ fn parse_vector_entry_from_json( cdr3_start, quals, full_quals, - barcode, - tigname, + barcode: ann.barcode, + tigname: ann.contig_name, left, - dataset_index: li, + dataset_index, origin_index, donor_index, tag_index, @@ -643,55 +535,41 @@ fn parse_vector_entry_from_json( read_count, chain_type, annv, - validated_umis: valu, - non_validated_umis: non_valu, - invalidated_umis: invalu, + validated_umis: ann.validated_umis, + non_validated_umis: ann.non_validated_umis, + invalidated_umis: ann.invalidated_umis, frac_reads_used, }); - Ok(()) + Ok(res) } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ // Parse the JSON annotations file. // -// In the future could be converted to LazyWrite: -// https://martian-lang.github.io/martian-rust/doc/martian_filetypes/json_file/ -// index.html#lazy-readwrite-example. -// // Tracking contigs using bc_cdr3_aa; could improve later. // // This section requires 3.1. If you want to avoid that, do something to make tig_start // and tig_stop always nonnegative. Or use the RE option. -// -// Computational performance. It would appear that nearly all the time here is spent in -// two lines: -// -// read_vector_entry_from_json(&mut f) { -// let v: Value = serde_json::from_str(strme(&x)).unwrap(); -// (Should retest.) -// -// and simply reading the file lines is several times faster. So the way we parse the -// files is suboptimal. If we want to make this faster, one option would be to speed up -// this code. Another would be to write out a binary version of the JSON file that contains -// only the information that we need. -pub fn read_json( +#[derive(Default)] +struct ReadJsonResult { + vdj_cells: Vec, + gex_cells: Vec, + gex_cells_specified: bool, + tig_bc: Vec>, +} + +fn read_json( accept_inconsistent: bool, origin_info: &OriginInfo, - li: usize, + dataset_index: usize, json: &String, refdata: &RefData, to_ref_index: &HashMap, reannotate: bool, - cr_version: &mut String, ctl: &EncloneControl, - vdj_cells: &mut Vec, - gex_cells: &mut Vec, - gex_cells_specified: &mut bool, -) -> Result>, String> { - *gex_cells_specified = false; - let mut tigs = Vec::::new(); +) -> Result { let mut jsonx = json.clone(); if !path_exists(json) { jsonx = format!("{json}.lz4"); @@ -715,83 +593,42 @@ pub fn read_json( input files to enclone, including the PRE argument.\n" )); } - let mut f = BufReader::new(open_maybe_compressed(&jsonx)); - // ◼ This loop could be speeded up, see comments above. - let mut xs = Vec::>::new(); - loop { - let x = read_vector_entry_from_json(&mut f); - if x.is_err() { - eprintln!("\nProblem reading {jsonx}.\n"); - return Err(x.err().unwrap()); - } - match x.unwrap() { - None => break, - Some(x) => { - xs.push(x); - } - } - } - let mut results = Vec::<( - usize, - Vec, - Vec, - bool, - String, - Vec, - String, - )>::new(); - for i in 0..xs.len() { - results.push(( - i, - Vec::::new(), - Vec::::new(), - false, - String::new(), - Vec::::new(), - String::new(), - )); - } - let exiting = AtomicBool::new(false); - results.par_iter_mut().for_each(|res| { - let i = res.0; - let resx = parse_vector_entry_from_json( - &xs[i], + + let mut tigs = Vec::new(); + let mut vdj_cells = Vec::new(); + let mut gex_cells = Vec::new(); + let mut gex_cells_specified = false; + + let reader: LazyJsonReader = + LazyJsonReader::with_reader(BufReader::new(open_maybe_compressed(&jsonx))) + .map_err(|err| format!("{err:#?}"))?; + + for entry in reader.into_iter() { + let result = process_json_annotation( + entry.map_err(|err| err.to_string())?, json, accept_inconsistent, origin_info, - li, + dataset_index, refdata, to_ref_index, reannotate, ctl, - &mut res.1, - &mut res.2, - &mut res.3, - &mut res.4, - &mut res.5, - &exiting, - ); - if let Err(resx) = resx { - res.6 = resx; + )?; + if let Some(tig) = result.tig { + tigs.push(tig); } - }); - for result in &results { - if !result.6.is_empty() { - return Err(result.6.clone()); + if let Some(c) = result.vdj_cell { + vdj_cells.push(c); } - } - for result in results.iter_mut().take(xs.len()) { - vdj_cells.append(&mut result.1); - gex_cells.append(&mut result.2); - if result.3 { - *gex_cells_specified = true; + if let Some(c) = result.gex_cell { + gex_cells.push(c); } - if !result.4.is_empty() { - *cr_version = result.4.clone(); + if result.gex_cells_specified { + gex_cells_specified = true; } - tigs.append(&mut result.5); } - unique_sort(gex_cells); + unique_sort(&mut gex_cells); let mut tig_bc = Vec::>::new(); let mut r = 0; while r < tigs.len() { @@ -812,7 +649,7 @@ pub fn read_json( } r = s; } - unique_sort(vdj_cells); + unique_sort(&mut vdj_cells); // Subsample. @@ -826,122 +663,89 @@ pub fn read_json( if y < 1.0 - ctl.gen_opt.subsample { *del = true; let bc = &bc[0].barcode; - let p = bin_position(vdj_cells, bc); + let p = bin_position(&vdj_cells, bc); if p >= 0 { to_delete2[p as usize] = true; } - let p = bin_position(gex_cells, bc); + let p = bin_position(&gex_cells, bc); if p >= 0 { to_delete3[p as usize] = true; } } } erase_if(&mut tig_bc, &to_delete1); - erase_if(vdj_cells, &to_delete2); - erase_if(gex_cells, &to_delete3); + erase_if(&mut vdj_cells, &to_delete2); + erase_if(&mut gex_cells, &to_delete3); } // Done. - Ok(tig_bc) + Ok(ReadJsonResult { + vdj_cells, + gex_cells, + gex_cells_specified, + tig_bc, + }) } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -// Parse the JSON annotations file(s). +pub struct Annotations { + pub vdj_cells: Vec>, + pub gex_cells: Vec>, + pub gex_cells_specified: Vec, + pub tig_bc: Vec>, + pub fate: Vec>, +} pub fn parse_json_annotations_files( ctl: &EncloneControl, - tig_bc: &mut Vec>, refdata: &RefData, to_ref_index: &HashMap, - vdj_cells: &mut Vec>, - gex_cells: &mut Vec>, - gex_cells_specified: &mut Vec, - fate: &mut [HashMap], -) -> Result<(), String> { - // (origin index, contig name, V..J length): (?) - let mut results = Vec::<( - usize, - Vec<(String, usize)>, - Vec>, - Vec>, // logs - String, - Vec, - Vec, - bool, - String, - )>::new(); - for i in 0..ctl.origin_info.dataset_path.len() { - results.push(( - i, - Vec::<(String, usize)>::new(), - Vec::>::new(), - Vec::>::new(), - String::new(), - Vec::::new(), - Vec::::new(), - false, - String::new(), - )); - } +) -> Result { // Note: only tracking truncated seq and quals initially let ann = if !ctl.gen_opt.cellranger { "all_contig_annotations.json" } else { "contig_annotations.json" }; - results.par_iter_mut().for_each(|res| { - let li = res.0; - let json = format!("{}/{ann}", ctl.origin_info.dataset_path[li]); - let json_lz4 = format!("{}/{ann}.lz4", ctl.origin_info.dataset_path[li]); - if !path_exists(&json) && !path_exists(&json_lz4) { - res.8 = format!("\ncan't find {json} or {json_lz4}\n"); - return; - } - let resx = read_json( - ctl.gen_opt.accept_inconsistent, - &ctl.origin_info, - li, - &json, - refdata, - to_ref_index, - ctl.gen_opt.reannotate, - &mut res.4, - ctl, - &mut res.5, - &mut res.6, - &mut res.7, - ); - if let Ok(resx) = resx { - let tig_bc: Vec> = resx; - res.5.sort(); - res.2 = tig_bc; - } else { - res.8 = resx.err().unwrap(); - } - }); - for result in &results { - if !result.8.is_empty() { - return Err(result.8.clone()); - } - } - let mut versions = Vec::::new(); - for i in 0..results.len() { - tig_bc.append(&mut results[i].2.clone()); - // ctl.gen_opt.cr_version = results[i].4.clone(); - if results[i].4.is_empty() { - versions.push("≤3.1".to_string()); - } else { - versions.push(results[i].4.clone()); - } - vdj_cells.push(results[i].5.clone()); - gex_cells.push(results[i].6.clone()); - gex_cells_specified.push(results[i].7); + let results = ctl + .origin_info + .dataset_path + .par_iter() + .enumerate() + .map(|(li, dataset_path)| { + let json = format!("{dataset_path}/{ann}"); + let json_lz4 = format!("{dataset_path}/{ann}.lz4"); + if !path_exists(&json) && !path_exists(&json_lz4) { + return Err(format!("\ncan't find {json} or {json_lz4}\n")); + } + read_json( + ctl.gen_opt.accept_inconsistent, + &ctl.origin_info, + li, + &json, + refdata, + to_ref_index, + ctl.gen_opt.reannotate, + ctl, + ) + .map(|r| (li, r)) + }) + .collect::, String>>()?; + + let mut ann = Annotations { + tig_bc: Default::default(), + vdj_cells: Default::default(), + gex_cells: Default::default(), + gex_cells_specified: Default::default(), + fate: vec![HashMap::::new(); ctl.origin_info.n()], + }; - let cells = &results[i].5; + for (i, result) in results { + let cells = &result.vdj_cells; let mut found = vec![false; cells.len()]; - let tigs = &results[i].2; + let tigs = &result.tig_bc; for tig in tigs { let p = bin_position(cells, &tig[0].barcode); if p >= 0 { @@ -950,25 +754,14 @@ pub fn parse_json_annotations_files( } for j in 0..found.len() { if !found[j] { - fate[i].insert(cells[j].clone(), BarcodeFate::NonProductive); + ann.fate[i].insert(cells[j].clone(), BarcodeFate::NonProductive); } } + + ann.tig_bc.extend(result.tig_bc.into_iter()); + ann.vdj_cells.push(result.vdj_cells); + ann.gex_cells.push(result.gex_cells); + ann.gex_cells_specified.push(result.gex_cells_specified); } - /* - if !ctl.gen_opt.internal_run { - unique_sort(&mut versions); - if versions.len() > 1 - && versions != vec!["4.0".to_string(), "4009.52.0-82-g2244c685a".to_string()] - { - let args: Vec = env::args().collect(); - return Err(format!( - "\nYou're using output from multiple Cell Ranger versions = {},\n\ - which is not allowed. Your command was:\n{}\n", - versions.iter().format(", "), - args.iter().format(","), - )); - } - } - */ - Ok(()) + Ok(ann) } diff --git a/enclone_stuff/src/start.rs b/enclone_stuff/src/start.rs index b537b6c41..52ae52f93 100644 --- a/enclone_stuff/src/start.rs +++ b/enclone_stuff/src/start.rs @@ -18,9 +18,9 @@ use enclone::join::join_exacts; use enclone::misc1::{cross_filter, lookup_heavy_chain_reuse}; use enclone::misc2::{check_for_barcode_reuse, find_exact_subclonotypes, search_for_shm_indels}; use enclone::misc3::sort_tig_bc; -use enclone_args::read_json::parse_json_annotations_files; +use enclone_args::read_json::{parse_json_annotations_files, Annotations}; use enclone_core::barcode_fate::BarcodeFate; -use enclone_core::defs::{AlleleData, CloneInfo, TigData}; +use enclone_core::defs::{AlleleData, CloneInfo}; use enclone_core::enclone_structs::{EncloneExacts, EncloneIntermediates, EncloneSetup}; use enclone_core::hcomp::heavy_complexity; use enclone_print::define_mat::{define_mat, setup_define_mat}; @@ -121,21 +121,14 @@ pub fn main_enclone_start(setup: EncloneSetup) -> Result>::new(); - let mut vdj_cells = Vec::>::new(); - let mut gex_cells = Vec::>::new(); - let mut gex_cells_specified = Vec::::new(); - let mut fate = vec![HashMap::::new(); ctl.origin_info.n()]; - parse_json_annotations_files( - ctl, - &mut tig_bc, - refdata, - to_ref_index, - &mut vdj_cells, - &mut gex_cells, - &mut gex_cells_specified, - &mut fate, - )?; + + let Annotations { + mut tig_bc, + gex_cells, + gex_cells_specified, + vdj_cells, + mut fate, + } = parse_json_annotations_files(ctl, refdata, to_ref_index)?; ctl.perf_stats(&tparse, "loading from json"); // Populate features. diff --git a/io_utils/src/lib.rs b/io_utils/src/lib.rs index f15a6b669..3b393a10a 100644 --- a/io_utils/src/lib.rs +++ b/io_utils/src/lib.rs @@ -251,85 +251,6 @@ pub fn get_metric_value(f: impl AsRef, metric: &str) -> String { String::default() } -// ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -// CODE FOR STREAMING A JSON VECTOR -// ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ - -// Read an entry from a json file that represents a vector. This is not completely -// general as it depends on assumptions about the formatting of the file. -// -// To compare to and probably replace with: -// https://martian-lang.github.io/martian-rust/doc/martian_filetypes/json_file/ -// index.html#lazy-readwrite-example - -pub fn read_vector_entry_from_json(json: &mut R) -> Result>, String> { - let mut line = String::new(); - if json.read_line(&mut line).is_err() || line == *"" || line == *"[]" { - return Ok(None); - } - if line == *"[\n" { - line.clear(); - if json.read_line(&mut line).is_err() { - return Err( - "\nProblem reading json file, probably due to a defect in it.\n".to_string(), - ); - } - } - let mut entry = Vec::::new(); - let (mut curlies, mut bracks, mut quotes) = (0_isize, 0_isize, 0_isize); - let mut s = line.as_bytes(); - loop { - if (s == b"]" || s == b"]\n") && curlies == 0 && bracks == 0 && quotes % 2 == 0 { - if !entry.is_empty() { - return Ok(Some(entry)); - } else { - return Ok(None); - } - } - let mut cpos = -1_isize; - if s.is_empty() { - return Err("\nError reading json file. It is possible that the file \ - was truncated.\n" - .to_string()); - } - for i in (0..s.len() - 1).rev() { - if s[i] == b',' { - cpos = i as isize; - break; - } - if s[i] != b' ' { - break; - } - } - let mut escaped = false; - for i in 0..s.len() { - if !escaped && s[i] == b'"' { - quotes += 1; - } else if !escaped && quotes % 2 == 0 { - match s[i] { - b'{' => curlies += 1, - b'}' => curlies -= 1, - b'[' => bracks += 1, - b']' => bracks -= 1, - b',' => { - if i as isize == cpos && curlies == 0 && bracks == 0 && quotes % 2 == 0 { - return Ok(Some(entry)); - } - } - _ => {} - }; - } - escaped = s[i] == b'\\' && !escaped; - entry.push(s[i]); - } - line.clear(); - if json.read_line(&mut line).is_err() { - return Err("\nSomething appears to be defective in a json file.\n".to_string()); - } - s = line.as_bytes(); - } -} - // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ // READ FILE TO STRING AND PRINT FILE NAME IF IT DOESN'T EXIST // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index 38bd005f5..adc59d263 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -3007,9 +3007,11 @@ pub struct ContigAnnotation { pub fwr4: Option, // annotations + #[serde(default)] pub annotations: Vec, // the annotations - pub clonotype: Option, // null, filled in later - pub info: ClonotypeInfo, // Empty initially, may be filled in later + pub clonotype: Option, // null, filled in later + #[serde(default)] + pub info: ClonotypeInfo, // Empty initially, may be filled in later // state of the contig pub high_confidence: bool, // declared high confidence? @@ -3018,7 +3020,8 @@ pub struct ContigAnnotation { pub invalidated_umis: Option>, // invalidated UMIs pub is_cell: bool, // was the barcode declared a cell? pub productive: Option, // productive? (null means not full length) - pub filtered: bool, // true and never changed (unused field) + #[serde(default = "set_true")] + pub filtered: bool, // true and never changed (unused field) pub is_gex_cell: Option, // Was the barcode declared a cell by Gene expression data, if available pub is_asm_cell: Option, // Was the barcode declared a cell by the VDJ assembler @@ -3030,6 +3033,10 @@ pub struct ContigAnnotation { pub sample: Option, } +fn set_true() -> bool { + true +} + impl ContigAnnotation { // Given the alignment entities produced by annotate_seq, produce a // ContigAnnotation. This is done so as to produce at most one V, D, J and C,