diff --git a/fixtures/aa.gfa b/fixtures/aa.gfa new file mode 100644 index 0000000..2e83010 --- /dev/null +++ b/fixtures/aa.gfa @@ -0,0 +1,5 @@ +H VN:Z:1.2 +S 1 A SN:Z:123 SO:i:0 SR:i:0 +S 2 A SN:Z:123 SO:i:0 SR:i:0 +L 1 + 2 + * +P 124 1+,2+ 0M diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index 0d2a0dd..bcd95a0 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -19,8 +19,14 @@ CREATE TABLE sequence ( length INTEGER NOT NULL ) STRICT; +CREATE TABLE nodes ( + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + sequence_hash TEXT NOT NULL, + FOREIGN KEY(sequence_hash) REFERENCES sequence(hash) +) STRICT; + CREATE TABLE block_group ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, collection_name TEXT NOT NULL, sample_name TEXT, name TEXT NOT NULL, @@ -31,7 +37,7 @@ CREATE UNIQUE INDEX block_group_uidx ON block_group(collection_name, sample_name CREATE UNIQUE INDEX block_group_null_sample_uidx ON block_group(collection_name, name) WHERE sample_name is null; CREATE TABLE path ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, block_group_id INTEGER NOT NULL, name TEXT NOT NULL, FOREIGN KEY(block_group_id) REFERENCES block_group(id) @@ -40,7 +46,7 @@ CREATE UNIQUE INDEX path_uidx ON path(block_group_id, name); -- an operation from a vcf can impact multiple paths and samples, so operation is not faceted on that CREATE TABLE operation ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, parent_id INTEGER, collection_name TEXT NOT NULL, change_type TEXT NOT NULL, @@ -49,36 +55,36 @@ CREATE TABLE operation ( ) STRICT; CREATE TABLE file_addition ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, file_path TEXT NOT NULL, file_type TEXT NOT NULL ) STRICT; CREATE TABLE operation_summary ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, operation_id INTEGER NOT NULL, summary TEXT NOT NULL, FOREIGN KEY(operation_id) REFERENCES operation(id) ) STRICT; CREATE TABLE edges ( - id INTEGER PRIMARY KEY NOT NULL, - source_hash TEXT NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + source_node_id INTEGER, source_coordinate INTEGER NOT NULL, source_strand TEXT NOT NULL, - target_hash TEXT NOT NULL, + target_node_id INTEGER, target_coordinate INTEGER NOT NULL, target_strand TEXT NOT NULL, chromosome_index INTEGER NOT NULL, phased INTEGER NOT NULL, - FOREIGN KEY(source_hash) REFERENCES sequence(hash), - FOREIGN KEY(target_hash) REFERENCES sequence(hash), + FOREIGN KEY(source_node_id) REFERENCES nodes(id), + FOREIGN KEY(target_node_id) REFERENCES nodes(id), constraint chk_phased check (phased in (0, 1)) ) STRICT; -CREATE UNIQUE INDEX edge_uidx ON edges(source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased); +CREATE UNIQUE INDEX edge_uidx ON edges(source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased); CREATE TABLE path_edges ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, path_id INTEGER NOT NULL, index_in_path INTEGER NOT NULL, edge_id INTEGER NOT NULL, @@ -88,7 +94,7 @@ CREATE TABLE path_edges ( CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id, index_in_path); CREATE TABLE block_group_edges ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, block_group_id INTEGER NOT NULL, edge_id INTEGER NOT NULL, FOREIGN KEY(block_group_id) REFERENCES block_group(id), @@ -96,7 +102,6 @@ CREATE TABLE block_group_edges ( ) STRICT; CREATE UNIQUE INDEX block_group_edges_uidx ON block_group_edges(block_group_id, edge_id); -INSERT INTO sequence (hash, sequence_type, sequence, name, file_path, "length") values ("start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "", "", 64), ("end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "", "", 64); INSERT INTO gen_metadata (db_uuid) values (lower( hex(randomblob(4)) || '-' || hex(randomblob(2)) || '-' || '4' || substr(hex( randomblob(2)), 2) || '-' || @@ -104,3 +109,7 @@ INSERT INTO gen_metadata (db_uuid) values (lower( substr(hex(randomblob(2)), 2) || '-' || hex(randomblob(6)) )); +INSERT INTO sequence (hash, sequence_type, sequence, name, file_path, "length") values ("start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "", "", 64), ("end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "", "", 64); +INSERT INTO nodes (id, sequence_hash) values (1, "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); +INSERT INTO nodes (id, sequence_hash) values (2, "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"); +UPDATE SQLITE_SEQUENCE SET seq = 2 WHERE name = 'nodes'; diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 723a9d5..26f01ce 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -10,9 +10,9 @@ use crate::models::{ block_group_edge::BlockGroupEdge, collection::Collection, edge::{Edge, GroupBlock}, + node::{PATH_END_NODE_ID, PATH_START_NODE_ID}, path::Path, path_edge::PathEdge, - sequence::Sequence, strand::Strand, }; @@ -36,9 +36,7 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) let mut terminal_block_ids = HashSet::new(); for block in &blocks { - if block.sequence_hash == Sequence::PATH_START_HASH - || block.sequence_hash == Sequence::PATH_END_HASH - { + if block.node_id == PATH_START_NODE_ID || block.node_id == PATH_END_NODE_ID { terminal_block_ids.insert(block.id); continue; } @@ -109,7 +107,7 @@ fn link_line( target_strand: Strand, ) -> String { format!( - "L\t{}\t{}\t{}\t{}\t*\n", + "L\t{}\t{}\t{}\t{}\t0M\n", source_index + 1, source_strand, target_index + 1, @@ -125,21 +123,21 @@ fn link_line( fn nodes_for_edges( edge1: &Edge, edge2: &Edge, - blocks_by_hash_and_start: &HashMap<(&str, i32), GroupBlock>, - blocks_by_hash_and_end: &HashMap<(&str, i32), GroupBlock>, + blocks_by_node_and_start: &HashMap<(i32, i32), GroupBlock>, + blocks_by_node_and_end: &HashMap<(i32, i32), GroupBlock>, ) -> Vec { - let mut current_block = blocks_by_hash_and_start - .get(&(edge1.target_hash.as_str(), edge1.target_coordinate)) + let mut current_block = blocks_by_node_and_start + .get(&(edge1.target_node_id, edge1.target_coordinate)) .unwrap(); - let end_block = blocks_by_hash_and_end - .get(&(edge2.source_hash.as_str(), edge2.source_coordinate)) + let end_block = blocks_by_node_and_end + .get(&(edge2.source_node_id, edge2.source_coordinate)) .unwrap(); let mut node_ids = vec![]; #[allow(clippy::while_immutable_condition)] while current_block.id != end_block.id { node_ids.push(current_block.id); - current_block = blocks_by_hash_and_start - .get(&(current_block.sequence_hash.as_str(), current_block.end)) + current_block = blocks_by_node_and_start + .get(&(current_block.node_id, current_block.end)) .unwrap(); } node_ids.push(end_block.id); @@ -157,34 +155,34 @@ fn write_paths( let edges_by_path_id = PathEdge::edges_for_paths(conn, paths.iter().map(|path| path.id).collect()); - let blocks_by_hash_and_start = blocks + let blocks_by_node_and_start = blocks .iter() - .map(|block| ((block.sequence_hash.as_str(), block.start), block.clone())) - .collect::>(); - let blocks_by_hash_and_end = blocks + .map(|block| ((block.node_id, block.start), block.clone())) + .collect::>(); + let blocks_by_node_and_end = blocks .iter() - .map(|block| ((block.sequence_hash.as_str(), block.end), block.clone())) - .collect::>(); + .map(|block| ((block.node_id, block.end), block.clone())) + .collect::>(); for path in paths { let edges_for_path = edges_by_path_id.get(&path.id).unwrap(); - let mut node_ids = vec![]; + let mut graph_node_ids = vec![]; let mut node_strands = vec![]; for (edge1, edge2) in edges_for_path.iter().tuple_windows() { let current_node_ids = nodes_for_edges( edge1, edge2, - &blocks_by_hash_and_start, - &blocks_by_hash_and_end, + &blocks_by_node_and_start, + &blocks_by_node_and_end, ); for node_id in ¤t_node_ids { - node_ids.push(*node_id); + graph_node_ids.push(*node_id); node_strands.push(edge1.target_strand); } } writer - .write_all(&path_line(&path.name, &node_ids, &node_strands).into_bytes()) + .write_all(&path_line(&path.name, &graph_node_ids, &node_strands).into_bytes()) .unwrap_or_else(|_| panic!("Error writing path {} to GFA stream", path.name)); } } @@ -199,12 +197,16 @@ fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> Stri format!("P\t{}\t{}\n", path_name, nodes) } +#[cfg(test)] mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; use crate::imports::gfa::import_gfa; - use crate::models::{block_group::BlockGroup, collection::Collection}; + use crate::models::{ + block_group::BlockGroup, collection::Collection, node::Node, sequence::Sequence, + }; + use crate::test_helpers::{get_connection, setup_gen_dir}; use tempfile::tempdir; @@ -232,13 +234,17 @@ mod tests { .sequence_type("DNA") .sequence("CCCC") .save(&conn); + let node1_id = Node::create(&conn, &sequence1.hash); + let node2_id = Node::create(&conn, &sequence2.hash); + let node3_id = Node::create(&conn, &sequence3.hash); + let node4_id = Node::create(&conn, &sequence4.hash); let edge1 = Edge::create( &conn, - Sequence::PATH_START_HASH.to_string(), + PATH_START_NODE_ID, 0, Strand::Forward, - sequence1.hash.clone(), + node1_id, 0, Strand::Forward, 0, @@ -246,10 +252,10 @@ mod tests { ); let edge2 = Edge::create( &conn, - sequence1.hash, + node1_id, 4, Strand::Forward, - sequence2.hash.clone(), + node2_id, 0, Strand::Forward, 0, @@ -257,10 +263,10 @@ mod tests { ); let edge3 = Edge::create( &conn, - sequence2.hash, + node2_id, 4, Strand::Forward, - sequence3.hash.clone(), + node3_id, 0, Strand::Forward, 0, @@ -268,10 +274,10 @@ mod tests { ); let edge4 = Edge::create( &conn, - sequence3.hash, + node3_id, 4, Strand::Forward, - sequence4.hash.clone(), + node4_id, 0, Strand::Forward, 0, @@ -279,10 +285,10 @@ mod tests { ); let edge5 = Edge::create( &conn, - sequence4.hash, + node4_id, 4, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), + PATH_END_NODE_ID, 0, Strand::Forward, 0, diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index 5507697..6aeed28 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -4,8 +4,15 @@ use std::str; use crate::models::file_types::FileTypes; use crate::models::operations::{FileAddition, Operation, OperationSummary}; use crate::models::{ - block_group::BlockGroup, block_group_edge::BlockGroupEdge, collection::Collection, edge::Edge, - metadata, path::Path, sequence::Sequence, strand::Strand, + block_group::BlockGroup, + block_group_edge::BlockGroupEdge, + collection::Collection, + edge::Edge, + metadata, + node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, + path::Path, + sequence::Sequence, + strand::Strand, }; use crate::operation_management; use noodles::fasta; @@ -57,13 +64,14 @@ pub fn import_fasta( .sequence(&sequence) .save(conn) }; + let node_id = Node::create(conn, &seq.hash); let block_group = BlockGroup::create(conn, &collection.name, None, &name); let edge_into = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), + PATH_START_NODE_ID, 0, Strand::Forward, - seq.hash.to_string(), + node_id, 0, Strand::Forward, 0, @@ -71,10 +79,10 @@ pub fn import_fasta( ); let edge_out_of = Edge::create( conn, - seq.hash.to_string(), + node_id, sequence_length, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), + PATH_END_NODE_ID, 0, Strand::Forward, 0, diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 8b3616f..aaba44c 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -8,6 +8,7 @@ use crate::models::{ block_group_edge::BlockGroupEdge, collection::Collection, edge::{Edge, EdgeData}, + node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, path::Path, sequence::Sequence, strand::Strand, @@ -26,6 +27,7 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let block_group = BlockGroup::create(conn, collection_name, None, ""); let gfa: Gfa = Gfa::parse_gfa_file(gfa_path.to_str().unwrap()); let mut sequences_by_segment_id: HashMap = HashMap::new(); + let mut node_ids_by_segment_id: HashMap = HashMap::new(); for segment in &gfa.segments { let input_sequence = segment.sequence.get_string(&gfa.sequence); @@ -33,72 +35,77 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) .sequence_type("DNA") .sequence(input_sequence) .save(conn); - sequences_by_segment_id.insert(segment.id, sequence); + sequences_by_segment_id.insert(segment.id, sequence.clone()); + let node_id = Node::create(conn, &sequence.hash); + node_ids_by_segment_id.insert(segment.id, node_id); } let mut edges = HashSet::new(); for link in &gfa.links { let source = sequences_by_segment_id.get(&link.from).unwrap(); - let target = sequences_by_segment_id.get(&link.to).unwrap(); + let source_node_id = *node_ids_by_segment_id.get(&link.from).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(&link.to).unwrap(); edges.insert(edge_data_from_fields( - &source.hash, + source_node_id, source.length, bool_to_strand(link.from_dir), - &target.hash, + target_node_id, bool_to_strand(link.to_dir), )); } for input_path in &gfa.paths { - let mut source_hash = Sequence::PATH_START_HASH; + let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; for (index, segment_id) in input_path.nodes.iter().enumerate() { let target = sequences_by_segment_id.get(segment_id).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_path.dir[index]); edges.insert(edge_data_from_fields( - source_hash, + source_node_id, source_coordinate, source_strand, - &target.hash, + target_node_id, target_strand, )); - source_hash = &target.hash; + source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } edges.insert(edge_data_from_fields( - source_hash, + source_node_id, source_coordinate, source_strand, - Sequence::PATH_END_HASH, + PATH_END_NODE_ID, Strand::Forward, )); } for input_walk in &gfa.walk { - let mut source_hash = Sequence::PATH_START_HASH; + let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; for (index, segment_id) in input_walk.walk_id.iter().enumerate() { let target = sequences_by_segment_id.get(segment_id).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_walk.walk_dir[index]); edges.insert(edge_data_from_fields( - source_hash, + source_node_id, source_coordinate, source_strand, - &target.hash, + target_node_id, target_strand, )); - source_hash = &target.hash; + source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } edges.insert(edge_data_from_fields( - source_hash, + source_node_id, source_coordinate, source_strand, - Sequence::PATH_END_HASH, + PATH_END_NODE_ID, Strand::Forward, )); } @@ -110,10 +117,10 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let mut edge_ids_by_data = HashMap::new(); for edge in saved_edges { let key = edge_data_from_fields( - &edge.source_hash, + edge.source_node_id, edge.source_coordinate, edge.source_strand, - &edge.target_hash, + edge.target_node_id, edge.target_strand, ); edge_ids_by_data.insert(key, edge.id); @@ -121,31 +128,32 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) for input_path in &gfa.paths { let path_name = &input_path.name; - let mut source_hash = Sequence::PATH_START_HASH; + let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; let mut path_edge_ids = vec![]; for (index, segment_id) in input_path.nodes.iter().enumerate() { let target = sequences_by_segment_id.get(segment_id).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_path.dir[index]); let key = edge_data_from_fields( - source_hash, + source_node_id, source_coordinate, source_strand, - &target.hash, + target_node_id, target_strand, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); path_edge_ids.push(edge_id); - source_hash = &target.hash; + source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } let key = edge_data_from_fields( - source_hash, + source_node_id, source_coordinate, source_strand, - Sequence::PATH_END_HASH, + PATH_END_NODE_ID, Strand::Forward, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); @@ -155,31 +163,32 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) for input_walk in &gfa.walk { let path_name = &input_walk.sample_id; - let mut source_hash = Sequence::PATH_START_HASH; + let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; let mut path_edge_ids = vec![]; for (index, segment_id) in input_walk.walk_id.iter().enumerate() { let target = sequences_by_segment_id.get(segment_id).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_walk.walk_dir[index]); let key = edge_data_from_fields( - source_hash, + source_node_id, source_coordinate, source_strand, - &target.hash, + target_node_id, target_strand, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); path_edge_ids.push(edge_id); - source_hash = &target.hash; + source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } let key = edge_data_from_fields( - source_hash, + source_node_id, source_coordinate, source_strand, - Sequence::PATH_END_HASH, + PATH_END_NODE_ID, Strand::Forward, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); @@ -189,17 +198,17 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) } fn edge_data_from_fields( - source_hash: &str, + source_node_id: i32, source_coordinate: i32, source_strand: Strand, - target_hash: &str, + target_node_id: i32, target_strand: Strand, ) -> EdgeData { EdgeData { - source_hash: source_hash.to_string(), + source_node_id, source_coordinate, source_strand, - target_hash: target_hash.to_string(), + target_node_id, target_coordinate: 0, target_strand, chromosome_index: 0, @@ -236,6 +245,9 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "ATGGCATATTCGCAGCT"); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 6); } #[test] @@ -252,6 +264,9 @@ mod tests { all_sequences, HashSet::from_iter(vec!["AAAATTTTGGGGCCCC".to_string()]) ); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 6); } #[test] @@ -275,6 +290,9 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "ACCTACAAATTCAAAC"); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 6); } #[test] @@ -298,6 +316,9 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "TATGCCAGCTGCGAATA"); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 6); } #[test] @@ -323,9 +344,115 @@ mod tests { .clone(); let result = Path::sequence(conn, path); - let expected_sequence_parts = vec!["T", "T", "G", "A", "C", "G", "GCTAGCTCAG", "T", "CCT", "A", "GG", "T", "A", "C", "A", "G", - "TGCTAGCTACTAGTGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGACGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAAGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAAGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGCTGCCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAAGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAATAACGCTGATAGTGCTAGTGTAGATCGCTACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATATACTAGAAGCGGCCGCTGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCATCTAGAG"]; + let big_part = "TGCTAGCTACTAGTGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGACGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAAGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAAGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGCTGCCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAAGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAATAACGCTGATAGTGCTAGTGTAGATCGCTACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATATACTAGAAGCGGCCGCTGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCATCTAGAG"; + let expected_sequence_parts = vec![ + "T", + "T", + "G", + "A", + "C", + "G", + "GCTAGCTCAG", + "T", + "CCT", + "A", + "GG", + "T", + "A", + "C", + "A", + "G", + big_part, + ]; + let expected_sequence = expected_sequence_parts.join(""); assert_eq!(result, expected_sequence); + + let part1 = "T"; + let part3 = "T"; + let part4_5 = vec!["G", "T"]; + let part6 = "A"; + let part7_8 = vec!["C", "T"]; + let part9_10 = vec!["A", "G"]; + let part11 = "GCTAGCTCAG"; + let part12_13 = vec!["T", "C"]; + let part14 = "CCT"; + let part15_16 = vec!["A", "T"]; + let part17 = "GG"; + let part18_19 = vec!["T", "G"]; + let part20 = "A"; + let part21_22 = vec!["T", "C"]; + let part23_24 = vec!["A", "T"]; + let part25_26 = vec!["A", "G"]; + + let mut expected_sequences = HashSet::new(); + for part_a in &part4_5 { + for part_b in &part7_8 { + for part_c in &part9_10 { + for part_d in &part12_13 { + for part_e in &part15_16 { + for part_f in &part18_19 { + for part_g in &part21_22 { + for part_h in &part23_24 { + for part_i in &part25_26 { + let expected_sequence_parts1 = vec![ + part1, part3, part_a, part6, part_b, part_c, + part11, part_d, part14, part_e, part17, part_f, + part20, part_g, part_h, part_i, big_part, + ]; + let temp_sequence1 = expected_sequence_parts1.join(""); + let expected_sequence_parts2 = vec![ + part3, part_a, part6, part_b, part_c, part11, + part_d, part14, part_e, part17, part_f, part20, + part_g, part_h, part_i, big_part, + ]; + let temp_sequence2 = expected_sequence_parts2.join(""); + expected_sequences.insert(temp_sequence1); + expected_sequences.insert(temp_sequence2); + } + } + } + } + } + } + } + } + } + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + assert_eq!(all_sequences.len(), 1024); + assert_eq!(all_sequences, expected_sequences); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 28); + } + + #[test] + fn test_import_aa_gfa() { + setup_gen_dir(); + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/aa.gfa"); + let collection_name = "test".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let path = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(block_group_id), + SQLValue::from("124".to_string()), + ], + )[0] + .clone(); + + let result = Path::sequence(conn, path); + assert_eq!(result, "AA"); + + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + assert_eq!(all_sequences, HashSet::from_iter(vec!["AA".to_string()])); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 4); } } diff --git a/src/models.rs b/src/models.rs index 0be2805..5da2e5c 100644 --- a/src/models.rs +++ b/src/models.rs @@ -4,6 +4,7 @@ pub mod collection; pub mod edge; pub mod file_types; pub mod metadata; +pub mod node; pub mod operations; pub mod path; pub mod path_edge; diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 329f96d..4174fde 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -8,9 +8,9 @@ use serde::{Deserialize, Serialize}; use crate::graph::all_simple_paths; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData, GroupBlock}; -use crate::models::path::{NewBlock, Path, PathData}; +use crate::models::node::{PATH_END_NODE_ID, PATH_START_NODE_ID}; +use crate::models::path::{Path, PathBlock, PathData}; use crate::models::path_edge::PathEdge; -use crate::models::sequence::Sequence; use crate::models::strand::Strand; #[derive(Debug, Deserialize, Serialize)] @@ -34,14 +34,14 @@ pub struct PathChange { pub path: Path, pub start: i32, pub end: i32, - pub block: NewBlock, + pub block: PathBlock, pub chromosome_index: i32, pub phased: i32, } pub struct PathCache<'a> { pub cache: HashMap, - pub intervaltree_cache: HashMap>, + pub intervaltree_cache: HashMap>, pub conn: &'a Connection, } @@ -49,7 +49,7 @@ impl PathCache<'_> { pub fn new(conn: &Connection) -> PathCache { PathCache { cache: HashMap::::new(), - intervaltree_cache: HashMap::>::new(), + intervaltree_cache: HashMap::>::new(), conn, } } @@ -80,7 +80,7 @@ impl PathCache<'_> { pub fn get_intervaltree<'a>( path_cache: &'a PathCache<'a>, path: &'a Path, - ) -> Option<&'a IntervalTree> { + ) -> Option<&'a IntervalTree> { path_cache.intervaltree_cache.get(path) } } @@ -282,9 +282,7 @@ impl BlockGroup { // TODO: maybe make all_simple_paths return a single path id where start == end if start_node == *end_node { let block = blocks_by_id.get(&start_node).unwrap(); - if block.sequence_hash != Sequence::PATH_START_HASH - && block.sequence_hash != Sequence::PATH_END_HASH - { + if block.node_id != PATH_START_NODE_ID && block.node_id != PATH_END_NODE_ID { sequences.insert(block.sequence.clone()); } } else { @@ -326,7 +324,7 @@ impl BlockGroup { pub fn insert_change( conn: &Connection, change: &PathChange, - tree: &IntervalTree, + tree: &IntervalTree, ) { let new_edges = BlockGroup::set_up_new_edges(change, tree); let edge_ids = Edge::bulk_create(conn, new_edges); @@ -335,14 +333,14 @@ impl BlockGroup { pub fn set_up_new_edges( change: &PathChange, - tree: &IntervalTree, + tree: &IntervalTree, ) -> Vec { - let start_blocks: Vec<&NewBlock> = + let start_blocks: Vec<&PathBlock> = tree.query_point(change.start).map(|x| &x.value).collect(); assert_eq!(start_blocks.len(), 1); // NOTE: This may not be used but needs to be initialized here instead of inside the if // statement that uses it, so that the borrow checker is happy - let previous_start_blocks: Vec<&NewBlock> = tree + let previous_start_blocks: Vec<&PathBlock> = tree .query_point(change.start - 1) .map(|x| &x.value) .collect(); @@ -355,7 +353,7 @@ impl BlockGroup { start_blocks[0] }; - let end_blocks: Vec<&NewBlock> = tree.query_point(change.end).map(|x| &x.value).collect(); + let end_blocks: Vec<&PathBlock> = tree.query_point(change.end).map(|x| &x.value).collect(); assert_eq!(end_blocks.len(), 1); let end_block = end_blocks[0]; @@ -364,11 +362,11 @@ impl BlockGroup { if change.block.sequence_start == change.block.sequence_end { // Deletion let new_edge = EdgeData { - source_hash: start_block.sequence.hash.clone(), + source_node_id: start_block.node_id, source_coordinate: change.start - start_block.path_start + start_block.sequence_start, source_strand: Strand::Forward, - target_hash: end_block.sequence.hash.clone(), + target_node_id: end_block.node_id, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, chromosome_index: change.chromosome_index, @@ -381,10 +379,10 @@ impl BlockGroup { // another start point in the block group DAG. if change.start == 0 { let new_beginning_edge = EdgeData { - source_hash: Sequence::PATH_START_HASH.to_string(), + source_node_id: PATH_START_NODE_ID, source_coordinate: 0, source_strand: Strand::Forward, - target_hash: end_block.sequence.hash.clone(), + target_node_id: end_block.node_id, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, chromosome_index: change.chromosome_index, @@ -398,21 +396,21 @@ impl BlockGroup { } else { // Insertion/replacement let new_start_edge = EdgeData { - source_hash: start_block.sequence.hash.clone(), + source_node_id: start_block.node_id, source_coordinate: change.start - start_block.path_start + start_block.sequence_start, source_strand: Strand::Forward, - target_hash: change.block.sequence.hash.clone(), + target_node_id: change.block.node_id, target_coordinate: change.block.sequence_start, target_strand: Strand::Forward, chromosome_index: change.chromosome_index, phased: change.phased, }; let new_end_edge = EdgeData { - source_hash: change.block.sequence.hash.clone(), + source_node_id: change.block.node_id, source_coordinate: change.block.sequence_end, source_strand: Strand::Forward, - target_hash: end_block.sequence.hash.clone(), + target_node_id: end_block.node_id, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, chromosome_index: change.chromosome_index, @@ -429,7 +427,7 @@ impl BlockGroup { #[cfg(test)] mod tests { use super::*; - use crate::models::{collection::Collection, sample::Sample}; + use crate::models::{collection::Collection, node::Node, sample::Sample, sequence::Sequence}; use crate::test_helpers::{get_connection, setup_block_group}; #[test] @@ -470,9 +468,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -505,9 +504,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -548,9 +548,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -588,9 +589,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -628,9 +630,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -668,9 +671,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -708,9 +712,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -748,9 +753,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -788,9 +794,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -828,9 +835,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -868,9 +876,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -910,9 +919,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -962,9 +972,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1003,9 +1014,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1043,9 +1055,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1083,9 +1096,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1123,9 +1137,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1163,9 +1178,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1203,9 +1219,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1243,9 +1260,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, diff --git a/src/models/edge.rs b/src/models/edge.rs index d53d731..edac91d 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -7,15 +7,16 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection, Result as SQLResult, Row}; use serde::{Deserialize, Serialize}; -use crate::models::{sequence::Sequence, strand::Strand}; +use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; +use crate::models::strand::Strand; #[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)] pub struct Edge { pub id: i32, - pub source_hash: String, + pub source_node_id: i32, pub source_coordinate: i32, pub source_strand: Strand, - pub target_hash: String, + pub target_node_id: i32, pub target_coordinate: i32, pub target_strand: Strand, pub chromosome_index: i32, @@ -24,10 +25,10 @@ pub struct Edge { #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct EdgeData { - pub source_hash: String, + pub source_node_id: i32, pub source_coordinate: i32, pub source_strand: Strand, - pub target_hash: String, + pub target_node_id: i32, pub target_coordinate: i32, pub target_strand: Strand, pub chromosome_index: i32, @@ -37,10 +38,10 @@ pub struct EdgeData { impl From<&Edge> for EdgeData { fn from(item: &Edge) -> Self { EdgeData { - source_hash: item.source_hash.clone(), + source_node_id: item.source_node_id, source_coordinate: item.source_coordinate, source_strand: item.source_strand, - target_hash: item.target_hash.clone(), + target_node_id: item.target_node_id, target_coordinate: item.target_coordinate, target_strand: item.target_strand, chromosome_index: item.chromosome_index, @@ -51,14 +52,14 @@ impl From<&Edge> for EdgeData { #[derive(Eq, Hash, PartialEq)] pub struct BlockKey { - pub sequence_hash: String, + pub node_id: i32, pub coordinate: i32, } #[derive(Clone, Debug)] pub struct GroupBlock { pub id: i32, - pub sequence_hash: String, + pub node_id: i32, pub sequence: String, pub start: i32, pub end: i32, @@ -68,22 +69,22 @@ impl Edge { #[allow(clippy::too_many_arguments)] pub fn create( conn: &Connection, - source_hash: String, + source_node_id: i32, source_coordinate: i32, source_strand: Strand, - target_hash: String, + target_node_id: i32, target_coordinate: i32, target_strand: Strand, chromosome_index: i32, phased: i32, ) -> Edge { - let query = "INSERT INTO edges (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8) RETURNING *"; - let id_query = "select id from edges where source_hash = ?1 and source_coordinate = ?2 and source_strand = ?3 and target_hash = ?4 and target_coordinate = ?5 and target_strand = ?6 and chromosome_index = ?7 and phased = ?8"; + let query = "INSERT INTO edges (source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8) RETURNING *"; + let id_query = "select id from edges where and source_node_id = ?1 and source_coordinate = ?2 and source_strand = ?3 and target_node_id = ?4 and target_coordinate = ?5 and target_strand = ?6 and chromosome_index = ?7 and phased = ?8"; let placeholders: Vec = vec![ - source_hash.clone().into(), + source_node_id.into(), source_coordinate.into(), source_strand.into(), - target_hash.clone().into(), + target_node_id.into(), target_coordinate.into(), target_strand.into(), chromosome_index.into(), @@ -94,10 +95,10 @@ impl Edge { match stmt.query_row(params_from_iter(&placeholders), |row| { Ok(Edge { id: row.get(0)?, - source_hash: row.get(1)?, + source_node_id: row.get(1)?, source_coordinate: row.get(2)?, source_strand: row.get(3)?, - target_hash: row.get(4)?, + target_node_id: row.get(4)?, target_coordinate: row.get(5)?, target_strand: row.get(6)?, chromosome_index: row.get(7)?, @@ -112,10 +113,10 @@ impl Edge { id: conn .query_row(id_query, params_from_iter(&placeholders), |row| row.get(0)) .unwrap(), - source_hash, + source_node_id, source_coordinate, source_strand, - target_hash, + target_node_id, target_coordinate, target_strand, chromosome_index, @@ -134,10 +135,10 @@ impl Edge { fn edge_from_row(row: &Row) -> SQLResult { Ok(Edge { id: row.get(0)?, - source_hash: row.get(1)?, + source_node_id: row.get(1)?, source_coordinate: row.get(2)?, source_strand: row.get(3)?, - target_hash: row.get(4)?, + target_node_id: row.get(4)?, target_coordinate: row.get(5)?, target_strand: row.get(6)?, chromosome_index: row.get(7)?, @@ -151,7 +152,7 @@ impl Edge { .map(|edge_id| edge_id.to_string()) .collect::>() .join(","); - let query = format!("select id, source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased from edges where id in ({});", formatted_edge_ids); + let query = format!("select id, source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased from edges where id in ({});", formatted_edge_ids); Edge::query(conn, &query, vec![]) } @@ -171,16 +172,14 @@ impl Edge { let mut edge_rows = vec![]; let mut edge_map: HashMap = HashMap::new(); for edge in &edges { - let source_hash = format!("\"{0}\"", edge.source_hash); let source_strand = format!("\"{0}\"", edge.source_strand); - let target_hash = format!("\"{0}\"", edge.target_hash); let target_strand = format!("\"{0}\"", edge.target_strand); let edge_row = format!( "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", - source_hash, + edge.source_node_id, edge.source_coordinate, source_strand, - target_hash, + edge.target_node_id, edge.target_coordinate, target_strand, edge.chromosome_index, @@ -190,7 +189,7 @@ impl Edge { } let formatted_edge_rows = edge_rows.join(", "); - let select_statement = format!("SELECT * FROM edges WHERE (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) in ({0});", formatted_edge_rows); + let select_statement = format!("SELECT * FROM edges WHERE (source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased) in ({0});", formatted_edge_rows); let existing_edges = Edge::query(conn, &select_statement, vec![]); for edge in existing_edges.iter() { edge_map.insert(EdgeData::from(edge), edge.id); @@ -208,16 +207,14 @@ impl Edge { let mut edge_rows_to_insert = vec![]; for edge in edges_to_insert { - let source_hash = format!("\"{0}\"", edge.source_hash); - let target_hash = format!("\"{0}\"", edge.target_hash); let source_strand = format!("\"{0}\"", edge.source_strand); let target_strand = format!("\"{0}\"", edge.target_strand); let edge_row = format!( "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", - source_hash, + edge.source_node_id, edge.source_coordinate, source_strand, - target_hash, + edge.target_node_id, edge.target_coordinate, target_strand, edge.chromosome_index, @@ -230,7 +227,7 @@ impl Edge { for chunk in edge_rows_to_insert.chunks(100000) { let formatted_edge_rows_to_insert = chunk.join(", "); - let insert_statement = format!("INSERT INTO edges (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) VALUES {0} RETURNING *;", formatted_edge_rows_to_insert); + let insert_statement = format!("INSERT INTO edges (source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES {0} RETURNING *;", formatted_edge_rows_to_insert); let mut stmt = conn.prepare(&insert_statement).unwrap(); let rows = stmt.query_map([], Edge::edge_from_row).unwrap(); for row in rows { @@ -247,10 +244,10 @@ impl Edge { pub fn to_data(edge: Edge) -> EdgeData { EdgeData { - source_hash: edge.source_hash, + source_node_id: edge.source_node_id, source_coordinate: edge.source_coordinate, source_strand: edge.source_strand, - target_hash: edge.target_hash, + target_node_id: edge.target_node_id, target_coordinate: edge.target_coordinate, target_strand: edge.target_strand, chromosome_index: edge.chromosome_index, @@ -290,35 +287,36 @@ impl Edge { } pub fn blocks_from_edges(conn: &Connection, edges: &Vec) -> (Vec, Vec) { - let mut sequence_hashes = HashSet::new(); - let mut edges_by_source_hash: HashMap<&str, Vec<&Edge>> = HashMap::new(); - let mut edges_by_target_hash: HashMap<&str, Vec<&Edge>> = HashMap::new(); + let mut node_ids = HashSet::new(); + let mut edges_by_source_node_id: HashMap> = HashMap::new(); + let mut edges_by_target_node_id: HashMap> = HashMap::new(); for edge in edges { - if edge.source_hash != Sequence::PATH_START_HASH { - sequence_hashes.insert(edge.source_hash.as_str()); - edges_by_source_hash - .entry(&edge.source_hash) + if edge.source_node_id != PATH_START_NODE_ID { + node_ids.insert(edge.source_node_id); + edges_by_source_node_id + .entry(edge.source_node_id) .and_modify(|edges| edges.push(edge)) .or_default(); } - if edge.target_hash != Sequence::PATH_END_HASH { - sequence_hashes.insert(edge.target_hash.as_str()); - edges_by_target_hash - .entry(&edge.target_hash) + if edge.target_node_id != PATH_END_NODE_ID { + node_ids.insert(edge.target_node_id); + edges_by_target_node_id + .entry(edge.target_node_id) .and_modify(|edges| edges.push(edge)) .or_default(); } } - let sequences_by_hash = - Sequence::sequences_by_hash(conn, sequence_hashes.into_iter().collect::>()); + let sequences_by_node_id = + Node::get_sequences_by_node_ids(conn, node_ids.into_iter().collect::>()); + let mut blocks = vec![]; let mut block_index = 0; let mut boundary_edges = vec![]; - for (hash, sequence) in sequences_by_hash.into_iter() { + for (node_id, sequence) in sequences_by_node_id.into_iter() { let block_boundaries = Edge::get_block_boundaries( - edges_by_source_hash.get(hash.as_str()), - edges_by_target_hash.get(hash.as_str()), + edges_by_source_node_id.get(&node_id), + edges_by_target_node_id.get(&node_id), sequence.length, ); for block_boundary in &block_boundaries { @@ -326,10 +324,10 @@ impl Edge { // for the data we need to set up boundary edges in the block group graph boundary_edges.push(Edge { id: -1, - source_hash: hash.clone(), + source_node_id: node_id, source_coordinate: *block_boundary, source_strand: Strand::Unknown, - target_hash: hash.clone(), + target_node_id: node_id, target_coordinate: *block_boundary, target_strand: Strand::Unknown, chromosome_index: 0, @@ -343,7 +341,7 @@ impl Edge { let block_sequence = sequence.get_sequence(start, end).to_string(); let first_block = GroupBlock { id: block_index, - sequence_hash: hash.clone(), + node_id, sequence: block_sequence, start, end, @@ -354,7 +352,7 @@ impl Edge { let block_sequence = sequence.get_sequence(start, end).to_string(); let block = GroupBlock { id: block_index, - sequence_hash: hash.clone(), + node_id, sequence: block_sequence, start, end, @@ -367,7 +365,7 @@ impl Edge { let block_sequence = sequence.get_sequence(start, end).to_string(); let last_block = GroupBlock { id: block_index, - sequence_hash: hash.clone(), + node_id, sequence: block_sequence, start, end, @@ -377,7 +375,7 @@ impl Edge { } else { blocks.push(GroupBlock { id: block_index, - sequence_hash: hash.clone(), + node_id, sequence: sequence.get_sequence(None, None), start: 0, end: sequence.length, @@ -390,19 +388,17 @@ impl Edge { // block group, since different paths in the block group may start or end at different // places on sequences. These two "start sequence" and "end sequence" blocks will serve // that role. - let start_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_START_HASH).unwrap(); let start_block = GroupBlock { id: block_index + 1, - sequence_hash: start_sequence.hash.clone(), + node_id: PATH_START_NODE_ID, sequence: "".to_string(), start: 0, end: 0, }; blocks.push(start_block); - let end_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); let end_block = GroupBlock { id: block_index + 2, - sequence_hash: end_sequence.hash.clone(), + node_id: PATH_END_NODE_ID, sequence: "".to_string(), start: 0, end: 0, @@ -421,7 +417,7 @@ impl Edge { .map(|block| { ( BlockKey { - sequence_hash: block.sequence_hash, + node_id: block.node_id, coordinate: block.start, }, block.id, @@ -434,7 +430,7 @@ impl Edge { .map(|block| { ( BlockKey { - sequence_hash: block.sequence_hash, + node_id: block.node_id, coordinate: block.end, }, block.id, @@ -449,12 +445,12 @@ impl Edge { } for edge in edges { let source_key = BlockKey { - sequence_hash: edge.source_hash.clone(), + node_id: edge.source_node_id, coordinate: edge.source_coordinate, }; let source_id = blocks_by_end.get(&source_key); let target_key = BlockKey { - sequence_hash: edge.target_hash.clone(), + node_id: edge.target_node_id, coordinate: edge.target_coordinate, }; let target_id = blocks_by_start.get(&target_key); @@ -475,7 +471,7 @@ impl Edge { mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; - use crate::models::collection::Collection; + use crate::models::{collection::Collection, sequence::Sequence}; use crate::test_helpers::get_connection; #[test] @@ -486,11 +482,12 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge1 = EdgeData { - source_hash: Sequence::PATH_START_HASH.to_string(), + source_node_id: PATH_START_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, - target_hash: sequence1.hash.clone(), + target_node_id: node1_id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -500,21 +497,22 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: sequence1.hash.clone(), + source_node_id: node1_id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: sequence2.hash.clone(), + target_node_id: node2_id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_hash: sequence2.hash.clone(), + source_node_id: node2_id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: Sequence::PATH_END_HASH.to_string(), + target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, @@ -526,22 +524,22 @@ mod tests { let edges = Edge::bulk_load(conn, &edge_ids); assert_eq!(edges.len(), 3); - let edges_by_source_hash = edges + let edges_by_source_node_id = edges .into_iter() - .map(|edge| (edge.source_hash.clone(), edge)) - .collect::>(); + .map(|edge| (edge.source_node_id, edge)) + .collect::>(); - let edge_result1 = edges_by_source_hash.get(Sequence::PATH_START_HASH).unwrap(); + let edge_result1 = edges_by_source_node_id.get(&PATH_START_NODE_ID).unwrap(); assert_eq!(edge_result1.source_coordinate, -1); - assert_eq!(edge_result1.target_hash, sequence1.hash); + assert_eq!(edge_result1.target_node_id, node1_id); assert_eq!(edge_result1.target_coordinate, 1); - let edge_result2 = edges_by_source_hash.get(&sequence1.hash).unwrap(); + let edge_result2 = edges_by_source_node_id.get(&node1_id).unwrap(); assert_eq!(edge_result2.source_coordinate, 2); - assert_eq!(edge_result2.target_hash, sequence2.hash); + assert_eq!(edge_result2.target_node_id, node2_id); assert_eq!(edge_result2.target_coordinate, 3); - let edge_result3 = edges_by_source_hash.get(&sequence2.hash).unwrap(); + let edge_result3 = edges_by_source_node_id.get(&node2_id).unwrap(); assert_eq!(edge_result3.source_coordinate, 4); - assert_eq!(edge_result3.target_hash, Sequence::PATH_END_HASH); + assert_eq!(edge_result3.target_node_id, PATH_END_NODE_ID); assert_eq!(edge_result3.target_coordinate, -1); } @@ -553,11 +551,12 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge1 = EdgeData { - source_hash: Sequence::PATH_START_HASH.to_string(), + source_node_id: PATH_START_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, - target_hash: sequence1.hash.clone(), + target_node_id: node1_id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -567,21 +566,22 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: sequence1.hash.clone(), + source_node_id: node1_id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: sequence2.hash.clone(), + target_node_id: node2_id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_hash: sequence2.hash.clone(), + source_node_id: node2_id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: Sequence::PATH_END_HASH.to_string(), + target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, @@ -626,28 +626,29 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str()); // NOTE: Create one edge ahead of time to confirm an existing row ID gets returned in the bulk create let existing_edge = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), + PATH_START_NODE_ID, -1, Strand::Forward, - sequence1.hash.clone(), + node1_id, 1, Strand::Forward, 0, 0, ); - assert_eq!(existing_edge.source_hash, Sequence::PATH_START_HASH); + assert_eq!(existing_edge.source_node_id, PATH_START_NODE_ID); assert_eq!(existing_edge.source_coordinate, -1); - assert_eq!(existing_edge.target_hash, sequence1.hash); + assert_eq!(existing_edge.target_node_id, node1_id); assert_eq!(existing_edge.target_coordinate, 1); let edge1 = EdgeData { - source_hash: Sequence::PATH_START_HASH.to_string(), source_coordinate: -1, + source_node_id: PATH_START_NODE_ID, source_strand: Strand::Forward, - target_hash: sequence1.hash.clone(), + target_node_id: node1_id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -657,21 +658,22 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: sequence1.hash.clone(), + source_node_id: node1_id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: sequence2.hash.clone(), + target_node_id: node2_id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_hash: sequence2.hash.clone(), + source_node_id: node2_id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: Sequence::PATH_END_HASH.to_string(), + target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, @@ -683,25 +685,25 @@ mod tests { let edges = Edge::bulk_load(conn, &edge_ids); assert_eq!(edges.len(), 3); - let edges_by_source_hash = edges + let edges_by_source_node_id = edges .into_iter() - .map(|edge| (edge.source_hash.clone(), edge)) - .collect::>(); + .map(|edge| (edge.source_node_id, edge)) + .collect::>(); - let edge_result1 = edges_by_source_hash.get(Sequence::PATH_START_HASH).unwrap(); + let edge_result1 = edges_by_source_node_id.get(&PATH_START_NODE_ID).unwrap(); assert_eq!(edge_result1.id, existing_edge.id); assert_eq!(edge_result1.source_coordinate, -1); - assert_eq!(edge_result1.target_hash, sequence1.hash); + assert_eq!(edge_result1.target_node_id, node1_id); assert_eq!(edge_result1.target_coordinate, 1); - let edge_result2 = edges_by_source_hash.get(&sequence1.hash).unwrap(); + let edge_result2 = edges_by_source_node_id.get(&node1_id).unwrap(); assert_eq!(edge_result2.source_coordinate, 2); - assert_eq!(edge_result2.target_hash, sequence2.hash); + assert_eq!(edge_result2.target_node_id, node2_id); assert_eq!(edge_result2.target_coordinate, 3); - let edge_result3 = edges_by_source_hash.get(&sequence2.hash).unwrap(); + let edge_result3 = edges_by_source_node_id.get(&node2_id).unwrap(); assert_eq!(edge_result3.source_coordinate, 4); - assert_eq!(edge_result3.target_hash, Sequence::PATH_END_HASH); + assert_eq!(edge_result3.target_node_id, PATH_END_NODE_ID); assert_eq!(edge_result3.target_coordinate, -1); } } diff --git a/src/models/node.rs b/src/models/node.rs new file mode 100644 index 0000000..b1b9a8f --- /dev/null +++ b/src/models/node.rs @@ -0,0 +1,84 @@ +use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +use crate::models::sequence::Sequence; + +pub const PATH_START_NODE_ID: i32 = 1; +pub const PATH_END_NODE_ID: i32 = 2; + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct Node { + pub id: i32, + pub sequence_hash: String, +} + +impl Node { + pub fn create(conn: &Connection, sequence_hash: &str) -> i32 { + let insert_statement = format!( + "INSERT INTO nodes (sequence_hash) VALUES ('{}');", + sequence_hash + ); + let _ = conn.execute(&insert_statement, ()); + conn.last_insert_rowid() as i32 + } + + pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); + let rows = stmt + .query_map(params_from_iter(placeholders), |row| { + Ok(Node { + id: row.get(0)?, + sequence_hash: row.get(1)?, + }) + }) + .unwrap(); + let mut objs = vec![]; + for row in rows { + objs.push(row.unwrap()); + } + objs + } + + pub fn get_nodes(conn: &Connection, node_ids: Vec) -> Vec { + Node::query( + conn, + &format!( + "SELECT * FROM nodes WHERE id IN ({})", + node_ids.iter().map(|_| "?").collect::>().join(", ") + ), + node_ids + .iter() + .map(|id| SQLValue::Integer(*id as i64)) + .collect(), + ) + } + + pub fn get_sequences_by_node_ids( + conn: &Connection, + node_ids: Vec, + ) -> HashMap { + let nodes = Node::get_nodes(conn, node_ids.into_iter().collect::>()); + let sequence_hashes_by_node_id = nodes + .iter() + .map(|node| (node.id, node.sequence_hash.clone())) + .collect::>(); + let sequences_by_hash = Sequence::sequences_by_hash( + conn, + sequence_hashes_by_node_id + .values() + .map(|hash| hash.as_str()) + .collect::>(), + ); + sequence_hashes_by_node_id + .clone() + .into_iter() + .map(|(node_id, sequence_hash)| { + ( + node_id, + sequences_by_hash.get(&sequence_hash).unwrap().clone(), + ) + }) + .collect::>() + } +} diff --git a/src/models/path.rs b/src/models/path.rs index 15746b8..c067974 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -6,7 +6,13 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use serde::{Deserialize, Serialize}; -use crate::models::{edge::Edge, path_edge::PathEdge, sequence::Sequence, strand::Strand}; +use crate::models::{ + edge::Edge, + node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, + path_edge::PathEdge, + sequence::Sequence, + strand::Strand, +}; #[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)] pub struct Path { @@ -55,9 +61,9 @@ pub fn revcomp(seq: &str) -> String { } #[derive(Clone, Debug)] -pub struct NewBlock { +pub struct PathBlock { pub id: i32, - pub sequence: Sequence, + pub node_id: i32, pub block_sequence: String, pub sequence_start: i32, pub sequence_end: i32, @@ -171,17 +177,17 @@ impl Path { path: &Path, into: Edge, out_of: Edge, - sequences_by_hash: &HashMap, + sequences_by_node_id: &HashMap, current_path_length: i32, - ) -> NewBlock { - if into.target_hash != out_of.source_hash { + ) -> PathBlock { + if into.target_node_id != out_of.source_node_id { panic!( "Consecutive edges in path {0} don't share the same sequence", path.id ); } - let sequence = sequences_by_hash.get(&into.target_hash).unwrap(); + let sequence = sequences_by_node_id.get(&into.target_node_id).unwrap(); let start = into.target_coordinate; let end = out_of.source_coordinate; @@ -204,9 +210,9 @@ impl Path { sequence.get_sequence(start, end) }; - NewBlock { + PathBlock { id: block_id, - sequence: sequence.clone(), + node_id: into.target_node_id, block_sequence, sequence_start: start, sequence_end: end, @@ -216,19 +222,21 @@ impl Path { } } - pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { + pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { let edges = PathEdge::edges_for_path(conn, path.id); - let mut sequence_hashes = HashSet::new(); + let mut sequence_node_ids = HashSet::new(); for edge in &edges { - if edge.source_hash != Sequence::PATH_START_HASH { - sequence_hashes.insert(edge.source_hash.as_str()); + if edge.source_node_id != PATH_START_NODE_ID { + sequence_node_ids.insert(edge.source_node_id); } - if edge.target_hash != Sequence::PATH_END_HASH { - sequence_hashes.insert(edge.target_hash.as_str()); + if edge.target_node_id != PATH_END_NODE_ID { + sequence_node_ids.insert(edge.target_node_id); } } - let sequences_by_hash = - Sequence::sequences_by_hash(conn, sequence_hashes.into_iter().collect::>()); + let sequences_by_node_id = Node::get_sequences_by_node_ids( + conn, + sequence_node_ids.into_iter().collect::>(), + ); let mut blocks = vec![]; let mut path_length = 0; @@ -236,10 +244,9 @@ impl Path { // NOTE: Adding a "start block" for the dedicated start sequence with a range from i32::MIN // to 0 makes interval tree lookups work better. If the point being looked up is -1 (or // below), it will return this block. - let start_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_START_HASH).unwrap(); - blocks.push(NewBlock { + blocks.push(PathBlock { id: -1, - sequence: start_sequence, + node_id: PATH_START_NODE_ID, block_sequence: "".to_string(), sequence_start: 0, sequence_end: 0, @@ -254,7 +261,7 @@ impl Path { path, into, out_of, - &sequences_by_hash, + &sequences_by_node_id, path_length, ); path_length += block.block_sequence.len() as i32; @@ -264,10 +271,9 @@ impl Path { // NOTE: Adding an "end block" for the dedicated end sequence with a range from the path // length to i32::MAX makes interval tree lookups work better. If the point being looked up // is the path length (or higher), it will return this block. - let end_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); - blocks.push(NewBlock { + blocks.push(PathBlock { id: -2, - sequence: end_sequence, + node_id: PATH_END_NODE_ID, block_sequence: "".to_string(), sequence_start: 0, sequence_end: 0, @@ -279,9 +285,9 @@ impl Path { blocks } - pub fn intervaltree_for(conn: &Connection, path: &Path) -> IntervalTree { + pub fn intervaltree_for(conn: &Connection, path: &Path) -> IntervalTree { let blocks = Path::blocks_for(conn, path); - let tree: IntervalTree = blocks + let tree: IntervalTree = blocks .into_iter() .map(|block| (block.path_start..block.path_end, block)) .collect(); @@ -306,12 +312,13 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge1 = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), + PATH_START_NODE_ID, -123, Strand::Forward, - sequence1.hash.clone(), + node1_id, 0, Strand::Forward, 0, @@ -321,12 +328,13 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = Edge::create( conn, - sequence1.hash.clone(), + node1_id, 8, Strand::Forward, - sequence2.hash.clone(), + node2_id, 1, Strand::Forward, 0, @@ -336,12 +344,13 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); + let node3_id = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - sequence2.hash.clone(), + node2_id, 8, Strand::Forward, - sequence3.hash.clone(), + node3_id, 1, Strand::Forward, 0, @@ -351,12 +360,13 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); + let node4_id = Node::create(conn, sequence4.hash.as_str()); let edge4 = Edge::create( conn, - sequence3.hash.clone(), + node3_id, 8, Strand::Forward, - sequence4.hash.clone(), + node4_id, 1, Strand::Forward, 0, @@ -364,10 +374,10 @@ mod tests { ); let edge5 = Edge::create( conn, - sequence4.hash.clone(), + node4_id, 8, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), + PATH_END_NODE_ID, -1, Strand::Forward, 0, @@ -392,12 +402,13 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge5 = Edge::create( conn, - sequence1.hash.clone(), + node1_id, 8, Strand::Reverse, - Sequence::PATH_END_HASH.to_string(), + PATH_END_NODE_ID, 0, Strand::Reverse, 0, @@ -407,12 +418,13 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge4 = Edge::create( conn, - sequence2.hash.clone(), + node2_id, 7, Strand::Reverse, - sequence1.hash.clone(), + node1_id, 0, Strand::Reverse, 0, @@ -422,12 +434,13 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); + let node3_id = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - sequence3.hash.clone(), + node3_id, 7, Strand::Reverse, - sequence2.hash.clone(), + node2_id, 0, Strand::Reverse, 0, @@ -437,12 +450,13 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); + let node4_id = Node::create(conn, sequence4.hash.as_str()); let edge2 = Edge::create( conn, - sequence4.hash.clone(), + node4_id, 7, Strand::Reverse, - sequence3.hash.clone(), + node3_id, 0, Strand::Reverse, 0, @@ -450,10 +464,10 @@ mod tests { ); let edge1 = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), + PATH_START_NODE_ID, -1, Strand::Reverse, - sequence4.hash.clone(), + node4_id, 0, Strand::Reverse, 0, @@ -485,12 +499,13 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge1 = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), + PATH_START_NODE_ID, -1, Strand::Forward, - sequence1.hash.clone(), + node1_id, 0, Strand::Forward, 0, @@ -500,12 +515,13 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = Edge::create( conn, - sequence1.hash.clone(), + node1_id, 8, Strand::Forward, - sequence2.hash.clone(), + node2_id, 1, Strand::Forward, 0, @@ -515,12 +531,13 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); + let node3_id = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - sequence2.hash.clone(), + node2_id, 8, Strand::Forward, - sequence3.hash.clone(), + node3_id, 1, Strand::Forward, 0, @@ -530,12 +547,13 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); + let node4_id = Node::create(conn, sequence4.hash.as_str()); let edge4 = Edge::create( conn, - sequence3.hash.clone(), + node3_id, 8, Strand::Forward, - sequence4.hash.clone(), + node4_id, 1, Strand::Forward, 0, @@ -543,10 +561,10 @@ mod tests { ); let edge5 = Edge::create( conn, - sequence4.hash.clone(), + node4_id, 8, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), + PATH_END_NODE_ID, -1, Strand::Forward, 0, @@ -560,30 +578,30 @@ mod tests { &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); let tree = Path::intervaltree_for(conn, &path); - let blocks1: Vec<_> = tree.query_point(2).map(|x| x.value.clone()).collect(); + let blocks1: Vec = tree.query_point(2).map(|x| x.value.clone()).collect(); assert_eq!(blocks1.len(), 1); let block1 = &blocks1[0]; - assert_eq!(block1.sequence.hash, sequence1.hash); + assert_eq!(block1.node_id, node1_id); assert_eq!(block1.sequence_start, 0); assert_eq!(block1.sequence_end, 8); assert_eq!(block1.path_start, 0); assert_eq!(block1.path_end, 8); assert_eq!(block1.strand, Strand::Forward); - let blocks2: Vec<_> = tree.query_point(12).map(|x| x.value.clone()).collect(); + let blocks2: Vec = tree.query_point(12).map(|x| x.value.clone()).collect(); assert_eq!(blocks2.len(), 1); let block2 = &blocks2[0]; - assert_eq!(block2.sequence.hash, sequence2.hash); + assert_eq!(block2.node_id, node2_id); assert_eq!(block2.sequence_start, 1); assert_eq!(block2.sequence_end, 8); assert_eq!(block2.path_start, 8); assert_eq!(block2.path_end, 15); assert_eq!(block2.strand, Strand::Forward); - let blocks4: Vec<_> = tree.query_point(25).map(|x| x.value.clone()).collect(); + let blocks4: Vec = tree.query_point(25).map(|x| x.value.clone()).collect(); assert_eq!(blocks4.len(), 1); let block4 = &blocks4[0]; - assert_eq!(block4.sequence.hash, sequence4.hash); + assert_eq!(block4.node_id, node4_id); assert_eq!(block4.sequence_start, 1); assert_eq!(block4.sequence_end, 8); assert_eq!(block4.path_start, 22); diff --git a/src/models/sequence.rs b/src/models/sequence.rs index 6b69770..8706a4d 100644 --- a/src/models/sequence.rs +++ b/src/models/sequence.rs @@ -189,10 +189,6 @@ impl<'a> NewSequence<'a> { } impl Sequence { - pub const PATH_START_HASH: &'static str = - "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; - pub const PATH_END_HASH: &'static str = - "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"; #[allow(clippy::new_ret_no_self)] pub fn new() -> NewSequence<'static> { NewSequence::new() @@ -265,10 +261,6 @@ impl Sequence { self.sequence[start as usize..end as usize].to_string() } - fn is_delimiter_hash(hash: &str) -> bool { - hash == Self::PATH_START_HASH || hash == Self::PATH_END_HASH - } - pub fn sequences(conn: &Connection, query: &str, placeholders: Vec) -> Vec { let mut stmt = conn.prepare_cached(query).unwrap(); let rows = stmt @@ -279,15 +271,7 @@ impl Sequence { external_sequence = true; } let hash: String = row.get(0).unwrap(); - // NOTE: "Delimiter" sequences are present to point to the actual start or end of a - // path or node in a block group. They are stored with a non-empty sequence in the - // database in order to satisfy foreign key constraints, so we must make them empty - // here. - let sequence: String = if Sequence::is_delimiter_hash(&hash) { - "".to_string() - } else { - row.get(2).unwrap() - }; + let sequence = row.get(2).unwrap(); Ok(Sequence { hash, sequence_type: row.get(1).unwrap(), diff --git a/src/operation_management.rs b/src/operation_management.rs index b6d717c..a0fba1e 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -15,6 +15,7 @@ use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData}; use crate::models::file_types::FileTypes; +use crate::models::node::Node; use crate::models::operations::{ Branch, FileAddition, Operation, OperationState, OperationSummary, }; @@ -40,6 +41,7 @@ pub enum FileMode { struct DependencyModels { sequences: Vec, block_group: Vec, + nodes: Vec, edges: Vec, paths: Vec, } @@ -74,6 +76,7 @@ pub fn get_changeset_dependencies(conn: &Connection, changes: &[u8]) -> Vec let mut created_block_groups = HashSet::new(); let mut created_paths = HashSet::new(); let mut created_edges = HashSet::new(); + let mut created_nodes = HashSet::new(); let mut created_sequences: HashSet = HashSet::new(); while let Some(item) = iter.next().unwrap() { let op = item.op().unwrap(); @@ -106,22 +109,28 @@ pub fn get_changeset_dependencies(conn: &Connection, changes: &[u8]) -> Vec previous_block_groups.insert(bg_id); } } - "edges" => { - let edge_pk = item.new_value(pk_column).unwrap().as_i64().unwrap() as i32; - let source_hash = + "nodes" => { + created_nodes + .insert(item.new_value(pk_column).unwrap().as_i64().unwrap() as i32); + let sequence_hash = str::from_utf8(item.new_value(1).unwrap().as_bytes().unwrap()) .unwrap() .to_string(); - let target_hash = - str::from_utf8(item.new_value(4).unwrap().as_bytes().unwrap()) - .unwrap() - .to_string(); + if !created_sequences.contains(&sequence_hash) { + previous_sequences.insert(sequence_hash); + } + } + "edges" => { + let edge_pk = item.new_value(pk_column).unwrap().as_i64().unwrap() as i32; + let source_node_id = item.new_value(1).unwrap().as_i64().unwrap() as i32; + let target_node_id = item.new_value(4).unwrap().as_i64().unwrap() as i32; created_edges.insert(edge_pk); - if !created_sequences.contains(&source_hash) { - previous_sequences.insert(source_hash); + let nodes = Node::get_nodes(conn, vec![source_node_id, target_node_id]); + if !created_nodes.contains(&source_node_id) { + previous_sequences.insert(nodes[0].sequence_hash.clone()); } - if !created_sequences.contains(&target_hash) { - previous_sequences.insert(target_hash); + if !created_nodes.contains(&target_node_id) { + previous_sequences.insert(nodes[1].sequence_hash.clone()); } } "path_edges" => { @@ -166,6 +175,7 @@ pub fn get_changeset_dependencies(conn: &Connection, changes: &[u8]) -> Vec ), vec![], ), + nodes: vec![], edges: Edge::query( conn, &format!( @@ -200,6 +210,7 @@ pub fn write_changeset(conn: &Connection, operation: &Operation, changes: &[u8]) let mut file = fs::File::create_new(&change_path) .unwrap_or_else(|_| panic!("Unable to open {change_path:?}")); + file.write_all(changes).unwrap() } @@ -221,6 +232,12 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { dep_bg_map.insert(&bg.id, new_bg.id); } + let mut dep_node_map = HashMap::new(); + for node in dependencies.nodes.iter() { + let new_node_id = Node::create(conn, &node.sequence_hash.clone()); + dep_node_map.insert(&node.id, new_node_id); + } + let mut dep_edge_map = HashMap::new(); let new_edges = Edge::bulk_create( conn, @@ -255,6 +272,7 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { let mut blockgroup_map: HashMap = HashMap::new(); let mut edge_map: HashMap = HashMap::new(); + let mut node_map: HashMap = HashMap::new(); let mut path_edges: HashMap> = HashMap::new(); let mut insert_paths = vec![]; let mut insert_block_group_edges = vec![]; @@ -325,16 +343,25 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { .to_string(), }); } + "nodes" => { + let node_pk = item.new_value(pk_column).unwrap().as_i64().unwrap() as i32; + node_map.insert( + node_pk, + str::from_utf8(item.new_value(1).unwrap().as_bytes().unwrap()) + .unwrap() + .to_string(), + ); + } "edges" => { let edge_pk = item.new_value(pk_column).unwrap().as_i64().unwrap() as i32; edge_map.insert( edge_pk, EdgeData { - source_hash: item.new_value(1).unwrap().as_str().unwrap().to_string(), + source_node_id: item.new_value(1).unwrap().as_i64().unwrap() as i32, source_coordinate: item.new_value(2).unwrap().as_i64().unwrap() as i32, source_strand: Strand::column_result(item.new_value(3).unwrap()) .unwrap(), - target_hash: item.new_value(4).unwrap().as_str().unwrap().to_string(), + target_node_id: item.new_value(4).unwrap().as_i64().unwrap() as i32, target_coordinate: item.new_value(5).unwrap().as_i64().unwrap() as i32, target_strand: Strand::column_result(item.new_value(6).unwrap()) .unwrap(), @@ -366,12 +393,49 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { } } - let sorted_edge_ids = edge_map.keys().copied().sorted().collect::>(); + let mut node_id_map: HashMap = HashMap::new(); + for (node_id, sequence_hash) in node_map { + let new_node_id = Node::create(conn, &sequence_hash); + node_id_map.insert(node_id, new_node_id); + } + + let mut updated_edge_map = HashMap::new(); + for (edge_id, edge) in edge_map { + let updated_source_node_id = dep_node_map.get(&edge.source_node_id).unwrap_or( + node_id_map + .get(&edge.source_node_id) + .unwrap_or(&edge.source_node_id), + ); + let updated_target_node_id = dep_node_map.get(&edge.target_node_id).unwrap_or( + node_id_map + .get(&edge.target_node_id) + .unwrap_or(&edge.target_node_id), + ); + updated_edge_map.insert( + edge_id, + EdgeData { + source_node_id: *updated_source_node_id, + source_coordinate: edge.source_coordinate, + source_strand: edge.source_strand, + target_node_id: *updated_target_node_id, + target_coordinate: edge.target_coordinate, + target_strand: edge.target_strand, + chromosome_index: edge.chromosome_index, + phased: edge.phased, + }, + ); + } + + let sorted_edge_ids = updated_edge_map + .keys() + .copied() + .sorted() + .collect::>(); let created_edges = Edge::bulk_create( conn, sorted_edge_ids .iter() - .map(|id| edge_map[id].clone()) + .map(|id| updated_edge_map[id].clone()) .collect::>(), ); let mut edge_id_map: HashMap = HashMap::new(); @@ -491,6 +555,7 @@ pub fn attach_session(session: &mut session::Session) { "sequence", "block_group", "path", + "nodes", "edges", "path_edges", "block_group_edges", @@ -535,7 +600,7 @@ mod tests { use crate::imports::fasta::import_fasta; use crate::models::file_types::FileTypes; use crate::models::operations::{setup_db, Branch, FileAddition, Operation, OperationState}; - use crate::models::{edge::Edge, metadata, sample::Sample}; + use crate::models::{edge::Edge, metadata, node::Node, sample::Sample}; use crate::test_helpers::{ get_connection, get_operation_connection, setup_block_group, setup_gen_dir, }; @@ -565,13 +630,20 @@ mod tests { setup_db(op_conn, &db_uuid); // create some stuff before we attach to our main session that will be required as extra information - let (bg_id, path_id) = setup_block_group(conn); + let (bg_id, _path_id) = setup_block_group(conn); let binding = BlockGroup::query( conn, "select * from block_group where id = ?1;", vec![Value::from(bg_id)], ); let dep_bg = binding.first().unwrap(); + + let existing_seq = Sequence::new() + .sequence_type("DNA") + .sequence("AAAATTTT") + .save(conn); + let existing_node_id = Node::create(conn, existing_seq.hash.as_str()); + let mut session = Session::new(conn).unwrap(); attach_session(&mut session); @@ -579,14 +651,14 @@ mod tests { .sequence_type("DNA") .sequence("ATCG") .save(conn); - let existing_seq = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); + let random_node_id = Node::create(conn, random_seq.hash.as_str()); let new_edge = Edge::create( conn, - random_seq.hash.clone(), + random_node_id, 0, Strand::Forward, - existing_seq.hash.clone(), + existing_node_id, 0, Strand::Forward, 0, @@ -632,10 +704,12 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 2); + assert_eq!(node_count, 3); assert_eq!(sample_count, 0); assert_eq!(op_count, 1); update_with_vcf( @@ -647,10 +721,25 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; - assert_eq!(edge_count, 10); + // NOTE: The edge count is 14 because of the following: + // * 1 edge from the source node to the node created by the fasta import + // * 1 edge from the node created by the fasta import to the sink node + // * 8 edges to and from nodes representing the first alt sequence. Topologically there are + // just 2 edges, but there is redundancy because of phasing. There is further redundancy + // because there are 2 non-reference samples, causing 2 nodes to be created for each alt + // sequence. + // * 4 edges to and from nodes representing the second alt sequence. (One sample uses the + // reference part instead of the alt sequence in this case.) + assert_eq!(edge_count, 14); + // NOTE: The node count is 9: + // * 2 source and sink nodes + // * 1 node created by the initial fasta import + // * 6 nodes created by the VCF update. See above explanation of edge count for more details. + assert_eq!(node_count, 9); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); @@ -664,10 +753,12 @@ mod tests { ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 2); + assert_eq!(node_count, 3); assert_eq!(sample_count, 0); assert_eq!(op_count, 2); @@ -679,10 +770,12 @@ mod tests { ), ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; - assert_eq!(edge_count, 10); + assert_eq!(edge_count, 14); + assert_eq!(node_count, 9); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); } @@ -707,8 +800,8 @@ mod tests { operation_conn, ); - let branch_1 = Branch::create(operation_conn, &db_uuid, "branch-1"); - let branch_2 = Branch::create(operation_conn, &db_uuid, "branch-2"); + Branch::create(operation_conn, &db_uuid, "branch-1"); + Branch::create(operation_conn, &db_uuid, "branch-2"); checkout( conn, operation_conn, @@ -834,10 +927,12 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 2); + assert_eq!(node_count, 3); assert_eq!(sample_count, 0); assert_eq!(op_count, 1); @@ -860,10 +955,12 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; - assert_eq!(edge_count, 10); + assert_eq!(edge_count, 14); + assert_eq!(node_count, 9); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); @@ -883,10 +980,12 @@ mod tests { // ensure branch 1 operations have been undone let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 2); + assert_eq!(node_count, 3); assert_eq!(sample_count, 0); assert_eq!(op_count, 2); @@ -900,10 +999,12 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 6); + assert_eq!(node_count, 5); assert_eq!(sample_count, 1); assert_eq!(op_count, 3); @@ -921,10 +1022,12 @@ mod tests { ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; - assert_eq!(edge_count, 10); + assert_eq!(edge_count, 14); + assert_eq!(node_count, 9); assert_eq!(sample_count, 3); assert_eq!(op_count, 3); } diff --git a/src/test_helpers.rs b/src/test_helpers.rs index 2abebbb..79b9684 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -9,6 +9,7 @@ use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::collection::Collection; use crate::models::edge::Edge; +use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; use crate::models::path::Path; use crate::models::sequence::Sequence; use crate::models::strand::Strand; @@ -62,26 +63,30 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { .sequence_type("DNA") .sequence("AAAAAAAAAA") .save(conn); + let a_node_id = Node::create(conn, a_seq.hash.as_str()); let t_seq = Sequence::new() .sequence_type("DNA") .sequence("TTTTTTTTTT") .save(conn); + let t_node_id = Node::create(conn, t_seq.hash.as_str()); let c_seq = Sequence::new() .sequence_type("DNA") .sequence("CCCCCCCCCC") .save(conn); + let c_node_id = Node::create(conn, c_seq.hash.as_str()); let g_seq = Sequence::new() .sequence_type("DNA") .sequence("GGGGGGGGGG") .save(conn); + let g_node_id = Node::create(conn, g_seq.hash.as_str()); let _collection = Collection::create(conn, "test"); let block_group = BlockGroup::create(conn, "test", None, "hg19"); let edge0 = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), + PATH_START_NODE_ID, 0, Strand::Forward, - a_seq.hash.clone(), + a_node_id, 0, Strand::Forward, 0, @@ -89,10 +94,10 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge1 = Edge::create( conn, - a_seq.hash, + a_node_id, 10, Strand::Forward, - t_seq.hash.clone(), + t_node_id, 0, Strand::Forward, 0, @@ -100,10 +105,10 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge2 = Edge::create( conn, - t_seq.hash, + t_node_id, 10, Strand::Forward, - c_seq.hash.clone(), + c_node_id, 0, Strand::Forward, 0, @@ -111,10 +116,10 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge3 = Edge::create( conn, - c_seq.hash, + c_node_id, 10, Strand::Forward, - g_seq.hash.clone(), + g_node_id, 0, Strand::Forward, 0, @@ -122,10 +127,10 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge4 = Edge::create( conn, - g_seq.hash, + g_node_id, 10, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), + PATH_END_NODE_ID, 0, Strand::Forward, 0, diff --git a/src/updates/vcf.rs b/src/updates/vcf.rs index 7bca403..ae53fee 100644 --- a/src/updates/vcf.rs +++ b/src/updates/vcf.rs @@ -6,8 +6,9 @@ use crate::models::{ block_group::{BlockGroup, BlockGroupData, PathCache, PathChange}, file_types::FileTypes, metadata, + node::Node, operations::{FileAddition, Operation, OperationSummary}, - path::{NewBlock, Path}, + path::{Path, PathBlock}, sample::Sample, sequence::Sequence, strand::Strand, @@ -118,15 +119,17 @@ fn prepare_change( ref_end: i32, chromosome_index: i32, phased: i32, - sequence: Sequence, + block_sequence: String, + sequence_length: i32, + node_id: i32, ) -> PathChange { // TODO: new sequence may not be real and be or some sort. Handle these. - let new_block = NewBlock { + let new_block = PathBlock { id: 0, - sequence: sequence.clone(), - block_sequence: sequence.get_sequence(None, None), + node_id, + block_sequence, sequence_start: 0, - sequence_end: sequence.length, + sequence_end: sequence_length, path_start: ref_start, path_end: ref_end, strand: Strand::Forward, @@ -277,6 +280,8 @@ pub fn update_with_vcf( for vcf_entry in vcf_entries { let sequence = SequenceCache::lookup(&mut sequence_cache, "DNA", vcf_entry.alt_seq.to_string()); + let sequence_string = sequence.get_sequence(None, None); + let node_id = Node::create(conn, sequence.hash.as_str()); let change = prepare_change( vcf_entry.block_group_id, &vcf_entry.path, @@ -284,7 +289,9 @@ pub fn update_with_vcf( ref_end as i32, vcf_entry.chromosome_index, vcf_entry.phased, - sequence, + sequence_string.clone(), + sequence_string.len() as i32, + node_id, ); changes .entry((vcf_entry.path, vcf_entry.sample_name))