Skip to content

Commit

Permalink
Add phasing and chromosome indices
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris7 committed Jul 29, 2024
1 parent e4d2b6f commit 64aa1a3
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 23 deletions.
7 changes: 5 additions & 2 deletions migrations/01-initial/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,11 @@ CREATE TABLE edges (
id INTEGER PRIMARY KEY NOT NULL,
source_id INTEGER NOT NULL,
target_id INTEGER,
chromosome_index INTEGER NOT NULL,
phased INTEGER NOT NULL,
FOREIGN KEY(source_id) REFERENCES block(id),
FOREIGN KEY(target_id) REFERENCES block(id)
FOREIGN KEY(target_id) REFERENCES block(id),
constraint chk_phased check (phased in (0, 1))
);

CREATE UNIQUE INDEX edge_uidx ON edges(source_id, target_id);
CREATE UNIQUE INDEX edge_uidx ON edges(source_id, target_id, chromosome_index, phased);
35 changes: 22 additions & 13 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ use std::path::PathBuf;
use bio::io::fasta;
use gen::get_connection;
use gen::migrations::run_migrations;
use gen::models::{self, Block, BlockGroup, Path};
use gen::models::{self, Block, BlockGroup, Sequence};
use noodles::vcf;
use noodles::vcf::variant::record::samples::series::value::genotype::Phasing;
use noodles::vcf::variant::record::samples::series::Value;
use noodles::vcf::variant::record::samples::{Sample, Series};
use noodles::vcf::variant::record::{AlternateBases, ReferenceBases, Samples};
Expand Down Expand Up @@ -79,7 +80,7 @@ fn import_fasta(fasta: &String, name: &String, shallow: bool, conn: &mut Connect
(sequence.len() as i32),
&"1".to_string(),
);
let edge = models::Edge::create(conn, block.id, None);
let edge = models::Edge::create(conn, block.id, None, 0, 0);
}
println!("Created it");
} else {
Expand Down Expand Up @@ -109,45 +110,53 @@ fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Conne
let alt_alleles: Vec<_> = alt_bases.iter().collect::<io::Result<_>>().unwrap();
for (sample_index, sample) in record.samples().iter().enumerate() {
let genotype = sample.get(&header, "GT");
let mut seen_haplotypes: HashSet<i32> = HashSet::new();
let mut seen_alleles: HashSet<i32> = HashSet::new();
if genotype.is_some() {
if let Value::Genotype(genotypes) = genotype.unwrap().unwrap().unwrap() {
for gt in genotypes.iter() {
for (chromosome_index, gt) in genotypes.iter().enumerate() {
if gt.is_ok() {
let (haplotype, phasing) = gt.unwrap();
let haplotype = haplotype.unwrap();
if haplotype != 0 && !seen_haplotypes.contains(&(haplotype as i32)) {
let alt_seq = alt_alleles[haplotype - 1];
let (allele, phasing) = gt.unwrap();
let phased = match phasing {
Phasing::Phased => 1,
Phasing::Unphased => 0,
};
let allele = allele.unwrap();
if allele != 0 && !seen_alleles.contains(&(allele as i32)) {
let alt_seq = alt_alleles[allele - 1];
// TODO: new sequence may not be real and be <DEL> or some sort. Handle these.
let new_sequence_hash = models::Sequence::create(
let new_sequence_hash = Sequence::create(
conn,
"DNA".to_string(),
&alt_seq.to_string(),
true,
);
let sample_path_id = BlockGroup::get_or_create_sample_block_group(
let sample_bg_id = BlockGroup::get_or_create_sample_block_group(
conn,
collection_name,
&sample_names[sample_index],
&seq_name,
chromosome_index as i32,
phased,
);
let new_block_id = Block::create(
conn,
&new_sequence_hash,
sample_path_id,
sample_bg_id,
0,
alt_seq.len() as i32,
&"1".to_string(),
);
BlockGroup::insert_change(
conn,
sample_path_id,
sample_bg_id,
ref_start as i32,
ref_end as i32,
new_block_id.id,
chromosome_index as i32,
phased,
);
}
seen_haplotypes.insert(haplotype as i32);
seen_alleles.insert(allele as i32);
}
}
}
Expand Down
48 changes: 40 additions & 8 deletions src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,29 +177,43 @@ pub struct Edge {
pub id: i32,
pub source_id: i32,
pub target_id: Option<i32>,
pub chromosome_index: i32,
pub phased: i32,
}

impl Edge {
pub fn create(conn: &Connection, source_id: i32, target_id: Option<i32>) -> Edge {
pub fn create(
conn: &Connection,
source_id: i32,
target_id: Option<i32>,
chromosome_index: i32,
phased: i32,
) -> Edge {
let mut query;
let mut id_query;
let mut placeholders = vec![];
if target_id.is_some() {
query = "INSERT INTO edges (source_id, target_id) VALUES (?1, ?2) RETURNING *";
query = "INSERT INTO edges (source_id, target_id, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4) RETURNING *";
id_query = "select id from edges where source_id = ?1 and target_id = ?2";
placeholders.push(source_id);
placeholders.push(target_id.unwrap());
placeholders.push(chromosome_index);
placeholders.push(phased);
} else {
id_query = "select id from edges where source_id = ?1 and target_id is null";
query = "INSERT INTO edges (source_id) VALUES (?1) RETURNING *";
id_query = "select id from edges where source_id = ?1 and target_id is null and chromosome_index = ?2 and phased = ?3";
query = "INSERT INTO edges (source_id, chromosome_index, phased) VALUES (?1, ?2, ?3) RETURNING *";
placeholders.push(source_id);
placeholders.push(chromosome_index);
placeholders.push(phased);
}
let mut stmt = conn.prepare(query).unwrap();
match stmt.query_row(params_from_iter(&placeholders), |row| {
Ok(Edge {
id: row.get(0)?,
source_id: row.get(1)?,
target_id: row.get(2)?,
chromosome_index: row.get(3)?,
phased: row.get(4)?,
})
}) {
Ok(edge) => edge,
Expand All @@ -212,6 +226,8 @@ impl Edge {
.unwrap(),
source_id,
target_id,
chromosome_index,
phased,
}
} else {
panic!("something bad happened querying the database")
Expand Down Expand Up @@ -739,7 +755,13 @@ impl BlockGroup {
}
}

pub fn clone(conn: &mut Connection, source_id: i32, target_id: i32) {
pub fn clone(
conn: &mut Connection,
source_id: i32,
target_id: i32,
chromosome_index: i32,
phased: i32,
) {
let mut stmt = conn
.prepare_cached(
"SELECT id, sequence_hash, start, end, strand from block where block_group_id = ?1",
Expand Down Expand Up @@ -779,6 +801,8 @@ impl BlockGroup {
conn,
*block_map.get(&source_id).unwrap_or(&source_id),
target_id,
chromosome_index,
phased,
);
row = it.next().unwrap();
}
Expand All @@ -789,6 +813,8 @@ impl BlockGroup {
collection_name: &String,
sample_name: &String,
group_name: &String,
chromosome_index: i32,
phased: i32,
) -> i32 {
let mut bg_id : i32 = match conn.query_row(
"select id from block_group where collection_name = ?1 AND sample_name = ?2 AND name = ?3",
Expand Down Expand Up @@ -820,7 +846,7 @@ impl BlockGroup {
let new_bg_id = BlockGroup::create(conn, collection_name, Some(sample_name), group_name);

// clone parent blocks/edges
BlockGroup::clone(conn, bg_id, new_bg_id.id);
BlockGroup::clone(conn, bg_id, new_bg_id.id, chromosome_index, phased);

new_bg_id.id
}
Expand All @@ -833,6 +859,8 @@ impl BlockGroup {
start: i32,
end: i32,
new_block_id: i32,
chromosome_index: i32,
phased: i32,
) {
println!("change is {block_group_id} {start} {end} {new_block_id}");
// todo:
Expand All @@ -847,7 +875,7 @@ impl BlockGroup {
// change that hits just start/end boundry, e.g. block is 1,5 and change is 3,5 or 1,3.
// change that deletes block boundry
// https://stackoverflow.com/questions/3269434/whats-the-most-efficient-way-to-test-if-two-ranges-overlap
let mut stmt = conn.prepare_cached("select b.id, b.sequence_hash, b.block_group_id, b.start, b.end, b.strand, e.id as edge_id, e.source_id, e.target_id from block b left join edges e on (e.source_id = b.id or e.target_id = b.id) where b.block_group_id = ?1 AND b.start <= ?3 AND ?2 <= b.end AND b.id != ?4;").unwrap();
let mut stmt = conn.prepare_cached("select b.id, b.sequence_hash, b.block_group_id, b.start, b.end, b.strand, e.id as edge_id, e.source_id, e.target_id, e.chromosome_index, e.phased from block b left join edges e on (e.source_id = b.id or e.target_id = b.id) where b.block_group_id = ?1 AND b.start <= ?3 AND ?2 <= b.end AND b.id != ?4;").unwrap();
let mut block_edges: HashMap<i32, Vec<Edge>> = HashMap::new();
let mut blocks: HashMap<i32, Block> = HashMap::new();
let mut it = stmt
Expand Down Expand Up @@ -875,12 +903,16 @@ impl BlockGroup {
id: edge_id.unwrap(),
source_id: entry.get(7).unwrap(),
target_id: entry.get(8).unwrap(),
chromosome_index: entry.get(9).unwrap(),
phased: entry.get(10).unwrap(),
}]);
} else {
block_edges.get_mut(&block_id).unwrap().push(Edge {
id: entry.get(6).unwrap(),
source_id: entry.get(7).unwrap(),
target_id: entry.get(8).unwrap(),
chromosome_index: entry.get(9).unwrap(),
phased: entry.get(10).unwrap(),
});
}
} else {
Expand Down Expand Up @@ -1066,7 +1098,7 @@ impl BlockGroup {
}
}
for new_edge in new_edges {
Edge::create(conn, new_edge.0, Some(new_edge.1));
Edge::create(conn, new_edge.0, Some(new_edge.1), chromosome_index, phased);
}

let block_keys = blocks
Expand Down

0 comments on commit 64aa1a3

Please sign in to comment.