Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement block group #7

Merged
merged 3 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions migrations/01-initial/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,43 @@ CREATE TABLE sequence (
);

CREATE TABLE path (
id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
path_index INTEGER NOT NULL DEFAULT 0
);

CREATE TABLE block_group (
id INTEGER PRIMARY KEY NOT NULL,
collection_name TEXT NOT NULL,
sample_name TEXT,
name TEXT NOT NULL,
path_index INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY(collection_name) REFERENCES collection(name),
FOREIGN KEY(sample_name) REFERENCES sample(name)
);
CREATE UNIQUE INDEX path_uidx ON path(collection_name, sample_name, name, path_index);
CREATE UNIQUE INDEX block_group_uidx ON block_group(collection_name, sample_name, name);

CREATE TABLE block (
id INTEGER PRIMARY KEY NOT NULL,
sequence_hash TEXT NOT NULL,
path_id INTEGER NOT NULL,
block_group_id INTEGER NOT NULL,
"start" INTEGER NOT NULL,
"end" INTEGER NOT NULL,
strand TEXT NOT NULL DEFAULT "1",
FOREIGN KEY(sequence_hash) REFERENCES sequence(hash),
FOREIGN KEY(path_id) REFERENCES path(id),
FOREIGN KEY(block_group_id) REFERENCES block_group(id),
constraint chk_strand check (strand in ('-1', '1', '0', '.', '?'))
);
CREATE UNIQUE INDEX block_uidx ON block(sequence_hash, path_id, start, end, strand);
CREATE UNIQUE INDEX block_uidx ON block(sequence_hash, block_group_id, start, end, strand);

CREATE TABLE edges (
id INTEGER PRIMARY KEY NOT NULL,
source_id INTEGER NOT NULL,
target_id INTEGER,
chromosome_index INTEGER NOT NULL,
phased INTEGER NOT NULL,
FOREIGN KEY(source_id) REFERENCES block(id),
FOREIGN KEY(target_id) REFERENCES block(id)
FOREIGN KEY(target_id) REFERENCES block(id),
constraint chk_phased check (phased in (0, 1))
);

CREATE UNIQUE INDEX edge_uidx ON edges(source_id, target_id);
CREATE UNIQUE INDEX edge_uidx ON edges(source_id, target_id, chromosome_index, phased);
47 changes: 27 additions & 20 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#![allow(warnings)]
use clap::{Parser, Subcommand};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::fmt::Debug;
use std::path::PathBuf;

use bio::io::fasta;
use gen::get_connection;
use gen::migrations::run_migrations;
use gen::models::{self, Block, Path};
use gen::models::{self, Block, BlockGroup, Sequence};
use noodles::vcf;
use noodles::vcf::variant::record::samples::series::value::genotype::Phasing;
use noodles::vcf::variant::record::samples::series::Value;
use noodles::vcf::variant::record::samples::{Sample, Series};
use noodles::vcf::variant::record::{AlternateBases, ReferenceBases, Samples};
Expand Down Expand Up @@ -69,17 +70,17 @@ fn import_fasta(fasta: &String, name: &String, shallow: bool, conn: &mut Connect
let record = result.expect("Error during fasta record parsing");
let sequence = String::from_utf8(record.seq().to_vec()).unwrap();
let seq_hash = models::Sequence::create(conn, "DNA".to_string(), &sequence, !shallow);
let path =
models::Path::create(conn, &collection.name, None, &record.id().to_string(), None);
let block_group =
BlockGroup::create(conn, &collection.name, None, &record.id().to_string());
let block = Block::create(
conn,
&seq_hash,
path.id,
block_group.id,
0,
(sequence.len() as i32),
&"1".to_string(),
);
let edge = models::Edge::create(conn, block.id, None);
let edge = models::Edge::create(conn, block.id, None, 0, 0);
}
println!("Created it");
} else {
Expand Down Expand Up @@ -107,48 +108,54 @@ fn update_with_vcf(vcf_path: &String, collection_name: &String, conn: &mut Conne
let ref_end = record.variant_end(&header).unwrap().get();
let alt_bases = record.alternate_bases();
let alt_alleles: Vec<_> = alt_bases.iter().collect::<io::Result<_>>().unwrap();
let mut created: HashSet<i32> = HashSet::new();
for (sample_index, sample) in record.samples().iter().enumerate() {
let genotype = sample.get(&header, "GT");
let mut seen_haplotypes: HashSet<i32> = HashSet::new();
let mut allele_blocks: HashMap<i32, i32> = HashMap::new();
if genotype.is_some() {
if let Value::Genotype(genotypes) = genotype.unwrap().unwrap().unwrap() {
for gt in genotypes.iter() {
for (chromosome_index, gt) in genotypes.iter().enumerate() {
if gt.is_ok() {
let (haplotype, phasing) = gt.unwrap();
let haplotype = haplotype.unwrap();
if haplotype != 0 && !seen_haplotypes.contains(&(haplotype as i32)) {
let alt_seq = alt_alleles[haplotype - 1];
let (allele, phasing) = gt.unwrap();
let phased = match phasing {
Phasing::Phased => 1,
Phasing::Unphased => 0,
};
let allele = allele.unwrap();
if allele != 0 {
let alt_seq = alt_alleles[allele - 1];
// TODO: new sequence may not be real and be <DEL> or some sort. Handle these.
let new_sequence_hash = models::Sequence::create(
let new_sequence_hash = Sequence::create(
conn,
"DNA".to_string(),
&alt_seq.to_string(),
true,
);
let sample_path_id = models::Path::get_or_create_sample_path(
let sample_bg_id = BlockGroup::get_or_create_sample_block_group(
conn,
collection_name,
&sample_names[sample_index],
&seq_name,
haplotype as i32,
);
let new_block_id = Block::create(
conn,
&new_sequence_hash,
sample_path_id,
sample_bg_id,
0,
alt_seq.len() as i32,
&"1".to_string(),
);
Path::insert_change(
println!("{sample_bg_id} {new_block_id:?} {chromosome_index} {phased} {allele}");
BlockGroup::insert_change(
conn,
sample_path_id,
sample_bg_id,
ref_start as i32,
ref_end as i32,
new_block_id.id,
chromosome_index as i32,
phased,
);
}
seen_haplotypes.insert(haplotype as i32);
}
}
}
Expand Down Expand Up @@ -220,7 +227,7 @@ mod tests {
);
update_with_vcf(&vcf_path.to_str().unwrap().to_string(), &collection, conn);
assert_eq!(
Path::sequence(conn, &collection, Some(&"foo".to_string()), "m123", 1),
BlockGroup::sequence(conn, &collection, Some(&"foo".to_string()), "m123"),
"ATCATCGATCGATCGATCGGGAACACACAGAGA"
);
}
Expand Down
Loading
Loading