Skip to content

Commit

Permalink
Updated IMGT parsing and database
Browse files Browse the repository at this point in the history
  • Loading branch information
douweschulte committed Sep 13, 2024
1 parent 9b21318 commit 8e3ac3c
Show file tree
Hide file tree
Showing 75 changed files with 3,332 additions and 314 deletions.
2,127 changes: 2,127 additions & 0 deletions rustyms-imgt-generate/data/imgt_small.dat

Large diffs are not rendered by default.

13 changes: 7 additions & 6 deletions rustyms-imgt-generate/src/combine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pub fn combine(

for element in data.flatten() {
let species = element.species;
// println!("{element}");
// if species != Species::HomoSapiens {
// continue;
// }
Expand Down Expand Up @@ -62,12 +63,12 @@ pub fn combine(

// Save temp seqs in final data structure
for (species, entry) in deduped_temp {
if species == Species::HomoSapiens
&& entry.name.kind == GeneType::C(Some(Constant::M))
&& entry.name.chain == ChainType::Heavy
{
println!("{}", entry);
}
// if species == Species::HomoSapiens
// && entry.name.kind == GeneType::C(Some(Constant::M))
// && entry.name.chain == ChainType::Heavy
// {
// println!("{}", entry);
// }
grouped
.entry(species)
.or_insert(Germlines::new(species))
Expand Down
46 changes: 41 additions & 5 deletions rustyms-imgt-generate/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ pub fn parse_dat<T: std::io::Read>(
.filter(|pre| {
pre.kw.contains(&"immunoglobulin (IG)".to_string())
&& (pre.kw.contains(&"functional".to_string())
|| pre.kw.contains(&"germline".to_string()))
|| pre.kw.contains(&"germline".to_string())
|| pre.kw.contains(&"productive".to_string()))
&& pre.os.is_some()
})
.map(DataItem::new)
Expand Down Expand Up @@ -67,6 +68,7 @@ fn parse_dat_line(data: &mut PreDataItem, line: &str) -> bool {

impl DataItem {
fn new(data: PreDataItem) -> Result<Self, String> {
// println!("{}", data.id);
let mut result = Self {
id: data.id[5..].split(';').next().unwrap().to_string(),
species: data.os.ok_or("No species found")?,
Expand All @@ -91,7 +93,7 @@ impl DataItem {
reported_seq: String::new(),
found_seq: Err("Not loaded".to_string()),
allele: String::new(),
functional: false,
functional: true,
partial: false,
shift: 0,
splice_aa: None,
Expand All @@ -118,7 +120,7 @@ impl DataItem {
let trimmed = line.trim();
let split = trimmed
.split_once('=')
.map(|(key, tail)| (key.to_lowercase(), tail));
.map(|(key, tail)| (key.to_ascii_lowercase(), tail));

match split.as_ref().map(|(key, tail)| (key.as_str(), tail)) {
Some(("/translation", tail)) => {
Expand All @@ -139,9 +141,15 @@ impl DataItem {
current.splice_aa = AminoAcid::try_from(tail.as_bytes()[i - 1]).ok();
}
}
Some(("/functional" | "/note=\"functional\"" | "/imgt_note=\"functional\"", _)) => {
Some(("/functional", _)) => {
current.functional = true;
}
Some(("/note" | "/imgt_note", s)) if s.to_ascii_lowercase().contains("functional") => {
current.functional = true;
}
Some(("/pseudo", _)) => {
current.functional = false;
}
Some(("/partial", _)) => current.partial = true,
None if *is_sequence => {
current.reported_seq += trimmed.trim_end_matches('\"');
Expand All @@ -153,6 +161,7 @@ impl DataItem {
}

fn add_region(&mut self, mut region: Region) {
// println!("AR: {region}");
// Get the actual sequence
region.found_seq = self.get_sequence(&region.location, region.shift);

Expand All @@ -169,6 +178,22 @@ impl DataItem {
allele: region.allele,
regions: HashMap::new(),
});
} else if ["V-REGION", "C-REGION", "J-REGION"].contains(&region.key.as_str()) // , "D-GENE"
&& region.functional
&& !region.partial
&& region.allele.starts_with("IG")
{
if let Some(existing) = self.genes.iter_mut().find(|g| g.allele == region.allele) {
existing.regions.insert(region.key.clone(), region);
} else {
self.genes.push(IMGTGene {
acc: region.acc,
key: region.key,
location: region.location,
allele: region.allele,
regions: HashMap::new(),
});
}
} else if [
"FR1-IMGT",
"FR2-IMGT",
Expand Down Expand Up @@ -215,7 +240,18 @@ impl DataItem {
]
.contains(&region.key.as_str())
{
if let Some(gene) = self
if region.key == "CDR3-IMGT" {
if let Some(gene) = self
.genes
.iter_mut()
.find(|g| g.location.overlaps(&region.location))
// CDR3 does not have to be fully inside a V-REGION
{
gene.regions.insert(region.key.clone(), region);
} else {
self.regions.push(region)
}
} else if let Some(gene) = self
.genes
.iter_mut()
.find(|g| g.location.contains(&region.location))
Expand Down
18 changes: 17 additions & 1 deletion rustyms-imgt-generate/src/structs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,12 @@ impl Display for Region {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}\t{}\t{}",
"{}\t{}\t{}\t{}\t{}\t{}",
self.key,
self.location,
self.allele,
self.functional,
self.partial,
// self.sequence,
// dna,
// self.found_seq.0,
Expand All @@ -71,6 +74,19 @@ pub enum Location {
}

impl Location {
/// Check if a location overlaps or is immediately adjacent to this location.
/// Used to detect if a CDR3 belongs to a certain V-REGION
pub fn overlaps(&self, other: &Location) -> bool {
match (self, other) {
(Self::Complement(s), Self::Complement(o)) | (Self::Normal(s), Self::Normal(o)) => {
*s.start() <= o.end() + 1 && s.end() + 1 >= *o.start()
}
(Self::Complement(s), Self::SingleComplement(o)) => s.contains(o),
(Self::Normal(s), Self::SingleNormal(o)) => s.contains(o),
_ => false,
}
}

pub fn contains(&self, other: &Location) -> bool {
match (self, other) {
(Self::Complement(s), Self::Complement(o)) | (Self::Normal(s), Self::Normal(o)) => {
Expand Down
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Alpaca.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Arabian camel.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Atlantic cod.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Atlantic salmon.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Black rat.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Black rockcod.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Blackfin icefish.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Bornean orangutan.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Channel catfish.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Chimpanzee.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Clearnose skate.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Common carp.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Common gibbon.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Cook's mouse.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Crab-eating macaque.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic bovine.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic cat.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic chicken.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic dog.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic ferret.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic goat.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic horse.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic pig.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Domestic sheep.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Eastern European house mouse.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Emerald rockcod.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/European rabbit.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/European seabass.bin
Binary file not shown.
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Horn shark.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/House mouse.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Human.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Japanese amberjack.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Japanese wild mouse.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Liontail macaque.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Little skate.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Mandarin fish.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Marbled lungfish.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Mice.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Norway rat.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Nurse shark.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Olive baboon anubis.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Pere David's macaque.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Pig-tailed macaque.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Platypus.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Rabbit.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Rainbow trout.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Rhesus monkey.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Ring-tailed lemur.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/River trout.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Sandbar shark.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Shrew mouse.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Sooty mangabey.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Southeastern Asian house mouse.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Southern African pygmy mouse.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Spiny mouse.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Spotted ratfish.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Spotted wolffish.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Stump-tailed macaque.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Sumatran orangutan.bin
Binary file not shown.
Binary file added rustyms/src/imgt/germlines/Taiwan macaque.bin
Binary file not shown.
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Western European house mouse.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Western gorilla.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Western lowland gorilla.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Western wild mouse.bin
Binary file not shown.
Binary file modified rustyms/src/imgt/germlines/Zebrafish.bin
Binary file not shown.
Loading

0 comments on commit 8e3ac3c

Please sign in to comment.