diff --git a/corona.py b/corona.py index 9110aab..849290e 100644 --- a/corona.py +++ b/corona.py @@ -1,4 +1,4 @@ -from lib import cc, translate +from lib import genome, translate # entire diff: https://www.ncbi.nlm.nih.gov/projects/msaviewer/?rid=7FYNU14F01R&coloring= # protein alignments: http://virological.org/t/alignment-of-58-sarbecovirus-genomes-for-conservation-analysis-of-sars-cov-2/430 @@ -44,15 +44,17 @@ # https://en.wikipedia.org/wiki/MRNA_(nucleoside-2%27-O-)-methyltransferase # in front "the untranslated leader sequence that ends with the Transcription Regulation Sequence" -corona['untranslated_region'] = cc[0:265] +corona['untranslated_region'] = genome.get_nucleotideSequence()[0:265] -corona['orf1a'] = translate(cc[266-1:13483], True) +corona['orf1a'] = translate(genome.get_nucleotideSequence()[266-1:13483], True) # cc[266-1+4398*3:13468] = 'TTT_TTA_AAC' aka 'X_XXY_YYZ' # https://en.wikipedia.org/wiki/Ribosomal_frameshift # Programmed −1 Ribosomal Frameshifting # TODO: add this to the translate function with automatic detection -corona['orf1b'] = translate(cc[13468-1:21555], False).strip("*") # chop off the stop, note this doesn't have a start +# chop off the stop, note this doesn't have a start +corona['orf1b'] = translate(genome.get_nucleotideSequence()[ + 13468-1:21555], False).strip("*") # exploit vector, this attaches to ACE2. also called "surface glycoprotein" # https://www.ncbi.nlm.nih.gov/Structure/pdb/6VYB -- open state @@ -62,28 +64,37 @@ # S2 = 686-1273 # S2' = 816-1273 # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2750777/ -corona['spike_glycoprotein'] = translate(cc[21563-1:25384], True) +corona['spike_glycoprotein'] = translate( + genome.get_nucleotideSequence()[21563-1:25384], True) # Forms homotetrameric potassium sensitive ion channels (viroporin) and may modulate virus release. -corona['orf3a'] = translate(cc[25393-1:26220], True) +corona['orf3a'] = translate( + genome.get_nucleotideSequence()[25393-1:26220], True) # these two things stick out, used in assembly aka they package the virus -corona['envelope_protein'] = translate(cc[26245-1:26472], True) # also known as small membrane -corona['membrane_glycoprotein'] = translate(cc[26523-1:27191], True) +corona['envelope_protein'] = translate( + genome.get_nucleotideSequence()[26245-1:26472], True) # also known as small membrane +corona['membrane_glycoprotein'] = translate( + genome.get_nucleotideSequence()[26523-1:27191], True) -corona['orf6'] = translate(cc[27202-1:27387], True) +corona['orf6'] = translate( + genome.get_nucleotideSequence()[27202-1:27387], True) -corona['orf7a'] = translate(cc[27394-1:27759], True) -corona['orf7b'] = translate(cc[27756-1:27887], True) # is this one real? +corona['orf7a'] = translate( + genome.get_nucleotideSequence()[27394-1:27759], True) +corona['orf7b'] = translate(genome.get_nucleotideSequence()[ + 27756-1:27887], True) # is this one real? -corona['orf8'] = translate(cc[27894-1:28259], True) +corona['orf8'] = translate( + genome.get_nucleotideSequence()[27894-1:28259], True) # https://en.wikipedia.org/wiki/Capsid # Packages the positive strand viral genome RNA into a helical ribonucleocapsid # Includes the "internal" protein (from Coronavirus Pathogenesis) # https://www.sciencedirect.com/topics/veterinary-science-and-veterinary-medicine/human-coronavirus-oc43 -corona['nucleocapsid_phosphoprotein'] = translate(cc[28274-1:29533], True) +corona['nucleocapsid_phosphoprotein'] = translate( + genome.get_nucleotideSequence()[28274-1:29533], True) # might be called the internal protein (Coronavirus Pathogenesis) -corona['orf10'] = translate(cc[29558-1:29674], True) - +corona['orf10'] = translate( + genome.get_nucleotideSequence()[29558-1:29674], True) diff --git a/fold.py b/fold.py index 50b2e22..c3ea741 100755 --- a/fold.py +++ b/fold.py @@ -10,7 +10,8 @@ parser = argparse.ArgumentParser(description='Fold some proteins.') parser.add_argument('--scratch', action='store_true') parser.add_argument('--temp', type=int, default=300) -parser.add_argument('--steps', type=int, default=100000, help="2500000000 should fold the protein") +parser.add_argument('--steps', type=int, default=100000, + help="2500000000 should fold the protein") parser.add_argument('--writes', type=int, default=1000, help="default is 1000") parser.add_argument('--out', type=str, default="/tmp/output.pdb") parser.add_argument('--pdb', type=str, default="proteins/villin/1vii.pdb") @@ -18,24 +19,24 @@ args = parser.parse_args(sys.argv[1:]) try: - platform = Platform.getPlatformByName("CUDA") + platform = Platform.getPlatformByName("CUDA") except Exception: - platform = Platform.getPlatformByName("OpenCL") + platform = Platform.getPlatformByName("OpenCL") if args.scratch: - # unfolded protein - if args.fasta is not None: - fasta = args.fasta - else: - protein_fasta = "proteins/villin/1vii.fasta" - fasta = open(protein_fasta).read().split("\n")[1] - print("folding %s" % fasta) - from lib import write_unfolded - write_unfolded(fasta, "/tmp/unfolded.pdb") - pdb = PDBFile("/tmp/unfolded.pdb") + # unfolded protein + if args.fasta is not None: + fasta = args.fasta + else: + protein_fasta = "proteins/villin/1vii.fasta" + fasta = open(protein_fasta).read().split("\n")[1] + print("folding %s" % fasta) + from lib import write_unfolded + write_unfolded(fasta, "/tmp/unfolded.pdb") + pdb = PDBFile("/tmp/unfolded.pdb") else: - # already folded protein - pdb = PDBFile(args.pdb) + # already folded protein + pdb = PDBFile(args.pdb) #forcefield = ForceField('amber99sb.xml', 'tip3p.xml') forcefield = ForceField('amber03.xml', 'amber03_obc.xml') @@ -45,9 +46,10 @@ print(modeller.topology) system = forcefield.createSystem(modeller.topology, - implicitSolvent=OBC2, # matches https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2980750/#bib39 - nonbondedMethod=NoCutoff, nonbondedCutoff=1*nanometer, - constraints=HBonds) + # matches https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2980750/#bib39 + implicitSolvent=OBC2, + nonbondedMethod=NoCutoff, nonbondedCutoff=1*nanometer, + constraints=HBonds) integrator = LangevinIntegrator(args.temp*kelvin, 1/picosecond, 2*femtoseconds) simulation = Simulation(modeller.topology, system, integrator, platform) simulation.context.setPositions(modeller.positions) @@ -57,6 +59,6 @@ steps_write = max(1, steps//args.writes) print("writing every %d steps" % steps_write) simulation.reporters.append(PDBReporter(args.out, steps_write)) -simulation.reporters.append(StateDataReporter(stdout, steps_write, step=True, potentialEnergy=True, temperature=True)) +simulation.reporters.append(StateDataReporter( + stdout, steps_write, step=True, potentialEnergy=True, temperature=True)) simulation.step(steps) - diff --git a/generic-genome.py b/generic-genome.py new file mode 100644 index 0000000..1f8a530 --- /dev/null +++ b/generic-genome.py @@ -0,0 +1,20 @@ +class Genome: + def __init__(self, chromosomes, genes, dnscode): + self.chromosomes = chromosomes + + def get_nucleotideSequence(self): + return self.chromosomes + + +class GenomeBuilder(): + def add_chromosomes(self, chromosomes): + self.chromosomes = chromosomes + + def add_genes(self, genes): + self.genes = genes + + def add_dnscode(self, dnscode): + self.dnscode = dnscode + + def build(self): + return Genome(self.chromosomes, self.genes, self.dnscode) diff --git a/genome.py b/genome.py new file mode 100644 index 0000000..896e087 --- /dev/null +++ b/genome.py @@ -0,0 +1,21 @@ +class Genome: + def __init__(self, nucleotideSequence): + self.nucleotideSequence = nucleotideSequence + + def get_nucleotideSequence(self): + return self.nucleotideSequence + + +class GenomeBuilder(): + def __init__(self, nucleotideSequence): + self.genome = Genome(nucleotideSequence) + + def build(self): + if(self.validate()): + return self.genome + else: + return None + + def validate(self): + my_set = set('ACGTU') + return set(self.genome.get_nucleotideSequence).issubset(my_set) diff --git a/lib.py b/lib.py index f440f4c..eec4267 100644 --- a/lib.py +++ b/lib.py @@ -1,5 +1,9 @@ +import pathlib +import json +import os import random - +from genome import Genome +from genome import GenomeBuilder # Asn or Asp / B AAU, AAC; GAU, GAC # Gln or Glu / Z CAA, CAG; GAA, GAG # START AUG @@ -27,87 +31,95 @@ """.strip() dec = {} for t in tt.split("\n"): - k = t[:len("Val / V")].strip() - v = t[len("Val / V "):] - if '/' in k: - k = k.split("/")[-1].strip() - k = k.replace("STOP", "*") - v = v.replace(",", "").replace(";", "").lower().replace("u", "t").split(" ") - for vv in v: - if vv in dec: - print("dup", vv) - dec[vv.strip()] = k + k = t[:len("Val / V")].strip() + v = t[len("Val / V "):] + if '/' in k: + k = k.split("/")[-1].strip() + k = k.replace("STOP", "*") + v = v.replace(",", "").replace( + ";", "").lower().replace("u", "t").split(" ") + for vv in v: + if vv in dec: + print("dup", vv) + dec[vv.strip()] = k + def translate(x, protein=False): - x = x.lower() - aa = [] - for i in range(0, len(x)-2, 3): - aa.append(dec[x[i:i+3]]) - aa = ''.join(aa) - if protein: - if aa[0] != "M" or aa[-1] != "*": - print("BAD PROTEIN") - print(aa) - return None - aa = aa[:-1] - return aa + x = x.lower() + aa = [] + for i in range(0, len(x)-2, 3): + aa.append(dec[x[i:i+3]]) + aa = ''.join(aa) + if protein: + if aa[0] != "M" or aa[-1] != "*": + print("BAD PROTEIN") + print(aa) + return None + aa = aa[:-1] + return aa + ltl = 'Asp D Glu E Arg R Lys K His H Asn N Gln Q Ser S Thr T Tyr Y Ala A Gly G Val V Leu L Ile I Pro P Phe F Met M Trp W Cys C' ltl = ltl.split(" ") ltl = dict(zip(ltl[1::2], ltl[0::2])) + def get_atoms(): - from data import get_amber99sb - amber99sb = get_amber99sb() - residues = amber99sb.getElementsByTagName("Residue") - atoms = {} - for r in residues: - name = r.attributes['name'].value - atoms[name] = [x.attributes['name'].value for x in r.getElementsByTagName("Atom")] - return atoms + from data import get_amber99sb + amber99sb = get_amber99sb() + residues = amber99sb.getElementsByTagName("Residue") + atoms = {} + for r in residues: + name = r.attributes['name'].value + atoms[name] = [ + x.attributes['name'].value for x in r.getElementsByTagName("Atom")] + return atoms + def write_unfolded(fasta, fn): - atoms = get_atoms() - atom_num = 1 - res_num = 1 - ss = [] - random.seed(1337) - for i, aa in enumerate(fasta): - tl = ltl[aa].upper() - for a in atoms[tl] + ([] if i != len(fasta)-1 else ["OXT"]): - if len(a) < 4: - pa = " " + a - else: - pa = a - gr = lambda: 1.0*(random.random()-0.5) - x,y,z = gr(), gr(), gr() - x += res_num*5 - s = "ATOM %6d %-4s %3s A %3d %8.3f%8.3f%8.3f 1.00 1.00 %s" % \ - (atom_num, pa, tl, res_num, x, y, z, a[0:1]) - ss.append(s) - atom_num += 1 - res_num += 1 + atoms = get_atoms() + atom_num = 1 + res_num = 1 + ss = [] + random.seed(1337) + for i, aa in enumerate(fasta): + tl = ltl[aa].upper() + for a in atoms[tl] + ([] if i != len(fasta)-1 else ["OXT"]): + if len(a) < 4: + pa = " " + a + else: + pa = a + + def gr(): return 1.0*(random.random()-0.5) + x, y, z = gr(), gr(), gr() + x += res_num*5 + s = "ATOM %6d %-4s %3s A %3d %8.3f%8.3f%8.3f 1.00 1.00 %s" % \ + (atom_num, pa, tl, res_num, x, y, z, a[0:1]) + ss.append(s) + atom_num += 1 + res_num += 1 + + with open(fn, "w") as f: + f.write('\n'.join(ss)) + - with open(fn, "w") as f: - f.write('\n'.join(ss)) - def invert(dd): - dd = dd.upper() - def _invert(x): - if x == 'A': - return 'T' - elif x == 'T': - return 'A' - elif x == 'C': - return 'G' - elif x == 'G': - return 'C' - return (''.join([_invert(x) for x in dd]))[::-1] + dd = dd.upper() -import pathlib -import os -import json -with open(os.path.join(pathlib.Path(__file__).parent.absolute(), "data", "allseq.json")) as f: - allseq = json.load(f) -cc = allseq['MN908947'] + def _invert(x): + if x == 'A': + return 'T' + elif x == 'T': + return 'A' + elif x == 'C': + return 'G' + elif x == 'G': + return 'C' + return (''.join([_invert(x) for x in dd]))[::-1] + +with open(os.path.join(pathlib.Path(__file__).parent.absolute(), "data", "allseq.json")) as f: + allseq = json.load(f) +nucleotideSequence = allseq['MN908947'] +builder = GenomeBuilder(nucleotideSequence) +genome = builder.build() diff --git a/opt.py b/opt.py index c909e4d..4f9fc9c 100755 --- a/opt.py +++ b/opt.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -from lib import cc as virus +from lib import genome as virus from vaccine.load import dat as vaccine from corona import corona -virus = virus.replace("T", "U") +virus = virus.get_nucleotideSequence().replace("T", "U") vaccine = vaccine.replace("Ψ", "U") """ @@ -21,4 +21,3 @@ print(vvirus) print(vaccine) -