added p4

kth-gt · Aug 12, 2023 · f649164 · f649164
1 parent d554267
commit f649164
Show file tree

Hide file tree

Showing 3 changed files with 207 additions and 0 deletions.
diff --git a/prog/p4/labp4.py b/prog/p4/labp4.py
@@ -0,0 +1,137 @@
+
+authors = ['A. Student', 'B. Helper']
+
+import numpy as np
+
+# A dictionary converting codons to amino acids
+codon2aa = {
+    'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N',
+    'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
+    'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S',
+    'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I',
+    'CAA': 'Q', 'CAC': 'H', 'CAG': 'Q', 'CAT': 'H',
+    'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
+    'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
+    'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
+    'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D',
+    'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
+    'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
+    'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
+    'TAA': '*', 'TAC': 'Y', 'TAG': '*', 'TAT': 'Y',
+    'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
+    'TGA': '*', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C',
+    'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F'
+}
+
+##
+
+
+
+dist_matr = np.array([[1,2,3],
+                      [1,2,3],
+                      [1,2,3]])
+
+dist_matr = [[1,2,3],
+            [1,2,3],
+            [1,2,3]]
+
+names_list = ['s1', 's2', 's3']
+
+def upgma(dist_matr, names_list):
+    new_names_list = names_list
+    new_dist_matr = dist_matr
+    while(len(new_names_list) > 1) :
+        # find smallest distance
+        smallest = 10000
+        for row in range(0, len(new_names_list):
+            for col in range((row + 1), len(new_names_list):
+                if (new_dist_matr[row][col] < smallest):
+                    smallest = new_dist_matr[row][col]
+                    smallest_row = row
+                    smallest_col = col
+        # calculate distance between new cluster and all other clusters
+
+
+        # update distance matrix and names list
+
+
+
+
+
+
+## Here is an example implementation of a function that translates
+# an RNA string into a protein string
+def dna2aa(dna_str):
+    aa_str = ''
+    for i in range(0, len(dna_str), 3):
+        codon = dna_str[i:i+3]
+        if codon not in codon2aa:
+            continue
+        aa = codon2aa[codon]
+        aa_str += aa
+    return aa_str
+
+## Here is an example implementation of an extended function that
+# handles three frames and returns the longest ORF
+def dna2aa_3frame(dna_str):
+    aa_str, longest_orf = '', 0
+    for frame in range(0, 3):
+        frame_longest_orf, frame_aa_str = 0, ''
+        len_orf = 0
+        for i in range(frame, len(dna_str), 3):
+            codon = dna_str[i:i+3]
+            if codon not in codon2aa:
+                continue
+            aa = codon2aa[codon]
+            if aa == '*':
+                if len_orf > longest_orf:
+                    frame_longest_orf = len_orf
+                    len_orf = 0
+                else:
+                    len_orf += 1
+            frame_aa_str += aa
+        if frame_longest_orf > longest_orf:
+            longest_orf = frame_longest_orf
+            aa_str = frame_aa_str
+    return aa_str
+
+
+## Here is a function that reads a FASTA file and returns strings containing tupples of (sequence name, sequence)
+def read_fasta(filename):
+    seqs = []
+    with open(filename) as f:
+        name = None
+        seq = ''
+        for line in f:
+            if line[0] == '>':
+                if name is not None:
+                    seqs.append((name, seq))
+                name = line[1:].strip()
+                seq = ''
+            else:
+                seq += line.strip()
+        seqs.append((name, seq))
+    return seqs
+
+## Here is a function that writes a FASTA file from a list of (sequence name, sequence) tuples
+def write_fasta(filename, seqs):
+    with open(filename, 'w') as f:
+        for name, seq in seqs:
+            f.write('>' + name + '\n')
+            f.write(seq + '\n')
+
+## Here is a function that reads a RNA FASTA and writes a protein FASTA
+def dna2aa_fasta(dna_filename, protein_filename):
+    seqs = read_fasta(dna_filename)
+    protein_seqs = []
+    for name, seq in seqs:
+        protein_seqs.append((name, dna2aa(seq)))
+    write_fasta(protein_filename, protein_seqs)
+
+
+# Test code for the dna2aa function. 
+# Will only be executed if this file is run directly
+if __name__ == "__main__":
+    dna2aa("ATGATGATG")
+    dna2aa_fasta('cdna.faa', 'output.faa')
+
diff --git a/prog/p4/readme.md b/prog/p4/readme.md
@@ -0,0 +1,54 @@
+# Programming Lab P1
+
+This is an introductory lab for the programming part of the course.
+Your task is to write a function that can convert mRNA sequence to amino acid sequence.
+To your help you have a scaffold of python code that you should use as to validate your code and also to make sure you follow a standard that the TAs can automatically validate.
+
+### Installation
+
+Begin with downloading the project to your local computer by using this [link](https://download-directory.github.io/?url=https%3A%2F%2Fgithub.com%2Fkth-gt%2Fcb2442%2Ftree%2Fmain%2Fprog%2Fp1). 
+
+
+Unzip the files into a directory and open the directory in VScode. 
+```bash
+$ unzip 'kth-gt cb2442 main prog-p1.zip'
+$ code .
+```
+
+### Implementation
+
+Add a python function to the file `labp1.py` named
+
+```python
+def dna2aa(dna_str):
+```
+
+that takes a dna sequence as input and returns an amino acid sequence. You may use the dictionary `codon2aa`. which translates tripplets of bases into amino acid symbols.
+Also, set the list `authors` to contain all the group members names.  
+
+### Test
+
+You can make an initial execution of your `dna2aa` function by running the ain function of the python file itself by executing the line,
+
+```bash
+$ python3 labp1.py
+```
+
+However ther final test of the code is done by executing the `runner.py` executable, which can be exected from command line as, 
+
+```bash
+$ python3 runner.py
+```
+
+or just
+
+```bash
+$ ./runner.py
+```
+
+This executes the code in `labp1.py`, and validates the results against some known test vectors.
+If you implemented the function right, you will see your names apearing.
+
+### Extra excercise
+
+Change the behaviour of `dna2aa` so that it tries all thre possible frames of translation, and selects the amino acid sequence that has the longest orf of the three alternatives.
diff --git a/prog/p4/runnerp4.py b/prog/p4/runnerp4.py
@@ -0,0 +1,16 @@
+#! /bin/env python3
+import filecmp
+import labp1 as lab
+
+def runner():
+    assert lab.dna2aa('ATTGCGATGGCGCCGGAACCGACCATTGATGAATAA') == 'IAMAPEPTIDE*'
+    assert lab.dna2aa('ATGGCCATGGCGCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA') == 'MAMAPRTEINSTRING*'
+    lab.dna2aa_fasta('cdna.faa', 'output.faa')
+    assert filecmp.cmp('output.faa', '0shift.faa', shallow=False) == True
+    for author in lab.authors:
+        print(author)
+    print('made a function that passed all tests!')
+
+
+if __name__ == "__main__":
+    runner()