-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
207 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
|
||
authors = ['A. Student', 'B. Helper'] | ||
|
||
import numpy as np | ||
|
||
# A dictionary converting codons to amino acids | ||
codon2aa = { | ||
'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N', | ||
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', | ||
'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S', | ||
'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I', | ||
'CAA': 'Q', 'CAC': 'H', 'CAG': 'Q', 'CAT': 'H', | ||
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', | ||
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', | ||
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', | ||
'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D', | ||
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', | ||
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', | ||
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', | ||
'TAA': '*', 'TAC': 'Y', 'TAG': '*', 'TAT': 'Y', | ||
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', | ||
'TGA': '*', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C', | ||
'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F' | ||
} | ||
|
||
## | ||
|
||
|
||
|
||
dist_matr = np.array([[1,2,3], | ||
[1,2,3], | ||
[1,2,3]]) | ||
|
||
dist_matr = [[1,2,3], | ||
[1,2,3], | ||
[1,2,3]] | ||
|
||
names_list = ['s1', 's2', 's3'] | ||
|
||
def upgma(dist_matr, names_list): | ||
new_names_list = names_list | ||
new_dist_matr = dist_matr | ||
while(len(new_names_list) > 1) : | ||
# find smallest distance | ||
smallest = 10000 | ||
for row in range(0, len(new_names_list): | ||
for col in range((row + 1), len(new_names_list): | ||
if (new_dist_matr[row][col] < smallest): | ||
smallest = new_dist_matr[row][col] | ||
smallest_row = row | ||
smallest_col = col | ||
# calculate distance between new cluster and all other clusters | ||
|
||
|
||
# update distance matrix and names list | ||
|
||
|
||
|
||
|
||
|
||
|
||
## Here is an example implementation of a function that translates | ||
# an RNA string into a protein string | ||
def dna2aa(dna_str): | ||
aa_str = '' | ||
for i in range(0, len(dna_str), 3): | ||
codon = dna_str[i:i+3] | ||
if codon not in codon2aa: | ||
continue | ||
aa = codon2aa[codon] | ||
aa_str += aa | ||
return aa_str | ||
|
||
## Here is an example implementation of an extended function that | ||
# handles three frames and returns the longest ORF | ||
def dna2aa_3frame(dna_str): | ||
aa_str, longest_orf = '', 0 | ||
for frame in range(0, 3): | ||
frame_longest_orf, frame_aa_str = 0, '' | ||
len_orf = 0 | ||
for i in range(frame, len(dna_str), 3): | ||
codon = dna_str[i:i+3] | ||
if codon not in codon2aa: | ||
continue | ||
aa = codon2aa[codon] | ||
if aa == '*': | ||
if len_orf > longest_orf: | ||
frame_longest_orf = len_orf | ||
len_orf = 0 | ||
else: | ||
len_orf += 1 | ||
frame_aa_str += aa | ||
if frame_longest_orf > longest_orf: | ||
longest_orf = frame_longest_orf | ||
aa_str = frame_aa_str | ||
return aa_str | ||
|
||
|
||
## Here is a function that reads a FASTA file and returns strings containing tupples of (sequence name, sequence) | ||
def read_fasta(filename): | ||
seqs = [] | ||
with open(filename) as f: | ||
name = None | ||
seq = '' | ||
for line in f: | ||
if line[0] == '>': | ||
if name is not None: | ||
seqs.append((name, seq)) | ||
name = line[1:].strip() | ||
seq = '' | ||
else: | ||
seq += line.strip() | ||
seqs.append((name, seq)) | ||
return seqs | ||
|
||
## Here is a function that writes a FASTA file from a list of (sequence name, sequence) tuples | ||
def write_fasta(filename, seqs): | ||
with open(filename, 'w') as f: | ||
for name, seq in seqs: | ||
f.write('>' + name + '\n') | ||
f.write(seq + '\n') | ||
|
||
## Here is a function that reads a RNA FASTA and writes a protein FASTA | ||
def dna2aa_fasta(dna_filename, protein_filename): | ||
seqs = read_fasta(dna_filename) | ||
protein_seqs = [] | ||
for name, seq in seqs: | ||
protein_seqs.append((name, dna2aa(seq))) | ||
write_fasta(protein_filename, protein_seqs) | ||
|
||
|
||
# Test code for the dna2aa function. | ||
# Will only be executed if this file is run directly | ||
if __name__ == "__main__": | ||
dna2aa("ATGATGATG") | ||
dna2aa_fasta('cdna.faa', 'output.faa') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Programming Lab P1 | ||
|
||
This is an introductory lab for the programming part of the course. | ||
Your task is to write a function that can convert mRNA sequence to amino acid sequence. | ||
To your help you have a scaffold of python code that you should use as to validate your code and also to make sure you follow a standard that the TAs can automatically validate. | ||
|
||
### Installation | ||
|
||
Begin with downloading the project to your local computer by using this [link](https://download-directory.github.io/?url=https%3A%2F%2Fgithub.com%2Fkth-gt%2Fcb2442%2Ftree%2Fmain%2Fprog%2Fp1). | ||
|
||
|
||
Unzip the files into a directory and open the directory in VScode. | ||
```bash | ||
$ unzip 'kth-gt cb2442 main prog-p1.zip' | ||
$ code . | ||
``` | ||
|
||
### Implementation | ||
|
||
Add a python function to the file `labp1.py` named | ||
|
||
```python | ||
def dna2aa(dna_str): | ||
``` | ||
|
||
that takes a dna sequence as input and returns an amino acid sequence. You may use the dictionary `codon2aa`. which translates tripplets of bases into amino acid symbols. | ||
Also, set the list `authors` to contain all the group members names. | ||
|
||
### Test | ||
|
||
You can make an initial execution of your `dna2aa` function by running the ain function of the python file itself by executing the line, | ||
|
||
```bash | ||
$ python3 labp1.py | ||
``` | ||
|
||
However ther final test of the code is done by executing the `runner.py` executable, which can be exected from command line as, | ||
|
||
```bash | ||
$ python3 runner.py | ||
``` | ||
|
||
or just | ||
|
||
```bash | ||
$ ./runner.py | ||
``` | ||
|
||
This executes the code in `labp1.py`, and validates the results against some known test vectors. | ||
If you implemented the function right, you will see your names apearing. | ||
|
||
### Extra excercise | ||
|
||
Change the behaviour of `dna2aa` so that it tries all thre possible frames of translation, and selects the amino acid sequence that has the longest orf of the three alternatives. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#! /bin/env python3 | ||
import filecmp | ||
import labp1 as lab | ||
|
||
def runner(): | ||
assert lab.dna2aa('ATTGCGATGGCGCCGGAACCGACCATTGATGAATAA') == 'IAMAPEPTIDE*' | ||
assert lab.dna2aa('ATGGCCATGGCGCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA') == 'MAMAPRTEINSTRING*' | ||
lab.dna2aa_fasta('cdna.faa', 'output.faa') | ||
assert filecmp.cmp('output.faa', '0shift.faa', shallow=False) == True | ||
for author in lab.authors: | ||
print(author) | ||
print('made a function that passed all tests!') | ||
|
||
|
||
if __name__ == "__main__": | ||
runner() |