Skip to content

Commit

Permalink
added p4
Browse files Browse the repository at this point in the history
  • Loading branch information
andand committed Aug 12, 2023
1 parent d554267 commit f649164
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 0 deletions.
137 changes: 137 additions & 0 deletions prog/p4/labp4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@

authors = ['A. Student', 'B. Helper']

import numpy as np

# A dictionary converting codons to amino acids
codon2aa = {
'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S',
'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I',
'CAA': 'Q', 'CAC': 'H', 'CAG': 'Q', 'CAT': 'H',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
'TAA': '*', 'TAC': 'Y', 'TAG': '*', 'TAT': 'Y',
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
'TGA': '*', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C',
'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F'
}

##



dist_matr = np.array([[1,2,3],
[1,2,3],
[1,2,3]])

dist_matr = [[1,2,3],
[1,2,3],
[1,2,3]]

names_list = ['s1', 's2', 's3']

def upgma(dist_matr, names_list):
new_names_list = names_list
new_dist_matr = dist_matr
while(len(new_names_list) > 1) :
# find smallest distance
smallest = 10000
for row in range(0, len(new_names_list):
for col in range((row + 1), len(new_names_list):
if (new_dist_matr[row][col] < smallest):
smallest = new_dist_matr[row][col]
smallest_row = row
smallest_col = col
# calculate distance between new cluster and all other clusters


# update distance matrix and names list






## Here is an example implementation of a function that translates
# an RNA string into a protein string
def dna2aa(dna_str):
aa_str = ''
for i in range(0, len(dna_str), 3):
codon = dna_str[i:i+3]
if codon not in codon2aa:
continue
aa = codon2aa[codon]
aa_str += aa
return aa_str

## Here is an example implementation of an extended function that
# handles three frames and returns the longest ORF
def dna2aa_3frame(dna_str):
aa_str, longest_orf = '', 0
for frame in range(0, 3):
frame_longest_orf, frame_aa_str = 0, ''
len_orf = 0
for i in range(frame, len(dna_str), 3):
codon = dna_str[i:i+3]
if codon not in codon2aa:
continue
aa = codon2aa[codon]
if aa == '*':
if len_orf > longest_orf:
frame_longest_orf = len_orf
len_orf = 0
else:
len_orf += 1
frame_aa_str += aa
if frame_longest_orf > longest_orf:
longest_orf = frame_longest_orf
aa_str = frame_aa_str
return aa_str


## Here is a function that reads a FASTA file and returns strings containing tupples of (sequence name, sequence)
def read_fasta(filename):
seqs = []
with open(filename) as f:
name = None
seq = ''
for line in f:
if line[0] == '>':
if name is not None:
seqs.append((name, seq))
name = line[1:].strip()
seq = ''
else:
seq += line.strip()
seqs.append((name, seq))
return seqs

## Here is a function that writes a FASTA file from a list of (sequence name, sequence) tuples
def write_fasta(filename, seqs):
with open(filename, 'w') as f:
for name, seq in seqs:
f.write('>' + name + '\n')
f.write(seq + '\n')

## Here is a function that reads a RNA FASTA and writes a protein FASTA
def dna2aa_fasta(dna_filename, protein_filename):
seqs = read_fasta(dna_filename)
protein_seqs = []
for name, seq in seqs:
protein_seqs.append((name, dna2aa(seq)))
write_fasta(protein_filename, protein_seqs)


# Test code for the dna2aa function.
# Will only be executed if this file is run directly
if __name__ == "__main__":
dna2aa("ATGATGATG")
dna2aa_fasta('cdna.faa', 'output.faa')

54 changes: 54 additions & 0 deletions prog/p4/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Programming Lab P1

This is an introductory lab for the programming part of the course.
Your task is to write a function that can convert mRNA sequence to amino acid sequence.
To your help you have a scaffold of python code that you should use as to validate your code and also to make sure you follow a standard that the TAs can automatically validate.

### Installation

Begin with downloading the project to your local computer by using this [link](https://download-directory.github.io/?url=https%3A%2F%2Fgithub.com%2Fkth-gt%2Fcb2442%2Ftree%2Fmain%2Fprog%2Fp1).


Unzip the files into a directory and open the directory in VScode.
```bash
$ unzip 'kth-gt cb2442 main prog-p1.zip'
$ code .
```

### Implementation

Add a python function to the file `labp1.py` named

```python
def dna2aa(dna_str):
```

that takes a dna sequence as input and returns an amino acid sequence. You may use the dictionary `codon2aa`. which translates tripplets of bases into amino acid symbols.
Also, set the list `authors` to contain all the group members names.

### Test

You can make an initial execution of your `dna2aa` function by running the ain function of the python file itself by executing the line,

```bash
$ python3 labp1.py
```

However ther final test of the code is done by executing the `runner.py` executable, which can be exected from command line as,

```bash
$ python3 runner.py
```

or just

```bash
$ ./runner.py
```

This executes the code in `labp1.py`, and validates the results against some known test vectors.
If you implemented the function right, you will see your names apearing.

### Extra excercise

Change the behaviour of `dna2aa` so that it tries all thre possible frames of translation, and selects the amino acid sequence that has the longest orf of the three alternatives.
16 changes: 16 additions & 0 deletions prog/p4/runnerp4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#! /bin/env python3
import filecmp
import labp1 as lab

def runner():
assert lab.dna2aa('ATTGCGATGGCGCCGGAACCGACCATTGATGAATAA') == 'IAMAPEPTIDE*'
assert lab.dna2aa('ATGGCCATGGCGCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA') == 'MAMAPRTEINSTRING*'
lab.dna2aa_fasta('cdna.faa', 'output.faa')
assert filecmp.cmp('output.faa', '0shift.faa', shallow=False) == True
for author in lab.authors:
print(author)
print('made a function that passed all tests!')


if __name__ == "__main__":
runner()

0 comments on commit f649164

Please sign in to comment.