-
Notifications
You must be signed in to change notification settings - Fork 2
/
format_data.py
64 lines (36 loc) · 1.71 KB
/
format_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from itertools import combinations
import pandas as pd
import numpy as np
from htmlparser import DistTableHTMLParser
def struct_combos(struct: str, sets: list[str]) -> list[tuple]:
"""Return all combinations of structure sets.
Args:
struct (str): structure ID used to identify structure.
ex. in `'TUP1_A'`, `struct='TUP1'`
sets (list[str]): list of all sets.
ex. `['TUP1_A', 'TUP1_B', 'TUP1_C', '2QMT_A', '2QMT_B', '2QMT_C']`
Returns:
list of set combinations for a given protein
ex. `[('TUP1_A', 'TUP1_B'), ('TUP1_A', 'TUP1_C'), ('TUP1_B', 'TUP1_C')]`
"""
struct_sets = [s for s in sets if s.split('_')[0] == struct]
return list(combinations(struct_sets, 2))
def dist_vector(struct_combos: list[tuple], data: pd.DataFrame) -> list[float]:
"""Return distance vector for all set combinations in one structure.
Args:
struct_combos (list[tuple]): list of all possible set pairs in a given
structure.
data (pd.DataFrame): pandas dataframe of distance matrix generated by iCn3d
Returns:
A concatenated distance vector for a given structure"""
return [data.loc[row,col] for row,col in struct_combos]
def dist_matrix(structs: list, sets: list[str], data: pd.DataFrame) -> pd.DataFrame:
"""Return pandas DataFrame of distance vectors with structure names as index"""
df = pd.DataFrame(columns=['dist_vector'], index=structs)
for struct in structs:
df.loc[struct, 'dist_vector'] = pd.Series({i:v for i,v in enumerate(dist_vector(struct_combos(struct, sets), data))})
return df
def retrieve_set_distances(html_file):
parser = DistTableHTMLParser(html_file)
structs = list(set([s.split('_')[0] for s in parser.set_ids]))
return dist_matrix(structs, parser.set_ids, parser.dist_table())