-
Notifications
You must be signed in to change notification settings - Fork 1
/
pdb_tools.py
159 lines (139 loc) · 6.39 KB
/
pdb_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""
ARIADNE - PDB module
Functions for working with PDBs and their Biopython representations.
"""
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union
from Bio.PDB import PDBParser
from Bio.PDB.Chain import Chain
from Bio.PDB.Model import Model
from Bio.PDB.Structure import Structure
import numpy as np
import pandas as pd
import terms
BACKBONE_BONDS = [
((-1, "O3'"), (0, "P")),
((0, "P"), (0, "O5'")),
((0, "O5'"), (0, "C5'")),
((0, "C5'"), (0, "C4'")),
((0, "C4'"), (0, "C3'")),
((0, "C3'"), (0, "O3'")),
]
def get_structure(pdb_file, name="structure"):
structure = PDBParser().get_structure(name, pdb_file)
return structure
def get_model(structure: Union[Model, Structure]) -> Model:
"""
Get the first model from a Biopython PDB structure.
:param structure: structure or model object
:return model: the first model in the structure
"""
if isinstance(structure, Model):
model = structure
else:
models = list(structure.get_models())
model = models[0]
return model
def get_chain_nums_seq(chain: Chain, na: Optional[str] = None) -> Tuple[List[int], str]:
"""
Get the sequence and base numbers of a chain.
:param chain: Biopython chain from which to extract the sequence
na: whether to convert the sequence to DNA or RNA (optional)
:return nums: the number of each residue in the chain
seq: the sequence of the chain using one-letter codes
"""
nums = [residue.get_id()[1] for residue in chain]
seq = "".join([residue.get_resname()[0] for residue in chain])
if na is not None:
if na.upper() == "DNA":
seq = seq.replace("U", "T")
elif na.upper() == "RNA":
seq = seq.replace("T", "U")
else:
raise ValueError(na)
return nums, seq
def get_chains_nums_seqs(chains: List[Chain], na: Optional[str] = None) -> Dict[str, Tuple[List[int], str]]:
"""
Get the sequence and base numbers of every chain in a structure or model.
:param chains:
:return:
"""
chains_nums_seqs = {chain.get_id(): get_chain_nums_seq(chain, na) for chain in chains}
return chains_nums_seqs
def get_bond_types_and_lengths(model, pdb_chain_num_to_cando_num, pair_directions, base_num_to_annotation, g_ax):
bond_types_and_lengths = dict()
bond_types_and_axial_distances = dict()
bond_types_and_planar_distances = dict()
for chain in model:
chain_id = chain.get_id()
prev_base = None
for base in chain:
base_num = base.get_id()[1]
pdb_chain_num = chain_id, base_num
cando_num = pdb_chain_num_to_cando_num[pdb_chain_num]
paired_num = g_ax.get(cando_num)
scaf_num = min(cando_num, paired_num) if paired_num is not None else None
bond_types_and_lengths[pdb_chain_num] = dict()
bond_types_and_axial_distances[pdb_chain_num] = dict()
bond_types_and_planar_distances[pdb_chain_num] = dict()
for bond_type in BACKBONE_BONDS:
(num1_del, atom1_name), (num2_del, atom2_name) = bond_type
if prev_base:
cando_prev = pdb_chain_num_to_cando_num[chain_id, prev_base.get_id()[1]]
paired_prev = g_ax.get(cando_prev)
scaf_prev = min(cando_prev, paired_prev) if paired_prev is not None else None
else:
scaf_prev = None
if num1_del == 0:
atom1 = base[atom1_name]
axis1 = pair_directions.loc[scaf_num] if scaf_num is not None else None
elif num1_del == -1:
atom1 = prev_base[atom1_name] if prev_base is not None else None
axis1 = pair_directions.loc[scaf_prev] if scaf_prev is not None else None
else:
raise ValueError(num1_del)
if num2_del == 0:
atom2 = base[atom2_name]
axis2 = pair_directions.loc[scaf_num] if scaf_num is not None else None
elif num2_del == -1:
atom2 = prev_base[atom2_name] if prev_base is not None else None
axis2 = pair_directions.loc[scaf_prev] if scaf_prev is not None else None
else:
raise ValueError(num1_del)
if atom1 and atom2:
# find the vector connecting the two atoms
atom_displacement = atom2.get_coord() - atom1.get_coord()
distance = np.linalg.norm(atom_displacement)
bond_types_and_lengths[pdb_chain_num][bond_type] = distance
if axis1 is None or axis2 is None:
axial_dist = np.nan
elif base_num_to_annotation[cando_num][1] == terms.EDGE_TM and (num1_del == -1 or num2_del == -1):
# axis switches at edge termini, so can't compute axial distance if using the previous base
axial_dist = np.nan
else:
# project the vector onto the axis of each helix
axial_disp_1 = project_vector(atom_displacement, axis1)
axial_dist_1 = np.linalg.norm(axial_disp_1)
axial_disp_2 = project_vector(atom_displacement, axis2)
axial_dist_2 = np.linalg.norm(axial_disp_2)
assert np.isclose(axial_dist_1, axial_dist_2)
axial_dist = (axial_dist_1 + axial_dist_2) / 2
# compute the planar distance using the Pythagorean theorem
# distance^2 = axial_dist^2 + planar_dist^2
planar_dist = np.sqrt(distance**2 - axial_dist**2)
bond_types_and_axial_distances[pdb_chain_num][bond_type] = axial_dist
bond_types_and_planar_distances[pdb_chain_num][bond_type] = planar_dist
prev_base = base
return bond_types_and_lengths, bond_types_and_axial_distances, bond_types_and_planar_distances
def project_vector(query, target):
"""
Project vector query onto vector target.
:param query:
:param target:
:param query_center:
:param target_center:
:return:
"""
assert query.shape == target.shape
projection = np.dot(query, target) / np.linalg.norm(target)
return projection