-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcheck_atom_names.py
115 lines (102 loc) · 5.25 KB
/
check_atom_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import os
import os.path as path
import Bio.PDB as PDB
def get_unmodified_residue_codes(pdb_file):
modres_names = {}
for line in pdb_file:
if line[:6].strip() == 'MODRES':
modres_name = line[12:15].strip()
std_residue_name = line[24:27].strip()
modres_names[modres_name] = std_residue_name
return modres_names
def process_directory(data_path):
print_step = 10
pdb_parser = PDB.PDBParser(PERMISSIVE=False, QUIET=True)
file_names = os.listdir(data_path)
# filenames = filenames[:1]
# filenames = ['1E93.pdb']
files_proceeded = 0
for filename in file_names:
next_file_path = path.join(data_path, filename)
# if path.isdir(next_file_path):
# files_proceeded += process_directory(next_file_path)
if path.islink(next_file_path):
process_directory(os.readlink(next_file_path))
else:
with open(next_file_path) as next_file:
unmod_res_codes = get_unmodified_residue_codes(next_file)
next_file.seek(0)
structure = pdb_parser.get_structure(filename, next_file)
for residue in structure.get_residues():
# we don't need water
if residue.id[0] == 'W':
continue
# if it's a modified residue
# so we should build spheres only around standard atoms
# because we want to predict modification that didn't happen yet
elif residue.id[0].startswith('H_'):
# skip non amino acid het atoms
if residue.resname not in unmod_res_codes:
continue
for possible_atom_names in standard_residue_atoms[unmod_res_codes[residue.resname]]:
# create 1-element tuple to avoid iterating over string letters
if not isinstance(possible_atom_names, tuple):
possible_atom_names = (possible_atom_names, )
for atom_name in possible_atom_names:
if residue.has_id(atom_name):
break
# residue lacks some necessary atom
else:
print('file: ' + structure.id + ', chain: ' + residue.parent.id +
', residue: ' + str(residue.id) + ' lacks some necessary atom: ' + str(possible_atom_names))
# if residue is not modified then just iterate over all atoms and check that every atom present in standard atoms
else:
for atom in residue:
# skip hydrogen atoms
if atom.element == 'H':
continue
if atom.id == 'OXT':
continue
# check if atom is in standard residue atoms
for possible_atom_names in standard_residue_atoms[residue.resname]:
if not isinstance(possible_atom_names, tuple):
possible_atom_names = (possible_atom_names, )
if atom.id in possible_atom_names:
break
else:
print('file: ' + structure.id + ', chain: ' + residue.parent.id +
', residue: ' + str(residue.id) + ', atom: ' + atom.id + ' is not in standard atoms')
files_proceeded += 1
if files_proceeded % print_step == 0:
print('{0} out of {1} files in {2} proceeded'.format(files_proceeded, len(file_names), data_path))
return files_proceeded
def main():
data_path = sys.argv[1]
process_directory(data_path)
standard_backbone_atoms = ['N', 'CA', 'C', ('O', 'S')]
standard_residue_atoms = {'GLY': [],
'ALA': ['CB'],
'VAL': ['CB', 'CG1', 'CG2'],
'LEU': ['CB', 'CG', 'CD1', 'CD2'],
'ILE': ['CB', 'CG1', 'CG2', 'CD1'],
'MET': ['CB', 'CG', ('SD', 'S', 'SE'), 'CE'],
'PHE': ['CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'],
'TRP': ['CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2'],
'PRO': ['CB', 'CG', 'CD'],
'SER': ['CB', 'OG'],
'THR': ['CB', 'OG1', 'CG2'],
'CYS': ['CB', 'SG'],
'TYR': ['CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH'],
'ASN': ['CB', 'CG', 'OD1', 'ND2'],
'GLN': ['CB', 'CG', 'CD', 'OE1', 'NE2'],
'ASP': ['CB', 'CG', 'OD1', 'OD2'],
'GLU': ['CB', 'CG', 'CD', 'OE1', 'OE2'],
'LYS': ['CB', 'CG', 'CD', 'CE', 'NZ'],
'ARG': ['CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2'],
'HIS': ['CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2']}
for key, value in standard_residue_atoms.items():
value.extend(standard_backbone_atoms)
# check SNP: 3NBJ - 634 A
# check multiple models
main()