-
Notifications
You must be signed in to change notification settings - Fork 2
/
clustering_prot.py
executable file
·4675 lines (4173 loc) · 208 KB
/
clustering_prot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#generic python modules
import argparse
import operator
from operator import itemgetter
import sys, os, shutil, itertools
import os.path
################################################################################################################################################
# RETRIEVE USER INPUTS
################################################################################################################################################
#=========================================================================================
# create parser
#=========================================================================================
version_nb = "0.1.a"
parser = argparse.ArgumentParser(prog='clustering_prot', usage='', add_help=False, formatter_class=argparse.RawDescriptionHelpFormatter, description=\
'''
**********************************************
v''' + version_nb + '''
author: Jean Helie ([email protected])
contributors: Anna Duncan, Matthieu Chavent
git: https://github.com/jhelie/clustering_prot
**********************************************
[ DESCRIPTION ]
This script calculates the evolution of proteins clustering status.
It produces 3 outputs ((a) and (b) require --groups to be specified, see note 5):
(a) 2D plots: time evolution of the cluster size each protein is involved in
(b) 1D plots: time evolution of the % of protein represented by each group
(c) stability statistics: max nb of consecutive frames each group existed for [BETA]
The metrics can be calculated for a single frame or for an entire trajectory - and in case
a trajectory is supplied the data for individual frame snapshots can also be produced at a
frequency specified by the option -w (if you don't specify an argument to -w snapshots
will only be written for the first and last frame). Note that writing files to disk will
considerably slow down the execution of the script.
Detection of transmembrane protein clusters
-------------------------------------------
Two clustering algorithms can be used to identify protein clusters.
->Connectivity based (relies on networkX module):
A protein is considered in a cluster if it is within a distance less than --nx_cutoff
from another protein. This means that a single protein can act as a connector between
two otherwise disconnected protein clusters.
This algorithm can be ran using either the minimum distante between proteins (default,
--algorithm 'min') or the distance between their center of geometry (--algorithm 'cog').
The 'min' option scales as the square of the number of proteins and can thus be very
slow for large systems.
->Density based (relies on the sklearn module and its implementation of DBSCAN):
A protein is considered in a cluster if is surrounded by at least --db_neighbours other
proteins within a radius of --db_radius.
This density based approach is usually less suited to the detection of protein
clusters but as a general rule the more compact the clusters, the smaller --db_radius
the higher --db_neighbours can be - for details on this algorithm see its online
documentation.
This algorithm is selected by setting the --algorithm option to 'density'.
The identified protein clusters are considered to be transmembrane only if the closest
lipid headgroup neighbours to the cluster particles are all within the same leaflet.
In addition to the sizes identified, size groups can be defined - see note 7.
VMD visualisation
-----------------
The perturbation calculated can be visualised with VMD either for a single frame or an
entire trajectory. Note that in the case of a trajectory only the processed frame will be
annotated (every 10 by defaults) - you might want to pre-process your trajectory to remove
frames and then set the -t option to 1 when running the script.
->frame:
The thickness or order parameter info is stored in the beta factor column. Just open
the PDB file with VMD and choose Draw Style > Coloring Method > Beta.
->trajectory:
The thickness or order parameter info is stored in a .txt file in folders '/3_VMD/'.
To load it into your trajectory in VMD use the appropriate routine of the script
script 'vmd_parser.tcl' (see https://github.com/jhelie/vmd_parser).
[ REQUIREMENTS ]
The following python modules are needed :
- MDAnalysis
- matplotlib
- networkX (if --algorithm set to 'min' or 'cog')
- sklearn (if --algorithm set to 'density')
[ NOTES ]
1. It's a good idea to trjconv the xtc first and only outputs the proteins, as the
script will run MUCH faster. Also, use the -pbc mol option.
2. The script tracks the accummulation of proteins on each leaflet, but detailed clustering
information is only calculated for transmembrane proteins.
Identification of the bilayer leaflets can be further controlled via 3 ways:
(a) beads
By default, the particles taken into account to define leaflet are:e
-> name PO4 or name PO3 or name B1A
Note that only lipids which contain one of the beads mentioned in the selection string
will be taken into account. If you wish to specify your own selection string (e.g. to
choose different beads or add a bead not in the default list in order to take into
account a particular lipid specie) you can do so by supplying a file via the --beads
option. This file should contain a single line that can be passed as the argument
to MDAnalysis selectAtoms() routine and should not contain any quotation marks, e.g.:
-> name PO4 or name PO3 or name B1A or name AM1
(b) leaflet finding method
By default leaflets are identified using the MDAnalysis LeafletFinder routine and the
the optimum cutoff to identify 2 lipids groups is determined using the optimize_cutoff
routine.
This optimisation process can take time in large systems and you can specify your own
cutoff value to skip this step. For instance to use the default 15 Angstrom cutoff
directly (without optimising):
-> '--leaflets 15'
In very large systems (more then ~50,000 phospholipids) LeafletFinder (or rather the
networkX module that it relies on) can fail. To avoid this you can choose not to use
this routine by specifying:
-> '--leaflets large'
In this case lipids whose headgroups z value is above the average lipids z value will
be considered to make up the upper leaflet and those whose headgroups z value is below
the average will be considered to be in the lower leaflet.
This means that the bilayer should be as flat as possible in the 1st frame of the xtc
file supplied in order to get a meaningful outcome.
NOTE: By default the gro file is only used as a topology file and the 1st frame of the
xtc is used to identify leaflets. If you wish to use the gro file instead, for instance
in the case that the 1st frame of the xtc is not flat, you need to specify the --use_gro
flag: be warned that this might take a few minutes longer on large systems.
(c) flipflopping lipids
In case lipids flipflop during the trajectory, a file listing them can be supplied
with the --flipflops option. Each line of this file should follow the format:
-> 'resname,resid,starting_leaflet,z_bead'
where starting_leaflet is either 'upper' or 'lower' - e.g. 'POPC,145,lower,PO4'. The
z_bead is used to track the position of the lipid.
If flipflopping lipids are not specified they may add significant noise to results.
3. Proteins are detected automatically but you can specify an input file to define your
own selection with the -p option.
In this case the supplied file should contain on each line a protein selection string
that can be passed as the argument of the MDAnalysis selectAtoms() routine - for
instance 'bynum 1:344'.
4. Clusters statistics can be binned into size groups. The size groups are defined by
supplying a file with the --groups option, whose lines all follow the format:
-> 'lower_size,upper_size, colour'
Size groups definition should follow the following rules:
-to specify an open ended group use 'max', e.g. '3,max,colour'
-groups should be ordered by increasing size and their boundaries should not overlap
-boundaries are inclusive so you can specify one size groups with 'size,size,colour'
-colours must be specified for each group (see (c) in note 5)
-any cluster size not falling within the specified size groups will be labeled as
'other' and coloured in grey (#E6E6E6)
5. The colours associated to each TM cluster size identified are based on the matplotlib
'jet' colour map by default. You can specify your own colours as follows:
(a) Colour of individual cluster sizes
Colours of individual cluster sizes use the matplotlib 'jet' colour scheme and cannot
be modified.
(b) Colour of size groups
See note 4 above: exact colour control can be achieved for each size by using size
groups of unitary size, so if you want to control individual cluster sizes colours just
specify the relevant group file.
(c) Colour definition
Colours can be specified using single letter code (rgbcmykw), hex code or the name of
a colour map (see the matplotlib website for a list of the available colour maps).
In case a colour map is used, its name must be specified as the colour for each lipid
specie or size group.
6. The script automatically detects proteins and, by default, groups them into species labelled
'A', 'B', etc... based on sequences. Oligomers, ie proteins made up of the repeat of a
same sequence, are not detected by default and just considered as a protein with the overall
sequence which can effectively decrease your sampling of contacts on heatmaps representing
residues interactions.
The --species flag allows to specify flag to name proteins and address the oligomer issue.
Each line of this file should follow the format (without the quotes):
->'name,multiplicity,sequence'
with the multiplicity referring to the number of times the sequence appears. So, in the
case of, say, a trimer 'multiplicity' can be set to 3 with 'sequence' corresponding to
the sequence of a monomer. For proteins which do not involve the repetition of a base
unit 'multiplicity' should be 1 and 'sequence' the complete protein sequence.
1 letter code should be used for sequence.
7. The --nx_cutoff option is mandatory and allows to control the distance for which proteins are
considered to be in contact.
In case the 'min' option for the --algorithm flag is specified, the --nx_cutoff flag should be
set to a float value (typically 6 or 8 Angstrom in CG).
In case the 'cog' option for --algorithm is used (which should always be the case for rigid TM
proteins as it is MUCH faster) the --nx_cutoff can either be set to a float value (Angstrom) or
used to specify a text file. This latter option is particularly useful if the system contains
several protein species of very different sizes. In this case a cutoff specific to each type of
interactions must be supplied in the txt file, with each line following the format:
-> 'specie1_index,specie2_index,cutoff_distance_between_cog1_and_cog2'
where 'specie1_index' and 'specie2_index' correspond to 0-based indices referring to a specie of
proteins present in the system (numbered according to their appearance in the pdb or gro file).
For instance, for a system containing two types of proteins, the following would be acceptable:
'0,0,60'
'1,1,100'
'0,1,80'
NB: this means that the systems MUST be built so that the different protein species are listed
sequentially in the gro/pdb files (they cannot alternate) ant that you know the order in which
the species are listed. Doing otherwise would generally be poor practice and lead to complex
topology files anyway.
8. The --res_used option is for when using the 'cog' algorithm, in the event that only part of
the protein is required to generate a protein centre of geometry (eg. for Kir channels,
the protein tilts relative to the membrane, but clustering occurs only via a single
extramembrane domain, so distances between cog for whole proteins vary more than the
cog of distances between the domains that actually cluster, giving misleading results).
The --res_used value should be used to specify a filename, with format similar to that for
the --nx_cutoff input file, ie., each line should have the format:
'species_index,residue_list'
The format of residue list input for --res_used is as follows:
resx1:resx2,resx3:resx4,resx5:resx6
where 'resx1:resx2' denotes residues in the range x1 to x2 (*inclusive*)
and ',' separates residues ranges. Note that there are no spaces.
For example, in a system with three protein species, where residues 1 to 15 are required for
species 1; all residues for species 2; and residues 3 to 10 and 50 to 60 are required
for species 3, the file would contain the lines:
0,1:15
2,3:10,50:60
(Note that to specify all residues, the species index is not included in the file)
9. --graph_order controls the order of the proteins in the graph in 4_clusters_sizes/4_1_plots_2D
(ie. the colour plot showing time vs. protein vs. cluster size).
Default is "none", which leaves the proteins in the order present in the gro file
If "all" is selected, all proteins are re-ordered so the evolution of cluster size can be seen.
"byspecies" orders the proteins as above, but only within protein species.
[ USAGE ]
Option Default Description
-----------------------------------------------------
-f : structure file [.gro] (required)
-x : trajectory file [.xtc]
-o : name of output folder
-b : beginning time (ns)
-e : ending time (ns)
-t 10 : process every t-frames
-w : write snapshots every [w] processed frames (see 'DESCRIPTION')
Lipids identification (see note 2)
-----------------------------------------------------
--beads : leaflet identification technique, see note 2(a)
--flipflops : input file with flipflopping lipids, see note 2(c)
--leaflets optimise: leaflet identification 'optimise', 'large', 'no' or float, see note 2(b)
--use_gro : use gro file instead of xtc, see note 2(b)
Proteins properties
-----------------------------------------------------
--species : file defining name,multiplicity and sequence of proteins, see note 6
--groups : cluster groups definition file, see note 4
--res_contact 8 : cutoff to consider contacts between residues c.o.g (Angstrom)
--res_show 1 : show interactions for residues for at least that many % of contacts between proteins
--algorithm cog : 'cog','min' or 'density', see 'DESCRIPTION'
--nx_cutoff : networkX cutoff distance for protein-protein contact (Angstrom or file), see note 7
--res_used : file containing list of residues to use to generate protein cog, if --algorithm cog, see note 8
--db_radius 20 : DBSCAN search radius (Angstrom)
--db_neighbours 3 : DBSCAN minimum number of neighbours within a circle of radius --db_radius
Other options
-----------------------------------------------------
--graph_order none : 'none', 'byspecies', or 'all' - ordering of proteins in 2D prot vs. cluster size vs. time plot - see note 9.
--max_clust none : 'none' or an integer max cluster size, to be used for the colorscale in 2D protein cluster size plot
--version : show version number and exit
-h, --help : show this menu and exit
''')
#data options
parser.add_argument('-f', nargs=1, dest='grofilename', default=['no'], help=argparse.SUPPRESS, required=True)
parser.add_argument('-x', nargs=1, dest='xtcfilename', default=['no'], help=argparse.SUPPRESS)
parser.add_argument('-o', nargs=1, dest='output_folder', default=['no'], help=argparse.SUPPRESS)
parser.add_argument('-b', nargs=1, dest='t_start', default=[-1], type=int, help=argparse.SUPPRESS)
parser.add_argument('-e', nargs=1, dest='t_end', default=[-1], type=int, help=argparse.SUPPRESS)
parser.add_argument('-t', nargs=1, dest='frames_dt', default=[10], type=int, help=argparse.SUPPRESS)
parser.add_argument('-w', nargs='?', dest='frames_write_dt', const=1000000000000000, default="no", help=argparse.SUPPRESS)
#lipids identification
parser.add_argument('--beads', nargs=1, dest='beadsfilename', default=['no'], help=argparse.SUPPRESS)
parser.add_argument('--flipflops', nargs=1, dest='selection_file_ff', default=['no'], help=argparse.SUPPRESS)
parser.add_argument('--leaflets', nargs=1, dest='cutoff_leaflet', default=['optimise'], help=argparse.SUPPRESS)
parser.add_argument('--use_gro', dest='use_gro', action='store_true', help=argparse.SUPPRESS)
#protein clusters identification
parser.add_argument('--species', nargs=1, dest='species_file', default=['no'], help=argparse.SUPPRESS)
parser.add_argument('--groups', nargs=1, dest='cluster_groups_file', default=['no'], help=argparse.SUPPRESS)
parser.add_argument('--res_contact', nargs=1, dest='res_contact', default=[8], type=float, help=argparse.SUPPRESS)
parser.add_argument('--res_show', nargs=1, dest='res_show', default=[1], type=float, help=argparse.SUPPRESS)
parser.add_argument('--algorithm', dest='m_algorithm', choices=['cog','min','density'], default='cog', help=argparse.SUPPRESS)
parser.add_argument('--nx_cutoff', nargs=1, dest='nx_cutoff', help=argparse.SUPPRESS, required=True)
parser.add_argument('--res_used', nargs=1, dest='res_used', default=['none'], help=argparse.SUPPRESS)
parser.add_argument('--db_radius', nargs=1, dest='dbscan_dist', default=[20], type=float, help=argparse.SUPPRESS)
parser.add_argument('--db_neighbours', nargs=1, dest='dbscan_nb', default=[3], type=int, help=argparse.SUPPRESS)
#other options
parser.add_argument('--graph_order', nargs=1, dest='order', default=['none'], help=argparse.SUPPRESS)
parser.add_argument('--max_clust', nargs=1, dest='max_clust', default=['none'], help=argparse.SUPPRESS)
parser.add_argument('--buffer_size', nargs=1, dest='buffer_size', default=[100], type=int, help=argparse.SUPPRESS)
parser.add_argument('--version', action='version', version='%(prog)s v' + version_nb, help=argparse.SUPPRESS)
parser.add_argument('-h','--help', action='help', help=argparse.SUPPRESS)
#=========================================================================================
# store inputs
#=========================================================================================
#parse user inputs
#-----------------
args = parser.parse_args()
#data options
args.grofilename = args.grofilename[0]
args.xtcfilename = args.xtcfilename[0]
args.output_folder = args.output_folder[0]
args.t_start=args.t_start[0]
args.t_end=args.t_end[0]
args.frames_dt=args.frames_dt[0]
if args.frames_write_dt != "no":
args.frames_write_dt = int(args.frames_write_dt)
#lipids identification
args.beadsfilename = args.beadsfilename[0]
args.cutoff_leaflet = args.cutoff_leaflet[0]
args.selection_file_ff = args.selection_file_ff[0]
#protein clusters identification
args.species_file = args.species_file[0]
args.cluster_groups_file = args.cluster_groups_file[0]
args.res_contact = args.res_contact[0]
args.res_show = args.res_show[0]
args.nx_cutoff = args.nx_cutoff[0]
args.res_used = args.res_used[0]
args.dbscan_dist = args.dbscan_dist[0]
args.dbscan_nb = args.dbscan_nb[0]
#other options
args.order = args.order[0]
args.max_clust = args.max_clust[0]
args.buffer_size = args.buffer_size[0]
#process options
#---------------
global vmd_counter
global vmd_cluster_size
global vmd_cluster_group
global lipids_ff_nb
global colour_group_other
global colour_leaflet_lower
global colour_leaflet_upper
global nx_cutoff_file
global cog_cutoff_values
global res_used
vmd_counter = 0
vmd_cluster_size = ""
vmd_cluster_group = ""
lipids_ff_nb = 0
nx_cutoff_file = False
cog_cutoff_values = {}
res_used = {}
colour_group_other = "#E6E6E6" #very light grey
colour_leaflet_lower = "#808080" #dark grey
colour_leaflet_upper = "#C0C0C0" #light grey
#leaflet identification
if args.cutoff_leaflet != "large" and args.cutoff_leaflet != "no" and args.cutoff_leaflet != "optimise":
try:
args.cutoff_leaflet = float(args.cutoff_leaflet)
except:
print "Error: the argument of the --leaflets option should set to a number, 'optimise', 'large' or 'no', see note 2"
sys.exit(1)
#=========================================================================================
# import modules (doing it now otherwise might crash before we can display the help menu!)
#=========================================================================================
#generic science modules
try:
import math
except:
print "Error: you need to install the maths module."
sys.exit(1)
try:
import numpy as np
except:
print "Error: you need to install the np module."
sys.exit(1)
try:
import scipy
except:
print "Error: you need to install the scipy module."
sys.exit(1)
try:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.colors as mcolors
mcolorconv = mcolors.ColorConverter()
import matplotlib.cm as cm #colours library
import matplotlib.ticker
from matplotlib.ticker import MaxNLocator, NullFormatter, ScalarFormatter
from matplotlib.font_manager import FontProperties
fontP=FontProperties()
except:
print "Error: you need to install the matplotlib module."
sys.exit(1)
try:
import pylab as plt
except:
print "Error: you need to install the pylab module."
sys.exit(1)
#MDAnalysis module
try:
import MDAnalysis
from MDAnalysis import *
import MDAnalysis.analysis
import MDAnalysis.analysis.leaflet
import MDAnalysis.analysis.distances
#set MDAnalysis to use periodic boundary conditions
MDAnalysis.core.flags['use_periodic_selections'] = True
MDAnalysis.core.flags['use_KDTree_routines'] = False
except:
print "Error: you need to install the MDAnalysis module first. See http://mdanalysis.googlecode.com"
sys.exit(1)
#=========================================================================================
# sanity check
#=========================================================================================
if not os.path.isfile(args.grofilename):
print "Error: file " + str(args.grofilename) + " not found."
sys.exit(1)
if args.cluster_groups_file != "no" and args.max_clust != "none":
print "Error: Only one of --groups and --max_clust should be specified; they do not work together"
print "Please reconsider inputs such that only one of these options is used"
sys.exit(1)
if args.cluster_groups_file != "no" and not os.path.isfile(args.cluster_groups_file):
print "Error: file " + str(args.cluster_groups_file) + " not found."
sys.exit(1)
if args.selection_file_ff != "no" and not os.path.isfile(args.selection_file_ff):
print "Error: file " + str(args.selection_file_ff) + " not found."
sys.exit(1)
if args.species_file != "no" and not os.path.isfile(args.species_file):
print "Error: file " + str(args.species_file) + " not found."
sys.exit(1)
if args.beadsfilename != "no" and not os.path.isfile(args.beadsfilename):
print "Error: file " + str(args.beadsfilename) + " not found."
sys.exit(1)
if args.t_end != -1 and args.t_end < args.t_start:
print "Error: the starting time (" + str(args.t_start) + "ns) for analysis is later than the ending time (" + str(args.t_end) + "ns)."
sys.exit(1)
if args.buffer_size < -1:
print "Error: the option --buffer_size should be greater than -1 (set to " + str(args.buffer_size) + ")."
sys.exit(1)
if args.xtcfilename == "no":
if '-t' in sys.argv:
print "Error: -t option specified but no xtc file specified."
sys.exit(1)
elif '-b' in sys.argv:
print "Error: -b option specified but no xtc file specified."
sys.exit(1)
elif '-e' in sys.argv:
print "Error: -e option specified but no xtc file specified."
sys.exit(1)
elif '-w' in sys.argv:
print "Error: -w option specified but no xtc file specified."
sys.exit(1)
elif not os.path.isfile(args.xtcfilename):
print "Error: file " + str(args.xtcfilename) + " not found."
sys.exit(1)
try:
args.nx_cutoff = float(args.nx_cutoff)
except:
if not os.path.isfile(args.nx_cutoff):
print "Error: the --nx_cutoff should either be a float or a text file, see note 7."
sys.exit(1)
else:
print "\nReading cog cutoffs definition file..."
nx_cutoff_file = True
s_indices = []
with open(args.nx_cutoff) as f:
lines = f.readlines()
for l_index in range(0, len(lines)):
#get line content
line = lines[l_index]
if line[-1] == "\n":
line = line[:-1]
l_content = line.split(',')
#check line format
if len(l_content) != 3:
print "Error: the format of line " + str(l_index+1) + " is incorrect, see clustering_prot --help, note 7."
print "->", line
sys.exit(1)
else:
#store cutoffs
s1_index = int(l_content[0])
s2_index = int(l_content[1])
s1s2_cutoff = float(l_content[2])
cog_cutoff_values[s1_index, s2_index] = s1s2_cutoff
cog_cutoff_values[s2_index, s1_index] = s1s2_cutoff
#store list of indices
s_indices.append(s1_index)
s_indices.append(s2_index)
#check there's no gap
s_indices_unique = np.unique(s_indices)
if min(s_indices_unique) != 0:
print "Error: species indexing should start at 0."
sys.exit(1)
if max(s_indices_unique) >= len(s_indices_unique):
print "Error: cutoff not specified for all species."
sys.exit(1)
for s_index in range(0, len(s_indices_unique)):
for ss_index in range(0, len(s_indices_unique)):
if (s_index, ss_index) not in cog_cutoff_values.keys():
print "Error: cutoff not specified for all species."
sys.exit(1)
if not os.path.isfile(args.res_used):
print "No res_used file specified, so default (all residues) will be used to calculate each protein centre of geometry - see note 8. for more details on this option"
else:
print "\nReading res_used definition file..."
s_indices = []
with open(args.res_used) as f:
lines = f.readlines()
for l_index in range(0, len(lines)):
#get line content
line = lines[l_index]
if line == "\n":
continue
if line[-1] == "\n":
line = line[:-1]
l_content = line.split(',')
print "residues to be used: ",l_content
#check line format
if len(l_content) < 2:
print "Error: the format of line " + str(l_index+1) + " is incorrect, see clustering_prot --help, note 8."
print "->", line
sys.exit(1)
else:
#store res_used string
s_index = int(l_content[0])
res_used[s_index] = " and (resid "+" or resid ".join(l_content[1:])+")"
#store list of indices
s_indices.append(s_index)
# report proteins with res_used lists and check there are no repeats
print "res_used lists have been stored for species: ",s_indices
s_indices_unique = np.unique(s_indices)
if len(s_indices) > len(s_indices_unique):
print "Error: one or more proteins have >1 res_used list defined."
sys.exit(1)
if args.m_algorithm != "density":
if '--db_radius' in sys.argv:
print "Error: --db_radius option specified but --algorithm option set to '" + str(args.m_algorithm) + "'."
sys.exit(1)
elif '--db_neighbours' in sys.argv:
print "Error: --db_neighbours option specified but --algorithm option set to '" + str(args.m_algorithm) + "'."
sys.exit(1)
else:
if '--nx_cutoff' in sys.argv:
print "Error: --nx_cutoff option specified but --algorithm option set to 'density'."
sys.exit(1)
order_options = ["none","byspecies","all"]
if args.order not in order_options:
print "Error: --graph_order set to {} but should be one of {}".format( args.order, ", ".join(order_options) )
sys.exit(1)
if args.max_clust != 'none':
try:
args.max_clust = int(args.max_clust)
except ValueError:
print " Error: --max_clust should either be 'none' or an integer. Currently it is set to: {}".format(args.max_clust)
sys.exit(1)
#=========================================================================================
# create folders and log file
#=========================================================================================
#create folder name
if args.output_folder == "no":
if args.xtcfilename == "no":
args.output_folder = "clustering_prot_" + args.grofilename[:-4].split('/')[-1]
else:
args.output_folder = "clustering_prot_" + args.xtcfilename[:-4].split('/')[-1]
#create folder and sub-folders
if os.path.isdir(args.output_folder):
print "Error: folder " + str(args.output_folder) + " already exists, choose a different output name via -o."
sys.exit(1)
else:
#create folders
#--------------
os.mkdir(args.output_folder)
#snapshots
os.mkdir(args.output_folder + "/1_snapshots")
os.mkdir(args.output_folder + "/1_snapshots/sizes")
if args.cluster_groups_file!="no":
os.mkdir(args.output_folder + "/1_snapshots/groups")
#interactions and compositions
os.mkdir(args.output_folder + "/2_proteins_interactions")
os.mkdir(args.output_folder + "/2_proteins_interactions/xvg")
os.mkdir(args.output_folder + "/3_clusters_compositions")
#cluster sizes
os.mkdir(args.output_folder + "/4_clusters_sizes/")
if args.xtcfilename != "no":
os.mkdir(args.output_folder + "/4_clusters_sizes/4_1_plots_2D")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_1_plots_2D/xvg")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_1_plots_2D/png")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_2_plots_1D")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_2_plots_1D/png")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_2_plots_1D/xvg")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_3_biggest")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_3_biggest/png")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_3_biggest/xvg")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_4_mostrep")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_4_mostrep/png")
os.mkdir(args.output_folder + "/4_clusters_sizes/4_4_mostrep/xvg")
#cluster groups
if args.cluster_groups_file!="no":
os.mkdir(args.output_folder + "/5_clusters_groups")
if args.xtcfilename != "no":
os.mkdir(args.output_folder + "/5_clusters_groups/5_1_plots_2D")
os.mkdir(args.output_folder + "/5_clusters_groups/5_1_plots_2D/xvg")
os.mkdir(args.output_folder + "/5_clusters_groups/5_1_plots_2D/png")
os.mkdir(args.output_folder + "/5_clusters_groups/5_2_plots_1D")
os.mkdir(args.output_folder + "/5_clusters_groups/5_2_plots_1D/png")
os.mkdir(args.output_folder + "/5_clusters_groups/5_2_plots_1D/xvg")
#VMD
if args.xtcfilename != "no":
os.mkdir(args.output_folder + "/6_VMD")
#create log
#----------
filename_log=os.getcwd() + '/' + str(args.output_folder) + '/clustering_prot.log'
output_log=open(filename_log, 'w')
output_log.write("[clustering_prot v" + str(version_nb) + "]\n")
output_log.write("\nThis folder and its content were created using the following command:\n\n")
tmp_log = "python clustering_prot.py"
for c in sys.argv[1:]:
tmp_log+=" " + c
output_log.write(tmp_log + "\n")
output_log.close()
#copy input files
#----------------
if args.selection_file_ff != "no":
shutil.copy2(args.selection_file_ff,args.output_folder + "/")
if args.cluster_groups_file != "no":
shutil.copy2(args.cluster_groups_file,args.output_folder + "/")
if args.species_file != "no":
shutil.copy2(args.species_file,args.output_folder + "/")
if args.beadsfilename != "no":
shutil.copy2(args.beadsfilename,args.output_folder + "/")
##########################################################################################
# FUNCTIONS DEFINITIONS
##########################################################################################
#=========================================================================================
# data loading
#=========================================================================================
def set_proteins_database():
global proteins_db_colours
global proteins_db_sequences
global proteins_db_multiplicity
global res_code_3to1
res_code_3to1 = {}
res_code_3to1['ALA'] = 'A'
res_code_3to1['ARG'] = 'R'
res_code_3to1['ASN'] = 'N'
res_code_3to1['ASP'] = 'D'
res_code_3to1['CYS'] = 'C'
res_code_3to1['GLU'] = 'E'
res_code_3to1['GLN'] = 'Q'
res_code_3to1['GLY'] = 'G'
res_code_3to1['HIS'] = 'H'
res_code_3to1['ILE'] = 'I'
res_code_3to1['LEU'] = 'L'
res_code_3to1['LYS'] = 'K'
res_code_3to1['MET'] = 'M'
res_code_3to1['PHE'] = 'F'
res_code_3to1['PRO'] = 'P'
res_code_3to1['SER'] = 'S'
res_code_3to1['THR'] = 'T'
res_code_3to1['TRP'] = 'W'
res_code_3to1['TYR'] = 'Y'
res_code_3to1['VAL'] = 'V'
#make the colour parameter readable from the supplied text file
proteins_db_colours = {}
proteins_db_colours["OmpF"] = 'c'
proteins_db_colours["BtuB"] = 'y'
proteins_db_colours["transportan"] = 'y'
proteins_db_sequences = {}
proteins_db_sequences["OmpF"] = 'AEIYNKDGNKVDLYGKAVGLHYFSKGNGENSYGGNGDMTYARLGFKGETQINSDLTGYGQWEYNFQGNNSEGADAQTGNKTRLAFAGLKYADVGSFDYGRNYGVVYDALGYTDMLPEFGGDTAYSDDFFVGRVGGVATYRNSNFFGLVDGLNFAVQYLGKNERDTARRSNGDGVGGSISYEYEGFGIVGAYGAADRTNLQEAQPLGNGKKAEQWATGLKYDANNIYLAANYGETRNATPITNKFTNTSGFANKTQDVLLVAQYQFDFGLRPSIAYTKSKAKDVEGIGDVDLVNYFEVGATYYFNKNMSTYVDYIINQIDSDNKLGVGSDDTVAVGIVYQFAEIYNKDGNKVDLYGKAVGLHYFSKGNGENSYGGNGDMTYARLGFKGETQINSDLTGYGQWEYNFQGNNSEGADAQTGNKTRLAFAGLKYADVGSFDYGRNYGVVYDALGYTDMLPEFGGDTAYSDDFFVGRVGGVATYRNSNFFGLVDGLNFAVQYLGKNERDTARRSNGDGVGGSISYEYEGFGIVGAYGAADRTNLQEAQPLGNGKKAEQWATGLKYDANNIYLAANYGETRNATPITNKFTNTSGFANKTQDVLLVAQYQFDFGLRPSIAYTKSKAKDVEGIGDVDLVNYFEVGATYYFNKNMSTYVDYIINQIDSDNKLGVGSDDTVAVGIVYQFAEIYNKDGNKVDLYGKAVGLHYFSKGNGENSYGGNGDMTYARLGFKGETQINSDLTGYGQWEYNFQGNNSEGADAQTGNKTRLAFAGLKYADVGSFDYGRNYGVVYDALGYTDMLPEFGGDTAYSDDFFVGRVGGVATYRNSNFFGLVDGLNFAVQYLGKNERDTARRSNGDGVGGSISYEYEGFGIVGAYGAADRTNLQEAQPLGNGKKAEQWATGLKYDANNIYLAANYGETRNATPITNKFTNTSGFANKTQDVLLVAQYQFDFGLRPSIAYTKSKAKDVEGIGDVDLVNYFEVGATYYFNKNMSTYVDYIINQIDSDNKLGVGSDDTVAVGIVYQF'
proteins_db_sequences["BtuB"] = 'QDTSPDTLVVTANRFEQPRSTVLAPTTVVTRQDIDRWQSTSVNDVLRRLPGVDITQNGGSGQLSSIFIRGTNASHVLVLIDGVRLNLAGVSGSADLSQFPIALVQRVEYIRGPRSAVYGSDAIGGVVNIITTRDEPGTEISAGWGSNSYQNYDVSTQQQLGDKTRVTLLGDYAHTHGYDVVAYGNTGTQAQTDNDGFLSKTLYGALEHNFTDAWSGFVRGYGYDNRTNYDAYYSPGSPLLDTRKLYSQSWDAGLRYNGELIKSQLITSYSHSKDYNYDPHYGRYDSSATLDEMKQYTVQWANNVIVGHGSIGAGVDWQKQTTTPGTGYVEDGYDQRNTGIYLTGLQQVGDFTFEGAARSDDNSQFGRHGTWQTSAGWEFIEGYRFIASYGTSYKAPNLGQLYGFYGNPNLDPEKSKQWEGAFEGLTAGVNWRISGYRNDVSDLIDYDDHTLKYYNEGKARIKGVEATANFDTGPLTHTVSYDYVDARNAITDTPLLRRAKQQVKYQLDWQLYDFDWGITYQYLGTRYDKDYSSYPYQTVKMGGVSLWDLAVAYPVTSHLTVRGKIANLFDKDYETVYGYQTAGREYTLSGSYTF'
proteins_db_sequences["transportan"] = 'GWTLNSAGYLLGKINLKALAALAKKIL'
proteins_db_multiplicity = {}
proteins_db_multiplicity["OmpF"] = 3
proteins_db_multiplicity["BtuB"] = 1
proteins_db_multiplicity["transportan"] = 1
if args.species_file != "no":
print "\nReading species definition file..."
with open(args.species_file) as f:
lines = f.readlines()
print ' -found ' + str(len(lines)) + ' species definition'
nb_lines = len(lines)
for l_index in range(0,nb_lines):
#get current line
line = lines[l_index]
#check format
if line[-1] == "\n":
line = line[:-1]
l_content = line.split(',')
if len(l_content) != 3:
print "Error: the format of line " + str(l_index + 1) + " should be 'name,multiplicity,sequence' (see clustering_prot --help, note 6)."
print "->", line
sys.exit(1)
else:
tmp_name = l_content[0]
tmp_mult = int(l_content[1])
tmp_seq = l_content[2]
#display update
progress='\r -creating species entries: ' + str(tmp_name) + ' '
sys.stdout.flush()
sys.stdout.write(progress)
if tmp_name in proteins_db_sequences.keys():
print "Error: a specie named " + str(tmp_name) + " is already defined."
print "->", line
sys.exit(1)
elif tmp_mult == 0:
print "Error: multiplicity should be greater or equal to 1."
print "->", line
sys.exit(1)
else:
proteins_db_colours[tmp_name] = np.random.rand(3,) #at this stage assign random colours to protein, change it so that can be user specified
proteins_db_sequences[tmp_name] = tmp_seq*tmp_mult
proteins_db_multiplicity[tmp_name] = tmp_mult
print ""
return
def get_sequence(seq3):
seq1 = ""
for r in seq3:
try:
seq1 += res_code_3to1[r]
except:
print "Warning: unknown residue code '" + str(r)
return seq1
def set_lipids_beads():
global leaflet_sele_string
#set default beads
leaflet_sele_string = "name PO4 or name PO3 or name B1A"
#use users input
if args.beadsfilename != "no":
with open(args.beadsfilename) as f:
lines = f.readlines()
if len(lines) > 1:
print "Error: the file " + str(args.beadsfilename) + " should conly ontain 1 line (" + str(len(lines)) + " found), see note 2(a)."
sys.exit(1)
else:
if lines[0][-1] == "\n":
lines[0] = lines[0][:-1]
leaflet_sele_string = lines[0]
return
def load_MDA_universe():
global U
global all_atoms
global nb_atoms
global nb_frames_xtc
global frames_to_process
global frames_to_write
global nb_frames_to_process
global f_start
global f_end
f_start = 0
if args.xtcfilename == "no":
print "\nLoading file..."
U = Universe(args.grofilename)
all_atoms = U.selectAtoms("all")
nb_atoms = all_atoms.numberOfAtoms()
nb_frames_xtc = 1
frames_to_process = [0]
frames_to_write = [True]
nb_frames_to_process = 1
f_end = f_start ## added 28 Sept - ALD - o/w progress update crashes if not running with xtc
else:
print "\nLoading trajectory..."
if args.use_gro:
global U_gro
U_gro = Universe(args.grofilename)
U = Universe(args.grofilename, args.xtcfilename)
U_timestep = U.trajectory.dt
all_atoms = U.selectAtoms("all")
nb_atoms = all_atoms.numberOfAtoms()
nb_frames_xtc = U.trajectory.numframes
#sanity check
if U.trajectory[nb_frames_xtc-1].time/float(1000) < args.t_start:
print "Error: the trajectory duration (" + str(U.trajectory.time/float(1000)) + "ns) is shorted than the starting stime specified (" + str(args.t_start) + "ns)."
sys.exit(1)
if U.trajectory.numframes < args.frames_dt:
print "Warning: the trajectory contains fewer frames (" + str(nb_frames_xtc) + ") than the frame step specified (" + str(args.frames_dt) + ")."
#rewind traj (very important to make sure that later the 1st frame of the xtc will be used for leaflet identification)
U.trajectory.rewind()
#create list of index of frames to process
if args.t_end != -1:
f_end = int((args.t_end*1000 - U.trajectory[0].time) / float(U_timestep))
if f_end < 0:
print "Error: the starting time specified is before the beginning of the xtc."
sys.exit(1)
else:
f_end = nb_frames_xtc - 1
if args.t_start != -1:
f_start = int((args.t_start*1000 - U.trajectory[0].time) / float(U_timestep))
if f_start > f_end:
print "Error: the starting time specified is after the end of the xtc."
sys.exit(1)
if (f_end - f_start)%args.frames_dt == 0:
tmp_offset = 0
else:
tmp_offset = 1
frames_to_process = map(lambda f:f_start + args.frames_dt*f, range(0,(f_end - f_start)//args.frames_dt+tmp_offset))
nb_frames_to_process = len(frames_to_process)
#create list of frames to write
if args.frames_write_dt == "no":
frames_to_write = [False for f_index in range(0, nb_frames_to_process)]
else:
frames_to_write = [True if (f_index % args.frames_write_dt == 0 or f_index == (f_end - f_start)//args.frames_dt) else False for f_index in range(0, nb_frames_to_process)]
return
def identify_ff():
print "\nReading selection file for flipflopping lipids..."
#declare variables
global lipids_ff_nb
global lipids_ff_info
global lipids_ff_resnames
global lipids_ff_leaflet
global lipids_ff_u2l_index
global lipids_ff_l2u_index
global lipids_sele_ff
global lipids_sele_ff_bead
global lipids_sele_ff_bonds
global lipids_sele_ff_VMD_string
global leaflet_sele_string
lipids_ff_nb = 0
lipids_ff_info = {}
lipids_ff_resnames = []
lipids_ff_leaflet = []
lipids_ff_u2l_index = []
lipids_ff_l2u_index = []
lipids_sele_ff = {}
lipids_sele_ff_bead = {}
lipids_sele_ff_bonds = {}
lipids_sele_ff_VMD_string={}
with open(args.selection_file_ff) as f:
lines = f.readlines()
lipids_ff_nb = len(lines)
print " -found " + str(lipids_ff_nb) + " flipflopping lipids"
leaflet_sele_string = leaflet_sele_string + " and not ("
for l_index in range(0,lipids_ff_nb):
line = lines[l_index]
if line[-1] == "\n":
line = line[:-1]
try:
line_content = line.split(',')
if len(line_content) != 4:
print "Error: wrong format for line " + str(l_index+1) + " in " + str(args.selection_file_ff) + ", see note 4 in clustering_prot --help."
print " ->", line
sys.exit(1)
#read current lipid details
lip_resname = line_content[0]
lip_resnum = int(line_content[1])
lip_leaflet = line_content[2]
lip_bead = line_content[3]
lipids_ff_info[l_index] = [lip_resname,lip_resnum,lip_leaflet,lip_bead]
#update: starting leaflets
if lip_leaflet not in lipids_ff_leaflet:
lipids_ff_leaflet.append(lip_leaflet)
#update: index in directional lists
if lip_leaflet == "upper":
lipids_ff_u2l_index.append(l_index)
elif lip_leaflet == "lower":
lipids_ff_l2u_index.append(l_index)
else:
print "->unknown starting leaflet '" + str(lip_leaflet) + "'."
sys.exit(1)
#update: resnames
if lip_resname not in lipids_ff_resnames:
lipids_ff_resnames.append(lip_resname)
#update: leaflet selection string
if l_index==0:
leaflet_sele_string+="(resname " + str(lip_resname) + " and resnum " + str(lip_resnum) + ")"
else:
leaflet_sele_string+=" or (resname " + str(lip_resname) + " and resnum " + str(lip_resnum) + ")"
#create selections
lipids_sele_ff[l_index] = U.selectAtoms("resname " + str(lip_resname) + " and resnum " + str(lip_resnum))
lipids_sele_ff_bead[l_index] = lipids_sele_ff[l_index].selectAtoms("name " + str(lip_bead))
lipids_sele_ff_VMD_string[l_index]="resname " + str(lipids_ff_info[l_index][0]) + " and resid " + str(lipids_ff_info[l_index][1])
if lipids_sele_ff[l_index].numberOfAtoms() == 0:
print "Error:"
print line
print "-> no such lipid found."
sys.exit(1)
except:
print "Error: invalid flipflopping lipid selection string on line " + str(l_index+1) + ": '" + line + "'"
sys.exit(1)
leaflet_sele_string+=")"
return
def identify_proteins():
print "\nIdentifying proteins..."
#import modules
if args.m_algorithm == "density":
global DBSCAN
from sklearn.cluster import DBSCAN
else:
global nx
import networkx as nx
#declare variables
global proteins_nb
global proteins_sele
global proteins_sele_forCoG
global proteins_names
global proteins_length
global proteins_colours
global proteins_species
global proteins_residues
global proteins_boundaries
global proteins_multiplicity
global proteins_sele_string_VMD
global pres_oligomers
global nb_atom_per_protein
proteins_nb = {}
proteins_sele = {}
proteins_sele_forCoG = {}
proteins_names = {}
proteins_length = {}
proteins_colours = {}
proteins_species = []
proteins_residues = {}