forked from medema-group/BiG-SCAPE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bigscape.py
3347 lines (2729 loc) · 154 KB
/
bigscape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
BiG-SCAPE
PI: Marnix Medema [email protected]
Maintainers:
Jorge Navarro [email protected]
Developers:
Jorge Navarro [email protected]
Satria Kautsar [email protected]
Emmanuel (Emzo) de los Santos [email protected]
Usage: Please see `python bigscape.py -h`
Example: python bigscape.py -c 8 --pfam_dir ./ -i ./inputfiles -o ./results
Official repository:
https://github.com/medema-group/BiG-SCAPE
# License: GNU Affero General Public License v3 or later
# A copy of GNU AGPL v3 should have been included in this software package in
# LICENSE.txt.
"""
import pickle # for storing and retrieving dictionaries
import os
import subprocess
import sys
import time
from glob import glob
from itertools import combinations
from itertools import product as combinations_product
from collections import defaultdict
from multiprocessing import Pool, cpu_count, get_context
from argparse import ArgumentParser
from difflib import SequenceMatcher
from operator import itemgetter
import zipfile
from Bio import SeqIO
# from Bio.SeqFeature import BeforePosition, AfterPosition
# from Bio import AlignIO
# from Bio import pairwise2
# from Bio.SubsMat.MatrixInfo import pam250 as scoring_matrix
from Bio import Phylo
from functions import *
from ArrowerSVG import *
import numpy as np
from array import array
from scipy.optimize import linear_sum_assignment
import json
import shutil
from distutils import dir_util
from sklearn.cluster import AffinityPropagation
import networkx as nx
global use_relevant_mibig
global mibig_set
global genbankDict
global valid_classes
def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_biosynthetic_genes):
""" Given a file path to a GenBank file, reads information about the BGC"""
biosynthetic_genes = set()
product_list_per_record = []
save_fasta = True
adding_sequence = False
contig_edge = False
total_seq_length = 0
record_end = 0
offset_record_position = 0
bgc_locus_tags = []
locus_sequences = {}
locus_coordinates = {}
file_folder, fname = os.path.split(gbk)
clusterName = fname[:-4]
# See if we need to keep the sequence
# (Currently) we have to open the file anyway to read all its
# properties for bgc_info anyway...
outputfile = os.path.join(bgc_fasta_folder, clusterName + '.fasta')
if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0 and not force_hmmscan:
if verbose:
print(" File {} already processed".format(outputfile))
save_fasta = False
else:
save_fasta = True
try:
# basic file verification. Substitutes check_data_integrity
records = list(SeqIO.parse(gbk, "genbank"))
except ValueError as e:
print(" Error with file {}: \n '{}'".format(gbk, str(e)))
print(" (This file will be excluded from the analysis)")
return
else:
total_seq_length = 0
bgc_size = 0
cds_ctr = 0
product = "no type"
offset_record_position = 0
max_width = 0 # This will be used for the SVG figure
record_count = 0
for record in records:
record_count += 1
bgc_size += len(record.seq)
if len(record.seq) > max_width:
max_width = len(record.seq)
for feature in record.features:
# antiSMASH <= 4
if feature.type == "cluster":
if "product" in feature.qualifiers:
# in antiSMASH 4 there should only be 1 product qualifiers
for product in feature.qualifiers["product"]:
for p in product.replace(" ","").split("-"):
product_list_per_record.append(p)
if "contig_edge" in feature.qualifiers:
# there might be mixed contig_edge annotations
# in multi-record files. Turn on contig_edge when
# there's at least one annotation
if feature.qualifiers["contig_edge"][0] == "True":
if verbose:
print(" Contig edge detected in {}".format(fname))
contig_edge = True
# antiSMASH = 5
if "region" in feature.type:
if "product" in feature.qualifiers:
for product in feature.qualifiers["product"]:
product_list_per_record.append(product)
if "contig_edge" in feature.qualifiers:
# there might be mixed contig_edge annotations
# in multi-record files. Turn on contig_edge when
# there's at least one annotation
if feature.qualifiers["contig_edge"][0] == "True":
if verbose:
print(" Contig edge detected in {}".format(fname))
contig_edge = True
# Get biosynthetic genes + sequences
if feature.type == "CDS":
cds_ctr += 1
CDS = feature
gene_id = ""
if "gene" in CDS.qualifiers:
gene_id = CDS.qualifiers.get('gene',"")[0]
protein_id = ""
if "protein_id" in CDS.qualifiers:
protein_id = CDS.qualifiers.get('protein_id',"")[0]
# nofuzzy_start/nofuzzy_end are obsolete
# http://biopython.org/DIST/docs/api/Bio.SeqFeature.FeatureLocation-class.html#nofuzzy_start
gene_start = offset_record_position + max(0, int(CDS.location.start))
gene_end = offset_record_position + max(0, int(CDS.location.end))
record_end = gene_end
direction = CDS.location.strand
if direction == 1:
strand = '+'
else:
strand = '-'
fasta_header = "{}_ORF{}:gid:{}:pid:{}:loc:{}:{}:strand:{}".format(clusterName, str(cds_ctr), str(gene_id).replace(":","_"), str(protein_id).replace(":","_"), str(gene_start), str(gene_end), strand)
fasta_header = fasta_header.replace(">","") #the coordinates might contain larger than signs, tools upstream don't like this
fasta_header = fasta_header.replace(" ", "") #the domtable output format (hmmscan) uses spaces as a delimiter, so these cannot be present in the fasta header
# antiSMASH <=4
if "sec_met" in feature.qualifiers:
if "Kind: biosynthetic" in feature.qualifiers["sec_met"]:
biosynthetic_genes.add(fasta_header)
# antiSMASH == 5
if "gene_kind" in feature.qualifiers:
if "biosynthetic" in feature.qualifiers["gene_kind"]:
biosynthetic_genes.add(fasta_header)
fasta_header = ">"+fasta_header
if 'translation' in CDS.qualifiers.keys():
prot_seq = CDS.qualifiers['translation'][0]
# If translation isn't available translate manually, this will take longer
else:
nt_seq = CDS.location.extract(record.seq)
# If we know sequence is an ORF (like all CDSs), codon table can be
# used to correctly translate alternative start codons.
# see http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc25
# If the sequence has a fuzzy start/end, it might not be complete,
# (therefore it might not be the true start codon)
# However, in this case, if 'translation' not available, assume
# this is just a random sequence
complete_cds = False
# More about fuzzy positions
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc39
fuzzy_start = False
if str(CDS.location.start)[0] in "<>":
complete_cds = False
fuzzy_start = True
fuzzy_end = False
if str(CDS.location.end)[0] in "<>":
fuzzy_end = True
#for protein sequence if it is at the start of the entry assume
# that end of sequence is in frame and trim from the beginning
#if it is at the end of the genbank entry assume that the start
# of the sequence is in frame
reminder = len(nt_seq)%3
if reminder > 0:
if fuzzy_start and fuzzy_end:
print("Warning, CDS ({}, {}) has fuzzy\
start and end positions, and a \
sequence length not multiple of \
three. Skipping".format(clusterName,
CDS.qualifiers.get('locus_tag',"")[0]))
break
if fuzzy_start:
if reminder == 1:
nt_seq = nt_seq[1:]
else:
nt_seq = nt_seq[2:]
# fuzzy end
else:
#same logic reverse direction
if reminder == 1:
nt_seq = nt_seq[:-1]
else:
nt_seq = nt_seq[:-2]
# The Genetic Codes: www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
if "transl_table" in CDS.qualifiers.keys():
CDStable = CDS.qualifiers.get("transl_table", "")[0]
prot_seq = str(nt_seq.translate(table=CDStable, to_stop=True, cds=complete_cds))
else:
prot_seq = str(nt_seq.translate(to_stop=True, cds=complete_cds))
total_seq_length += len(prot_seq)
bgc_locus_tags.append(fasta_header)
locus_sequences[fasta_header] = prot_seq
locus_coordinates[fasta_header] = (gene_start, gene_end, len(prot_seq))
# TODO: if len(biosynthetic_genes) == 0, traverse record again
# and add CDS with genes that contain domains labeled sec_met
# we'll probably have to have a list of domains if we allow
# fasta files as input
# make absolute positions for ORFs in next records
offset_record_position += record_end + 100
if bgc_size > min_bgc_size: # exclude the bgc if it's too small
# check what we have product-wise
# In particular, handle different products for multi-record files
product_set = set(product_list_per_record)
if len(product_set) == 1: # only one type of product
product = product_list_per_record[0]
elif "other" in product_set: # more than one, and it contains "other"
if len(product_set) == 2:
product = list(product_set - {'other'})[0] # product = not "other"
else:
product = ".".join(product_set - {'other'}) # likely a hybrid
else:
product = ".".join(product_set) # likely a hybrid
# Don't keep this bgc if its type not in valid classes specified by user
# This will avoid redundant tasks like domain detection
subproduct = set()
for p in product.split("."):
subproduct.add(sort_bgc(p).lower())
if "nrps" in subproduct and ("pksi" in subproduct or "pksother" in subproduct):
subproduct.add("pks-nrp_hybrids")
if len(valid_classes & subproduct) == 0:
if verbose:
print(" Skipping {} (type: {})".format(clusterName, product))
return False
# assuming that the definition field is the same in all records
# product: antiSMASH predicted class of metabolite
# gbk definition
# number of records (for Arrower figures)
# max_width: width of the largest record (for Arrower figures)
# id: the GenBank's accession
#bgc_info[clusterName] = (product, records[0].description, len(records), max_width, records[0].id, biosynthetic_genes.copy())
# TODO contig_edge annotation is not present for antiSMASH v < 4
# Perhaps we can try to infer if it's in a contig edge: if
# - first biosynthetic gene start < 10kb or
# - max_width - last biosynthetic gene end < 10kb (but this will work only for the largest record)
bgc_info[clusterName] = bgc_data(records[0].id, records[0].description, product, len(records), max_width, bgc_size + (record_count-1)*1000, records[0].annotations["organism"], ",".join(records[0].annotations["taxonomy"]), biosynthetic_genes.copy(), contig_edge)
if len(bgc_info[clusterName].biosynthetic_genes) == 0:
files_no_biosynthetic_genes.append(clusterName+".gbk")
# TODO why re-process everything if it was already in the list?
# if name already in genbankDict.keys -> add file_folder
# else: extract all info
if clusterName in genbankDict.keys():
# Name was already in use. Use file_folder as the new sample's name
genbankDict[clusterName][1].add(file_folder)
else:
# See if we need to write down the sequence
if total_seq_length > 0:
# location of first instance of the file is genbankDict[clustername][0]
genbankDict.setdefault(clusterName, [gbk, set([file_folder])])
if save_fasta:
# Find overlaps in CDS regions and delete the shortest ones.
# This is thought as a solution for selecting genes with
# alternate splicing events
# Food for thought: imagine CDS A overlapping CDS B overlapping
# CDS C. If len(A) > len(B) > len(C) and we first compare A vs B
# and delete A, then B vs C and delete B: would that be a better
# solution than removing B? Could this actually happen?
# TODO What if the overlapping CDS is in the reverse strand?
# maybe it should be kept as it is
# TODO what are the characterized differences in prokarytote
# vs eukaryote CDS overlap?
del_list = set()
for a, b in combinations(bgc_locus_tags, 2):
a_start, a_end, a_len = locus_coordinates[a]
b_start, b_end, b_len = locus_coordinates[b]
if b_end <= a_start or b_start >= a_end:
pass
else:
# calculate overlap
if a_start > b_start:
ov_start = a_start
else:
ov_start = b_start
if a_end < b_end:
ov_end = a_end
else:
ov_end = b_end
overlap_length = ov_end - ov_start
# allow the overlap to be as large as 10% of the
# shortest CDS. Overlap length is in nucleotides
# here, whereas a_len, b_len are protein
# sequence lengths
if overlap_length/3 > 0.1*min(a_len,b_len):
if a_len > b_len:
del_list.add(b)
else:
del_list.add(a)
for locus in del_list:
if verbose:
print(" Removing {} because it overlaps with other ORF".format(locus))
bgc_locus_tags.remove(locus)
with open(outputfile,'w') as fastaHandle:
for locus in bgc_locus_tags:
fastaHandle.write("{}\n".format(locus))
fastaHandle.write("{}\n".format(locus_sequences[locus]))
adding_sequence = True
else:
files_no_proteins.append(fname)
if verbose:
print(" Adding {} ({} bps)".format(fname, str(bgc_size)))
else:
print(" Discarding {} (size less than {} bp, was {})".format(clusterName, str(min_bgc_size), str(bgc_size)))
return adding_sequence
def get_gbk_files(inputpath, outputdir, bgc_fasta_folder, min_bgc_size, include_gbk_str, exclude_gbk_str, bgc_info):
"""Searches given directory for genbank files recursively, will assume that
the genbank files that have the same name are the same genbank file.
Returns a dictionary that contains the names of the clusters found as keys
and a list that contains [0] a path to the genbank file and [1] the
samples that the genbank file is a part of.
Extract and write the sequences as fasta files if not already in the Fasta
folder.
return: {cluster_name:[genbank_path,[s_a,s_b...]]}
"""
file_counter = 0
processed_sequences = 0
files_no_proteins = []
files_no_biosynthetic_genes = []
if os.path.isfile(inputpath):
files = [inputpath]
else:
# Unfortunately, this does not work in Python 2:
#files = glob(os.path.join(inputpath,"**/*.gbk"), recursive=True)
files = [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(inputpath)
for f in files if f.endswith(".gbk")]
for filepath in files:
file_folder, fname = os.path.split(filepath)
if len(include_gbk_str) == 1 and include_gbk_str[0] == "*":
pass
else:
if not any([word in fname for word in include_gbk_str]):
continue
if exclude_gbk_str != [] and any([word in fname for word in exclude_gbk_str]):
print(" Skipping file " + fname)
continue
if "_ORF" in fname:
print(" Skipping file {} (string '_ORF' is used internally)".format(fname))
continue
if " " in filepath:
sys.exit("\nError: Input GenBank files should not have spaces in their path as hmmscan cannot process them properly ('too many arguments').")
file_counter += 1
if process_gbk_files(filepath, min_bgc_size, bgc_info, files_no_proteins, files_no_biosynthetic_genes):
processed_sequences += 1
if len(files_no_proteins) > 0:
print(" Warning: Input set has files without protein sequences. They will be discarded")
print(" (See no_sequences_list.txt)")
with open(os.path.join(outputdir, "no_sequences_list.txt"), "w") as noseqs:
for f in sorted(files_no_proteins):
noseqs.write("{}\n".format(f))
if len(files_no_biosynthetic_genes) > 0 and (mode == "glocal" or mode == "auto"):
print(" Warning: Input set has files with no Biosynthetic Genes (affects alignment mode)")
print(" See no_biosynthetic_genes_list.txt")
with open(os.path.join(outputdir, "logs", "no_biosynthetic_genes_list.txt"), "w") as nobiogenes:
for f in sorted(files_no_biosynthetic_genes):
nobiogenes.write("{}\n".format(f))
print("\n Starting with {:d} files".format(file_counter))
print(" Files that had its sequence extracted: {:d}".format(processed_sequences))
return
def timeit(funct):
"""Writes the runtimes of functions to a file and prints them on screen.
Input:
- funct: a function
Output:
- That function its output.
- Runtime of that function in a file called commands.txt and on screen.
"""
def _wrap(*args):
start_time = time.time()
ret = funct(*args)
runtime = time.time()-start_time
runtime_string = '{} took {:.3f} seconds'.format(funct.__name__, runtime)
with open(os.path.join(log_folder, "runtimes.txt"), 'a') as timings_file:
timings_file.write(runtime_string + "\n")
print(runtime_string)
return ret
return _wrap
@timeit
def generate_network(cluster_pairs, cores):
"""Distributes the distance calculation part
cluster_pairs is a list of triads (cluster1_index, cluster2_index, BGC class)
"""
pool = get_context("fork").Pool(cores, maxtasksperchild=100)
#Assigns the data to the different workers and pools the results back into
# the network_matrix variable
network_matrix = pool.map(generate_dist_matrix, cluster_pairs)
# --- Serialized version of distance calculation ---
# For the time being, use this if you have memory issues
#network_matrix = []
#for pair in cluster_pairs:
#network_matrix.append(generate_dist_matrix(pair))
return network_matrix
def generate_dist_matrix(parms):
"""Unpack data to actually launch cluster_distance for one pair of BGCs"""
cluster1Idx,cluster2Idx,bgcClassIdx = [int(parm) for parm in parms]
cluster1 = clusterNames[cluster1Idx]
cluster2 = clusterNames[cluster2Idx]
bgc_class = bgcClassNames[bgcClassIdx]
try:
domain_list_A = DomainList[cluster1]
domain_list_B = DomainList[cluster2]
except KeyError:
print(" Warning: domain list for {} or {} was not found. Extracting from pfs files".format(cluster1, cluster2))
cluster_file1 = os.path.join(output_folder, cluster1 + ".pfs")
cluster_file2 = os.path.join(output_folder, cluster2 + ".pfs")
domain_list_A = get_domain_list(cluster_file1)
domain_list_B = get_domain_list(cluster_file2)
# this really shouldn't happen if we've filtered domain-less gene clusters already
if len(domain_list_A) == 0 or len(domain_list_B) == 0:
print(" Warning: Regarding distance between clusters {} and {}:".format(cluster1, cluster2))
if len(domain_list_A) == 0 and len(domain_list_B) == 0:
print(" None have identified domains. Distance cannot be calculated")
elif (domain_list_A) == 0:
print(" Cluster {} has no identified domains. Distance set to 1".format(cluster1))
else:
print(" Cluster {} has no identified domains. Distance set to 1".format(cluster2))
# last two values (S, Sa) should really be zero but this could give rise to errors when parsing
# the network file (unless we catched the case S = Sa = 0
# cluster1Idx, cluster2Idx, distance, jaccard, DSS, AI, rDSSNa, rDSSa,
# S, Sa, lcsStartA, lcsStartB
return array('f',[cluster1Idx,cluster2Idx,1,0,0,0,0,0,1,1,0,0])
# "Domain Count per Gene". List of simple labels (integers) indicating number
# of domains belonging to each gene
dcg_a = DomainCountGene[cluster1]
dcg_b = DomainCountGene[cluster2]
# Position of the anchor genes (i.e. genes with domains in the anchor
# domain list). Should probably be the Core Biosynthetic genes marked by
# antiSMASH
core_pos_a = corebiosynthetic_position[cluster1]
core_pos_b = corebiosynthetic_position[cluster2]
# go = "gene orientation"
go_a = BGCGeneOrientation[cluster1]
go_b = BGCGeneOrientation[cluster2]
dist, jaccard, dss, ai, rDSSna, rDSS, S, Sa, lcsStartA, lcsStartB, seedLength, reverse = cluster_distance_lcs(cluster1, cluster2, domain_list_A,
domain_list_B, dcg_a, dcg_b, core_pos_a, core_pos_b, go_a, go_b, bgc_class)
network_row = array('f',[cluster1Idx, cluster2Idx, dist, (1-dist)**2, jaccard,
dss, ai, rDSSna, rDSS, S, Sa, lcsStartA, lcsStartB, seedLength, reverse])
return network_row
def score_expansion(x_string_, y_string_, downstream):
"""
Input:
x_string: list of strings. This one tries to expand based on max score
y_string: reference list. This was already expanded.
downstream: If true, expansion goes from left to right.
Output:
max_score, a
Where a is length of expansion.
"""
match = 5
mismatch = -3
gap = -2
if downstream:
x_string = x_string_
y_string = y_string_
else:
# Expansion goes upstream. It's easier to flip both slices and proceed
# as if going downstream.
x_string = list(reversed(x_string_))
y_string = list(reversed(y_string_))
# how many gaps to open before calling a mismatch is more convenient?
max_gaps_before_mismatch = abs(int(match/gap))
max_score = 0
score = 0
pos_y = 0
a = 0
b = 0
for pos_x in range(len(x_string)):
g = x_string[pos_x]
# try to find g within the rest of the slice
# This has the obvious problem of what to do if a gene is found _before_
# the current y_slice (translocation). Duplications could also complicate
# things
try:
match_pos = y_string[pos_y:].index(g)
except ValueError:
score += mismatch
else:
score += match + match_pos*gap
pos_y += match_pos + 1 # move pointer one position after match
# Greater or equals. 'equals' because even if max_score isn't
# larger, it's an expansion
if score >= max_score:
max_score = score
# keep track of expansion. Account for zero-based numbering
a = pos_x + 1
#print(pos_x, score, status, g)
#print("")
return max_score, a
def cluster_distance_lcs(A, B, A_domlist, B_domlist, dcg_A, dcg_b, core_pos_A, core_pos_b, go_A, go_b, bgc_class):
"""Compare two clusters using information on their domains, and the
sequences of the domains.
This version first tries to search for the largest common slices of both BGCs by
finding the Longest Common Substring of genes (based on domain content), also
trying the reverse on one of the BGCs (capital "B" will be used when the 'true'
orientation of the BGC is found)
Then the algorithm will try to expand the slices to either side. Once each
slice are found, they have to pass one more validation: a core biosynthetic
gene (marked in antiSMASH) must be present.
Capital letters indicate the "true" orientation of the BGC (first BGC, 'A'
is kept fixed)
Output:
raw distance, jaccard index, domain sequence similarity, adjacency index,
raw DSS non-anchor, raw DSS anchor, number of non-anchor domains in the pair
(S), number of anchor domains in the pair
"""
Jaccardw, DSSw, AIw, anchorboost = bgc_class_weight[bgc_class]
temp_domain_fastas = {}
# Number of genes in each BGC
lenG_A = len(dcg_A)
lenG_B = len(dcg_b)
setA = set(A_domlist)
setB = set(B_domlist)
intersect = setA & setB
S = 0
S_anchor = 0
# define the subset of domain sequence tags to include in
# the DSS calculation. This is done for every domain.
A_domain_sequence_slice_bottom = defaultdict(int)
A_domain_sequence_slice_top = defaultdict(int)
B_domain_sequence_slice_bottom = defaultdict(int)
B_domain_sequence_slice_top = defaultdict(int)
# Detect totally unrelated pairs from the beginning
if len(intersect) == 0:
# Count total number of anchor and non-anchor domain to report in the
# network file. Apart from that, these BGCs are totally unrelated.
for domain in setA:
# This is a bit of a hack. If pfam domain ids ever change in size
# we'd be in trouble. The previous approach was to .split(".")[0]
# but it's more costly
if domain[:7] in anchor_domains:
S_anchor += len(BGCs[A][domain])
else:
S += len(BGCs[A][domain])
for domain in setB:
if domain[:7] in anchor_domains:
S_anchor += len(BGCs[B][domain])
else:
S += len(BGCs[B][domain])
return 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, S, S_anchor, 0, 0, 0, 0
# initialize domain sequence slices
# They might change if we manage to find a valid overlap
for domain in setA:
A_domain_sequence_slice_bottom[domain] = 0
A_domain_sequence_slice_top[domain] = len(BGCs[A][domain])
for domain in setB:
B_domain_sequence_slice_bottom[domain] = 0
B_domain_sequence_slice_top[domain] = len(BGCs[B][domain])
#initialize domlist borders for AI
domA_start = 0
domA_end = len(A_domlist)
domB_start = 0
domB_end = len(B_domlist)
# always find lcs seed to use for offset alignment in visualization
# Compress the list of domains according to gene information. For example:
# A_domlist = a b c d e f g
# dcg_a = 1 3 1 2 Number of domains per each gene in the BGC
# go_a = 1 -1 -1 1 Orientation of each gene
# A_string = a dcb e fg List of concatenated domains
# Takes into account gene orientation. This works effectively as putting all
# genes in the same direction in order to be able to compare their domain content
A_string = []
start = 0
for g in range(lenG_A):
domain_count = dcg_A[g]
if go_A[g] == 1:
# x[2:] <- small optimization, drop the "PF" from the pfam ids
A_string.append("".join(x[2:] for x in A_domlist[start:start+domain_count]))
else:
A_string.append("".join(A_domlist[x][2:] for x in range(start+domain_count-1, start-1 ,-1)))
start += domain_count
b_string = []
start = 0
for g in range(lenG_B):
domain_count = dcg_b[g]
if go_b[g] == 1:
b_string.append("".join(x[2:] for x in B_domlist[start:start+domain_count]))
else:
b_string.append("".join(B_domlist[x][2:] for x in range(start+domain_count-1, start-1 ,-1)))
start += domain_count
b_string_reverse = list(reversed(b_string))
seqmatch = SequenceMatcher(None, A_string, b_string)
# a: start position in A
# b: start position in B
# s: length of the match
a, b, s = seqmatch.find_longest_match(0, lenG_A, 0, lenG_B)
#print(A, B)
#print(a, b, s)
seqmatch = SequenceMatcher(None, A_string, b_string_reverse)
ar, br, sr = seqmatch.find_longest_match(0, lenG_A, 0, lenG_B)
#print(ar, br, sr)
# We need to keep working with the correct orientation
if s > sr or (s == sr and go_A[a] == go_b[b]):
dcg_B = dcg_b
B_string = b_string
# note: these slices are in terms of genes, not domains (which are
# ultimately what is used for distance)
# Currently, the following values represent the Core Overlap
sliceStartA = a
sliceStartB = b
sliceLengthA = s
sliceLengthB = s
reverse = False
b_name = B
elif s < sr:
dcg_B = list(reversed(dcg_b))
B_string = b_string_reverse
sliceStartA = ar
sliceStartB = br
sliceLengthA = sr
sliceLengthB = sr
# We'll need to know if we're working in reverse so that the start
# postion of the final slice can be transformed to the original orientation
reverse = True
b_name = B + "*"
# special cases: s == sr
# if only one gene matches, choose the one with the most domains
elif s == 1:
seqmatch = SequenceMatcher(None, A_string, b_string)
max_domains = 0
x = 0 # index in A with the gene with most domains
y = 0 # index in B with the gene with most domains
for a, b, z in seqmatch.get_matching_blocks():
if z != 0:
if DomainCountGene[A][a] > max_domains:
x = a
y = b
max_domains = DomainCountGene[A][a]
# note: these slices are in terms of genes, not domains (which are
# ultimately what is used for distance)
# Currently, the following values represent the Core Overlap
sliceStartA = x
sliceLengthA = 1
sliceLengthB = 1
if BGCGeneOrientation[A][x] == BGCGeneOrientation[B][y]:
sliceStartB = y
dcg_B = dcg_b
B_string = b_string
reverse = False
b_name = B
else:
sliceStartB = len(BGCGeneOrientation[B]) - y - 1
dcg_B = list(reversed(dcg_b))
B_string = b_string_reverse
reverse = True
b_name = B + "*"
# s == sr and (s == 0 or s > 1)
else:
dcg_B = dcg_b
B_string = b_string
# note: these slices are in terms of genes, not domains (which are
# ultimately what is used for distance)
# Currently, the following values represent the Core Overlap
sliceStartA = a
sliceStartB = b
sliceLengthA = s
sliceLengthB = s
reverse = False
b_name = B
lcsStartA = sliceStartA
lcsStartB = sliceStartB
seedLength = sliceLengthA
# paint stuff on screen
#if sliceStartB > sliceStartA:
#offset_A = sliceStartB - sliceStartA
#offset_B = 0
#else:
#offset_A = 0
#offset_B = sliceStartA - sliceStartB
#print(A, B)
#print(" "*offset_A + " ".join(map(str,dcg_A[:sliceStartA])) + "[" + " ".join(map(str,dcg_A[sliceStartA:sliceStartA+sliceLengthA])) + "]" + " ".join(map(str,dcg_A[sliceStartA+sliceLengthA:])) + "\t" + A )
#print(" "*offset_B + " ".join(map(str,dcg_B[:sliceStartB])) + "[" + " ".join(map(str,dcg_B[sliceStartB:sliceStartB+sliceLengthB])) + "]" + " ".join(map(str,dcg_B[sliceStartB+sliceLengthB:])) + "\t" + b_name)
##print(sliceStartA, sliceStartB, sliceLengthA)
#print("")
if mode=="glocal" or (mode=="auto" and (bgc_info[A].contig_edge or bgc_info[B].contig_edge)):
#X: bgc that drive expansion
#Y: the other bgc
# forward: True if expansion is to the right
# returns max_score, final positions for X and Y
#score_expansion(X_string, dcg_X, Y_string, dcg_Y, downstream=True/False)
# Expansion is relatively costly. We ask for a minimum of 3 genes
# for the core overlap before proceeding with expansion.
biosynthetic_hit_A = False
for biosynthetic_position in core_pos_A:
if biosynthetic_position >= sliceStartA and biosynthetic_position <= (sliceStartA+sliceLengthA):
biosynthetic_hit_A = True
break
if sliceLengthA >= 3 or biosynthetic_hit_A:
# LEFT SIDE
# Find which bgc has the least genes to the left. If both have the same
# number, find the one that drives the expansion with highest possible score
if sliceStartA == sliceStartB:
# assume complete expansion of A, try to expand B
score_B, sbB = score_expansion(B_string[:sliceStartB], A_string[:sliceStartA], False)
# assume complete expansion of B, try to expand A
score_A, saA = score_expansion(A_string[:sliceStartA], B_string[:sliceStartB], False)
if score_A > score_B or (score_A == score_B and saA > sbB):
sliceLengthA += saA
sliceLengthB += len(B_string[:sliceStartB])
sliceStartA -= saA
sliceStartB = 0
else:
sliceLengthA += len(A_string[:sliceStartA])
sliceLengthB += sbB
sliceStartA = 0
sliceStartB -= sbB
else:
# A is shorter upstream. Assume complete extension. Find B's extension
if sliceStartA < sliceStartB:
score_B, sb = score_expansion(B_string[:sliceStartB], A_string[:sliceStartA], False)
sliceLengthA += len(A_string[:sliceStartA])
sliceLengthB += sb
sliceStartA = 0
sliceStartB -= sb
else:
score_A, sa = score_expansion(A_string[:sliceStartA], B_string[:sliceStartB], False)
sliceLengthA += sa
sliceLengthB += len(B_string[:sliceStartB])
sliceStartA -= sa
sliceStartB = 0
# RIGHT SIDE
# check which side is the shortest downstream. If both BGCs have the same
# length left, choose the one with the best expansion
downstream_A = lenG_A - sliceStartA - sliceLengthA
downstream_B = lenG_B - sliceStartB - sliceLengthB
if downstream_A == downstream_B:
# assume complete extension of A, try to expand B
score_B, xb = score_expansion(B_string[sliceStartB+sliceLengthB:], A_string[sliceStartA+sliceLengthA:], True)
# assume complete extension of B, try to expand A
score_A, xa = score_expansion(A_string[sliceStartA+sliceLengthA:], B_string[sliceStartB+sliceLengthB:], True)
if (score_A == score_B and xa > xb) or score_A > score_B:
sliceLengthA += xa
sliceLengthB += len(B_string[sliceStartB+sliceLengthB:])
else:
sliceLengthA += len(A_string[sliceStartA+sliceLengthA:])
sliceLengthB += xb
else:
if downstream_A < downstream_B:
# extend all of remaining A
score_B, xb = score_expansion(B_string[sliceStartB+sliceLengthB:], A_string[sliceStartA+sliceLengthA:], True)
sliceLengthA += len(A_string[sliceStartA+sliceLengthA:])
sliceLengthB += xb
else:
score_A, xa = score_expansion(A_string[sliceStartA+sliceLengthA:], B_string[sliceStartB+sliceLengthB:], True)
sliceLengthA += xa
sliceLengthB += len(B_string[sliceStartB+sliceLengthB:])
#print(" "*offset_A + " ".join(map(str,dcg_A[:sliceStartA])) + "[" + " ".join(map(str,dcg_A[sliceStartA:sliceStartA+sliceLengthA])) + "]" + " ".join(map(str,dcg_A[sliceStartA+sliceLengthA:])) + "\t" + A )
#print(" "*offset_B + " ".join(map(str,dcg_B[:sliceStartB])) + "[" + " ".join(map(str,dcg_B[sliceStartB:sliceStartB+sliceLengthB])) + "]" + " ".join(map(str,dcg_B[sliceStartB+sliceLengthB:])) + "\t" + b_name)
#print()
#print(A_string)
#print(B_string)
#print()
if min(sliceLengthA, sliceLengthB) >= 5:
# First test passed. Find if there is a biosynthetic gene in both slices
# (note that even if they are, currently we don't check whether it's
# actually the _same_ gene)
biosynthetic_hit_A = False
biosynthetic_hit_B = False
for biosynthetic_position in core_pos_A:
if biosynthetic_position >= sliceStartA and biosynthetic_position <= (sliceStartA+sliceLengthA):
biosynthetic_hit_A = True
break
# return to original orientation if needed
if reverse:
sliceStartB = lenG_B - sliceStartB - sliceLengthB
# using original core_pos_b
for biosynthetic_position in core_pos_b:
if biosynthetic_position >= sliceStartB and biosynthetic_position <= (sliceStartB + sliceLengthB):
biosynthetic_hit_B = True
break
# finally...
if biosynthetic_hit_A and biosynthetic_hit_B:
domA_start = sum(dcg_A[:sliceStartA])
domA_end = domA_start + sum(dcg_A[sliceStartA:sliceStartA+sliceLengthA])
setA = set(A_domlist[domA_start:domA_end])
domB_start = sum(dcg_b[:sliceStartB])
domB_end = domB_start + sum(dcg_b[sliceStartB:sliceStartB+sliceLengthB])
setB = set(B_domlist[domB_start:domB_end])
intersect = setA & setB
# re-adjust the indices for each domain so we get only the sequence
# tags in the selected slice. First step: find out which is the
# first copy of each domain we're using
for domain in A_domlist[:domA_start]:
A_domain_sequence_slice_bottom[domain] += 1
for domain in B_domlist[:domB_start]:
B_domain_sequence_slice_bottom[domain] += 1
# Step 2: work with the last copy of each domain.
# Step 2a: make top = bottom
for domain in setA:
A_domain_sequence_slice_top[domain] = A_domain_sequence_slice_bottom[domain]
for domain in setB:
B_domain_sequence_slice_top[domain] = B_domain_sequence_slice_bottom[domain]
# Step 2b: increase top with the domains in the slice
for domain in A_domlist[domA_start:domA_end]:
A_domain_sequence_slice_top[domain] += 1
for domain in B_domlist[domB_start:domB_end]:
B_domain_sequence_slice_top[domain] += 1
#else:
#print(" - - Not a valid overlap found - - (no biosynthetic genes)\n")