-
Notifications
You must be signed in to change notification settings - Fork 5
/
StrainScan_build.py
162 lines (140 loc) · 6.87 KB
/
StrainScan_build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import re
import os
import sys
import shutil
import logging
import argparse
sys.path.append('library')
from library import Cluster,Unique_kmer_detect_direct,Build_kmer_sets_unique_region_lasso_test_allinone_sp,select_rep,Build_overlap_matrix_sp,Build_overlap_matrix_sp_jellyfish,Build_tree,Build_tree_mem
import time
import psutil
from collections import defaultdict
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter):
pass
__author__ = "Liao Herui, Ji Yongxin - PhD of City University of HongKong"
usage = "StrainScan - A kmer-based strain-level identification tool."
def merge_cls(dc_in):
dc_out=defaultdict(lambda:{})
for e in dc_in:
for e2 in dc_in[e]:
dc_out['C'][e2]=''
return dict(dc_out)
def manual(icf, fa_dir):
dn = {} # pre -> full file dir
for filename in os.listdir(fa_dir):
pre = re.split('\.',filename)[0]
dn[pre] = os.path.join(fa_dir, filename)
f = open(icf,'r')
dc95_l2 = defaultdict(lambda:{})
while True:
line = f.readline().strip()
if not line:break
ele = line.split('\t')
st = re.split(',',ele[-1])
for s in st:
dc95_l2[int(ele[0])][dn[s]] = ''
return dc95_l2
def main():
pwd = os.getcwd()
print(pwd)
# Get para
parser=argparse.ArgumentParser(prog='StrainScan_build.py',
description=usage,
formatter_class=CustomFormatter)
parser.add_argument('-i', '--input_fasta', dest='input_fa', type=str, required=True,
help="The dir of input fasta genome --- Required")
parser.add_argument('-o', '--output_dir', dest='out_dir', type=str,
default=os.path.join(os.getcwd(), 'StrainScan_DB'),
help='Output directory.')
parser.add_argument('-c', '--cls_file', dest='cls_custom_file', type=str,
default='',
help='The custom clustering file provided by user.')
parser.add_argument('-k', '--kmer_size', dest='ksize', type=int, default=31,
help='The size of kmer, should be odd number.')
parser.add_argument('-t', '--threads', dest='threads', type=int, default=1,
help='The threads used to build the database.')
parser.add_argument('-u', '--uk_num', dest='uknum', type=int, default=100000,
help='The maximum number of unique k-mers in each genome to extract.')
parser.add_argument('-g', '--gk_ratio', dest='gkratio', type=float, default=1.0,
help='The ratio of group-specific k-mers to extract.')
parser.add_argument('-m', '--strainest_sample', dest='mas', type=int, default=0,
help='If this parameter is 1, then the program will search'
' joint kmer sets from msa generated by Strainest.')
parser.add_argument('-e', '--memory_efficient', dest='mem', type=int, default=0,
help='If this parameter is 1, then the program will build'
' the database with the memory efficient mode.')
parser.add_argument('-n', '--mink_cutoff', dest='mink', type=int, default=1000,
help='Minimum k-mer number cutoff in a node.')
parser.add_argument('-x', '--maxk_cutoff', dest='maxk', type=int, default=30000,
help='Maximum k-mer number cutoff in a node')
parser.add_argument('-r', '--maxn_cutoff', dest='maxn', type=int, default=3000,
help='Maximum cluster number for node reconstruction.')
args = parser.parse_args()
out_dir = args.out_dir
params = [0.8, args.mink, args.maxk, args.maxn]
#icf=args.icf
'''
tid=args.tid
if not tid:
tid='1747'
'''
cls_res = os.path.join(out_dir, 'Cluster_Result')
kmer_sets_l2 = os.path.join(out_dir, 'Kmer_Sets_L2')
os.makedirs(out_dir, exist_ok=True)
os.makedirs(cls_res, exist_ok=True)
os.makedirs(kmer_sets_l2, exist_ok=True)
# Construct matrix with dashing (jaccard index)
logging.info('Constructing matrix with dashing (jaccard index)')
matrix = Cluster.construct_matrix(args.input_fa)
# -------- Hirarchical clustering Part --------
#### Default: Single: 0.95, Complete: 0.95
# ---------------------------------------------
logging.info('Hierarchical clustering')
dc95 = Cluster.hcls(matrix, 'single', '0.05')
os.system('mv hclsMap_* distance_matrix_rebuild.txt distance_matrix.txt '+cls_res)
if not args.cls_custom_file=='':
shutil.copy(args.cls_custom_file,os.path.join(cls_res, 'hclsMap_95.txt'))
#exit()
cls_file = os.path.join(cls_res, 'hclsMap_95.txt')
dc95_rep,dc95_l2 = select_rep.pick_rep(os.path.join(cls_res, 'distance_matrix_rebuild.txt'),
cls_file, cls_res)
# Construct the tree
logging.info('Constructing the tree')
os.makedirs(os.path.join(out_dir, 'Tree_database', 'test'), exist_ok=True)
os.makedirs(os.path.join(out_dir, 'Tree_database', 'nodes_kmer'), exist_ok=True)
os.makedirs(os.path.join(out_dir, 'Tree_database', 'overlap'), exist_ok=True)
if args.mem == 0:
Build_tree.build_tree([os.path.join(cls_res, 'distance_matrix.txt'),
os.path.join(cls_res, 'hclsMap_95_recls.txt'),
os.path.join(out_dir, 'Tree_database'),
31, params])
else:
om=open(out_dir+'/Memory_DB','w+')
om.close()
Build_tree_mem.build_tree([os.path.join(cls_res, 'distance_matrix.txt'),
os.path.join(cls_res, 'hclsMap_95_recls.txt'),
os.path.join(out_dir, 'Tree_database'),
31, params])
icf = os.path.join(out_dir, 'Tree_database', 'hclsMap_95_recls.txt')
shutil.copy(os.path.join(out_dir, 'Tree_database', 'hclsMap_95_recls.txt'), cls_res)
dc95_l2 = manual(icf, args.input_fa)
# Delete temp dir
shutil.rmtree(os.path.join(out_dir, 'Tree_database', 'test'))
# --------- Inside cluster strains kmer sets construction -------
logging.info('Inside cluster strains kmer sets construction')
Build_kmer_sets_unique_region_lasso_test_allinone_sp.build_kmer_sets(dc95_l2,kmer_sets_l2, args.ksize,
args.uknum, args.gkratio, args.mas, args.threads)
print(str(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) + ' - StrainScan::build_DB:: Build sparse matices done '+u'- Current Memory Usage: %.4f GB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024) )
# --------- Build Overlap matrix -----------
logging.info('Building Overlap matrix')
new_cls_file = os.path.join(out_dir, 'Tree_database', 'hclsMap_95_recls.txt')
if args.mem == 0:
Build_overlap_matrix_sp.build_omatrix(args.input_fa, new_cls_file, os.path.join(kmer_sets_l2, 'Kmer_Sets'), args.threads)
else:
Build_overlap_matrix_sp_jellyfish.build_omatrix(args.input_fa, new_cls_file, os.path.join(kmer_sets_l2, 'Kmer_Sets'),args.threads)
# --------- Delete temp dir ---------
shutil.rmtree(os.path.join(kmer_sets_l2, 'Colinear_Block'))
if __name__=='__main__':
main()