-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path2.1_batch_run_CyDotian_exact_match.py
712 lines (640 loc) · 47.4 KB
/
2.1_batch_run_CyDotian_exact_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
import os, re, time, argparse, codecs, sys, subprocess, signal
import pandas as pd
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
parser = argparse.ArgumentParser(description="Batch run CyDotian's modules")
parser.add_argument('-c', '--inputConfigFilePath', metavar='config', required=True, help='Input config file path')
parser.add_argument('-s', '--inputSequenceFolderPath', metavar='sequence', required=True, help='Input sequence folder path')
parser.add_argument('-o', '--outputFolderPath', metavar='output', required=True, help='Output folder path')
args = parser.parse_args()
startTime = time.time()
state = 0
try:
importFolder = args.inputSequenceFolderPath
importFileList = os.listdir(importFolder)
importFolderPath = os.path.abspath(importFolder)
exportFolder = args.outputFolderPath
def mkdir(path):
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
else:
pass
mkdir(exportFolder)
exportFolderPath = os.path.abspath(exportFolder)
configFile = open(args.inputConfigFilePath,'r',encoding='utf-8')
fileType,modeList,repeatLen = 'DNA', ['0', '1'], 4
print('=====================')
for row in configFile:
if False == row.startswith('#'):
if '=' in row:
parameterName = re.findall("(.*?)=", row.strip('\n'))[0]
matchStr = re.findall("=(.*?)#", row.strip('\n'))[0]
print(parameterName, matchStr, sep='=')
parameter = matchStr.replace(' ','')
if row.startswith('fileType'):
if '0' == parameter:
fileType = 'DNA'
elif '1' == parameter:
fileType = 'Amino acid'
elif row.startswith('mode'):
modeList = parameter.split(',')
elif row.startswith('repeatLen'):
repeatLen = int(parameter)
configFile.close()
print('=====================\n')
def batchExportPosition(chlFasta,exportFolderPositionPath,fileType,modeList,repeatLen):
folder = '/positions'
customFolderPath = exportFolderPositionPath + folder
isExists = os.path.exists(customFolderPath)
if isExists:
pass
else:
os.makedirs(customFolderPath, mode=0o777)
folderOriginal = '/positions_original'
customFolderPathOriginal = exportFolderPositionPath + folderOriginal
isExists = os.path.exists(customFolderPathOriginal)
if isExists:
pass
else:
os.makedirs(customFolderPathOriginal, mode=0o777)
logFile = open(customFolderPath + '/log.txt', 'w',encoding='utf-8')
if 'DNA' == fileType:
logFile.write('# DNA' + '\n')
logFile.write(
'# parameter: RepeatLen >= ' + str(repeatLen) + '\n')
logFile.write(
'# header of positions.txt: Start1 End1 Start2 End2 Length' + '\n')
elif 'Amino acid' == fileType:
logFile.write('# Amino acid' + '\n')
logFile.write(
'# parameter: RepeatLen >= ' + str(repeatLen) + '\n')
logFile.write(
'# header of positions.txt: Start1 End1 Start2 End2 Length' + '\n')
logFile.write('# Sequence names containing repeats larger than the RepeatLen threshold are recorded in file '
'sequence_names_direct.txt, sequence_names_inverted.txt or sequence_names_reverse_complement.txt.'
' The names of sequences that failed to be analysed are recorded in file failure_sequences.txt.\n')
logFile.write('Time: ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
failFile = open(customFolderPath + '/failure_sequences.txt', 'w',encoding='utf-8')
failLogFile = open(customFolderPath + '/failure_sequences_error_log.txt', 'w', encoding='utf-8')
if '0' in modeList:
directPositionBigLengthLogFile = open(customFolderPath + '/sequence_names_direct.txt', 'w',encoding='utf-8')
if '1' in modeList:
invertedPositionBigLengthLogFile = open(customFolderPath + '/sequence_names_inverted.txt','w',encoding='utf-8')
if '2' in modeList:
reverseComplementPositionBigLengthLogFile = open(
customFolderPath + '/sequence_names_reverse_complement.txt', 'w', encoding='utf-8')
seqLengthFileOriginal = open(customFolderPathOriginal + '/all_sequences_length.txt', 'w', encoding='utf-8')
def removeThreeFiles():
if os.path.exists('temp.single.input.fasta1.txt'):
os.remove('temp.single.input.fasta1.txt')
else:
print("'temp.single.input.fasta1.txt' does not exist!")
if os.path.exists('temp.single.input.fasta2.txt'):
os.remove('temp.single.input.fasta2.txt')
else:
print("'temp.single.input.fasta2.txt' does not exist!")
if os.path.exists('position.txt'):
os.remove('position.txt')
else:
print("'position.txt' does not exist!")
for name, seq in chlFasta.items():
try:
length = len(seq)
seqLengthFileOriginal.write(name + '\t' + str(length) + '\n')
if '0' in modeList:
if 'DNA' == fileType:
tempSingleInputFile1 = open('./temp.single.input.fasta1.txt','w',encoding='utf-8')
tempSingleInputFile1.write(seq)
tempSingleInputFile1.close()
tempSingleInputFile2 = open('./temp.single.input.fasta2.txt','w',encoding='utf-8')
tempSingleInputFile2.write(seq)
tempSingleInputFile2.close()
mode = 0
exactMatch = 0
identityThr = 100.0
DNA_Matrix = 0
# command = 'cd ./bin/; ./bpRepeatScan'+' '+str(identityThr)+' '+str(mode)+' '+str(DNA_Matrix)+' '+str(exactMatch)
# os.system(command)
command = "./bpRepeatScan {} {} {} {}".format(str(identityThr), str(mode), str(DNA_Matrix), str(exactMatch))
process = subprocess.Popen(command, universal_newlines=True, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,
preexec_fn=os.setsid)
stdoutput, erroutput = process.communicate() # 这行代码保证调用的子进程结束之后再执行Python脚本中下面的代码。
print("mode 0, {}, process.returncode: {}".format(name, process.returncode))
if process.returncode: # 获取进程的返回值。如果进程还没有结束,返回None。根据自我实验,就是等价于Popen.poll()
print(name, str(length), 'errOutput: ', erroutput.strip('\n'), sep=', ', end='***\n')
print(name, str(length), 'stdOutput: ', stdoutput.strip('\n'), sep=', ', end='***\n')
if process.returncode < 0:
print('The process calling the bpRepeatScan program was killed by the system!')
print('chenhuilong1\n')
failFile.write(name + '\t' + str(length) + '\n')
print('***', file=failLogFile) # 这里同样不用str()也行。都一样。
print(process.returncode, name, str(length), sep='\t', file=failLogFile) # 这里同样不用str()也行。都一样。
print('errOutput: ', erroutput.strip('\n'), file=failLogFile)
print('stdOutput: ', stdoutput.strip('\n'), end='\n***\n', file=failLogFile)
try:
os.killpg(process.pid, signal.SIGKILL)
except BaseException as e:
if str(e) != "[Errno 3] No such process":
print(e)
removeThreeFiles()
# 删除temp.single.input.fasta1.txt,temp.single.input.fasta2.txt, position.txt
continue
else: # 子进程状态为0,表明正常执行完毕。根据自己的实验,读写数据量/序列超大的时候,还是可能会出现子进程不被杀死,导致后面读写文件有问题,如position.txt文件出现这行没写完,就停止并写下一行的情况。
try:
posTable = pd.read_csv('./position.txt', encoding='utf-8', sep='\t', header=None)
posTable = posTable.sort_values(by=2, ascending=False).reset_index(drop=True)
# =========
# if posTable[2][1] >= repeatLen:
# directPositionBigLengthLogFile.write(name + '\n')
# The following modified code:
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
directPositionBigLengthLogFile.write(name + '\n')
break
# =========
directPositionFile = open(customFolderPath + '/' + name + '_positions_direct.txt', 'w',
encoding='utf-8')
directPositionFileOriginal = open(
customFolderPathOriginal + '/' + name + '_positions_direct.txt', 'w', encoding='utf-8')
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
directPositionFile.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] + chl[2] - 1)) + '\t' + str(int(chl[2])) + '\n')
if chl[2] >= repeatLen:
directPositionFileOriginal.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] + chl[2] - 1)) + '\t' + str(int(chl[2])) + '\n')
directPositionFile.close()
directPositionFileOriginal.close()
except BaseException as e:
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***')
print('chenhuilong2\n')
failFile.write(name + '\t' + str(length) + '\n')
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***', file=failLogFile)
if os.path.exists(customFolderPath + '/' + name + '_positions_direct.txt'):
os.remove(customFolderPath + '/' + name + '_positions_direct.txt')
if os.path.exists(customFolderPathOriginal + '/' + name + '_positions_direct.txt'):
os.remove(customFolderPathOriginal + '/' + name + '_positions_direct.txt')
# 如果在,删除directPositionFile,directPositionFileOriginal,
elif 'Amino acid' == fileType:
tempSingleInputFile1 = open('./temp.single.input.fasta1.txt', 'w', encoding='utf-8')
tempSingleInputFile1.write(seq)
tempSingleInputFile1.close()
tempSingleInputFile2 = open('./temp.single.input.fasta2.txt', 'w', encoding='utf-8')
tempSingleInputFile2.write(seq)
tempSingleInputFile2.close()
mode = 0
exactMatch = 0
similarityThr = 100.0
aminoAcidMatrix = 1
# command = 'cd ./bin/; ./aaRepeatScan' + ' ' + str(similarityThr) + ' ' + str(mode) + ' ' + str(
# aminoAcidMatrix) + ' ' + str(exactMatch)
# os.system(command)
command = "./aaRepeatScan {} {} {} {}".format(str(similarityThr), str(mode), str(aminoAcidMatrix), str(exactMatch))
process = subprocess.Popen(command, universal_newlines=True, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,
preexec_fn=os.setsid)
stdoutput, erroutput = process.communicate() # 这行代码保证调用的子进程结束之后再执行Python脚本中下面的代码。
print("mode 0, {}, process.returncode: {}".format(name, process.returncode))
if process.returncode: # 获取进程的返回值。如果进程还没有结束,返回None。根据自我实验,就是等价于Popen.poll()
print(name, str(length), 'errOutput: ', erroutput.strip('\n'), sep=', ', end='***\n')
print(name, str(length), 'stdOutput: ', stdoutput.strip('\n'), sep=', ', end='***\n')
if process.returncode < 0:
print('The process calling the aaRepeatScan program was killed by the system!')
print('chenhuilong1\n')
failFile.write(name + '\t' + str(length) + '\n')
print('***', file=failLogFile) # 这里同样不用str()也行。都一样。
print(process.returncode, name, str(length), sep='\t', file=failLogFile) # 这里同样不用str()也行。都一样。
print('errOutput: ', erroutput.strip('\n'), file=failLogFile)
print('stdOutput: ', stdoutput.strip('\n'), end='\n***\n', file=failLogFile)
try:
os.killpg(process.pid, signal.SIGKILL)
except BaseException as e:
if str(e) != "[Errno 3] No such process":
print(e)
removeThreeFiles()
# 删除temp.single.input.fasta1.txt,temp.single.input.fasta2.txt, position.txt
continue
else: # 子进程状态为0,表明正常执行完毕。根据自己的实验,读写数据量/序列超大的时候,还是可能会出现子进程不被杀死,导致后面读写文件有问题,如position.txt文件出现这行没写完,就停止并写下一行的情况。
try:
posTable = pd.read_csv('./position.txt', encoding='utf-8', sep='\t', header=None)
posTable = posTable.sort_values(by=2, ascending=False).reset_index(drop=True)
# =========
# if posTable[2][1] >= repeatLen:
# directPositionBigLengthLogFile.write(name + '\n')
# The following modified code:
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
directPositionBigLengthLogFile.write(name + '\n')
break
# =========
directPositionFile = open(customFolderPath + '/' + name + '_positions_direct.txt', 'w',
encoding='utf-8')
directPositionFileOriginal = open(
customFolderPathOriginal + '/' + name + '_positions_direct.txt', 'w',
encoding='utf-8')
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
directPositionFile.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] + chl[2] - 1)) + '\t' + str(int(chl[2])) + '\n')
if chl[2] >= repeatLen:
directPositionFileOriginal.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] + chl[2] - 1)) + '\t' + str(int(chl[2])) + '\n')
directPositionFile.close()
directPositionFileOriginal.close()
except BaseException as e:
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***')
print('chenhuilong2\n')
failFile.write(name + '\t' + str(length) + '\n')
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***', file=failLogFile)
if os.path.exists(customFolderPath + '/' + name + '_positions_direct.txt'):
os.remove(customFolderPath + '/' + name + '_positions_direct.txt')
if os.path.exists(customFolderPathOriginal + '/' + name + '_positions_direct.txt'):
os.remove(customFolderPathOriginal + '/' + name + '_positions_direct.txt')
# 如果在,删除directPositionFile,directPositionFileOriginal
if '1' in modeList:
if 'DNA' == fileType:
tempSingleInputFile1 = open('./temp.single.input.fasta1.txt', 'w', encoding='utf-8')
tempSingleInputFile1.write(seq)
tempSingleInputFile1.close()
tempSingleInputFile2 = open('./temp.single.input.fasta2.txt', 'w', encoding='utf-8')
tempSingleInputFile2.write(seq)
tempSingleInputFile2.close()
mode = 1
exactMatch = 0
identityThr = 100.0
DNA_Matrix = 0
# command = 'cd ./bin/; ./bpRepeatScan' + ' ' + str(identityThr) + ' ' + str(mode) + ' ' + str(
# DNA_Matrix) + ' ' + str(exactMatch)
# os.system(command)
command = "./bpRepeatScan {} {} {} {}".format(str(identityThr), str(mode), str(DNA_Matrix), str(exactMatch))
process = subprocess.Popen(command, universal_newlines=True, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,
preexec_fn=os.setsid)
stdoutput, erroutput = process.communicate() # 这行代码保证调用的子进程结束之后再执行Python脚本中下面的代码。
print("mode 1, {}, process.returncode: {}".format(name, process.returncode))
if process.returncode: # 获取进程的返回值。如果进程还没有结束,返回None。根据自我实验,就是等价于Popen.poll()
print(name, str(length), 'errOutput: ', erroutput.strip('\n'), sep=', ', end='***\n')
print(name, str(length), 'stdOutput: ', stdoutput.strip('\n'), sep=', ', end='***\n')
if process.returncode < 0:
print('The process calling the bpRepeatScan program was killed by the system!')
print('chenhuilong1\n')
failFile.write(name + '\t' + str(length) + '\n')
print('***', file=failLogFile) # 这里同样不用str()也行。都一样。
print(process.returncode, name, str(length), sep='\t', file=failLogFile) # 这里同样不用str()也行。都一样。
print('errOutput: ', erroutput.strip('\n'), file=failLogFile)
print('stdOutput: ', stdoutput.strip('\n'), end='\n***\n', file=failLogFile)
try:
os.killpg(process.pid, signal.SIGKILL)
except BaseException as e:
if str(e) != "[Errno 3] No such process":
print(e)
removeThreeFiles()
# 删除temp.single.input.fasta1.txt,temp.single.input.fasta2.txt, position.txt
continue
else:
try:
posTable = pd.read_csv('./position.txt', encoding='utf-8', sep='\t', header=None)
posTable = posTable.sort_values(by=2, ascending=False).reset_index(drop=True)
# =========
# if posTable[2][0] >= repeatLen:
# invertedPositionBigLengthLogFile.write(name + '\n')
# The following modified code:
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
invertedPositionBigLengthLogFile.write(name + '\n')
break
# =========
invertedPositionFile = open(customFolderPath + '/' + name + '_positions_inverted.txt', 'w',
encoding='utf-8')
invertedPositionFileOriginal = open(
customFolderPathOriginal + '/' + name + '_positions_inverted.txt', 'w', encoding='utf-8')
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
invertedPositionFile.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] - chl[2] + 1)) + '\t' + str(int(chl[2])) + '\n')
if chl[2] >= repeatLen:
invertedPositionFileOriginal.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] - chl[2] + 1)) + '\t' + str(int(chl[2])) + '\n')
invertedPositionFile.close()
invertedPositionFileOriginal.close()
except BaseException as e:
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***')
print('chenhuilong2\n')
failFile.write(name + '\t' + str(length) + '\n')
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***', file=failLogFile)
if os.path.exists(customFolderPath + '/' + name + '_positions_inverted.txt'):
os.remove(customFolderPath + '/' + name + '_positions_inverted.txt')
if os.path.exists(customFolderPathOriginal + '/' + name + '_positions_inverted.txt'):
os.remove(customFolderPathOriginal + '/' + name + '_positions_inverted.txt')
# 如果在,invertedPositionFile,invertedPositionFileOriginal.
elif 'Amino acid' == fileType:
tempSingleInputFile1 = open('./temp.single.input.fasta1.txt', 'w', encoding='utf-8')
tempSingleInputFile1.write(seq)
tempSingleInputFile1.close()
tempSingleInputFile2 = open('./temp.single.input.fasta2.txt', 'w', encoding='utf-8')
tempSingleInputFile2.write(seq)
tempSingleInputFile2.close()
mode = 1
exactMatch = 0
similarityThr = 100.0
aminoAcidMatrix = 1
# command = 'cd ./bin/; ./aaRepeatScan' + ' ' + str(similarityThr) + ' ' + str(mode) + ' ' + str(
# aminoAcidMatrix) + ' ' + str(exactMatch)
# os.system(command)
command = "./aaRepeatScan {} {} {} {}".format(str(similarityThr), str(mode), str(aminoAcidMatrix), str(exactMatch))
process = subprocess.Popen(command, universal_newlines=True, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,
preexec_fn=os.setsid)
stdoutput, erroutput = process.communicate() # 这行代码保证调用的子进程结束之后再执行Python脚本中下面的代码。
print("mode 1, {}, process.returncode: {}".format(name, process.returncode))
if process.returncode: # 获取进程的返回值。如果进程还没有结束,返回None。根据自我实验,就是等价于Popen.poll()
print(name, str(length), 'errOutput: ', erroutput.strip('\n'), sep=', ', end='***\n')
print(name, str(length), 'stdOutput: ', stdoutput.strip('\n'), sep=', ', end='***\n')
if process.returncode < 0:
print('The process calling the aaRepeatScan program was killed by the system!')
print('chenhuilong1\n')
failFile.write(name + '\t' + str(length) + '\n')
print('***', file=failLogFile) # 这里同样不用str()也行。都一样。
print(process.returncode, name, str(length), sep='\t', file=failLogFile) # 这里同样不用str()也行。都一样。
print('errOutput: ', erroutput.strip('\n'), file=failLogFile)
print('stdOutput: ', stdoutput.strip('\n'), end='\n***\n', file=failLogFile)
try:
os.killpg(process.pid, signal.SIGKILL)
except BaseException as e:
if str(e) != "[Errno 3] No such process":
print(e)
removeThreeFiles()
# 删除temp.single.input.fasta1.txt,temp.single.input.fasta2.txt, position.txt
continue
else: # 子进程状态为0,表明正常执行完毕。根据自己的实验,读写数据量/序列超大的时候,还是可能会出现子进程不被杀死,导致后面读写文件有问题,如position.txt文件出现这行没写完,就停止并写下一行的情况。
try:
posTable = pd.read_csv('./position.txt', encoding='utf-8', sep='\t', header=None)
posTable = posTable.sort_values(by=2, ascending=False).reset_index(drop=True)
# =========
# if posTable[2][0] >= repeatLen:
# invertedPositionBigLengthLogFile.write(name + '\n')
# The following modified code:
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
invertedPositionBigLengthLogFile.write(name + '\n')
break
# =========
invertedPositionFile = open(customFolderPath + '/' + name + '_positions_inverted.txt',
'w', encoding='utf-8')
invertedPositionFileOriginal = open(
customFolderPathOriginal + '/' + name + '_positions_inverted.txt', 'w',
encoding='utf-8')
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
invertedPositionFile.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] - chl[2] + 1)) + '\t' + str(int(chl[2])) + '\n')
if chl[2] >= repeatLen:
invertedPositionFileOriginal.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] - chl[2] + 1)) + '\t' + str(int(chl[2])) + '\n')
invertedPositionFile.close()
invertedPositionFileOriginal.close()
except BaseException as e:
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***')
print('chenhuilong2\n')
failFile.write(name + '\t' + str(length) + '\n')
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***', file=failLogFile)
if os.path.exists(customFolderPath + '/' + name + '_positions_inverted.txt'):
os.remove(customFolderPath + '/' + name + '_positions_inverted.txt')
if os.path.exists(customFolderPathOriginal + '/' + name + '_positions_inverted.txt'):
os.remove(customFolderPathOriginal + '/' + name + '_positions_inverted.txt')
# 如果在,invertedPositionFile,invertedPositionFileOriginal.
if '2' in modeList:
if 'DNA' == fileType:
tempSingleInputFile1 = open('./temp.single.input.fasta1.txt', 'w', encoding='utf-8')
tempSingleInputFile1.write(seq)
tempSingleInputFile1.close()
tempSingleInputFile2 = open('./temp.single.input.fasta2.txt', 'w', encoding='utf-8')
tempSingleInputFile2.write(seq)
tempSingleInputFile2.close()
mode = 2
exactMatch = 0
identityThr = 100.0
DNA_Matrix = 0
# command = 'cd ./bin/; ./bpRepeatScan' + ' ' + str(identityThr) + ' ' + str(mode) + ' ' + str(
# DNA_Matrix) + ' ' + str(exactMatch)
# os.system(command)
command = "./bpRepeatScan {} {} {} {}" + ' ' + str(identityThr) + ' ' + str(mode) + ' ' + str(
DNA_Matrix) + ' ' + str(exactMatch)
process = subprocess.Popen(command, universal_newlines=True, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,
preexec_fn=os.setsid)
stdoutput, erroutput = process.communicate() # 这行代码保证调用的子进程结束之后再执行Python脚本中下面的代码。
print("mode 2, {}, process.returncode: {}".format(name, process.returncode))
if process.returncode: # 获取进程的返回值。如果进程还没有结束,返回None。根据自我实验,就是等价于Popen.poll()
print(name, str(length), 'errOutput: ', erroutput.strip('\n'), sep=', ', end='***\n')
print(name, str(length), 'stdOutput: ', stdoutput.strip('\n'), sep=', ', end='***\n')
if process.returncode < 0:
print('The process calling the bpRepeatScan program was killed by the system!')
print('chenhuilong1\n')
failFile.write(name + '\t' + str(length) + '\n')
print('***', file=failLogFile) # 这里同样不用str()也行。都一样。
print(process.returncode, name, str(length), sep='\t', file=failLogFile) # 这里同样不用str()也行。都一样。
print('errOutput: ', erroutput.strip('\n'), file=failLogFile)
print('stdOutput: ', stdoutput.strip('\n'), end='\n***\n', file=failLogFile)
try:
os.killpg(process.pid, signal.SIGKILL)
except BaseException as e:
if str(e) != "[Errno 3] No such process":
print(e)
removeThreeFiles()
# 删除temp.single.input.fasta1.txt,temp.single.input.fasta2.txt, position.txt
continue
else: # 子进程状态为0,表明正常执行完毕。根据自己的实验,读写数据量/序列超大的时候,还是可能会出现子进程不被杀死,导致后面读写文件有问题,如position.txt文件出现这行没写完,就停止并写下一行的情况。
try:
posTable = pd.read_csv('./position.txt', encoding='utf-8', sep='\t', header=None)
posTable = posTable.sort_values(by=2, ascending=False).reset_index(drop=True)
# =========
# if posTable[2][0] >= repeatLen:
# reverseComplementPositionBigLengthLogFile.write(name + '\n')
# The following modified code:
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
reverseComplementPositionBigLengthLogFile.write(name + '\n')
break
# =========
reverseComplementPositionFile = open(
customFolderPath + '/' + name + '_positions_reverse_complement.txt', 'w', encoding='utf-8')
reverseComplementPositionFileOriginal = open(
customFolderPathOriginal + '/' + name + '_positions_reverse_complement.txt', 'w',
encoding='utf-8')
for chl in posTable.values:
if chl[1] > chl[0] and chl[2] >= repeatLen:
reverseComplementPositionFile.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] - chl[2] + 1)) + '\t' + str(int(chl[2])) + '\n')
if chl[2] >= repeatLen:
reverseComplementPositionFileOriginal.write(
str(int(chl[0])) + '\t' + str(int(chl[0] + chl[2] - 1)) + '\t' + str(int(chl[1])) + '\t'
+ str(int(chl[1] - chl[2] + 1)) + '\t' + str(int(chl[2])) + '\n')
reverseComplementPositionFile.close()
reverseComplementPositionFileOriginal.close()
except BaseException as e:
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***')
print('chenhuilong2\n')
failFile.write(name + '\t' + str(length) + '\n')
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***', file=failLogFile)
if os.path.exists(customFolderPath + '/' + name + '_positions_reverse_complement.txt'):
os.remove(customFolderPath + '/' + name + '_positions_reverse_complement.txt')
if os.path.exists(customFolderPathOriginal + '/' + name + '_positions_reverse_complement.txt'):
os.remove(customFolderPathOriginal + '/' + name + '_positions_reverse_complement.txt')
# 如果在,删除reverseComplementPositionFile,reverseComplementPositionFileOriginal,
except BaseException as e:
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***') # 把名字和长度都打印出来吧,跟542一样,就是完全输出到文件版一摸一样。根据实测,写文件的时候,如果中断,这个文件最终还是空文件——那这个是否也需要去解决一下?先不解决也行,因为能打印到中断,哪怕没space了。所以这个Python的写文件方式是否可以换成Linux的追加重定向方式添加>>,然后可以保证不出现那个问题。
print('chenhuilong3\n') # 把这个定位报错再用户体验好一些吧。这个字符串就不改了。比如我看到报错得判断出来是哪块代码出的问题。比如通过换行,或首尾加标识字符等等。
failFile.write(name + '\t' + str(length) + '\n')
print(name, str(length), e, e.__traceback__.tb_lineno, sep='***', file=failLogFile)
removeThreeFiles()
# 删除temp.single.input.fasta1.txt,temp.single.input.fasta2.txt, position.txt
logFile.close()
failFile.close()
failLogFile.close()
if '0' in modeList:
directPositionBigLengthLogFile.close()
if '1' in modeList:
invertedPositionBigLengthLogFile.close()
if '2' in modeList:
reverseComplementPositionBigLengthLogFile.close()
seqLengthFileOriginal.close()
for fileName1 in importFileList:
filePath1 = os.path.join(importFolderPath, fileName1)
fastaFile = open(filePath1, 'r', encoding='utf-8')
chlFasta = {}
gene = seq = ''
for row in fastaFile:
row = row.strip('\n')
if row.startswith('>'):
if gene != '' and seq != '':
chlFasta[gene] = seq.upper()
gene = row.replace('>', '')
seq = ''
else:
seq += row
chlFasta[gene] = seq.upper()
fastaFile.close()
exportFolderPositionPath = exportFolderPath + '/' + fileName1
batchExportPosition(chlFasta,exportFolderPositionPath,fileType,modeList,repeatLen)
# 如果结果位置文件的ID在失败的ID中,那删除这个位置文件——做一个检查。
totalNumber = len(list(chlFasta.keys()))
folder = '/positions'
customFailFilePath = exportFolderPositionPath + folder + '/failure_sequences.txt'
with open(customFailFilePath, 'r', encoding='utf-8') as failFile:
deduplicationFailNameDict = {}
for line in failFile:
print(line.strip('\n'))
lineList = line.strip('\n').split('\t')
deduplicationFailNameDict[lineList[0]] = ''
failNameList = list(deduplicationFailNameDict.keys())
failNumber = len(failNameList) # 这里要优化,就是这个数量要除以config里的mode数。不然大于1的时候,正常会是mode数的倍数值
# failFile.close()
# 'positions' folder
customFolderPath = exportFolderPositionPath + folder
customFileList = os.listdir(customFolderPath)
deduplicationDict = {}
for fileName in customFileList:
if '_positions_' in fileName:
name = re.findall("(.*?)_positions_", fileName)[0]
deduplicationDict[name] = ''
if name in failNameList:
print(
"For 'positions' folder, This '{}' is in 'failure_sequences.txt'. it's illogical!".format(name))
os.remove(customFolderPath + '/' + fileName)
successNumber = len(list(deduplicationDict.keys()))
customFileList = os.listdir(customFolderPath)
deduplicationAfterDeleteDict = {}
for fileName in customFileList:
if '_positions_' in fileName:
name = re.findall("(.*?)_positions_", fileName)[0]
deduplicationAfterDeleteDict[name] = ''
successNumberAfterDelete = len(list(deduplicationAfterDeleteDict.keys()))
print("For 'positions',")
print("This fasta file with {} sequences has a total of {} failures and {} successes this time!".format(
totalNumber, failNumber, successNumber))
print(
"If a successful ID appears in 'failure_sequences.txt', the number result after deleting the position file corresponding to this ID is: \n "
"This fasta file with {} sequences has a total of {} failures and {} successes this time!".format(
totalNumber, failNumber, successNumberAfterDelete))
# positions_original
folderOriginal = '/positions_original'
customFolderPathOriginal = exportFolderPositionPath + folderOriginal
customOriginalFileList = os.listdir(customFolderPathOriginal)
while 'all_sequences_length.txt' in customOriginalFileList:
customOriginalFileList.remove('all_sequences_length.txt')
deduplicationDictOriginal = {}
for fileNameOriginal in customOriginalFileList:
name = re.findall("(.*?)_positions_", fileNameOriginal)[0]
deduplicationDictOriginal[name] = ''
if name in failNameList:
print(
"For 'positions_original' folder, This '{}' is in 'failure_sequences.txt'. it's illogical!".format(
name))
os.remove(customFolderPathOriginal + '/' + fileNameOriginal)
successNumberOriginal = len(list(deduplicationDictOriginal.keys()))
customOriginalFileList = os.listdir(customFolderPathOriginal)
while 'all_sequences_length.txt' in customOriginalFileList:
customOriginalFileList.remove('all_sequences_length.txt')
deduplicationAfterDeleteDictOriginal = {}
for fileNameOriginal in customOriginalFileList:
name = re.findall("(.*?)_positions_", fileNameOriginal)[0]
deduplicationAfterDeleteDictOriginal[name] = ''
successNumberAfterDeleteOriginal = len(list(deduplicationAfterDeleteDictOriginal.keys()))
print("For 'positions_original',")
print("This fasta file with {} sequences has a total of {} failures and {} successes this time!".format(
totalNumber, failNumber, successNumberOriginal))
print(
"If a successful ID appears in 'failure_sequences.txt', the number result after deleting the position file corresponding to this ID is: \n "
"This fasta file with {} sequences has a total of {} failures and {} successes this time!".format(
totalNumber, failNumber, successNumberAfterDeleteOriginal))
if totalNumber == failNumber + successNumber and totalNumber == failNumber + successNumberOriginal:
print(
"For 'positions' and 'positions_original', the sum of this number of successes and failures is correct!")
else:
state = -1
# 'positions' folder
intersectionList = list(set(failNameList) & set(list(deduplicationDict.keys())))
unionList = list(set(failNameList) | set(list(deduplicationDict.keys())))
besidesIntersectionList = list(set(unionList) ^ set(list(chlFasta.keys())))
# if [] == intersectionList and [] == besidesIntersectionList:
# print('It is right!')
# positions_original
intersectionListOriginal = list(set(failNameList) & set(list(deduplicationDictOriginal.keys())))
unionListOriginal = list(set(failNameList) | set(list(deduplicationDictOriginal.keys())))
besidesIntersectionListOriginal = list(set(unionListOriginal) ^ set(list(chlFasta.keys())))
# if [] == intersectionListOriginal and [] == besidesIntersectionListOriginal:
# print('It is right!')
if [] == intersectionList and [] == besidesIntersectionList and [] == intersectionListOriginal and [] == besidesIntersectionListOriginal and totalNumber == failNumber + successNumber and totalNumber == failNumber + successNumberOriginal:
print(
"For 'positions' and 'positions_original', the sum of this number of successes and failures and and the ID is correct!\n")
else:
state = -1
except Exception as e:
state = -1
print(e, e.__traceback__.tb_lineno)
if 0 == state:
print('Congratulations, the script worked and finished successfully!')
elif -1 == state:
print('Sadly, the script did not complete properly, please check the output log, resolve the problem, and try again!')
endTime = time.time()
runTime = round(endTime - startTime)
hour = runTime//3600
minute = (runTime-3600*hour)//60
second = runTime-3600*hour-60*minute
print(f'The program running time: {hour}hour(s) {minute}minute(s) {second}second(s).')
# created by Huilong Chen, May 21 2022!
# revised by Huilong Chen, May 27 2022!
# revised by Huilong Chen, July 27, 2022! A new CyDotian -> seqAlignToolkit.
# revised by Huilong Chen, August 1, 2022! Optimize.
# revised by Huilong Chen, January 5, 2025! Optimize.