-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFind_redundant_seq.py
65 lines (53 loc) · 2.05 KB
/
Find_redundant_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
########################################################
# Please cite the script by referencing to github #
# repository #
########################################################
import argparse
import os
import subprocess
#import matplotlib.pyplot as pl
parser = argparse.ArgumentParser(description='This script reports identical sequences present in a fasta file. You can set the similarity level by identity_perc. It is based on blat output (psl format)')
parser.add_argument('-fa', '-fasta', help = 'A fasta file')
parser.add_argument('-idp', '-identity_perc', help = 'Identity percent (default=70)', type = int)
parser.add_argument('-dist', '-distribution', help = 'Report distribution of identity levels between sequences (default = True)', default = True, action='store_true')
args = parser.parse_args()
#Empty list to save idetity percents
idp_list = []
# Set identity percentage cut-off if it was not provided by the user
if args.idp is None:
args.idp = 70
print(args.idp)
print(args.dist)
def run_blat(fa_file):
print('Running blat.....')
cmd = 'blat -t=dna -q=dna -noHead ' + args.fa + ' ' + args.fa + ' ' + args.fa + '.psl'
print(cmd)
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
process.wait()
print('Blat is done')
#Running blat
run_blat(args.fa)
blat_output = args.fa + '.psl'
#Parsing blat output
psl_file = open(blat_output, 'r')
f = open(args.fa + 'gene_trans.txt', 'w')
for line in psl_file:
line = line.strip()
line_list = line.split()
match_length = line_list[0]
query_name = line_list[9]
target_name = line_list[13]
query_length = line_list[10]
if query_name != target_name:
idp_match = int(match_length) / int(query_length)
if args.dist == True:
idp_list.append(idp_match)
if int(idp_match) >= int(args.idp):
f.write(query_name + '\t' + target_name + '\n')
#Print histogram of idp distribution
#if args.dist == True:
# fig = pl.hist(idp_list,normed=0)
# pl.xlabel("Identity percent")
# pl.ylabel("Frequency")
# pl.savefig("abc.png")