forked from palc/teproj
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_fasta_seqs.py
112 lines (75 loc) · 2.82 KB
/
get_fasta_seqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/local/python/2.7.3/bin/python
from Bio import SeqIO
import argparse
from numpy import random as rand
import sys
# -----------------------------------------------------------------------------
parser = argparse.ArgumentParser(description='''
This script parses a multi-line FASTA file and extracts sequences based
on instructions.
AUTHOR: Michelle Hwang''')
parser.add_argument('fasta', help='Name of FASTA db.')
parser.add_argument('-k', '--keep', help='''List of names, 1 per line,
of sequences, to keep.''')
parser.add_argument('-i', '--invert', action="store_true", help='''If
specified, remove list of sequences instead of grabbing them.''')
parser.add_argument('-d', '--divide', help='''Instead of grabbing/filtering
sequences, divide FASTA file into separate files of x sequences each.''')
parser.add_argument('-p', '--prefix', help='''Out prefix for divide flag.''')
parser.add_argument('-r', '--random', type=int,
help='''Randomly sample this number of sequences from the FASTA file.''')
parser.add_argument('-a', '--append', type=str,
help='''Append string to all headers.''')
args = parser.parse_args()
seqiter = SeqIO.parse(open(args.fasta), 'fasta')
# -----------------------------------------------------------------------------
def append(seqiter):
for seq in seqiter:
seq.id = seq.id + str(args.append)
SeqIO.write(seq, out, 'fasta')
def divide():
i = 1
label = 10
seqs = list()
seqiter = list(seqiter)
l = len(seqiter)
d = int(args.divide)
for seq in seqiter:
seqs.append(seq)
if ((i == d) or (i % d == 0) or (i == l)):
out = open(args.prefix+'-'+str(label)+'.fa', 'w')
SeqIO.write((s for s in seqs), out, 'fasta')
out.close()
label = label + 10
seqs = list()
i = i+1
def random(seqiter):
n = int(args.random)
seqs = list(seqiter)
SeqIO.write((seq for seq in random.sample(seqs, n)),
sys.stdout, 'fasta')
def get(seqiter):
wanted = [line.strip() for line in open(args.keep)]
if args.invert:
SeqIO.write((seq for seq in seqiter if seq.id.split(' ')[0]
not in wanted), sys.stdout, 'fasta')
else:
#SeqIO.write((seq for seq in seqiter if seq.id.split(' ')[0] in wanted), sys.stdout, 'fasta')
for seq in seqiter:
if seq.id.split(' ')[0] in wanted or seq.id.split('#')[0] in wanted:
SeqIO.write(seq, sys.stdout, 'fasta')
# -----------------------------------------------------------------------------
def main():
seqiter = SeqIO.parse(open(args.fasta), 'fasta')
## Divide sequences
if args.divide is not None:
divide(seqiter)
## Randomly sample sequences
elif args.random is not None:
random(seqiter)
elif args.append is not None:
append(seqiter)
## Grab or filter sequences
else:
get(seqiter)
main()