-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_reads_w_blat_hit_pe.py
executable file
·116 lines (90 loc) · 3.12 KB
/
get_reads_w_blat_hit_pe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
import argparse
import glob
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
import re
import os, errno
# =====================================================
#
# This script goes through BLAT psl files (without headers, though probably will work if they're there too)
# It takes the query names that have blat hits, and puts those into a list.
# It then goes to the original read files and pulls the reads with those names.
# As it's written now, it assumes there are R1 and R2 files for forward and reverse reads, and a singletons
# file for unpaired reads.
#
# Matt Gitzendanner
# University of Florida
# 2/19/14
# 4/19/21 - Updated for Python3
#
# =====================================================
#####################
# Parse commandline options.
# -b blat filename name-- without the R1.psl, R2.psl, singletons.psl
# -r Directory with reads files-- R1, R2, singletons
# -o Output path
#####################
parser = argparse.ArgumentParser()
parser.add_argument("-b", help="blat name-- without the R1.psl, R2.psl, singletons.psl")
parser.add_argument("-r", help="Directory with reads files-- R1, R2, singletons")
parser.add_argument("-o", help="Output path")
args = parser.parse_args()
blat_name= args.b
reads_dir=args.r
out_dir=args.o
#Function to go through a psl file and add the hits to an array.
# Input: psl file path
def add_to_list(psl_file):
try:
BLAT=open(psl_file, 'r')
except IOError:
print (f"Can't open file: {psl_file}")
for Line in BLAT :
Line = Line.strip('\n')
Line_bits=re.split('\t',Line)
read_name=Line_bits[9]
if not (read_name in read_list): #add the scaffold to the list for the taxon.
read_list.append(read_name)
#Function to go through a reads file and pull reads that are in a list.
#Input: Reads file path and output file path
def get_reads(reads_file,out_file):
try:
READS=open(reads_file, 'r')
except IOError:
print (f"Can't open file: {reads_file}")
try:
OUT=open(out_file, 'w')
except IOError:
print ("Can't open file: {out_file}")
for record in SeqIO.parse(READS, "fastq") :
if record.id in read_list:
SeqIO.write(record, OUT, "fastq")
#Make the output directory if it's not already made.
try:
os.makedirs(out_dir)
except OSError as err:
# Reraise the error unless it's about an already existing directory
if err.errno != errno.EEXIST or not os.path.isdir(out_dir):
raise
read_list=[] #List for the reads that need to be pulled
#Run add_to_list on the 3 psl files.
blat_file=blat_name + "R1.psl"
add_to_list(blat_file)
blat_file=blat_name + "R2.psl"
add_to_list(blat_file)
blat_file=blat_name + "singletons.psl"
add_to_list(blat_file)
#Run get_reads on the 3 original reads files.
file=os.path.basename(blat_name)
reads_file= reads_dir + "/" + file + "R1.fq"
out_file= out_dir + "/" + file + "R1.fq"
get_reads(reads_file, out_file)
reads_file= reads_dir + "/" + file + "R2.fq"
out_file= out_dir + "/" + file + "R2.fq"
get_reads(reads_file, out_file)
reads_file= reads_dir + "/" + file + "singletons.fq"
out_file= out_dir + "/" + file + "singletons.fq"
get_reads(reads_file, out_file)