-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDelete_Sequences_in_Multiple_Files_Based_on_Names.py
67 lines (52 loc) · 2.32 KB
/
Delete_Sequences_in_Multiple_Files_Based_on_Names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
This script will take multiple fasta files and it will delete samples within them
based on a list of names from a second file. It will return a "cleaned" fasta file
for each of the original files in a new directory.
Simon Uribe-Convers - December 1st, 2017 - http://simonuribe.com
"""
import sys
import os
from Bio import SeqIO
import glob
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage Error, type: python "+sys.argv[0]+" fasta_file_ending (e.g., fasta, fa, aln) list_with_names")
sys.exit(0)
if os.path.exists("Cleaned_Files"):
os.system("rm -r Cleaned_Files")
os.mkdir("Cleaned_Files")
file_ending = (sys.argv[1])
file_with_names = open(sys.argv[2], "r")
# Create list of genes to subset with
names_subset = []
for line in file_with_names:
names = line.strip().split("\t")[0]
names_subset.append(names)
for filename in glob.glob("*." + file_ending):
#Read in sequence data
seqfile = SeqIO.parse(open(filename),'fasta')
# Files to write out
output = open(filename.split("." + file_ending)[0] + "_cleaned." + file_ending , "w")
# Delete sequence from list from each fasta file
for seq in seqfile:
# Split the name so it matches the names to subset
# This is only necessary if the names in different fasta files have some
# locus specific identifier, e.g., sp1_gene1, sp1_gene2. Modify the way
# to split the name according to your naming convention.
seq_name = str(seq.id.split("_Combined")[0])
# If the names are all the same in every fasta file, comment the line
# above and uncomment the line below.
# seq_name = str(seq.id)
sequence = str(seq.seq)
if seq_name not in names_subset:
output.write(">" + seq_name + "\n" + sequence + "\n")
#print("Original file has: %d"),
# Output some inforation
records = list(SeqIO.parse(filename, "fasta"))
print("\nWorking on file: %s" %filename)
print("Sequences in original file: %d" % len(records))
clean_records = list(SeqIO.parse(filename.split("." + file_ending)[0] + "_cleaned." + file_ending, "fasta"))
print("Sequences in cleaned file: %d" % len(clean_records))
# Housekeeping
os.system("mv *_cleaned* ./Cleaned_Files")
print("\nFinished, the cleaned files are in the 'Cleaned_Files' directory.\n")