forked from LeeBergstrand/CDHITtoFASTA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lib.py
102 lines (79 loc) · 3.34 KB
/
lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
"""
Created by: Lee Bergstrand (Copyright 2015)
Description: A provides functions for __main__.py
Requirements: - This software requires the Biopython module: http://biopython.org/wiki/Download
- This software requires the cd_hit_parser.py module (included)
"""
# Imports:
import sys
from os import path
from Bio import SeqIO
from cd_hit_parser import *
# ==========
# Functions:
# ==========
# -----------------------------------------------------------------------------------------------------------
def check_file_extensions(reference_sequences_path, input_fasta_path, input_cluster_path):
"""
Checks file extensions for correctness.
:param reference_sequences_path: The path for the reference sequences file.
:param input_fasta_path: The path for the FASTA sequences file.
:param input_cluster_path: The path for the CD-Hit cluster file.
"""
sequence_file_extension = path.splitext(reference_sequences_path)[-1]
fasta_file_extension = path.splitext(input_fasta_path)[-1]
cluster_file_extension = path.splitext(input_cluster_path)[-1]
if not sequence_file_extension == ".txt":
print("[Warning] " + sequence_file_extension + " may not be a txt file!")
if not fasta_file_extension == ".faa":
print("[Warning] " + fasta_file_extension + " may not be a FASTA file!")
if not cluster_file_extension == ".clstr":
print("[Warning] " + cluster_file_extension + " may not be a CD-Hit cluster file!")
# -----------------------------------------------------------------------------------------------------------
def get_reference_list(reference_sequences_path):
"""
Parses reference sequence file and returns a list of reference accessions.
:param reference_sequences_path: The path for the reference sequences file.
:return: List of reference sequence accession strings.
"""
try:
infile = open(reference_sequences_path, "rU")
sequences = infile.read()
reference_sequences = sequences.splitlines()
infile.close()
return reference_sequences
except IOError as e:
print(str(e))
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred)
# -----------------------------------------------------------------------------------------------------------
def get_fasta_dict(input_fasta_path):
"""
Reads a FASTA file and stores its sequences as a dictionary of Biopython sequence record objects.
:param input_fasta_path: The path for the FASTA sequences file.
:return: Dictionary of Biopython sequence record objects.
"""
try:
new_file = open(input_fasta_path, "rU")
sequence_record_dict = SeqIO.to_dict(SeqIO.parse(new_file, "fasta"))
new_file.close()
return sequence_record_dict
except IOError as e:
print(str(e))
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred)
# -----------------------------------------------------------------------------------------------------------
def get_cluster_list(input_cluster_path):
"""
Reads a CD-HIT CLuster file passes its contents onto parse_cd_hit_file for parsing.
:param input_cluster_path: The path for the CD-Hit cluster file.
:return: List of Cluster objects.
"""
try:
new_file = open(input_cluster_path, "rU")
clustering = new_file.read()
cluster_list = parse_cd_hit_file(clustering)
new_file.close()
return cluster_list
except IOError as e:
print(str(e))
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred)