-
Notifications
You must be signed in to change notification settings - Fork 10
/
debruijn.py
82 lines (71 loc) · 2.25 KB
/
debruijn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class Node:
""" Class Node to represent a vertex in the de bruijn graph """
def __init__(self, lab):
self.label = lab
self.indegree = 0
self.outdegree = 0
class Edge:
def __init__(self, lab):
self.label = lab
def read_reads(fname):
""" Read short reads in FASTA format. It is assumed that one line in the input file correspond to one read. """
f = open(fname, 'r')
lines = f.readlines()
f.close()
reads = []
for line in lines:
if line[0] != '>':
reads = reads + [line.rstrip()]
return reads
def construct_graph(reads, k):
""" Construct de bruijn graph from sets of short reads with k length word"""
edges = dict()
vertices = dict()
for read in reads:
i = 0
while i+k < len(read):
v1 = read[i:i+k]
v2 = read[i+1:i+k+1]
if v1 in edges.keys():
vertices[v1].outdegree += 1
edges[v1] += [Edge(v2)]
else:
vertices[v1] = Node(v1)
vertices[v1].outdegree += 1
edges[v1] = [Edge(v2)]
if v2 in edges.keys():
vertices[v2].indegree += 1
else:
vertices[v2] = Node(v2)
vertices[v2].indegree += 1
edges[v2] = []
i += 1
return (vertices, edges)
def output_contigs(g):
""" Perform searching for Eulerian path in the graph to output genome assembly"""
V = g[0]
E = g[1]
# Pick starting node (the vertex with zero in degree)
start = V.keys()[0]
for k in V.keys():
if V[k].indegree < V[start].indegree:
start = k
contig = start
current = start
while len(E[current]) > 0:
# Pick the next node to be traversed (for now, at random)
next = E[current][0]
del E[current][0]
contig += next.label[-1]
current = next.label
return contig
def print_graph(g):
""" Print the information in the graph to be (somewhat) presentable """
V = g[0]
E = g[1]
for k in V.keys():
print "name: ", V[k].label, ". indegree: ", V[k].indegree, ". outdegree: ", V[k].outdegree
print "Edges: "
for e in E[k]:
print e.label
print