-
Notifications
You must be signed in to change notification settings - Fork 18
/
csvToEuler.py
121 lines (95 loc) · 2.65 KB
/
csvToEuler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# converts a csv file with set info to an Euler permutation list
# also checks for duplicate row names
# defines the CSV data filenames
inFile = "input.csv"
outFile = "output.csv"
# necessary imports
import csv
import time
# start timer
start = time.time()
# number of redundant iterations
iter = 0
# get the CSV data file as input
input = open(inFile, "rU")
reader = csv.reader(input)
# set up CSV data file as output
output = open(outFile, "wb")
writer = csv.writer(output, delimiter="\t")
# initialize row variable
firstPass = True
# initialize lists for item info
itemName = []
# initialize label counting hash
labelCount = dict()
# process CSV file, row-by-row
for row in reader:
# initialize list for creating Euler diagram list
outRow = []
# when processing column names only
if firstPass:
firstPass = False;
# when processing an item
else:
# grab which label (last column is the set label)
label = row[61]
# are there duplicate names?
if item in itemName:
# add a redundancy
iter += 1
# no duplicates, store in list
else:
itemName.append(item)
# add to label count appropriately
if item in labelCount:
labelCount[gene[1]] += 1
else:
labelCount[gene[1]] = 1
# store as part of the Euler diagram output
outRow.append(item.strip())
outRow.append(label)
writer.writerow(outRow)
# close all files
input.close()
output.close()
# count the gene labels for histogram
oneLabel = 0
twoLabels = 0
threeLabels = 0
fourLabels = 0
fiveLabels = 0
additionalLabels = 0
for val in labelCount.values():
if val == 1:
oneLabel += 1
elif val == 2:
twoLabels += 1
elif val == 3:
threeLabels += 1
elif val == 4:
fourLabels += 1
elif val == 5:
fiveLabels += 1
else:
additionalLabels += 1
# stop timer
end = time.time()
# process the time elapsed
elapsed = end - start
min = round(elapsed / 60, 3)
# display redundancies (if any)
if iter == 1:
print("There was " + str(iter) + " redundancy.")
elif iter == 0:
print("There were no redundancies!")
else:
print("There were " + str(iter) + " redundancies.")
# display gene label counts
print("There are " + str(oneLabel) + " genes with one label.")
print("There are " + str(twoLabels) + " genes with two labels.")
print("There are " + str(threeLabels) + " genes with three labels.")
print("There are " + str(fourLabels) + " genes with four labels.")
print("There are " + str(fiveLabels) + " genes with five labels.")
print("There are " + str(additionalLabels) + " genes with additional labels.")
# display time taken
print("CSV scanning operation complete after", min, "minutes.")