-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_analyze_trait.py
127 lines (109 loc) · 5.84 KB
/
test_analyze_trait.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
from init_tests import *
from datetime import datetime
from scoary.ScoaryTree import ScoaryTree
from scoary.load_genes import load_genes
from scoary.load_traits import load_traits
from scoary.analyze_trait import init_result_df, create_test_df, add_odds_ratio, pair_picking
def generate_fake_traits(genes_df: pd.DataFrame) -> {str: bool}:
label_to_trait = {}
label_to_trait.update({l: True for l in genes_df.columns[:11]})
label_to_trait.update({l: False for l in genes_df.columns[89:]})
return pd.Series(label_to_trait, dtype='boolean')
class TestScoary(TestCase):
def test_create_result_df(self):
_, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
result_df = init_result_df(genes_df, trait_series=generate_fake_traits(genes_df))
self.assertEqual(
result_df.columns.tolist(),
['Gene', 'g+t+', 'g+t-', 'g-t+', 'g-t-', '__contingency_table__', 'sensitivity', 'specificity']
)
def test_contingency_test(self):
_, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
result_df = init_result_df(genes_df, trait_series=generate_fake_traits(genes_df))
test_df = create_test_df(result_df=result_df)
self.assertEqual(['__contingency_table__', 'fisher_p'], test_df.columns.tolist())
print(f"Done: minpval={test_df.fisher_p.min()}")
def test_odds_ratio(self):
_, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
genes_df = genes_df[:100] # only first 100 rows
test_df = init_result_df(genes_df, generate_fake_traits(genes_df))
# apply function
test_df = add_odds_ratio(test_df)
self.assertEqual(
test_df.columns.tolist(),
['Gene', 'g+t+', 'g+t-', 'g-t+', 'g-t-', '__contingency_table__', 'sensitivity', 'specificity',
'odds_ratio']
)
# calculate odds_ratio with fisher_exact
fisher_ors = test_df.apply(
lambda row: fisher_exact([[row['g+t+'], row['g+t-']], [row['g-t+'], row['g-t-']]])[0], axis=1)
# check if result is identical
for manual_or, fisher_or in zip(test_df['odds_ratio'], fisher_ors):
self.assertTrue(is_equivalent(manual_or, fisher_or))
def test_init_result_df_performance(self):
_, genes_df = load_genes('../data/new_ds/N0.tsv', gene_data_type='gene-list:\t')
ltt = generate_fake_traits(genes_df)
start = datetime.now()
result_df = init_result_df(genes_df, trait_series=ltt)
end = datetime.now()
print(result_df)
print('took:', end - start)
def test_tetracycline(self):
_, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
_, traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,')
trait_series = traits_df['Tetracycline_resistance']
# calculate sensitivity and specificity
test_df = init_result_df(
genes_df,
trait_series=pd.Series(
{l: bool(v) for l, v in trait_series.items() if v in (0, 1)},
dtype='boolean'
)
)
# calculate odds_ratio
test_df = add_odds_ratio(test_df)
# calculate pairwise comparisons
tree = ScoaryTree.from_list(get_json('../data/tetracycline/expected_result.json')['as_list'])
assert set(tree.labels()) == set(genes_df.columns)
test_df = pair_picking(test_df, genes_df, tree=tree, label_to_trait=trait_series)
# load expected result from scoary 1
expected_result = pd.read_csv('../data/tetracycline/fisher_permute100.results.csv')
test_df.set_index('Gene', inplace=True)
# check if result is identical
for i, row in expected_result.iterrows():
table = (row.Number_pos_present_in,
row.Number_neg_present_in,
row.Number_pos_not_present_in,
row.Number_neg_not_present_in)
new_row = test_df.loc[row.Gene]
new_table = tuple(int(new_row[c]) for c in ('g+t+', 'g+t-', 'g-t+', 'g-t-'))
self.assertEqual(table, new_table)
self.assertAlmostEqual(
row.Odds_ratio, new_row.odds_ratio,
msg=f'Failed to calculate odds_ratio for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}'
)
self.assertAlmostEqual(
row.Sensitivity, new_row.sensitivity,
msg=f'Failed to calculate sensitivity for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}'
)
self.assertAlmostEqual(
row.Specificity, new_row.specificity,
msg=f'Failed to calculate specificity for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}'
)
xx = [
(row.Max_Pairwise_comparisons, new_row.contrasting),
(row.Max_supporting_pairs, new_row.supporting),
(row.Max_opposing_pairs, new_row.opposing),
(row.Best_pairwise_comp_p, new_row.best),
(row.Worst_pairwise_comp_p, new_row.worst)
]
try:
self.assertEqual(row.Max_Pairwise_comparisons, new_row.contrasting)
self.assertEqual(row.Max_supporting_pairs, new_row.supporting)
self.assertEqual(row.Max_opposing_pairs, new_row.opposing)
self.assertAlmostEqual(row.Best_pairwise_comp_p, new_row.best)
self.assertAlmostEqual(row.Worst_pairwise_comp_p, new_row.worst)
except Exception as e:
print(i, row.Gene, xx)
self.fail(msg=str(e))