forked from uic-cs418/group-project-kungfu-pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
t_test.py
155 lines (120 loc) · 5.23 KB
/
t_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
def ttest_uic_theft(data:pd.DataFrame, data2:pd.DataFrame):
'''This test is used when you have one group of participants measured at two
different time points or under two different conditions. For example, you might
use a paired samples t-test to compare the blood pressure of individuals before
and after a treatment. '''
selected_neighborhoods = [
"Near West Side",
"West Town",
"Loop",
"Near North Side",
"Near South Side",
"Lower West Side",
"East Garfield Park",
"North Lawndale",
"South Lawndale",
"Humboldt Park",
]
data = data[data['RegionName'].isin(selected_neighborhoods)]
data = data[data['Primary Type'] == 'THEFT']
#print(data)
data2 = data2[data2['RegionName'].isin(selected_neighborhoods)]
data2 = data2[data2['Primary Type'] == 'THEFT']
preCovid = (data.groupby(['RegionName']).count().reset_index())
postCovid = (data2.groupby(['RegionName']).count().reset_index())
preCovid.rename(columns={'Arrest':'NUM CRIME'}, inplace=True)
postCovid.rename(columns={'Arrest':'NUM CRIME'}, inplace=True)
preCovid = preCovid[['RegionName', 'NUM CRIME']]
postCovid = postCovid[['RegionName', 'NUM CRIME']]
# print(preCovid)
# print(postCovid)
t_stat, pval = stats.ttest_rel(np.log(preCovid['NUM CRIME']).to_list(), np.log(postCovid['NUM CRIME']).to_list())
print(t_stat, pval)
if pval < 0.05:
if t_stat > 0:
print("Reject the NULL Hypothesis: There is a significant decrease in theft post COVID compared to pre COVID")
else:
print("Reject the NULL Hypothesis: There is a significant increase in theft post COVID compared to pre COVID")
else:
print("Fail to reject the NULL hypothesis: There is no significant difference in theft between pre and post covid")
# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2, 1)
# plt.hist(np.log(preCovid['NUM CRIME']), bins=30, color='blue', alpha=0.7)
# plt.title('Pre-COVID Crime Data')
# plt.xlabel('Number of Crimes')
# plt.ylabel('Frequency')
# plt.subplot(1, 2, 2)
# plt.hist(np.log(postCovid['NUM CRIME']), bins=30, color='red', alpha=0.7)
# plt.title('Post-COVID Crime Data')
# plt.xlabel('Number of Crimes')
# plt.ylabel('Frequency')
# plt.tight_layout()
# plt.show()
def ttest_uic_battery(data:pd.DataFrame, data2:pd.DataFrame):
'''This test is used when you have one group of participants measured at two
different time points or under two different conditions. For example, you might
use a paired samples t-test to compare the blood pressure of individuals before
and after a treatment. '''
selected_neighborhoods = [
"Near West Side",
"West Town",
"Loop",
"Near North Side",
"Near South Side",
"Lower West Side",
"East Garfield Park",
"North Lawndale",
"South Lawndale",
"Humboldt Park",
]
data = data[data['RegionName'].isin(selected_neighborhoods)]
data = data[data['Primary Type'] == 'BATTERY']
#print(data)
data2 = data2[data2['RegionName'].isin(selected_neighborhoods)]
data2 = data2[data2['Primary Type'] == 'BATTERY']
preCovid = (data.groupby(['RegionName']).count().reset_index())
postCovid = (data2.groupby(['RegionName']).count().reset_index())
preCovid.rename(columns={'Arrest':'NUM CRIME'}, inplace=True)
postCovid.rename(columns={'Arrest':'NUM CRIME'}, inplace=True)
preCovid = preCovid[['RegionName', 'NUM CRIME']]
postCovid = postCovid[['RegionName', 'NUM CRIME']]
# print(preCovid)
# print(postCovid)
t_stat, pval = stats.ttest_rel(np.log(preCovid['NUM CRIME']).to_list(), np.log(postCovid['NUM CRIME']).to_list())
print(t_stat, pval)
if pval < 0.05:
if t_stat > 0:
print("Reject the NULL Hypothesis: There is a significant decrease in battery post COVID compared to pre COVID")
else:
print("Reject the NULL Hypothesis: There is a significant increase in battery post COVID compared to pre COVID")
else:
print("Fail to reject the NULL hypothesis: There is no significant difference in battery between pre and post covid")
# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2, 1)
# plt.hist(np.log(preCovid['NUM CRIME']), bins=30, color='blue', alpha=0.7)
# plt.title('Pre-COVID Crime Data')
# plt.xlabel('Number of Crimes')
# plt.ylabel('Frequency')
# plt.subplot(1, 2, 2)
# plt.hist(np.log(postCovid['NUM CRIME']), bins=30, color='red', alpha=0.7)
# plt.title('Post-COVID Crime Data')
# plt.xlabel('Number of Crimes')
# plt.ylabel('Frequency')
# plt.tight_layout()
# plt.show()
def main():
pre = pd.read_csv('csv_files/Crimes_2017_to_2019.csv')
post = pd.read_csv('csv_files/Crimes_2021_to_Present.csv')
ttest_uic_battery(pre, post)
def main2():
pre = pd.read_csv('csv_files/Crimes_2017_to_2019.csv')
post = pd.read_csv('csv_files/Crimes_2021_to_Present.csv')
ttest_uic_theft(pre, post)
# if __name__ == "__main__":
# main()
# if __name__ == "__main__":
# main2()