-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_clean.py
211 lines (168 loc) · 8.11 KB
/
data_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
import tqdm
def file_name(file_dir):
for root, dirs, files in os.walk(file_dir):
# print(root) # 当前目录路径
# print(dirs) # 当前路径下所有子目录
return files # 当前路径下所有非目录子文件,列表
def get_test_clinical(fileload="new_train_clean"):
file_list = file_name(f'/home/user02/HYK/bis/database/ce_clean/{fileload}')
file_num = len(file_list)
clinical = pd.read_csv('/home/user02/HYK/bis/information.csv',
usecols=['caseid', 'age', 'sex', 'height', 'weight', "bmi"])
for i in range(file_num):
if clinical['sex'][i] == 'F':
clinical['sex'][i] = 0
else:
clinical['sex'][i] = 1
data = pd.DataFrame()
for i in range(len(file_list)):
fileid = int(file_list[i].split('.csv')[0])
clinical_data = clinical.loc[clinical.caseid == fileid, "age":"bmi"]
if clinical_data['sex'].item() == 'F':
clinical_data['sex'] = 0
else:
clinical_data['sex'] = 1
data = data.append([{"caseid": fileid,
'age': float(clinical_data['age']),
"sex":float(clinical_data['sex']),
"weight":float(clinical_data['weight']),
"height":float(clinical_data['height']),
"bmi": float(clinical_data['bmi'])}])
data.to_csv(f'/home/user02/HYK/bis/database/{fileload}.csv',
encoding='utf-8')
def data_clean(file, train=True):
# 读取数据
print(file)
people = pd.read_csv(f'/home/user02/HYK/bis/vital_1s/{file}')
people = people.rename(columns=lambda x: x.replace("Solar8000/", "").replace("Orchestra/", "").replace("BIS/", ""))
# 删除错误的心率数据
people.HR = people.HR.interpolate(method='linear', limit_direction='forward', axis=0)
people = people.dropna(subset=['HR'])
people.index = range(0, len(people))
people = people.drop(people[people.HR == 0].index, axis=0)
people.index = range(0, len(people))
# 删除错误的BIS数据
people.BIS = people.BIS.interpolate(method='linear', limit_direction='forward', axis=0)
people = people.dropna(subset=['BIS'])
people.index = range(0, len(people))
people = people.drop(people[people.BIS == 0].index, axis=0)
people.index = range(0, len(people))
# BIS平滑
if train:
lowess = sm.nonparametric.lowess
people.BIS = lowess(people.BIS, people.index, frac=0.03)[:, 1]
# 错误数据补齐
# people.PPF20_RATE = people.PPF20_RATE.interpolate(method='linear', limit_direction='forward', axis=0)
# people.RFTN20_RATE = people.RFTN20_RATE.interpolate(method='linear', limit_direction='forward', axis=0)
people.RFTN20_VOL = people.RFTN20_VOL.interpolate(method='linear', limit_direction='forward', axis=0)
people.PPF20_VOL = people.PPF20_VOL.interpolate(method='linear', limit_direction='forward', axis=0)
people = people.fillna(0)
for i in range(len(people)-1):
if np.abs(people.RFTN20_VOL[i+1] - people.RFTN20_VOL[i]) >= 10:
people.RFTN20_VOL[i+1] = people.RFTN20_VOL[i]
if np.abs(people.PPF20_VOL[i+1] - people.PPF20_VOL[i]) >= 10:
people.PPF20_VOL[i+1] = people.PPF20_VOL[i]
# 丢掉只有前半场手术数据的样本
if people.BIS[len(people)-1] <= 60:
return 0
# 丢弃前100s内数据缺失超过30s的样本
for i in range(100):
if people.time[i+1] - people.time[i] >= 30:
return 0
# 保存数据
if people.RFTN20_VOL[0] == 0 and people.PPF20_VOL[0] == 0 and people.BIS[0] >= 80:
if train:
people.to_csv(f'/home/user02/HYK/bis/new_train/{file}', encoding='utf-8')
if not train:
people.to_csv(f'/home/user02/HYK/bis/new_test/{file}', encoding='utf-8')
print(file, "loading finish")
return 1
else:
return 0
def ce_data_clean(file, train=True):
# 读取数据
print(file)
people = pd.read_csv(f'/home/user02/HYK/bis/database/ce/{file}')
people = people.rename(columns=lambda x: x.replace("Solar8000/", "").replace("Orchestra/", "").replace("BIS/", ""))
# 删除错误的心率数据
people.HR = people.HR.interpolate(method='linear', limit_direction='forward', axis=0)
people = people.dropna(subset=['HR'])
people.index = range(0, len(people))
people = people.drop(people[people.HR == 0].index, axis=0)
people.index = range(0, len(people))
# 删除错误的BIS数据
people.BIS = people.BIS.interpolate(method='linear', limit_direction='forward', axis=0)
people = people.dropna(subset=['BIS'])
people.index = range(0, len(people))
people = people.drop(people[people.BIS == 0].index, axis=0)
people.index = range(0, len(people))
# BIS平滑
if train:
lowess = sm.nonparametric.lowess
people.BIS = lowess(people.BIS, people.index, frac=0.03)[:, 1]
# 错误数据补齐
# people.PPF20_RATE = people.PPF20_RATE.interpolate(method='linear', limit_direction='forward', axis=0)
# people.RFTN20_RATE = people.RFTN20_RATE.interpolate(method='linear', limit_direction='forward', axis=0)
people.PPF20_VOL = people.PPF20_VOL.interpolate(method='linear', limit_direction='forward', axis=0)
people.PPF20_CP = people.PPF20_CP.interpolate(method='linear', limit_direction='forward', axis=0)
people.PPF20_CE = people.PPF20_CE.interpolate(method='linear', limit_direction='forward', axis=0)
people.RFTN20_VOL = people.RFTN20_VOL.interpolate(method='linear', limit_direction='forward', axis=0)
people.RFTN20_CP = people.RFTN20_CP.interpolate(method='linear', limit_direction='forward', axis=0)
people.RFTN20_CE = people.RFTN20_CE.interpolate(method='linear', limit_direction='forward', axis=0)
people = people.fillna(0)
for i in range(len(people)-1):
if np.abs(people.RFTN20_VOL[i+1] - people.RFTN20_VOL[i]) >= 10:
people.RFTN20_VOL[i+1] = people.RFTN20_VOL[i]
if np.abs(people.PPF20_VOL[i+1] - people.PPF20_VOL[i]) >= 10:
people.PPF20_VOL[i+1] = people.PPF20_VOL[i]
# 丢掉只有前半场手术数据的样本
if people.BIS[len(people)-1] <= 60:
return 0
# 丢弃前100s内数据缺失超过30s的样本
for i in range(100):
if people.time[i+1] - people.time[i] >= 30:
return 0
# 保存数据
if people.RFTN20_VOL[0] == 0 and people.PPF20_VOL[0] == 0 and people.BIS[0] >= 80:
if train:
people.to_csv(f'/home/user02/HYK/bis/database/ce_clean/train/{file}', encoding='utf-8')
return 1
if not train:
people.to_csv(f'/home/user02/HYK/bis/database/ce_clean/test/{file}', encoding='utf-8')
print(file, "loading finish")
return 1
else:
return 0
def casefile_clean():
file_list = file_name('/HDD_data/HYK/bis/ce_clean/train')
print(len(file_list), "files was found")
x = 0 # 加载的第x个case
y = 0 # 符合要求的case,加载训练集
train = True
while y < 300:
if ce_data_clean(file_list[x], train) == 1:
y += 1
x += 1
y = 0 # 符合要求的case清零,加载测试集
train = False
while y < 300:
if ce_data_clean(file_list[x], train) == 1:
y += 1
x += 1
def normalization(data):
_range = np.max(data) - np.min(data)
return (data - np.min(data)) / _range
def informationfile_clean():
people = pd.read_csv('/home/user02/HYK/bis/before_information.csv')
people.age = normalization(people.age)
people.height = normalization(people.height)
people.weight = normalization(people.weight)
people.to_csv('/home/user02/HYK/bis/clean_data1/information.csv', encoding='utf-8')
# casefile_clean()
# informationfile_clean()
# get_test_clinical("vaild")