-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocess.py
138 lines (106 loc) · 6.49 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from data_analysis import analyse_data
import numpy as np
import pandas as pd
#Average time delta between transactions used in calculations
def list_diff(x_input):
if len(x_input)<2:
xdiff = 0
delta_mean=0
else:
xdiff = [x_input[n]-x_input[n-1] for n in range(1,len(x_input))]
delta_mean = np.mean(xdiff)
return delta_mean
#Delta before last transaction used in calculations
def delta_last(x_input):
if len(x_input)<2:
delta_l=0
else:
delta_l = x_input[-1]-x_input[-2]
return delta_l
def data_preprocessing(data):
#number of transactions Orig !not used in calculations
dict_count_sent = data['nameOrig'].value_counts()
data['New_TotalOrig']= data['nameOrig'].map(dict_count_sent)
#number of transactions Dest !not used in calculations
dict_count_get = data['nameDest'].value_counts()
data['New_TotalDest']= data['nameDest'].map(dict_count_get)
#average transaction Orig !not used in calculations
trans_mean_orig = data.groupby("nameOrig")["amount"].aggregate(['mean'])
dict_trans_mean_orig=trans_mean_orig.to_dict()
value_dict_trans_mean_orig=dict_trans_mean_orig['mean']
data['New_TotalMeanOrig']=data['nameOrig'].map(value_dict_trans_mean_orig)
#average transaction Dest !not used in calculations
trans_mean_dest = data.groupby("nameDest")["amount"].aggregate(['mean'])
dict_trans_mean_dest=trans_mean_dest.to_dict()
value_dict_trans_mean_dest=dict_trans_mean_dest['mean']
data['New_TotalMeanDest']=data['nameDest'].map(value_dict_trans_mean_dest)
#Amount of transactions with participation Orig !not used in calculations
trans_sum_orig = data.groupby("nameOrig")["amount"].aggregate([sum])
dict_trans_sum_orig=trans_sum_orig.to_dict()
value_dict_trans_sum_orig=dict_trans_sum_orig['sum']
data['New_TotalSumOrig']=data['nameOrig'].map(value_dict_trans_sum_orig)
#Amount of transactions with participation Dest !not used in calculations
trans_sum_dest = data.groupby("nameDest")["amount"].aggregate([sum])
dict_trans_sum_dest=trans_sum_dest.to_dict()
value_dict_trans_sum_dest=dict_trans_sum_dest['sum']
data['New_TotalSumDest']=data['nameDest'].map(value_dict_trans_sum_dest)
#type Orig ==first letter from nameOrig
data['New_TypeOrig']= data['nameOrig'].apply(lambda x: x[0])
#type Dest ==first letter from nameDest
data['New_TypeDest']= data['nameDest'].apply(lambda x: x[0])
#Average time delta between transactions Orig !not used in calculations
x_input = data.groupby('nameOrig')['step'].apply(list).reset_index(name='info')
data = pd.merge(data, x_input, how='left', on='nameOrig')
data['New_Delta_Time_Tr_Orig'] = data['info'].apply(lambda x: list_diff(x))
#time to previous transaction Orig !not used in calculations
data['New_Delta_Last_Tr_Orig']= data['info'].apply(lambda x: delta_last(x))
#Average time delta between transactions Dest !not used in calculations
x_input_dest = data.groupby('nameDest')['step'].apply(list).reset_index(name='info_2')
data = pd.merge(data, x_input_dest, how='left', on='nameDest')
data['New_Delta_Time_Tr_Dest'] = data['info_2'].apply(lambda x: list_diff(x) )
#time to previous transaction Dest !not used in calculations
data['New_Delta_Last_Tr_Dest']= data['info_2'].apply(lambda x: delta_last(x))
#Removing extra columns
data = data.drop(columns=['info','info_2'])
#delete first letter Orig , Dest
data['nameOrig']=data['nameOrig'].apply(lambda x: x[1:])
data['nameDest']=data['nameDest'].apply(lambda x: x[1:])
#!not used in calculations
data['res_data']=pd.to_datetime(data['step'], unit='h', origin=pd.Timestamp('2000-01-01'))
### Select the date, days of the week, hours, month !not used in calculations
data['date'] = data.res_data.dt.date
data['day_of_week'] = data.res_data.dt.dayofweek
data['hour'] = data.res_data.dt.hour
data['month'] = data.res_data.dt.month
data.drop(columns = ['step', 'nameOrig','nameDest', 'isFlaggedFraud',
'New_TotalOrig', 'New_TotalDest', 'New_TotalMeanOrig',
'New_TotalMeanDest', 'New_TotalSumOrig', 'New_TotalSumDest',
'New_Delta_Time_Tr_Orig','New_Delta_Last_Tr_Orig',
'New_Delta_Time_Tr_Dest','New_Delta_Last_Tr_Dest',
'res_data', 'date', 'day_of_week', 'hour','month'],
inplace=True)
#due to the fact that we have few unique values, we will go by the simple way of converting categorical features
data_test = pd.get_dummies(data, prefix = ['type', 'New_TypeOrig', 'New_TypeDest'], drop_first = True)
return data_test
def data_preprocess():
data, num_features, cat_features = analyse_data()
# Transactions which are detected as fraud are cancelled,
# so for fraud detection these columns
# (oldbalanceOrg, newbalanceOrig, oldbalanceDest, newbalanceDest ) must not be used.
# nameOrig - customer who started the transaction
# oldbalanceOrg - initial balance before the transaction
# newbalanceOrig - new balance after the transaction.
# nameDest - customer who is the recipient of the transaction
# oldbalanceDest - initial balance recipient before the transaction.
# Note that there is not information for customers that start with M (Merchants).
# newbalanceDest - new balance recipient after the transaction.
# Note that there is not information for customers that start with M (Merchants).
# isFraud - This is the transactions made by the fraudulent agents inside the simulation.
# In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.
# isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts.
# An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.
data = data_preprocessing(data)
print(data.head())
categorical_features = data.select_dtypes("object").columns
numerical_features = data.select_dtypes("number").columns
return data, numerical_features, categorical_features