-
Notifications
You must be signed in to change notification settings - Fork 0
/
data-cleaning-description.R
169 lines (135 loc) · 4.93 KB
/
data-cleaning-description.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
rm(list = ls())
if (Sys.info()[8] == "leoczajka") {
setwd("/Users/leoczajka/Dropbox (Leo_and_Co)/Research/learning/Doctoral Training/machine-learning/ULB-CHM-Machine-Learners")
}
if (Sys.info()[8] == "??") {
setwd("~/BioInformatique/MA1/Q2/machine learning/projet")
}
if (Sys.info()[8] == "??") {
setwd("~/RProjects/Pumpit")
}
# load packages
library(Hmisc)
library(tidyverse)
library(dplyr)
library(caret)
library(xgboost)
library(mlbench)# For data
library(FSelector)
# Importing data sets
train_values <- read.csv("trainingsetvalues.csv")
train_labels <- read.csv("trainingsetlabels.csv")
df <- merge(train_values, train_labels)
test_values <- read.csv("testsetvalues.csv")
# Elements of strategy
# start with the simplest model - without all hard to deal with data
# make a data set with missing values
# make a data set with replaced missing values
# for cat-var = set missing to some cat
# for num var - either remove the var or set to missing (remove data), or set to the mode/mean or P50
# remove var with too many weird values
# always remove unprecise small cat
# add very precise cat-var progressively
# general cleaning : set categorical var into factor
# classify the variables based on type and number of different values
# get a glimpse at the data
describe(df)
# irrelevant or not varying variables
# num_private, recorded_by
describe(df$num_private)
describe(df$recorded_by)
# numerical variables
# amount_tsh - but few values - 98, many 0
# gps_height - half are 0
# longitude
# latitude
# population - many 0
hist(df$latitude)
sum(df$latitude==0)
hist(df$longitude) # set 0 to Mode ? set NA
sum(df$longitude==0)
hist(df$gps_height)
sum(df$gps_height==0)
df<- df[, !(colnames(df) %in% c("num_private", "recorded_by"))]
table(df$population)
hist(df$population[df$population>1 & df$population <500]) #
# put at test
# unlikely counts for 0 and 1 - set to NA - set to mode or P50
hist(df$amount_tsh[df$amount_tsh>0 & df$amount_tsh<1000])
table(df$amount_tsh) # pretty weird values
# categorical var
# FEW
# basin - 9
table(df$basin)
# region - 21
table(df$region)
# region_code 27
table(df$region_code)
table(df$region_code, df$region) # to be tested
# district_code - 20
table(df$district_code)
# lga - 125
table(df$lga)
ggplot(df) + geom_bar(aes(x = lga))
# public_meeting - 2
table(df$public_meeting)
# scheme_management 12
table(df$scheme_management)
# permit 2 - 3056
table(df$permit)
# construction_year - 55
hist(df$construction_year[df$construction_year!=0])
# to be tested
# extraction_type - 18 extraction_type_group extraction_type_class
table(df$extraction_type, df$extraction_type_group)
table(df$extraction_type_group, df$extraction_type_class)
# management management_group
table(df$management, df$management_group )
# payment payment_type
table(df$payment,df$payment_type )
# water_quality quality_group
table(df$water_quality,df$quality_group )
# quantity quantity_group
table(df$quantity, df$quantity_group)
# source source_type source_class
table(df$source, df$source_type )
table(df$source_type,df$source_class )
# waterpoint_type waterpoint_type_group
table(df$waterpoint_type,df$waterpoint_type_group )
# NUMEROUS
# date_recorded - 356 different values - can be transformed into
ggplot(df) + geom_bar(aes(x = date_recorded))
df$m_date <- as.Date(df$date_recorded)
df$daily_time_trend <- as.numeric(df$m_date) # remove the too early ones
hist(df$daily_time_trend[df$daily_time_trend>14942], breaks = seq(from=14942, to=16042, by=10))
df$m_year <- as.factor(as.numeric(format(df$m_date,'%Y')))
df$m_months <- as.factor(as.numeric(format(df$m_date,'%m')))
df$m_day <- as.factor(weekdays(df$m_date))
df$age <- df$m_year - df$construction_year
df$age[df$construction_year==0] <- NA # to b tested
# View(df %>% filter(df$time==53))
ggplot(df) + geom_bar(aes(x = m_day))
ggplot(df) + geom_bar(aes(x = m_months))
# ward - 2092
# ggplot(df) + geom_bar(aes(x = ward))
# funder - 1897 distinct - 3635 missing
# installer - 2145
# wpt_name 37400
# subvillage 19287
# scheme_name - 2696
df$installer <- tolower(df$installer)
df$funder <- tolower(df$funder)
df$wpt_name <- tolower(df$wpt_name)
df$subvillage <- tolower(df$subvillage)
df$scheme_name <- tolower(df$scheme_name)
table(df$funder)
df$funder_is_installer <- df$funder == df$installer & df$installer!=""
df$funder_is_installer[df$installer=="" | df$funder==""] <- "MISSING"
table(df$funder_is_installer)
# create variables :
# distance to measuring dates
# year and month of the measured
# df<- df[, !(colnames(df) %in% c("region", "extraction_type_group", "extraction_type_class", "management_group", "payment_type", "quality_group", "quantity_group", "source_class","source_type", "waterpoint_type_group", "num_private", "recorded_by"))]
# remove also the date - better use it as numerical
str(df)
# so what do we get for the most basic model?