-
Notifications
You must be signed in to change notification settings - Fork 90
/
nzv.R
113 lines (85 loc) · 2.43 KB
/
nzv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
library(tidymodels)
library(AmesHousing)
ames <- make_ames() %>%
dplyr::select(-matches("Qu"))
set.seed(333)
data_split <- initial_split(ames, strata = "Sale_Price")
ames_train <- training(data_split)
ames_test <- testing(data_split)
lm_mod <- linear_reg() %>%
set_engine("lm")
perf_metrics <- metric_set(rmse, rsq, ccc)
# -------
mod_rec <- recipe(
Sale_Price ~ Longitude + Latitude + Neighborhood,
data = ames_train
) %>%
step_log(Sale_Price, base = 10) %>%
# Lump factor levels that occur in
# <= 5% of data as "other"
step_other(Neighborhood, threshold = 0.05) %>%
# Create dummy variables for _any_ factor variables
step_dummy(all_nominal())
mod_rec_prepped <- prep(mod_rec, training = ames_train)
juice(mod_rec_prepped)
# -------
mod_rec_dummy <- recipe(
Sale_Price ~ Longitude + Latitude + Neighborhood,
data = ames_train
) %>%
step_log(Sale_Price, base = 10) %>%
step_dummy(all_nominal())
mod_rec_dummy_prepped <- prep(mod_rec_dummy, training = ames_train)
train_dummy_data <- juice(mod_rec_dummy_prepped)
train_dummy_data
train_dummy_data %>%
select(starts_with("Neighborhood_")) %>%
tidyr::pivot_longer(everything()) %>%
group_by(name, value) %>%
count() %>%
tidyr::pivot_wider(names_from = value, values_from = n) %>%
rename(one = `1`, zero = `0`) %>%
filter(one < 20 | is.na(one)) %>%
mutate(zero / one)
# -------
mod_rec_zv <- recipe(
Sale_Price ~ Longitude + Latitude + Neighborhood,
data = ames_train
) %>%
step_log(Sale_Price, base = 10) %>%
step_dummy(all_nominal()) %>%
step_zv(
starts_with("Neighborhood_")
)
mod_rec_zv_prepped <- prep(mod_rec_zv, training = ames_train)
mod_rec_zv_prepped
juice(mod_rec_zv_prepped)
# -------
mod_rec_nzv <- recipe(
Sale_Price ~ Longitude + Latitude + Neighborhood,
data = ames_train
) %>%
step_log(Sale_Price, base = 10) %>%
step_dummy(all_nominal()) %>%
step_nzv(
starts_with("Neighborhood_"),
freq_cut = 200/1
)
mod_rec_nzv_prepped <- prep(mod_rec_nzv, training = ames_train)
mod_rec_nzv_prepped
juice(mod_rec_nzv_prepped)
# -------
mod_rec_nzv2 <- recipe(
Sale_Price ~ Longitude + Latitude + Neighborhood,
data = ames_train
) %>%
step_log(Sale_Price, base = 10) %>%
step_dummy(all_nominal()) %>%
step_nzv(
starts_with("Neighborhood_"),
freq_cut = 1e10/1,
unique_cut = 2
)
mod_rec_nzv2_prepped <- prep(mod_rec_nzv2, training = ames_train)
mod_rec_nzv2_prepped
juice(mod_rec_nzv2_prepped)