# load libraries
+library(tidyverse)
+library(tidymodels)
+library(ggcorrplot)
+library(reshape2)
+library(vip)
+
+# import raw data
+<- read_csv("data/data-diabetes.csv")
+ input_diabetes
+# create BMI variable
+<- 703 # conversion factor to calculate BMI from inches and pounds BMI = weight (lb) / [height (in)]2 x 703
+ conv_factor <- input_diabetes %>%
+ data_diabetes mutate(BMI = weight / height^2 * 703, BMI = round(BMI, 2)) %>%
+ relocate(BMI, .after = id)
+
+# preview data
+glimpse(data_diabetes)
+## Rows: 403
+## Columns: 20
+## $ id <dbl> 1000, 1001, 1002, 1003, 1005, 1008, 1011, 1015, 1016, 1022, 1…
+## $ BMI <dbl> 22.13, 37.42, 48.37, 18.64, 27.82, 26.50, 28.20, 34.33, 24.51…
+## $ chol <dbl> 203, 165, 228, 78, 249, 248, 195, 227, 177, 263, 242, 215, 23…
+## $ stab.glu <dbl> 82, 97, 92, 93, 90, 94, 92, 75, 87, 89, 82, 128, 75, 79, 76, …
+## $ hdl <dbl> 56, 24, 37, 12, 28, 69, 41, 44, 49, 40, 54, 34, 36, 46, 30, 4…
+## $ ratio <dbl> 3.6, 6.9, 6.2, 6.5, 8.9, 3.6, 4.8, 5.2, 3.6, 6.6, 4.5, 6.3, 6…
+## $ glyhb <dbl> 4.31, 4.44, 4.64, 4.63, 7.72, 4.81, 4.84, 3.94, 4.84, 5.78, 4…
+## $ location <chr> "Buckingham", "Buckingham", "Buckingham", "Buckingham", "Buck…
+## $ age <dbl> 46, 29, 58, 67, 64, 34, 30, 37, 45, 55, 60, 38, 27, 40, 36, 3…
+## $ gender <chr> "female", "female", "female", "male", "male", "male", "male",…
+## $ height <dbl> 62, 64, 61, 67, 68, 71, 69, 59, 69, 63, 65, 58, 60, 59, 69, 6…
+## $ weight <dbl> 121, 218, 256, 119, 183, 190, 191, 170, 166, 202, 156, 195, 1…
+## $ frame <chr> "medium", "large", "large", "large", "medium", "large", "medi…
+## $ bp.1s <dbl> 118, 112, 190, 110, 138, 132, 161, NA, 160, 108, 130, 102, 13…
+## $ bp.1d <dbl> 59, 68, 92, 50, 80, 86, 112, NA, 80, 72, 90, 68, 80, NA, 66, …
+## $ bp.2s <dbl> NA, NA, 185, NA, NA, NA, 161, NA, 128, NA, 130, NA, NA, NA, N…
+## $ bp.2d <dbl> NA, NA, 92, NA, NA, NA, 112, NA, 86, NA, 90, NA, NA, NA, NA, …
+## $ waist <dbl> 29, 46, 49, 33, 44, 36, 46, 34, 34, 45, 39, 42, 35, 37, 36, 3…
+## $ hip <dbl> 38, 48, 57, 38, 41, 42, 49, 39, 40, 50, 45, 50, 41, 43, 40, 4…
+## $ time.ppn <dbl> 720, 360, 180, 480, 300, 195, 720, 1020, 300, 240, 300, 90, 7…
+
+# run basic EDA
+# note: we have seen descriptive statistics and plots during EDA session
+# note: so here we only look at missing data and correlation
+
+# calculate number of missing data per variable
+<- data_diabetes %>%
+ data_na summarise(across(everything(), ~ sum(is.na(.))))
+
+# make a table with counts sorted from highest to lowest
+<- data_na %>%
+ data_na_long pivot_longer(-id, names_to = "variable", values_to = "count") %>%
+ arrange(desc(count))
+
+# make a column plot to visualize the counts
+%>%
+ data_na_long ggplot(aes(x = variable, y = count)) +
+ geom_col(fill = "blue4") +
+ xlab("") +
+ theme_bw() +
+ theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
+
+# calculate correlation between numeric variables
+<- data_diabetes %>%
+ data_cor ::select(-id) %>%
+ dplyr::select(where(is.numeric)) %>%
+ dplyrcor(use = "pairwise.complete.obs")
+
+# visualize correlation via heatmap
+ggcorrplot(data_cor, hc.order = TRUE, lab = FALSE)
+
+# based on the number of missing data, let's delete bp.2s, bp.2d
+# and use complete-cases analysis
+<- data_diabetes %>%
+ data_diabetes_narm ::select(-bp.2s, -bp.2d) %>%
+ dplyrna.omit()
2 Demo: a predictive modelling case study
+Let’s use tidymodels
framework to run small predictive case study trying to build a predictive model for BMI using our diabetes
data set. We will use:
-
+
rsamples
for splitting data into test and non-test, as well as creating cross-validation folds
+recipes
for feature engineering, e.g. changing from imperial to metric measurements, removing irrelevant and highly correlated features
+parsnip
to specify Lasso regression model
+tune
to optimize search space for lambda values
+yardstick
to assess predictions
+workflows
to put all the step together
+
2.1 Data import & EDA
+2.2 Data splitting
+# use tidymodels framework to fit Lasso regression model for predicting BMI
+# using repeated cross-validation to tune lambda value in L1 penalty term
+
+# select random seed value
+<- 123
+ myseed
+# split data into non-test (other) and test (80% s)
+set.seed(myseed)
+<- initial_split(data_diabetes_narm, strata = BMI, prop = 0.8) # holds splitting info
+ data_split <- data_split %>% training() # creates non-test set (function is called training but it refers to non-test part)
+ data_other <- data_split %>% testing() # creates test set
+ data_test
+# prepare repeated cross-validation splits with 5 folds repeated 3 times
+set.seed(myseed)
+<- vfold_cv(data_other,
+ data_folds v = 5,
+ repeats = 3,
+ strata = BMI)
+
+# check the split
+dim(data_diabetes)
+## [1] 403 20
+dim(data_other)
+## [1] 291 18
+dim(data_test)
+## [1] 75 18
+
+# check BMI distributions in data splits
+par(mfrow=c(3,1))
+hist(data_diabetes$BMI, xlab = "", main = "BMI: all", 50)
+hist(data_other$BMI, xlab = "", main = "BMI: non-test", 50)
+hist(data_test$BMI, xlab = "", main = "BMI: test", 50)
2.3 Feature engineering
+# create data recipe (feature engineering)
+
+<- 2.54/100
+ inch2m <- 0.45
+ pound2kg
+<- recipe(BMI ~ ., data = data_other) %>%
+ data_recipe update_role(id, new_role = "sampleID") %>%
+ step_mutate(height = height * inch2m, height = round(height, 2)) %>% # convert height to meters
+ step_mutate(weight = weight * pound2kg, weight = round(weight, 2)) %>% # convert weight to kg
+ step_rename(glu = stab.glu) %>% # rename stab.glu to glu
+ step_log(glu) %>% #ln transform glucose
+ step_zv(all_numeric()) %>% # removes variables that are highly sparse and unbalanced (if found)
+ step_corr(all_numeric(), -all_outcomes(), -has_role("sampleID"), threshold = 0.8) %>% # removes variables with large absolute correlations with other variables (if found)
+ step_dummy(location, gender, frame) %>% # convert categorical variables to dummy variables
+ step_normalize(all_numeric(), -all_outcomes(), -has_role("sampleID"), skip = FALSE)
+
+ # you can implement more steps: see https://recipes.tidymodels.org/reference/index.html
+
+# print recipe
+
+ data_recipe
+# check if recipe is doing what it is supposed to do
+# i.e. bake the data
+<- data_recipe %>%
+ data_other_prep prep() %>%
+ bake(new_data = NULL)
+
+## bake test data
+<- data_recipe %>%
+ data_test_prep prep() %>%
+ bake(new_data = data_test)
+
+# preview baked data
+print(head(data_other_prep))
+## # A tibble: 6 × 17
+## id chol glu hdl ratio glyhb age height bp.1s bp.1d hip
+## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
+## 1 1045 -0.319 -0.539 -0.830 0.486 -0.172 -0.645 -0.474 -1.16 -0.552 -1.66
+## 2 1271 0.449 -1.09 -0.324 0.320 -0.458 -1.39 -1.29 -1.59 -1.00 -0.914
+## 3 1277 -0.657 -0.572 2.32 -1.45 -0.642 -0.337 1.56 0.286 2.15 -1.29
+## 4 1303 -0.590 -0.410 -0.436 -0.178 -0.518 0.341 0.545 -0.309 0.500 -1.47
+## 5 1309 -0.206 -0.347 0.687 -0.730 -0.860 -1.32 0.0357 -0.735 -0.402 -1.66
+## 6 1315 -0.793 -0.572 0.350 -0.841 0.225 0.649 1.26 -0.565 -1.45 -1.29
+## # ℹ 6 more variables: time.ppn <dbl>, BMI <dbl>, location_Louisa <dbl>,
+## # gender_male <dbl>, frame_medium <dbl>, frame_small <dbl>
2.4 Lasso regression
+# define model
+<- linear_reg(penalty = tune(), mixture = 1) %>%
+ model set_engine("glmnet") %>%
+ set_mode("regression")
+
+# create workflow with data recipe and model
+<- workflow() %>%
+ wf add_model(model) %>%
+ add_recipe(data_recipe)
+
+# define parameters range for tuning
+<- grid_regular(penalty(), levels = 25)
+ grid_lambda
+# tune lambda
+<- wf %>%
+ model_tune tune_grid(resamples = data_folds,
+ grid = grid_lambda)
+
+# show metrics average across folds
+%>%
+ model_tune collect_metrics(summarize = TRUE)
+ ## # A tibble: 50 × 7
+## penalty .metric .estimator mean n std_err .config
+## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
+## 1 1 e-10 rmse standard 2.48 15 0.142 Preprocessor1_Model01
+## 2 1 e-10 rsq standard 0.851 15 0.0143 Preprocessor1_Model01
+## 3 2.61e-10 rmse standard 2.48 15 0.142 Preprocessor1_Model02
+## 4 2.61e-10 rsq standard 0.851 15 0.0143 Preprocessor1_Model02
+## 5 6.81e-10 rmse standard 2.48 15 0.142 Preprocessor1_Model03
+## 6 6.81e-10 rsq standard 0.851 15 0.0143 Preprocessor1_Model03
+## 7 1.78e- 9 rmse standard 2.48 15 0.142 Preprocessor1_Model04
+## 8 1.78e- 9 rsq standard 0.851 15 0.0143 Preprocessor1_Model04
+## 9 4.64e- 9 rmse standard 2.48 15 0.142 Preprocessor1_Model05
+## 10 4.64e- 9 rsq standard 0.851 15 0.0143 Preprocessor1_Model05
+## # ℹ 40 more rows
+
+# plot k-folds results across lambda range
+%>%
+ model_tune collect_metrics() %>%
+ ::filter(.metric == "rmse") %>%
+ dplyrggplot(aes(penalty, mean, color = .metric)) +
+ geom_errorbar(aes( ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
+ scale_x_log10() +
+ geom_line(linewidth = 1.5) +
+ theme_bw() +
+ theme(legend.position = "none") +
+ scale_color_brewer(palette = "Set1")
+
+ # best lambda value (min. RMSE)
+<- model_tune %>%
+ model_best select_best(metric = "rmse")
+
+print(model_best)
+## # A tibble: 1 × 2
+## penalty .config
+## <dbl> <chr>
+## 1 0.0562 Preprocessor1_Model22
+
+# finalize workflow with tuned model
+<- wf %>%
+ wf_final finalize_workflow(model_best)
+
+# last fit
+<- wf_final %>%
+ fit_final last_fit(split = data_split)
+
+# final predictions
+<- fit_final %>% collect_predictions() # predicted BMI
+ y_test_pred
+# final predictions: performance on test (unseen data)
+%>% collect_metrics()
+ fit_final ## # A tibble: 2 × 4
+## .metric .estimator .estimate .config
+## <chr> <chr> <dbl> <chr>
+## 1 rmse standard 2.79 Preprocessor1_Model1
+## 2 rsq standard 0.857 Preprocessor1_Model1
+
+# plot predictions vs. actual for test data
+plot(data_test$BMI, y_test_pred$.pred, xlab="BMI (actual)", ylab = "BMI (predicted)", las = 1, pch = 19)
+
+# correlation between predicted and actual BMI values for test data
+cor(data_test$BMI, y_test_pred$.pred)
+## [1] 0.9256857
+
+# re-fit model on all non-test data
+<- wf_final %>%
+ model_final fit(data_other)
+
+# show final model
+tidy(model_final)
+## # A tibble: 16 × 3
+## term estimate penalty
+## <chr> <dbl> <dbl>
+## 1 (Intercept) 28.7 0.0562
+## 2 chol 0 0.0562
+## 3 glu 0.0229 0.0562
+## 4 hdl 0 0.0562
+## 5 ratio 0.335 0.0562
+## 6 glyhb -0.0512 0.0562
+## 7 age -0.257 0.0562
+## 8 height -1.86 0.0562
+## 9 bp.1s -0.294 0.0562
+## 10 bp.1d 0.203 0.0562
+## 11 hip 5.60 0.0562
+## 12 time.ppn 0 0.0562
+## 13 location_Louisa -0.160 0.0562
+## 14 gender_male 1.07 0.0562
+## 15 frame_medium -0.320 0.0562
+## 16 frame_small -0.530 0.0562
+
+# plot variables ordered by importance (highest abs(coeff))
+%>%
+ model_final extract_fit_parsnip() %>%
+ vip(geom = "point") +
+ theme_bw()