diff --git a/session-tidymodels/_quarto.yml b/session-tidymodels/_quarto.yml index 8243c67..23333ce 100644 --- a/session-tidymodels/_quarto.yml +++ b/session-tidymodels/_quarto.yml @@ -10,11 +10,10 @@ book: - index.qmd - intro.qmd - case-study.qmd - - exercises.qmd + #- exercises.qmd bibliography: references.bib - format: html: theme: spacebar diff --git a/session-tidymodels/case-study.qmd b/session-tidymodels/case-study.qmd index bd09984..e0d69b9 100644 --- a/session-tidymodels/case-study.qmd +++ b/session-tidymodels/case-study.qmd @@ -134,7 +134,6 @@ hist(data_diabetes$BMI, xlab = "", main = "BMI: all", 50) hist(data_other$BMI, xlab = "", main = "BMI: non-test", 50) hist(data_test$BMI, xlab = "", main = "BMI: test", 50) - ``` ## Feature engineering @@ -236,7 +235,8 @@ model_tune %>% # best lambda value (min. RMSE) model_best <- model_tune %>% - select_best("rmse") + select_best(metric = "rmse") + print(model_best) # finalize workflow with tuned model diff --git a/session-tidymodels/docs/case-study.html b/session-tidymodels/docs/case-study.html new file mode 100644 index 0000000..700e941 --- /dev/null +++ b/session-tidymodels/docs/case-study.html @@ -0,0 +1,785 @@ + + + + + + + + + +Tidymodels - 2  Demo: a predictive modelling case study + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ +
+ + +
+ + + +
+ +
+
+

2  Demo: a predictive modelling case study

+
+ + + +
+ + + + +
+ + +
+ +

Let’s use tidymodels framework to run small predictive case study trying to build a predictive model for BMI using our diabetes data set. We will use:

+ +
+

2.1 Data import & EDA

+
+
# load libraries
+library(tidyverse)
+library(tidymodels)
+library(ggcorrplot)
+library(reshape2)
+library(vip)
+
+# import raw data
+input_diabetes <- read_csv("data/data-diabetes.csv")
+
+# create BMI variable
+conv_factor <- 703 # conversion factor to calculate BMI from inches and pounds BMI = weight (lb) / [height (in)]2 x 703
+data_diabetes <- input_diabetes %>%
+  mutate(BMI = weight / height^2 * 703, BMI = round(BMI, 2)) %>%
+  relocate(BMI, .after = id)
+
+# preview data
+glimpse(data_diabetes)
+## Rows: 403
+## Columns: 20
+## $ id       <dbl> 1000, 1001, 1002, 1003, 1005, 1008, 1011, 1015, 1016, 1022, 1…
+## $ BMI      <dbl> 22.13, 37.42, 48.37, 18.64, 27.82, 26.50, 28.20, 34.33, 24.51…
+## $ chol     <dbl> 203, 165, 228, 78, 249, 248, 195, 227, 177, 263, 242, 215, 23…
+## $ stab.glu <dbl> 82, 97, 92, 93, 90, 94, 92, 75, 87, 89, 82, 128, 75, 79, 76, …
+## $ hdl      <dbl> 56, 24, 37, 12, 28, 69, 41, 44, 49, 40, 54, 34, 36, 46, 30, 4…
+## $ ratio    <dbl> 3.6, 6.9, 6.2, 6.5, 8.9, 3.6, 4.8, 5.2, 3.6, 6.6, 4.5, 6.3, 6…
+## $ glyhb    <dbl> 4.31, 4.44, 4.64, 4.63, 7.72, 4.81, 4.84, 3.94, 4.84, 5.78, 4…
+## $ location <chr> "Buckingham", "Buckingham", "Buckingham", "Buckingham", "Buck…
+## $ age      <dbl> 46, 29, 58, 67, 64, 34, 30, 37, 45, 55, 60, 38, 27, 40, 36, 3…
+## $ gender   <chr> "female", "female", "female", "male", "male", "male", "male",…
+## $ height   <dbl> 62, 64, 61, 67, 68, 71, 69, 59, 69, 63, 65, 58, 60, 59, 69, 6…
+## $ weight   <dbl> 121, 218, 256, 119, 183, 190, 191, 170, 166, 202, 156, 195, 1…
+## $ frame    <chr> "medium", "large", "large", "large", "medium", "large", "medi…
+## $ bp.1s    <dbl> 118, 112, 190, 110, 138, 132, 161, NA, 160, 108, 130, 102, 13…
+## $ bp.1d    <dbl> 59, 68, 92, 50, 80, 86, 112, NA, 80, 72, 90, 68, 80, NA, 66, …
+## $ bp.2s    <dbl> NA, NA, 185, NA, NA, NA, 161, NA, 128, NA, 130, NA, NA, NA, N…
+## $ bp.2d    <dbl> NA, NA, 92, NA, NA, NA, 112, NA, 86, NA, 90, NA, NA, NA, NA, …
+## $ waist    <dbl> 29, 46, 49, 33, 44, 36, 46, 34, 34, 45, 39, 42, 35, 37, 36, 3…
+## $ hip      <dbl> 38, 48, 57, 38, 41, 42, 49, 39, 40, 50, 45, 50, 41, 43, 40, 4…
+## $ time.ppn <dbl> 720, 360, 180, 480, 300, 195, 720, 1020, 300, 240, 300, 90, 7…
+
+# run basic EDA
+# note: we have seen descriptive statistics and plots during EDA session 
+# note: so here we only look at missing data and correlation
+
+# calculate number of missing data per variable
+data_na <- data_diabetes %>% 
+  summarise(across(everything(), ~ sum(is.na(.)))) 
+
+# make a table with counts sorted from highest to lowest
+data_na_long <- data_na %>%
+  pivot_longer(-id, names_to = "variable", values_to = "count") %>%
+  arrange(desc(count)) 
+
+# make a column plot to visualize the counts
+data_na_long %>%
+  ggplot(aes(x = variable, y = count)) + 
+  geom_col(fill = "blue4") + 
+  xlab("") + 
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
+
+# calculate correlation between numeric variables
+data_cor <- data_diabetes %>% 
+  dplyr::select(-id) %>% 
+  dplyr::select(where(is.numeric)) %>%
+  cor(use = "pairwise.complete.obs")
+
+# visualize correlation via heatmap
+ggcorrplot(data_cor, hc.order = TRUE, lab = FALSE)
+
+# based on the number of missing data, let's delete bp.2s, bp.2d
+# and use complete-cases analysis 
+data_diabetes_narm <- data_diabetes %>%
+  dplyr::select(-bp.2s, -bp.2d) %>%
+  na.omit()
+
+
+
+

+
Number of missing data per variable, shows that bp.2d and bp.2s have more than 50% missing entries
+
+
+
+
+
+
+

+
Heatmap visualizing Pearson correlation coefficient between numerical variables
+
+
+
+
+
+
+

2.2 Data splitting

+
+
# use tidymodels framework to fit Lasso regression model for predicting BMI
+# using repeated cross-validation to tune lambda value in L1 penalty term
+
+# select random seed value
+myseed <- 123
+
+# split data into non-test (other) and test (80% s)
+set.seed(myseed)
+data_split <- initial_split(data_diabetes_narm, strata = BMI, prop = 0.8) # holds splitting info
+data_other <- data_split %>% training() # creates non-test set (function is called training but it refers to non-test part)
+data_test <- data_split %>% testing() # creates test set
+
+# prepare repeated cross-validation splits with 5 folds repeated 3 times
+set.seed(myseed)
+data_folds <- vfold_cv(data_other,
+                       v = 5, 
+                       repeats = 3,
+                       strata = BMI)
+
+# check the split
+dim(data_diabetes)
+## [1] 403  20
+dim(data_other)
+## [1] 291  18
+dim(data_test)
+## [1] 75 18
+
+# check BMI distributions in data splits
+par(mfrow=c(3,1))
+hist(data_diabetes$BMI, xlab = "", main = "BMI: all", 50)
+hist(data_other$BMI, xlab = "", main = "BMI: non-test", 50)
+hist(data_test$BMI, xlab = "", main = "BMI: test", 50)
+
+
+
+

+
Distribution of BMI values given all data and spits into non-test and test
+
+
+
+
+
+
+

2.3 Feature engineering

+
+
# create data recipe (feature engineering)
+
+inch2m <- 2.54/100
+pound2kg <- 0.45
+
+data_recipe <- recipe(BMI ~ ., data = data_other) %>%
+  update_role(id, new_role = "sampleID") %>%
+  step_mutate(height = height * inch2m, height = round(height, 2)) %>% # convert height to meters
+  step_mutate(weight = weight * pound2kg, weight = round(weight, 2)) %>% # convert weight to kg
+  step_rename(glu = stab.glu) %>% # rename stab.glu to glu
+  step_log(glu) %>%  #ln transform glucose
+  step_zv(all_numeric()) %>% # removes variables that are highly sparse and unbalanced (if found)
+  step_corr(all_numeric(), -all_outcomes(), -has_role("sampleID"), threshold = 0.8) %>% # removes variables with large absolute correlations with other variables (if found)
+  step_dummy(location, gender, frame) %>% # convert categorical variables to dummy variables
+  step_normalize(all_numeric(), -all_outcomes(), -has_role("sampleID"), skip = FALSE) 
+  
+  # you can implement more steps: see https://recipes.tidymodels.org/reference/index.html
+
+# print recipe
+data_recipe
+
+# check if recipe is doing what it is supposed to do
+# i.e. bake the data
+data_other_prep <- data_recipe %>%
+  prep() %>%
+  bake(new_data = NULL)
+
+## bake test data
+data_test_prep <- data_recipe %>%
+  prep() %>%
+  bake(new_data = data_test)
+
+# preview baked data
+print(head(data_other_prep))
+## # A tibble: 6 × 17
+##      id   chol    glu    hdl  ratio  glyhb    age  height  bp.1s  bp.1d    hip
+##   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
+## 1  1045 -0.319 -0.539 -0.830  0.486 -0.172 -0.645 -0.474  -1.16  -0.552 -1.66 
+## 2  1271  0.449 -1.09  -0.324  0.320 -0.458 -1.39  -1.29   -1.59  -1.00  -0.914
+## 3  1277 -0.657 -0.572  2.32  -1.45  -0.642 -0.337  1.56    0.286  2.15  -1.29 
+## 4  1303 -0.590 -0.410 -0.436 -0.178 -0.518  0.341  0.545  -0.309  0.500 -1.47 
+## 5  1309 -0.206 -0.347  0.687 -0.730 -0.860 -1.32   0.0357 -0.735 -0.402 -1.66 
+## 6  1315 -0.793 -0.572  0.350 -0.841  0.225  0.649  1.26   -0.565 -1.45  -1.29 
+## # ℹ 6 more variables: time.ppn <dbl>, BMI <dbl>, location_Louisa <dbl>,
+## #   gender_male <dbl>, frame_medium <dbl>, frame_small <dbl>
+
+
+
+

2.4 Lasso regression

+
+
# define model
+model <- linear_reg(penalty = tune(), mixture = 1) %>%
+  set_engine("glmnet") %>%
+  set_mode("regression")
+
+# create workflow with data recipe and model 
+wf <- workflow() %>%
+  add_model(model) %>%
+  add_recipe(data_recipe)
+
+# define parameters range for tuning
+grid_lambda <- grid_regular(penalty(), levels = 25)
+
+# tune lambda
+model_tune <- wf %>%
+  tune_grid(resamples = data_folds, 
+            grid = grid_lambda)
+
+# show metrics average across folds
+model_tune  %>%
+  collect_metrics(summarize = TRUE)
+## # A tibble: 50 × 7
+##     penalty .metric .estimator  mean     n std_err .config              
+##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
+##  1 1   e-10 rmse    standard   2.48     15  0.142  Preprocessor1_Model01
+##  2 1   e-10 rsq     standard   0.851    15  0.0143 Preprocessor1_Model01
+##  3 2.61e-10 rmse    standard   2.48     15  0.142  Preprocessor1_Model02
+##  4 2.61e-10 rsq     standard   0.851    15  0.0143 Preprocessor1_Model02
+##  5 6.81e-10 rmse    standard   2.48     15  0.142  Preprocessor1_Model03
+##  6 6.81e-10 rsq     standard   0.851    15  0.0143 Preprocessor1_Model03
+##  7 1.78e- 9 rmse    standard   2.48     15  0.142  Preprocessor1_Model04
+##  8 1.78e- 9 rsq     standard   0.851    15  0.0143 Preprocessor1_Model04
+##  9 4.64e- 9 rmse    standard   2.48     15  0.142  Preprocessor1_Model05
+## 10 4.64e- 9 rsq     standard   0.851    15  0.0143 Preprocessor1_Model05
+## # ℹ 40 more rows
+
+# plot k-folds results across lambda range
+model_tune %>%
+  collect_metrics() %>% 
+  dplyr::filter(.metric == "rmse") %>% 
+  ggplot(aes(penalty, mean, color = .metric)) +
+  geom_errorbar(aes( ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
+  scale_x_log10() + 
+  geom_line(linewidth = 1.5) +
+  theme_bw() +
+  theme(legend.position = "none") +
+  scale_color_brewer(palette = "Set1")
+  
+# best lambda value (min. RMSE)
+model_best <- model_tune %>%
+  select_best(metric = "rmse")
+
+print(model_best)
+## # A tibble: 1 × 2
+##   penalty .config              
+##     <dbl> <chr>                
+## 1  0.0562 Preprocessor1_Model22
+
+# finalize workflow with tuned model
+wf_final <- wf %>%
+  finalize_workflow(model_best)
+
+# last fit 
+fit_final <- wf_final %>%
+  last_fit(split = data_split)
+
+# final predictions
+y_test_pred <- fit_final %>% collect_predictions() # predicted BMI
+
+# final predictions: performance on test (unseen data)
+fit_final %>% collect_metrics() 
+## # A tibble: 2 × 4
+##   .metric .estimator .estimate .config             
+##   <chr>   <chr>          <dbl> <chr>               
+## 1 rmse    standard       2.79  Preprocessor1_Model1
+## 2 rsq     standard       0.857 Preprocessor1_Model1
+
+# plot predictions vs. actual for test data
+plot(data_test$BMI, y_test_pred$.pred, xlab="BMI (actual)", ylab = "BMI (predicted)", las = 1, pch = 19)
+
+# correlation between predicted and actual BMI values for test data
+cor(data_test$BMI, y_test_pred$.pred)
+## [1] 0.9256857
+
+# re-fit model on all non-test data
+model_final <- wf_final %>%
+  fit(data_other) 
+
+# show final model
+tidy(model_final)
+## # A tibble: 16 × 3
+##    term            estimate penalty
+##    <chr>              <dbl>   <dbl>
+##  1 (Intercept)      28.7     0.0562
+##  2 chol              0       0.0562
+##  3 glu               0.0229  0.0562
+##  4 hdl               0       0.0562
+##  5 ratio             0.335   0.0562
+##  6 glyhb            -0.0512  0.0562
+##  7 age              -0.257   0.0562
+##  8 height           -1.86    0.0562
+##  9 bp.1s            -0.294   0.0562
+## 10 bp.1d             0.203   0.0562
+## 11 hip               5.60    0.0562
+## 12 time.ppn          0       0.0562
+## 13 location_Louisa  -0.160   0.0562
+## 14 gender_male       1.07    0.0562
+## 15 frame_medium     -0.320   0.0562
+## 16 frame_small      -0.530   0.0562
+
+# plot variables ordered by importance (highest abs(coeff))
+model_final %>%
+  extract_fit_parsnip() %>%
+  vip(geom = "point") + 
+  theme_bw()
+
+
+
+

+
Mean RMSE plus/minus standard error across repeated cross-validation folds as a function of lambda values
+
+
+
+
+
+
+

+
Predicted BMI values against actual BMI values using final model for predicting test (unseen) data
+
+
+
+
+
+
+

+
Top feature of importance, here measured as the features with highest absolute value of the Lasso regression coefficients from the final tuned model
+
+
+
+
+ + +
+ +
+ + +
+ + + + \ No newline at end of file diff --git a/session-tidymodels/case-study_files/figure-html/data-split-1.png b/session-tidymodels/docs/case-study_files/figure-html/data-split-1.png similarity index 100% rename from session-tidymodels/case-study_files/figure-html/data-split-1.png rename to session-tidymodels/docs/case-study_files/figure-html/data-split-1.png diff --git a/session-tidymodels/case-study_files/figure-html/load-data-1.png b/session-tidymodels/docs/case-study_files/figure-html/load-data-1.png similarity index 100% rename from session-tidymodels/case-study_files/figure-html/load-data-1.png rename to session-tidymodels/docs/case-study_files/figure-html/load-data-1.png diff --git a/session-tidymodels/case-study_files/figure-html/load-data-2.png b/session-tidymodels/docs/case-study_files/figure-html/load-data-2.png similarity index 100% rename from session-tidymodels/case-study_files/figure-html/load-data-2.png rename to session-tidymodels/docs/case-study_files/figure-html/load-data-2.png diff --git a/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-1.png b/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-1.png new file mode 100644 index 0000000..3188bd1 Binary files /dev/null and b/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-2.png b/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-2.png new file mode 100644 index 0000000..715bffe Binary files /dev/null and b/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-2.png differ diff --git a/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-3.png b/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-3.png new file mode 100644 index 0000000..4371bca Binary files /dev/null and b/session-tidymodels/docs/case-study_files/figure-html/unnamed-chunk-4-3.png differ diff --git a/session-tidymodels/docs/index.html b/session-tidymodels/docs/index.html index 6ff62b1..49330c9 100644 --- a/session-tidymodels/docs/index.html +++ b/session-tidymodels/docs/index.html @@ -108,7 +108,7 @@ - @@ -168,6 +162,7 @@

Preface

Aims

Learning outcomes