-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy path01 categorical predictors.R
187 lines (116 loc) · 4.92 KB
/
01 categorical predictors.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
library(dplyr)
library(parameters)
anxiety_adhd <- read.csv("anxiety_adhd.csv") |>
mutate(
ID = factor(ID),
treat_group = factor(treat_group),
sex = factor(sex)
)
glimpse(anxiety_adhd)
# see the levels of a factor
levels(anxiety_adhd$treat_group)
# Unfortunately, regressions (and other models) cannot "deal" with categorical
# predictors; Instead, they require all the predictors to be numerical. The way
# to deal with this is to code categorical variables as a series of numerical
# variables that together encode all the information in the categorical
# variable. These are usually called "dummy variables", as each one alone only
# tells part of the story.
#
# Let's see how we do this in R.
# (1) Model Fitting -------------------------------------------------------
## Adding dummy variables (manually) ---------------------------------------
# Option 1: The SPSS way - make the dummy vars by hand...
##>>>>>>>>>>>>>>>>>>>>>>>>>>##
## ______________ ##
## /.--------------.\ ##
## // \\ ##
## // \\ ##
## || .-..----. .-. .--. || ##
## ||( ( '-..-'|.-.||.-.||| ##
## || \ \ || || ||||_|||| ##
## ||._) ) || \'-'/||-' || ##
## \\'-' `' `-' `' // ##
## \\ // ##
## \\______________// ##
## '--------------' ##
## ##
## DO NOT DO THIS ##
## ##
##<<<<<<<<<<<<<<<<<<<<<<<<<<##
anxiety_adhd <- anxiety_adhd |>
mutate(
d_placebo = ifelse(treat_group == "placebo", 1, 0),
d_treat = ifelse(treat_group == "treat", 1, 0)
)
fit_dummy <- lm(anxiety ~ d_placebo + d_treat,
data = anxiety_adhd)
model_parameters(fit_dummy)
# what do these parameters mean?
## Adding dummy variables (automagically) ----------------------------------
# Option 2: let R do all the hard work... Just put the factor in the formula!
fit_factor <- lm(anxiety ~ treat_group, data = anxiety_adhd)
model_parameters(fit_factor)
model_parameters(fit_dummy)
# How does R determine what and how to dummy-code?
# If `X` is a factor, treatment coding is used, with the FIRST level as the base
# group.
#
# If `X` is a character vactor, it is first converted into a factor (level order
# is alphabetical).
# see the coding scheme:
contrasts(anxiety_adhd$treat_group)
## Change contrast scheme --------------------------------------------------
## 1. change base group in dummy coding
anxiety_adhd$treat_group <- relevel(anxiety_adhd$treat_group, ref = "placebo")
contrasts(anxiety_adhd$treat_group)
fit_factor2 <- lm(anxiety ~ treat_group, data = anxiety_adhd)
model_parameters(fit_factor2)
# Or change the first level of the factor by re-leveling the factor...
# (`forcats` is a good package for working with factors)
## 2. change to effects coding:
# Another popular coding scheme is the effects coding, where the "base group" is
# the AVERAGE of all the groups (so the first contrast is the difference between
# the mean of group 1 and the total mean, etc). Unfortunately, it makes
# parameter interpretation quite hard...
contrasts(anxiety_adhd$treat_group) <- contr.sum
contrasts(anxiety_adhd$treat_group)
fit_factor3 <- lm(anxiety ~ treat_group, data = anxiety_adhd)
model_parameters(fit_factor3) # what do there mean???
?contr.treatment # even more types...
# We can also use weighted effect coding with the {wec} package.
# (2) Model exploration (inference) ---------------------------------------
# Looking at the parameters from the last two models, it is hard to see what the
# difference between "no treat" and "treat" groups is... And even if we could
# fish it out, is it significant?
model_parameters(fit_factor2)
model_parameters(fit_factor3)
# If we were working in SPSS, we would re-fit the model with different dummy
# coding to get all the pair-wise differences.
#
# But we are in R! And we have smart packages that can do wonders!
library(emmeans)
# `emmeans` is one of the best packages in R - for ANY follow-up analysis!!
emmeans(fit_factor2, ~ treat_group)
emmeans(fit_factor3, ~ treat_group)
# Note that both returned the exact same results - the coding scheme has no
# influence here!
# All pair-wise comparisons.
emmeans(fit_factor2, ~ treat_group) |>
contrast(method = "pairwise",
infer = TRUE)
# Note the automatic p-value adjustment
# (We will see more complex contrasts when we learn about ANOVAs.)
# This is the time to note that in R, model ~fitting~ and ~hypothesis testing~
# are not as closely knit as they are in SPSS. Whereas in SPSS we would fit
# several models with different dummy coding or variable centering, in R we work
# in 2 stages:
# 1) fit a model
# 2) Ask the model questions - test contrasts / effects of interest, get
# predictions...
#
# This 2-part method makes life A LOT EASIER once models become more and more
# complex...
## Plot
library(ggeffects)
ggemmeans(fit_factor2, "treat_group") |>
plot(add.data = TRUE, jitter = 0.1)