From 79abf62994e59e425b47b83de9fd8ff0b7f0580d Mon Sep 17 00:00:00 2001 From: storresrod <69353664+storresrod@users.noreply.github.com> Date: Fri, 8 Dec 2023 11:04:07 -0500 Subject: [PATCH] Updates to imputation script, including new encoding for categorical variables, adding interaction terms, and increasing checks for missingness Updates to imputation script, including new encoding for categorical variables, adding interaction terms, and increasing checks for missingness --- NLSY/nlsy_impute.R | 75 +++++++++++++++++++++++++++++----------------- NLSY/nlsy_lib.R | 25 ++++++++++++++++ 2 files changed, 72 insertions(+), 28 deletions(-) diff --git a/NLSY/nlsy_impute.R b/NLSY/nlsy_impute.R index 57235ac..d076790 100644 --- a/NLSY/nlsy_impute.R +++ b/NLSY/nlsy_impute.R @@ -1,5 +1,6 @@ #Packages -librarian::shelf(dplyr, mice, car, corrplot, caret) +librarian::shelf(dplyr, mice, car, corrplot, caret, tidyr) +source(here::here("NLSY/nlsy_lib.R")) # Read data nlsydf = readRDS(paste0(here::here(), "/NLSY/NLSY-college-finance.rds")) @@ -24,28 +25,40 @@ test <- nlsydf %>% #Add additional variables that should be considered during mu ) %>% mutate(mom_age = mom_age_birth + age_resp) -# Making dummy variables -test$hisp <- ifelse(test$hispanic == "Yes", 1, 0) +# Making dummy or factor variables +test <- test |> + mutate( + mom_educ_hs = factor(nlsy_encode_educ4(mom_education)), + dad_educ_hs = factor(nlsy_encode_educ4(dad_education)), + race = factor(race), + hisp = factor(hispanic), + race_eth = ifelse(hisp == "Yes" & race == "White", "White Hispanic", NA), # Race and ethnicity interaction + race_eth = ifelse(hisp == "No" & race == "White", "White NonHispanic", race_eth), + race_eth = ifelse(hisp == "Yes" & race == "Black or African American", "Black Hispanic", race_eth), + race_eth = ifelse(hisp == "No" & race == "Black or African American", "Black NonHispanic", race_eth), + race_eth = ifelse(hisp == "Yes" & race == "American Indian, Eskimo, or Aleut", "NonBW Hispanic", race_eth), + race_eth = ifelse(hisp == "Yes" & race == "Asian or Pacific Islander", "NonBW Hispanic", race_eth), + race_eth = ifelse(hisp == "Yes" & race == "Something else? (SPECIFY)", "NonBW Hispanic", race_eth), + race_eth = ifelse(hisp == "No" & race == "American Indian, Eskimo, or Aleut", "NonBW NonHispanic", race_eth), + race_eth = ifelse(hisp == "No" & race == "Asian or Pacific Islander", "NonBW NonHispanic", race_eth), + race_eth = ifelse(hisp == "No" & race == "Something else? (SPECIFY)", "NonBW NonHispanic", race_eth)) + test$ret_sav <- ifelse(test$savings == "YES", 1, 0) test$hown <- ifelse(test$home == "OWNS OR IS BUYING; LAND CONTRACT", 1, 0) test$both_pars <- ifelse(test$both_parents == "Yes", 1, 0) test$par_dec <- ifelse(test$par1_deceased == "Yes" | test$par2_deceased == "Yes", 1, 0) # Either parent deceased -test$mom_educ_hs <- ifelse(test$mom_education == "12TH GRADE" | test$mom_education == "1ST YEAR COLLEGE" | test$mom_education == "2ND YEAR COLLEGE" | test$mom_education == "3RD YEAR COLLEGE" | test$mom_education == "4TH YEAR COLLEGE" | test$mom_education == "5TH YEAR COLLEGE" | test$mom_education == "6TH YEAR COLLEGE" | test$mom_education == "7TH YEAR COLLEGE" | test$mom_education == "8TH YEAR COLLEGE", 1, 0) -test$dad_educ_hs <- ifelse(test$dad_education == "12TH GRADE" | test$dad_education == "1ST YEAR COLLEGE" | test$dad_education == "2ND YEAR COLLEGE" | test$dad_education == "3RD YEAR COLLEGE" | test$mom_education == "4TH YEAR COLLEGE" | test$mom_education == "5TH YEAR COLLEGE" | test$mom_education == "6TH YEAR COLLEGE" | test$mom_education == "7TH YEAR COLLEGE" | test$mom_education == "8TH YEAR COLLEGE", 1, 0) -test$race_w <- ifelse(test$race == "White", 1, 0) -test$race_b <- ifelse(test$race == "Black or African American", 1, 0) # Looking for significance with non-missing income and wealth non_missing <- test %>% filter(!is.na(pincome), !is.na(pnetworth)) -summary(model <- lm(pincome ~ mom_age + hisp + race_w + race_b + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing)) +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing)) #Non-significant: race_w and par_dec - dropping for now # Checking for multi-collinearity testing_cor <- test %>% - select(-id, -hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -pincome, -pnetworth, -race_w, -par_dec) %>% + select(-id, -hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -pincome, -pnetworth, -par_dec, -mom_educ_hs,-dad_educ_hs) %>% drop_na() M <- cor(testing_cor) @@ -54,31 +67,39 @@ corrplot(M, type = "upper") summary(model <- lm(pincome ~ mom_age, data = test)) #Adj R2 = 0.057 summary(model <- lm(pincome ~ mom_age + hisp, data = test)) #Adj R2 = 0.09 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race_b, data = test)) # #Adj R2 = 0.154 +summary(model <- lm(pincome ~ mom_age + hisp + race, data = test)) # #Adj R2 = 0.157 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race_b + mom_educ_hs, data = test)) #Adj R2 = 0.1767 +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs, data = test)) #Adj R2 = 0.1899 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race_b + mom_educ_hs + dad_educ_hs, data = test)) #Adj R2 = 0.1336 +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs, data = test)) #Adj R2 = 0.1616 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race_b + mom_educ_hs + dad_educ_hs + ret_sav, data = test)) #Adj R2 = 0.2047 - biggest R2 jump +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav, data = test)) #Adj R2 = 0.199 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race_b + mom_educ_hs + dad_educ_hs + ret_sav + hown, data = test)) #Adj R2 = 0.2165 +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown, data = test)) #Adj R2 = 0.2107 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race_b + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2178 +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2108 car::vif(model) # Making df for imputation impute <- test %>% - select(-hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -race_w, -par_dec, -age_resp) + select(-hispanic, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -par_dec, -age_resp) t <- nearZeroVar(impute) # no variables are found to have near zero variance # Checking for missings in the predictor variables -sapply(impute, function(x) sum(is.na(x))) +sapply(impute, function(x) sum(is.na(x))) # dad_educ_hs has 4598 missings + +# Calculating percent of usable cases +m <- md.pairs(impute) +round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and predictor are missing on the same cases +#Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions + +impute <- impute %>% + select(-dad_educ_hs) # Paper that uses multiple imputation: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6049093/ # Multiple imputation in R reference: https://library.virginia.edu/data/articles/getting-started-with-multiple-imputation-in-r @@ -88,17 +109,15 @@ predM = init$predictorMatrix # Set distributions num <- c("mom_age") # numerical variables -log <- c("hisp", "race_b", "mom_educ_hs", "dad_educ_hs", "ret_sav", "hown", "both_pars") # binary variables - -meth[num] = "norm" # numerical variables -meth[log] = "logreg" # binary variables +log <- c("hisp", "ret_sav", "hown", "both_pars") # binary variables +poly <- c("mom_educ_hs") # ordered categorical variables +poly2 <- c("race", "race_eth")# Unordered categorical variable # Assign Variables to be imputed predM[,c("id")] <- 0 # not using as a predictor variable predM[c("id"),] <- 0 # not using as a predictor variable -predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "dad_educ_hs", "race_b", "both_pars")] <- 0 # not imputing - -meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "dad_educ_hs", "race_b", "both_pars")]= "" #not imputing missing values, just using as a predictor +predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "race", "both_pars", "race_eth")] <- 0 # not imputing +meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "race", "both_pars", "race_eth")]= "" #not imputing missing values, just using as a predictor # Income and Wealth imputation set.seed(25) @@ -112,11 +131,11 @@ final <- imputed %>% ret_sav = ret_sav.1, hown = hown.1, mom_educ_hs = mom_educ_hs.1, - dad_educ_hs = dad_educ_hs.1, - race_b = race_b.1, + race = race.1, hisp = hisp.1, - both_pars = both_pars.1) %>% - select(-starts_with("id."), -starts_with("mom_age."), -starts_with("ret_sav."), -starts_with("hown."), -starts_with("mom_educ_hs."), -starts_with("dad_educ_hs."), -starts_with("race_b."), -starts_with("both_pars."), -starts_with("hisp.")) + both_pars = both_pars.1, + race_eth = race_eth.1) %>% + select(-starts_with("id."), -starts_with("mom_age."), -starts_with("ret_sav."), -starts_with("hown."), -starts_with("mom_educ_hs."), -starts_with("dad_educ_hs."), -starts_with("race."), -starts_with("both_pars."), -starts_with("hisp."), -starts_with("race_eth.")) # Join back with original income and wealth og_values <- impute %>% diff --git a/NLSY/nlsy_lib.R b/NLSY/nlsy_lib.R index 9e61321..a98bfa0 100644 --- a/NLSY/nlsy_lib.R +++ b/NLSY/nlsy_lib.R @@ -275,3 +275,28 @@ nlsy_get_student_loans_df = function() mutate(hasdebt=ifelse(debt>0, 1, 0)) return(sloandf) } + +# Encode education variable +nlsy_encode_educ4 = function(var) +{ + return( case_when( + {{var}} %in% c( + "12TH GRADE" + ) ~ "High-school graduate", + {{var}} %in% c( + "1ST YEAR COLLEGE", + "2ND YEAR COLLEGE", + "3RD YEAR COLLEGE" + ) ~ "Some college", + {{var}} %in% c( + "4TH YEAR COLLEGE", + "5TH YEAR COLLEGE" + ) ~ "College degree", + {{var}} %in% c( + "6TH YEAR COLLEGE", + "7TH YEAR COLLEGE", + "8TH YEAR COLLEGE" + ) ~ "Graduate degree" + ) + ) +}