#Packages
+::shelf(dplyr, mice, car, corrplot, caret, tidyr)
+ librariansource(here::here("NLSY/nlsy_lib.R"))
+
+# Read data
+= readRDS(paste0(here::here(), "/NLSY/NLSY-college-finance.rds")) nlsydf
+
+
+
+
+
+
+
+NLSY Parent Income and Wealth Imputation
+Data Cleaning
+
+
+
+
+
+
+<- nlsydf %>% #Add additional variables that should be considered during multiple imputation
+ test select(id = "PUBID_1997",
+ mom_age_birth = "CV_BIO_MOM_AGE_YOUTH_1997", # numeric
+ age_resp = "CV_AGE_12/31/96_1997",
+ hispanic = "KEY_ETHNICITY_1997", # binary
+ race = "KEY_RACE_1997", # categorical
+ mom_education = "CV_HGC_RES_MOM_1997", # categorical - highest grade completed in 1997 by mom
+ dad_education = "CV_HGC_RES_DAD_1997", # categorical - highest grade completed in 1997 by dad
+ savings = "P5-130_1997", #binary - yes/no retirement savings
+ home = "P5-101_1997", # binary - homeownership or renting
+ both_parents = "YOUTH_BOTHBIO.01_1997",
+ par1_deceased = "YOUTH_NONR1DEAD.01_1997",
+ par2_deceased = "YOUTH_NONR2DEAD.01_1997",
+ pincome = "CV_INCOME_GROSS_YR_1997",
+ pnetworth = "CV_HH_NET_WORTH_P_1997",
+ # Also, it seems that this represents the wealth of the household in which a respondent lives. That means that for a respondent who lives with only one parent, the net-worth variable represents the wealth of that parent.
+ # In the case of an independent respondent, contains respondent's, rather than parents', wealth.
+ %>%
+ ) mutate(mom_age = mom_age_birth + age_resp)
+
+# Making dummy or factor variables
+<- test |>
+ test mutate(
+ mom_educ_hs = factor(nlsy_encode_educ4(mom_education)),
+ dad_educ_hs = factor(nlsy_encode_educ4(dad_education)),
+ race = factor(race),
+ hisp = factor(hispanic),
+ race_eth = ifelse(hisp == "Yes" & race == "White", "White Hispanic", NA), # Race and ethnicity interaction
+ race_eth = ifelse(hisp == "No" & race == "White", "White NonHispanic", race_eth),
+ race_eth = ifelse(hisp == "Yes" & race == "Black or African American", "Black Hispanic", race_eth),
+ race_eth = ifelse(hisp == "No" & race == "Black or African American", "Black NonHispanic", race_eth),
+ race_eth = ifelse(hisp == "Yes" & race == "American Indian, Eskimo, or Aleut", "NonBW Hispanic", race_eth),
+ race_eth = ifelse(hisp == "Yes" & race == "Asian or Pacific Islander", "NonBW Hispanic", race_eth),
+ race_eth = ifelse(hisp == "Yes" & race == "Something else? (SPECIFY)", "NonBW Hispanic", race_eth),
+ race_eth = ifelse(hisp == "No" & race == "American Indian, Eskimo, or Aleut", "NonBW NonHispanic", race_eth),
+ race_eth = ifelse(hisp == "No" & race == "Asian or Pacific Islander", "NonBW NonHispanic", race_eth),
+ race_eth = ifelse(hisp == "No" & race == "Something else? (SPECIFY)", "NonBW NonHispanic", race_eth),
+ race_eth = factor(race_eth))
+
+$ret_sav <- ifelse(test$savings == "YES", 1, 0)
+ test$hown <- ifelse(test$home == "OWNS OR IS BUYING; LAND CONTRACT", 1, 0)
+ test$both_pars <- ifelse(test$both_parents == "Yes", 1, 0)
+ test$par_dec <- ifelse(test$par1_deceased == "Yes" | test$par2_deceased == "Yes", 1, 0) # Either parent deceased test
+
+
+
+# Looking for significance with non-missing income and wealth
+<- test %>%
+ non_missing filter(!is.na(pincome),
+ !is.na(pnetworth))
+
+summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing))
+
+
+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs +
+ dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-106496 -21175 -5574 11535 209949
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 21684.1 7866.7 2.756 0.005891 **
+mom_age 1013.2 179.8 5.636 1.96e-08 ***
+hispYes -6942.6 3097.5 -2.241 0.025101 *
+raceBlack or African American -8065.7 2388.8 -3.376 0.000747 ***
+raceAmerican Indian, Eskimo, or Aleut -1839.5 10957.3 -0.168 0.866694
+raceAsian or Pacific Islander 7165.9 5949.4 1.204 0.228538
+raceSomething else? (SPECIFY) -351.9 3970.4 -0.089 0.929391
+mom_educ_hsGraduate degree 12856.9 3990.5 3.222 0.001292 **
+mom_educ_hsHigh-school graduate -11316.3 2439.1 -4.640 3.69e-06 ***
+mom_educ_hsSome college -10224.4 2379.7 -4.297 1.81e-05 ***
+dad_educ_hsGraduate degree 10419.2 3301.8 3.156 0.001623 **
+dad_educ_hsHigh-school graduate -14369.3 2361.0 -6.086 1.36e-09 ***
+dad_educ_hsSome college -8036.7 2355.0 -3.413 0.000655 ***
+ret_sav 16350.9 1867.5 8.756 < 2e-16 ***
+hown 14618.6 2248.5 6.501 9.80e-11 ***
+both_pars -2946.4 2203.9 -1.337 0.181396
+par_dec 1758.2 8311.9 0.212 0.832491
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 37700 on 2215 degrees of freedom
+ (3601 observations deleted due to missingness)
+Multiple R-squared: 0.2221, Adjusted R-squared: 0.2165
+F-statistic: 39.52 on 16 and 2215 DF, p-value: < 2.2e-16
+#Non-significant: par_dec - dropping for now
Checking for multi-collinearity - correlation plot for numeric variables
+
+
+
+<- test %>%
+ testing_cor select(-id, -hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -pincome, -pnetworth, -par_dec, -mom_educ_hs,-dad_educ_hs, -hisp,-race_eth) %>%
+ drop_na()
+
+<- cor(testing_cor)
+ M corrplot(M, type = "upper")
+
+
+Checking for multi-collinearity - VIF and Successive addition of regressors
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+summary(model <- lm(pincome ~ mom_age, data = test)) #Adj R2 = 0.057
+
+
+Call:
+lm(formula = pincome ~ mom_age, data = test)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-110821 -25331 -7287 15521 215185
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) -26029.36 3756.14 -6.93 4.64e-12 ***
+mom_age 1848.97 94.67 19.53 < 2e-16 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 40710 on 6210 degrees of freedom
+ (2772 observations deleted due to missingness)
+Multiple R-squared: 0.05787, Adjusted R-squared: 0.05771
+F-statistic: 381.4 on 1 and 6210 DF, p-value: < 2.2e-16
+summary(model <- lm(pincome ~ mom_age + hisp, data = test)) #Adj R2 = 0.09
+
+
+Call:
+lm(formula = pincome ~ mom_age + hisp, data = test)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-114431 -25073 -7057 15049 224819
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) -21449.50 3706.92 -5.786 7.55e-09 ***
+mom_age 1828.75 93.11 19.640 < 2e-16 ***
+hispYes -19072.61 1275.23 -14.956 < 2e-16 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 40000 on 6202 degrees of freedom
+ (2779 observations deleted due to missingness)
+Multiple R-squared: 0.09091, Adjusted R-squared: 0.09062
+F-statistic: 310.1 on 2 and 6202 DF, p-value: < 2.2e-16
+::vif(model) car
+
+ mom_age hisp
+1.000325 1.000325
+summary(model <- lm(pincome ~ mom_age + hisp + race, data = test)) # #Adj R2 = 0.157
+
+
+Call:
+lm(formula = pincome ~ mom_age + hisp + race, data = test)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-119072 -22397 -7121 13487 224217
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) -1001.21 3706.86 -0.270 0.7871
+mom_age 1499.44 91.34 16.415 < 2e-16 ***
+hispYes -23096.47 1578.75 -14.630 < 2e-16 ***
+raceBlack or African American -25965.05 1202.19 -21.598 < 2e-16 ***
+raceAmerican Indian, Eskimo, or Aleut -12807.11 5671.04 -2.258 0.0240 *
+raceAsian or Pacific Islander 18250.00 4471.05 4.082 4.53e-05 ***
+raceSomething else? (SPECIFY) -4626.23 1981.05 -2.335 0.0196 *
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38540 on 6164 degrees of freedom
+ (2813 observations deleted due to missingness)
+Multiple R-squared: 0.1581, Adjusted R-squared: 0.1573
+F-statistic: 193 on 6 and 6164 DF, p-value: < 2.2e-16
+::vif(model) car
+
+ GVIF Df GVIF^(1/(2*Df))
+mom_age 1.030125 1 1.014951
+hisp 1.616650 1 1.271476
+race 1.662056 4 1.065567
+summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs, data = test)) #Adj R2 = 0.1899
+
+
+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs, data = test)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-120890 -22543 -6045 14583 199382
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 28466 5153 5.524 3.50e-08 ***
+mom_age 1199 118 10.164 < 2e-16 ***
+hispYes -10947 2159 -5.071 4.13e-07 ***
+raceBlack or African American -20625 1442 -14.301 < 2e-16 ***
+raceAmerican Indian, Eskimo, or Aleut -10444 7343 -1.422 0.15500
+raceAsian or Pacific Islander 16432 5189 3.167 0.00155 **
+raceSomething else? (SPECIFY) -7672 2721 -2.819 0.00484 **
+mom_educ_hsGraduate degree 18416 3046 6.046 1.61e-09 ***
+mom_educ_hsHigh-school graduate -24546 1697 -14.462 < 2e-16 ***
+mom_educ_hsSome college -17773 1767 -10.056 < 2e-16 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38600 on 4425 degrees of freedom
+ (4549 observations deleted due to missingness)
+Multiple R-squared: 0.1915, Adjusted R-squared: 0.1899
+F-statistic: 116.5 on 9 and 4425 DF, p-value: < 2.2e-16
+::vif(model) car
+
+ GVIF Df GVIF^(1/(2*Df))
+mom_age 1.083580 1 1.040952
+hisp 1.516648 1 1.231523
+race 1.588827 4 1.059582
+mom_educ_hs 1.092325 3 1.014827
+summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs, data = test)) #Adj R2 = 0.1616
+
+
+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs +
+ dad_educ_hs, data = test)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-115866 -21996 -5334 12213 189896
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 42843.8 7124.4 6.014 2.06e-09 ***
+mom_age 1052.3 160.5 6.555 6.65e-11 ***
+hispYes -11445.9 2894.0 -3.955 7.85e-05 ***
+raceBlack or African American -12319.3 2144.6 -5.744 1.03e-08 ***
+raceAmerican Indian, Eskimo, or Aleut -4744.5 10135.3 -0.468 0.639739
+raceAsian or Pacific Islander 7868.9 5996.6 1.312 0.189554
+raceSomething else? (SPECIFY) -1544.6 3757.3 -0.411 0.681036
+mom_educ_hsGraduate degree 13236.8 3901.0 3.393 0.000701 ***
+mom_educ_hsHigh-school graduate -11808.7 2271.5 -5.199 2.16e-07 ***
+mom_educ_hsSome college -9038.6 2228.3 -4.056 5.13e-05 ***
+dad_educ_hsGraduate degree 13535.1 3121.2 4.337 1.50e-05 ***
+dad_educ_hsHigh-school graduate -16548.1 2213.6 -7.476 1.03e-13 ***
+dad_educ_hsSome college -10859.3 2211.2 -4.911 9.60e-07 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 39040 on 2718 degrees of freedom
+ (6253 observations deleted due to missingness)
+Multiple R-squared: 0.1653, Adjusted R-squared: 0.1616
+F-statistic: 44.86 on 12 and 2718 DF, p-value: < 2.2e-16
+::vif(model) car
+
+ GVIF Df GVIF^(1/(2*Df))
+mom_age 1.116575 1 1.056681
+hisp 1.458563 1 1.207710
+race 1.519815 4 1.053717
+mom_educ_hs 1.373641 3 1.054336
+dad_educ_hs 1.400450 3 1.057738
+summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav, data = test)) #Adj R2 = 0.199
+
+
+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs +
+ dad_educ_hs + ret_sav, data = test)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-120849 -20854 -5499 12002 202648
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 27572 7085 3.892 0.000102 ***
+mom_age 1008 157 6.422 1.59e-10 ***
+hispYes -8740 2840 -3.077 0.002110 **
+raceBlack or African American -8155 2130 -3.828 0.000132 ***
+raceAmerican Indian, Eskimo, or Aleut -5069 9888 -0.513 0.608241
+raceAsian or Pacific Islander 7350 5850 1.256 0.209131
+raceSomething else? (SPECIFY) -1288 3669 -0.351 0.725604
+mom_educ_hsGraduate degree 13822 3807 3.630 0.000288 ***
+mom_educ_hsHigh-school graduate -9214 2234 -4.124 3.84e-05 ***
+mom_educ_hsSome college -7318 2183 -3.353 0.000812 ***
+dad_educ_hsGraduate degree 13967 3046 4.585 4.76e-06 ***
+dad_educ_hsHigh-school graduate -14865 2174 -6.839 9.83e-12 ***
+dad_educ_hsSome college -9361 2166 -4.321 1.61e-05 ***
+ret_sav 18961 1658 11.437 < 2e-16 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38090 on 2701 degrees of freedom
+ (6269 observations deleted due to missingness)
+Multiple R-squared: 0.2029, Adjusted R-squared: 0.199
+F-statistic: 52.88 on 13 and 2701 DF, p-value: < 2.2e-16
+::vif(model) car
+
+ GVIF Df GVIF^(1/(2*Df))
+mom_age 1.117451 1 1.057096
+hisp 1.470673 1 1.212713
+race 1.567470 4 1.057791
+mom_educ_hs 1.387635 3 1.056118
+dad_educ_hs 1.411949 3 1.059180
+ret_sav 1.074406 1 1.036535
+summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown, data = test)) #Adj R2 = 0.2107
+
+
+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs +
+ dad_educ_hs + ret_sav + hown, data = test)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-121555 -20857 -5701 11831 213185
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 24176.2 7304.7 3.310 0.000947 ***
+mom_age 832.7 162.7 5.117 3.34e-07 ***
+hispYes -9571.9 2880.4 -3.323 0.000903 ***
+raceBlack or African American -6799.7 2193.2 -3.100 0.001953 **
+raceAmerican Indian, Eskimo, or Aleut -1367.3 10620.6 -0.129 0.897570
+raceAsian or Pacific Islander 8835.7 5919.5 1.493 0.135648
+raceSomething else? (SPECIFY) 985.4 3762.9 0.262 0.793436
+mom_educ_hsGraduate degree 12969.7 3814.5 3.400 0.000684 ***
+mom_educ_hsHigh-school graduate -9432.6 2274.8 -4.147 3.48e-05 ***
+mom_educ_hsSome college -7490.6 2218.1 -3.377 0.000744 ***
+dad_educ_hsGraduate degree 13989.5 3065.1 4.564 5.25e-06 ***
+dad_educ_hsHigh-school graduate -14762.7 2209.6 -6.681 2.89e-11 ***
+dad_educ_hsSome college -9420.1 2196.1 -4.290 1.86e-05 ***
+ret_sav 17029.3 1735.6 9.812 < 2e-16 ***
+hown 14533.5 2117.7 6.863 8.43e-12 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38070 on 2581 degrees of freedom
+ (6388 observations deleted due to missingness)
+Multiple R-squared: 0.215, Adjusted R-squared: 0.2107
+F-statistic: 50.49 on 14 and 2581 DF, p-value: < 2.2e-16
+::vif(model) car
+
+ GVIF Df GVIF^(1/(2*Df))
+mom_age 1.139122 1 1.067297
+hisp 1.443144 1 1.201309
+race 1.552772 4 1.056546
+mom_educ_hs 1.391807 3 1.056647
+dad_educ_hs 1.411255 3 1.059093
+ret_sav 1.109899 1 1.053517
+hown 1.099569 1 1.048603
+summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2108
+
+
+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs +
+ dad_educ_hs + ret_sav + hown + both_pars, data = test)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-121433 -21118 -5599 11890 211685
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 24194.5 7304.5 3.312 0.000938 ***
+mom_age 873.4 166.9 5.234 1.80e-07 ***
+hispYes -9452.3 2882.4 -3.279 0.001054 **
+raceBlack or African American -6940.0 2196.8 -3.159 0.001600 **
+raceAmerican Indian, Eskimo, or Aleut -1700.3 10624.5 -0.160 0.872862
+raceAsian or Pacific Islander 8874.4 5919.3 1.499 0.133937
+raceSomething else? (SPECIFY) 983.1 3762.7 0.261 0.793894
+mom_educ_hsGraduate degree 12937.6 3814.5 3.392 0.000705 ***
+mom_educ_hsHigh-school graduate -9383.8 2275.1 -4.125 3.83e-05 ***
+mom_educ_hsSome college -7484.8 2218.0 -3.375 0.000750 ***
+dad_educ_hsGraduate degree 13870.1 3066.9 4.522 6.39e-06 ***
+dad_educ_hsHigh-school graduate -14959.4 2216.7 -6.748 1.84e-11 ***
+dad_educ_hsSome college -9586.8 2201.2 -4.355 1.38e-05 ***
+ret_sav 17047.7 1735.6 9.823 < 2e-16 ***
+hown 14812.2 2132.8 6.945 4.77e-12 ***
+both_pars -2202.2 2001.6 -1.100 0.271349
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38070 on 2580 degrees of freedom
+ (6388 observations deleted due to missingness)
+Multiple R-squared: 0.2153, Adjusted R-squared: 0.2108
+F-statistic: 47.2 on 15 and 2580 DF, p-value: < 2.2e-16
+::vif(model) car
+
+ GVIF Df GVIF^(1/(2*Df))
+mom_age 1.198064 1 1.094561
+hisp 1.445199 1 1.202164
+race 1.559232 4 1.057095
+mom_educ_hs 1.392832 3 1.056776
+dad_educ_hs 1.421260 3 1.060341
+ret_sav 1.110001 1 1.053566
+hown 1.115309 1 1.056082
+both_pars 1.115809 1 1.056319
+Checking for missings in the predictor variables
+
+
+
+
+
+<- test %>%
+ impute select(-hispanic, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -par_dec, -age_resp)
+ <- nearZeroVar(impute) # no variables are found to have near zero variance
+ t
+sapply(impute, function(x) sum(is.na(x))) # dad_educ_hs has 4598 missings
+
+ id race pincome pnetworth mom_age mom_educ_hs
+ 0 80 2396 2365 608 2879
+dad_educ_hs hisp race_eth ret_sav hown both_pars
+ 4598 24 88 1194 1616 0
+# Calculating percent of usable cases
+<- md.pairs(impute)
+ m round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and predictor are missing on the same cases
+
+ id race pincome pnetworth mom_age mom_educ_hs dad_educ_hs hisp
+id NaN NaN NaN NaN NaN NaN NaN NaN
+race 1 0.000 0.600 0.562 0.838 0.312 0.200 0.800
+pincome 1 0.987 0.000 0.328 0.903 0.608 0.442 0.996
+pnetworth 1 0.985 0.319 0.000 0.901 0.658 0.526 0.994
+mom_age 1 0.979 0.618 0.613 0.000 0.497 0.428 0.985
+mom_educ_hs 1 0.981 0.673 0.719 0.894 0.000 0.225 0.993
+dad_educ_hs 1 0.986 0.709 0.756 0.924 0.515 0.000 0.996
+hisp 1 0.333 0.625 0.375 0.625 0.167 0.167 0.000
+race_eth 1 0.091 0.602 0.568 0.830 0.307 0.193 0.727
+ret_sav 1 0.972 0.073 0.000 0.869 0.555 0.440 0.989
+hown 1 0.981 0.338 0.296 0.892 0.546 0.399 0.993
+both_pars NaN NaN NaN NaN NaN NaN NaN NaN
+ race_eth ret_sav hown both_pars
+id NaN NaN NaN NaN
+race 0.000 0.588 0.613 1
+pincome 0.985 0.538 0.553 1
+pnetworth 0.984 0.495 0.519 1
+mom_age 0.975 0.742 0.714 1
+mom_educ_hs 0.979 0.816 0.745 1
+dad_educ_hs 0.985 0.855 0.789 1
+hisp 0.000 0.458 0.542 1
+race_eth 0.000 0.591 0.625 1
+ret_sav 0.970 0.000 0.105 1
+hown 0.980 0.338 0.000 1
+both_pars NaN NaN NaN NaN
+#Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions
+
+<- impute %>%
+ impute select(-dad_educ_hs, -race) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now
Multiple Imputation
+
+
+
+
+
+# Paper that uses multiple imputation: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6049093/
+# Multiple imputation in R reference: https://library.virginia.edu/data/articles/getting-started-with-multiple-imputation-in-r
+= mice(impute, maxit=10) init
+
+
+ iter imp variable
+ 1 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 1 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 1 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 1 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 1 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 2 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 2 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 2 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 2 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 2 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 3 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 3 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 3 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 3 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 3 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 4 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 4 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 4 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 4 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 4 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 5 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 5 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 5 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 5 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 5 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 6 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 6 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 6 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 6 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 6 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 7 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 7 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 7 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 7 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 7 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 8 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 8 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 8 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 8 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 8 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 9 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 9 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 9 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 9 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 9 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 10 1 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 10 2 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 10 3 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 10 4 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+ 10 5 pincome pnetworth mom_age mom_educ_hs hisp race_eth ret_sav hown
+= init$method
+ meth = init$predictorMatrix
+ predM
+# Set distributions
+<- c("mom_age") # numerical variables
+ num <- c("hisp", "ret_sav", "hown", "both_pars") # binary variables
+ log <- c("mom_educ_hs") # ordered categorical variables
+ poly <- c("race", "race_eth")# Unordered categorical variable
+ poly2
+# Assign Variables to be imputed
+c("id")] <- 0 # not using as a predictor variable
+ predM[,c("id"),] <- 0 # not using as a predictor variable
+ predM[c("race_eth"),] <- 1 # not using as a predictor variable
+ predM[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 # not imputing
+ predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" #not imputing missing values, just using as a predictor
+ meth[
+# Income and Wealth imputation
+set.seed(25)
+= mice(impute, method=meth, predictorMatrix=predM, m=10) imputed
+
+
+ iter imp variable
+ 1 1 pincome pnetworth
+ 1 2 pincome pnetworth
+ 1 3 pincome pnetworth
+ 1 4 pincome pnetworth
+ 1 5 pincome pnetworth
+ 1 6 pincome pnetworth
+ 1 7 pincome pnetworth
+ 1 8 pincome pnetworth
+ 1 9 pincome pnetworth
+ 1 10 pincome pnetworth
+ 2 1 pincome pnetworth
+ 2 2 pincome pnetworth
+ 2 3 pincome pnetworth
+ 2 4 pincome pnetworth
+ 2 5 pincome pnetworth
+ 2 6 pincome pnetworth
+ 2 7 pincome pnetworth
+ 2 8 pincome pnetworth
+ 2 9 pincome pnetworth
+ 2 10 pincome pnetworth
+ 3 1 pincome pnetworth
+ 3 2 pincome pnetworth
+ 3 3 pincome pnetworth
+ 3 4 pincome pnetworth
+ 3 5 pincome pnetworth
+ 3 6 pincome pnetworth
+ 3 7 pincome pnetworth
+ 3 8 pincome pnetworth
+ 3 9 pincome pnetworth
+ 3 10 pincome pnetworth
+ 4 1 pincome pnetworth
+ 4 2 pincome pnetworth
+ 4 3 pincome pnetworth
+ 4 4 pincome pnetworth
+ 4 5 pincome pnetworth
+ 4 6 pincome pnetworth
+ 4 7 pincome pnetworth
+ 4 8 pincome pnetworth
+ 4 9 pincome pnetworth
+ 4 10 pincome pnetworth
+ 5 1 pincome pnetworth
+ 5 2 pincome pnetworth
+ 5 3 pincome pnetworth
+ 5 4 pincome pnetworth
+ 5 5 pincome pnetworth
+ 5 6 pincome pnetworth
+ 5 7 pincome pnetworth
+ 5 8 pincome pnetworth
+ 5 9 pincome pnetworth
+ 5 10 pincome pnetworth
+<- complete(imputed, action = "repeated", include = FALSE)
+ imputed
+<- imputed %>%
+ final rename(id = id.1,
+ mom_age = mom_age.1,
+ ret_sav = ret_sav.1,
+ hown = hown.1,
+ mom_educ_hs = mom_educ_hs.1,
+ hisp = hisp.1,
+ both_pars = both_pars.1,
+ race_eth = race_eth.1) %>%
+ select(-starts_with("id."), -starts_with("mom_age."), -starts_with("ret_sav."), -starts_with("hown."), -starts_with("mom_educ_hs."), -starts_with("dad_educ_hs."), -starts_with("both_pars."), -starts_with("hisp."), -starts_with("race_eth."))
+
+
+# Join back with original income and wealth
+<- impute %>%
+ og_values select(id, pincome, pnetworth)
+
+<- merge(x=final,y=og_values,
+ final by="id", all.x=TRUE)
+
+<- final %>%
+ income select(id, starts_with("pincome"))
+
+<- final %>%
+ wealth select(id, starts_with("pnetworth"))
+
+<- tidyr::gather(income, key = "imputation", value ="pincome", matches("pincome"), -id)
+ pincome <- tidyr::gather(wealth, key = "imputation", value ="pnetworth", matches("pnetworth"), -id) pnetworth
Distribution Plots for Parent Income (non-imputed and imputed)
+
+
+
+ggplot(pincome, aes(x=pincome, fill= imputation)) +
+geom_density(alpha = 0.5) +
+ facet_wrap(~imputation, scales="free")
+
+
+Distribution Plots for Parent Wealth (non-imputed and imputed)
+
+
+
+ggplot(pnetworth, aes(x=pnetworth, fill= imputation)) +
+geom_density(alpha = 0.5) +
+ facet_wrap(~imputation, scales="free")
+
+
+Scatter Plots for both parent income and wealth (non-imputed is zero)
+
+
+
+$imputation <- gsub("pincome.", "", pincome$imputation)
+ pincome$imputation <- ifelse(pincome$imputation == "pincome", 0, pincome$imputation)
+ pincome
+$imputation <- gsub("pnetworth.", "", pnetworth$imputation)
+ pnetworth$imputation <- ifelse(pnetworth$imputation == "pnetworth", 0, pnetworth$imputation)
+ pnetworth
+<- merge(pincome, pnetworth, by = c("id", "imputation"))
+ merge
+ggplot(merge, aes(x=pincome, y=pnetworth, color = imputation)) +
+geom_point(alpha = 0.5) +
+ facet_wrap(~imputation, scales="free")
+
+
+