diff --git a/NLSY/nlsy_imputation.html b/NLSY/nlsy_imputation.html new file mode 100644 index 0000000..0a7ff60 --- /dev/null +++ b/NLSY/nlsy_imputation.html @@ -0,0 +1,3465 @@ + + + + + + + + + +NLSY Parent Income and Wealth Imputation + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+

NLSY Parent Income and Wealth Imputation

+
+ + + +
+ + + + +
+ + +
+ +
+

Data Cleaning

+
+
#Packages
+librarian::shelf(dplyr, mice, car, corrplot, caret, tidyr)
+source(here::here("NLSY/nlsy_lib.R"))
+
+# Read data
+nlsydf = readRDS(paste0(here::here(), "/NLSY/NLSY-college-finance.rds"))
+
+
+
test <- nlsydf %>% #Add additional variables that should be considered during multiple imputation
+    select(id = "PUBID_1997",
+           mom_age_birth = "CV_BIO_MOM_AGE_YOUTH_1997", # numeric
+           age_resp =  "CV_AGE_12/31/96_1997",
+           hispanic = "KEY_ETHNICITY_1997", # binary
+           race = "KEY_RACE_1997", # categorical
+           mom_education = "CV_HGC_RES_MOM_1997", # categorical - highest grade completed in 1997 by mom
+           dad_education = "CV_HGC_RES_DAD_1997", # categorical - highest grade completed in 1997 by dad
+           savings = "P5-130_1997", #binary - yes/no retirement savings
+           home = "P5-101_1997", # binary - homeownership or renting
+           both_parents = "YOUTH_BOTHBIO.01_1997",
+           par1_deceased = "YOUTH_NONR1DEAD.01_1997",
+           par2_deceased = "YOUTH_NONR2DEAD.01_1997",
+           pincome = "CV_INCOME_GROSS_YR_1997",
+           pnetworth = "CV_HH_NET_WORTH_P_1997",
+           # Also, it seems that this represents the wealth of the household in which a respondent lives. That means that for a respondent who lives with only one parent, the net-worth variable represents the wealth of that parent.
+           # In the case of an independent respondent, contains respondent's, rather than parents', wealth.
+    ) %>%
+    mutate(mom_age = mom_age_birth + age_resp)
+
+# Making dummy or factor variables
+test <- test |>
+    mutate(
+        mom_educ_hs = factor(nlsy_encode_educ4(mom_education)),
+        dad_educ_hs = factor(nlsy_encode_educ4(dad_education)),
+        race = factor(race),
+        hisp = factor(hispanic),
+        race_eth = ifelse(hisp == "Yes" & race == "White", "White Hispanic",  NA), # Race and ethnicity interaction
+        race_eth = ifelse(hisp == "No" & race == "White", "White NonHispanic", race_eth),
+        race_eth = ifelse(hisp == "Yes" & race == "Black or African American", "Black Hispanic", race_eth),
+        race_eth = ifelse(hisp == "No" & race == "Black or African American", "Black NonHispanic", race_eth),
+        race_eth = ifelse(hisp == "Yes" & race == "American Indian, Eskimo, or Aleut", "NonBW Hispanic", race_eth),
+        race_eth = ifelse(hisp == "Yes" & race == "Asian or Pacific Islander", "NonBW Hispanic", race_eth),
+        race_eth = ifelse(hisp == "Yes" & race == "Something else? (SPECIFY)", "NonBW Hispanic", race_eth),
+        race_eth = ifelse(hisp == "No" & race == "American Indian, Eskimo, or Aleut", "NonBW NonHispanic", race_eth),
+        race_eth = ifelse(hisp == "No" & race == "Asian or Pacific Islander", "NonBW NonHispanic", race_eth),
+        race_eth = ifelse(hisp == "No" & race == "Something else? (SPECIFY)", "NonBW NonHispanic", race_eth),
+        race_eth = factor(race_eth))
+
+test$ret_sav <- ifelse(test$savings == "YES", 1, 0)
+test$hown <- ifelse(test$home == "OWNS OR IS BUYING; LAND CONTRACT", 1, 0)
+test$both_pars <- ifelse(test$both_parents == "Yes", 1, 0)
+test$par_dec <- ifelse(test$par1_deceased == "Yes" | test$par2_deceased == "Yes", 1, 0) # Either parent deceased
+
+
+
# Looking for significance with non-missing income and wealth
+non_missing <- test %>%
+    filter(!is.na(pincome),
+           !is.na(pnetworth))
+
+summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing))
+
+

+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
+    dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-106496  -21175   -5574   11535  209949 
+
+Coefficients:
+                                      Estimate Std. Error t value Pr(>|t|)    
+(Intercept)                            21684.1     7866.7   2.756 0.005891 ** 
+mom_age                                 1013.2      179.8   5.636 1.96e-08 ***
+hispYes                                -6942.6     3097.5  -2.241 0.025101 *  
+raceBlack or African American          -8065.7     2388.8  -3.376 0.000747 ***
+raceAmerican Indian, Eskimo, or Aleut  -1839.5    10957.3  -0.168 0.866694    
+raceAsian or Pacific Islander           7165.9     5949.4   1.204 0.228538    
+raceSomething else? (SPECIFY)           -351.9     3970.4  -0.089 0.929391    
+mom_educ_hsGraduate degree             12856.9     3990.5   3.222 0.001292 ** 
+mom_educ_hsHigh-school graduate       -11316.3     2439.1  -4.640 3.69e-06 ***
+mom_educ_hsSome college               -10224.4     2379.7  -4.297 1.81e-05 ***
+dad_educ_hsGraduate degree             10419.2     3301.8   3.156 0.001623 ** 
+dad_educ_hsHigh-school graduate       -14369.3     2361.0  -6.086 1.36e-09 ***
+dad_educ_hsSome college                -8036.7     2355.0  -3.413 0.000655 ***
+ret_sav                                16350.9     1867.5   8.756  < 2e-16 ***
+hown                                   14618.6     2248.5   6.501 9.80e-11 ***
+both_pars                              -2946.4     2203.9  -1.337 0.181396    
+par_dec                                 1758.2     8311.9   0.212 0.832491    
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 37700 on 2215 degrees of freedom
+  (3601 observations deleted due to missingness)
+Multiple R-squared:  0.2221,    Adjusted R-squared:  0.2165 
+F-statistic: 39.52 on 16 and 2215 DF,  p-value: < 2.2e-16
+
+
#Non-significant: par_dec - dropping for now
+
+
+

Checking for multi-collinearity - correlation plot for numeric variables

+
+
testing_cor <- test %>%
+    select(-id, -hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -pincome, -pnetworth, -par_dec, -mom_educ_hs,-dad_educ_hs, -hisp,-race_eth) %>%
+    drop_na()
+
+M <- cor(testing_cor)
+corrplot(M, type = "upper")
+
+

+
+
+
+
+

Checking for multi-collinearity - VIF and Successive addition of regressors

+
+
summary(model <- lm(pincome ~ mom_age, data = test)) #Adj R2 = 0.057
+
+

+Call:
+lm(formula = pincome ~ mom_age, data = test)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-110821  -25331   -7287   15521  215185 
+
+Coefficients:
+             Estimate Std. Error t value Pr(>|t|)    
+(Intercept) -26029.36    3756.14   -6.93 4.64e-12 ***
+mom_age       1848.97      94.67   19.53  < 2e-16 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 40710 on 6210 degrees of freedom
+  (2772 observations deleted due to missingness)
+Multiple R-squared:  0.05787,   Adjusted R-squared:  0.05771 
+F-statistic: 381.4 on 1 and 6210 DF,  p-value: < 2.2e-16
+
+
summary(model <- lm(pincome ~ mom_age + hisp, data = test)) #Adj R2 = 0.09
+
+

+Call:
+lm(formula = pincome ~ mom_age + hisp, data = test)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-114431  -25073   -7057   15049  224819 
+
+Coefficients:
+             Estimate Std. Error t value Pr(>|t|)    
+(Intercept) -21449.50    3706.92  -5.786 7.55e-09 ***
+mom_age       1828.75      93.11  19.640  < 2e-16 ***
+hispYes     -19072.61    1275.23 -14.956  < 2e-16 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 40000 on 6202 degrees of freedom
+  (2779 observations deleted due to missingness)
+Multiple R-squared:  0.09091,   Adjusted R-squared:  0.09062 
+F-statistic: 310.1 on 2 and 6202 DF,  p-value: < 2.2e-16
+
+
car::vif(model)
+
+
 mom_age     hisp 
+1.000325 1.000325 
+
+
summary(model <- lm(pincome ~ mom_age + hisp + race, data = test)) # #Adj R2 = 0.157
+
+

+Call:
+lm(formula = pincome ~ mom_age + hisp + race, data = test)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-119072  -22397   -7121   13487  224217 
+
+Coefficients:
+                                       Estimate Std. Error t value Pr(>|t|)    
+(Intercept)                            -1001.21    3706.86  -0.270   0.7871    
+mom_age                                 1499.44      91.34  16.415  < 2e-16 ***
+hispYes                               -23096.47    1578.75 -14.630  < 2e-16 ***
+raceBlack or African American         -25965.05    1202.19 -21.598  < 2e-16 ***
+raceAmerican Indian, Eskimo, or Aleut -12807.11    5671.04  -2.258   0.0240 *  
+raceAsian or Pacific Islander          18250.00    4471.05   4.082 4.53e-05 ***
+raceSomething else? (SPECIFY)          -4626.23    1981.05  -2.335   0.0196 *  
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38540 on 6164 degrees of freedom
+  (2813 observations deleted due to missingness)
+Multiple R-squared:  0.1581,    Adjusted R-squared:  0.1573 
+F-statistic:   193 on 6 and 6164 DF,  p-value: < 2.2e-16
+
+
car::vif(model)
+
+
            GVIF Df GVIF^(1/(2*Df))
+mom_age 1.030125  1        1.014951
+hisp    1.616650  1        1.271476
+race    1.662056  4        1.065567
+
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs, data = test)) #Adj R2 = 0.1899
+
+

+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs, data = test)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-120890  -22543   -6045   14583  199382 
+
+Coefficients:
+                                      Estimate Std. Error t value Pr(>|t|)    
+(Intercept)                              28466       5153   5.524 3.50e-08 ***
+mom_age                                   1199        118  10.164  < 2e-16 ***
+hispYes                                 -10947       2159  -5.071 4.13e-07 ***
+raceBlack or African American           -20625       1442 -14.301  < 2e-16 ***
+raceAmerican Indian, Eskimo, or Aleut   -10444       7343  -1.422  0.15500    
+raceAsian or Pacific Islander            16432       5189   3.167  0.00155 ** 
+raceSomething else? (SPECIFY)            -7672       2721  -2.819  0.00484 ** 
+mom_educ_hsGraduate degree               18416       3046   6.046 1.61e-09 ***
+mom_educ_hsHigh-school graduate         -24546       1697 -14.462  < 2e-16 ***
+mom_educ_hsSome college                 -17773       1767 -10.056  < 2e-16 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38600 on 4425 degrees of freedom
+  (4549 observations deleted due to missingness)
+Multiple R-squared:  0.1915,    Adjusted R-squared:  0.1899 
+F-statistic: 116.5 on 9 and 4425 DF,  p-value: < 2.2e-16
+
+
car::vif(model)
+
+
                GVIF Df GVIF^(1/(2*Df))
+mom_age     1.083580  1        1.040952
+hisp        1.516648  1        1.231523
+race        1.588827  4        1.059582
+mom_educ_hs 1.092325  3        1.014827
+
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs, data = test)) #Adj R2 = 0.1616
+
+

+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
+    dad_educ_hs, data = test)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-115866  -21996   -5334   12213  189896 
+
+Coefficients:
+                                      Estimate Std. Error t value Pr(>|t|)    
+(Intercept)                            42843.8     7124.4   6.014 2.06e-09 ***
+mom_age                                 1052.3      160.5   6.555 6.65e-11 ***
+hispYes                               -11445.9     2894.0  -3.955 7.85e-05 ***
+raceBlack or African American         -12319.3     2144.6  -5.744 1.03e-08 ***
+raceAmerican Indian, Eskimo, or Aleut  -4744.5    10135.3  -0.468 0.639739    
+raceAsian or Pacific Islander           7868.9     5996.6   1.312 0.189554    
+raceSomething else? (SPECIFY)          -1544.6     3757.3  -0.411 0.681036    
+mom_educ_hsGraduate degree             13236.8     3901.0   3.393 0.000701 ***
+mom_educ_hsHigh-school graduate       -11808.7     2271.5  -5.199 2.16e-07 ***
+mom_educ_hsSome college                -9038.6     2228.3  -4.056 5.13e-05 ***
+dad_educ_hsGraduate degree             13535.1     3121.2   4.337 1.50e-05 ***
+dad_educ_hsHigh-school graduate       -16548.1     2213.6  -7.476 1.03e-13 ***
+dad_educ_hsSome college               -10859.3     2211.2  -4.911 9.60e-07 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 39040 on 2718 degrees of freedom
+  (6253 observations deleted due to missingness)
+Multiple R-squared:  0.1653,    Adjusted R-squared:  0.1616 
+F-statistic: 44.86 on 12 and 2718 DF,  p-value: < 2.2e-16
+
+
car::vif(model)
+
+
                GVIF Df GVIF^(1/(2*Df))
+mom_age     1.116575  1        1.056681
+hisp        1.458563  1        1.207710
+race        1.519815  4        1.053717
+mom_educ_hs 1.373641  3        1.054336
+dad_educ_hs 1.400450  3        1.057738
+
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav, data = test)) #Adj R2 = 0.199
+
+

+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
+    dad_educ_hs + ret_sav, data = test)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-120849  -20854   -5499   12002  202648 
+
+Coefficients:
+                                      Estimate Std. Error t value Pr(>|t|)    
+(Intercept)                              27572       7085   3.892 0.000102 ***
+mom_age                                   1008        157   6.422 1.59e-10 ***
+hispYes                                  -8740       2840  -3.077 0.002110 ** 
+raceBlack or African American            -8155       2130  -3.828 0.000132 ***
+raceAmerican Indian, Eskimo, or Aleut    -5069       9888  -0.513 0.608241    
+raceAsian or Pacific Islander             7350       5850   1.256 0.209131    
+raceSomething else? (SPECIFY)            -1288       3669  -0.351 0.725604    
+mom_educ_hsGraduate degree               13822       3807   3.630 0.000288 ***
+mom_educ_hsHigh-school graduate          -9214       2234  -4.124 3.84e-05 ***
+mom_educ_hsSome college                  -7318       2183  -3.353 0.000812 ***
+dad_educ_hsGraduate degree               13967       3046   4.585 4.76e-06 ***
+dad_educ_hsHigh-school graduate         -14865       2174  -6.839 9.83e-12 ***
+dad_educ_hsSome college                  -9361       2166  -4.321 1.61e-05 ***
+ret_sav                                  18961       1658  11.437  < 2e-16 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38090 on 2701 degrees of freedom
+  (6269 observations deleted due to missingness)
+Multiple R-squared:  0.2029,    Adjusted R-squared:  0.199 
+F-statistic: 52.88 on 13 and 2701 DF,  p-value: < 2.2e-16
+
+
car::vif(model)
+
+
                GVIF Df GVIF^(1/(2*Df))
+mom_age     1.117451  1        1.057096
+hisp        1.470673  1        1.212713
+race        1.567470  4        1.057791
+mom_educ_hs 1.387635  3        1.056118
+dad_educ_hs 1.411949  3        1.059180
+ret_sav     1.074406  1        1.036535
+
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown, data = test)) #Adj R2 = 0.2107
+
+

+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
+    dad_educ_hs + ret_sav + hown, data = test)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-121555  -20857   -5701   11831  213185 
+
+Coefficients:
+                                      Estimate Std. Error t value Pr(>|t|)    
+(Intercept)                            24176.2     7304.7   3.310 0.000947 ***
+mom_age                                  832.7      162.7   5.117 3.34e-07 ***
+hispYes                                -9571.9     2880.4  -3.323 0.000903 ***
+raceBlack or African American          -6799.7     2193.2  -3.100 0.001953 ** 
+raceAmerican Indian, Eskimo, or Aleut  -1367.3    10620.6  -0.129 0.897570    
+raceAsian or Pacific Islander           8835.7     5919.5   1.493 0.135648    
+raceSomething else? (SPECIFY)            985.4     3762.9   0.262 0.793436    
+mom_educ_hsGraduate degree             12969.7     3814.5   3.400 0.000684 ***
+mom_educ_hsHigh-school graduate        -9432.6     2274.8  -4.147 3.48e-05 ***
+mom_educ_hsSome college                -7490.6     2218.1  -3.377 0.000744 ***
+dad_educ_hsGraduate degree             13989.5     3065.1   4.564 5.25e-06 ***
+dad_educ_hsHigh-school graduate       -14762.7     2209.6  -6.681 2.89e-11 ***
+dad_educ_hsSome college                -9420.1     2196.1  -4.290 1.86e-05 ***
+ret_sav                                17029.3     1735.6   9.812  < 2e-16 ***
+hown                                   14533.5     2117.7   6.863 8.43e-12 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38070 on 2581 degrees of freedom
+  (6388 observations deleted due to missingness)
+Multiple R-squared:  0.215, Adjusted R-squared:  0.2107 
+F-statistic: 50.49 on 14 and 2581 DF,  p-value: < 2.2e-16
+
+
car::vif(model)
+
+
                GVIF Df GVIF^(1/(2*Df))
+mom_age     1.139122  1        1.067297
+hisp        1.443144  1        1.201309
+race        1.552772  4        1.056546
+mom_educ_hs 1.391807  3        1.056647
+dad_educ_hs 1.411255  3        1.059093
+ret_sav     1.109899  1        1.053517
+hown        1.099569  1        1.048603
+
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2108
+
+

+Call:
+lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
+    dad_educ_hs + ret_sav + hown + both_pars, data = test)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-121433  -21118   -5599   11890  211685 
+
+Coefficients:
+                                      Estimate Std. Error t value Pr(>|t|)    
+(Intercept)                            24194.5     7304.5   3.312 0.000938 ***
+mom_age                                  873.4      166.9   5.234 1.80e-07 ***
+hispYes                                -9452.3     2882.4  -3.279 0.001054 ** 
+raceBlack or African American          -6940.0     2196.8  -3.159 0.001600 ** 
+raceAmerican Indian, Eskimo, or Aleut  -1700.3    10624.5  -0.160 0.872862    
+raceAsian or Pacific Islander           8874.4     5919.3   1.499 0.133937    
+raceSomething else? (SPECIFY)            983.1     3762.7   0.261 0.793894    
+mom_educ_hsGraduate degree             12937.6     3814.5   3.392 0.000705 ***
+mom_educ_hsHigh-school graduate        -9383.8     2275.1  -4.125 3.83e-05 ***
+mom_educ_hsSome college                -7484.8     2218.0  -3.375 0.000750 ***
+dad_educ_hsGraduate degree             13870.1     3066.9   4.522 6.39e-06 ***
+dad_educ_hsHigh-school graduate       -14959.4     2216.7  -6.748 1.84e-11 ***
+dad_educ_hsSome college                -9586.8     2201.2  -4.355 1.38e-05 ***
+ret_sav                                17047.7     1735.6   9.823  < 2e-16 ***
+hown                                   14812.2     2132.8   6.945 4.77e-12 ***
+both_pars                              -2202.2     2001.6  -1.100 0.271349    
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 38070 on 2580 degrees of freedom
+  (6388 observations deleted due to missingness)
+Multiple R-squared:  0.2153,    Adjusted R-squared:  0.2108 
+F-statistic:  47.2 on 15 and 2580 DF,  p-value: < 2.2e-16
+
+
car::vif(model)
+
+
                GVIF Df GVIF^(1/(2*Df))
+mom_age     1.198064  1        1.094561
+hisp        1.445199  1        1.202164
+race        1.559232  4        1.057095
+mom_educ_hs 1.392832  3        1.056776
+dad_educ_hs 1.421260  3        1.060341
+ret_sav     1.110001  1        1.053566
+hown        1.115309  1        1.056082
+both_pars   1.115809  1        1.056319
+
+
+
+
+
+

Checking for missings in the predictor variables

+
+
impute <- test %>%
+    select(-hispanic, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -par_dec, -age_resp)
+t <- nearZeroVar(impute) # no variables are found to have near zero variance
+
+sapply(impute, function(x) sum(is.na(x))) # dad_educ_hs has 4598 missings
+
+
         id        race     pincome   pnetworth     mom_age mom_educ_hs 
+          0          80        2396        2365         608        2879 
+dad_educ_hs        hisp    race_eth     ret_sav        hown   both_pars 
+       4598          24          88        1194        1616           0 
+
+
# Calculating percent of usable cases
+m <- md.pairs(impute)
+round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and predictor are missing on the same cases
+
+
             id  race pincome pnetworth mom_age mom_educ_hs dad_educ_hs  hisp
+id          NaN   NaN     NaN       NaN     NaN         NaN         NaN   NaN
+race          1 0.000   0.600     0.562   0.838       0.312       0.200 0.800
+pincome       1 0.987   0.000     0.328   0.903       0.608       0.442 0.996
+pnetworth     1 0.985   0.319     0.000   0.901       0.658       0.526 0.994
+mom_age       1 0.979   0.618     0.613   0.000       0.497       0.428 0.985
+mom_educ_hs   1 0.981   0.673     0.719   0.894       0.000       0.225 0.993
+dad_educ_hs   1 0.986   0.709     0.756   0.924       0.515       0.000 0.996
+hisp          1 0.333   0.625     0.375   0.625       0.167       0.167 0.000
+race_eth      1 0.091   0.602     0.568   0.830       0.307       0.193 0.727
+ret_sav       1 0.972   0.073     0.000   0.869       0.555       0.440 0.989
+hown          1 0.981   0.338     0.296   0.892       0.546       0.399 0.993
+both_pars   NaN   NaN     NaN       NaN     NaN         NaN         NaN   NaN
+            race_eth ret_sav  hown both_pars
+id               NaN     NaN   NaN       NaN
+race           0.000   0.588 0.613         1
+pincome        0.985   0.538 0.553         1
+pnetworth      0.984   0.495 0.519         1
+mom_age        0.975   0.742 0.714         1
+mom_educ_hs    0.979   0.816 0.745         1
+dad_educ_hs    0.985   0.855 0.789         1
+hisp           0.000   0.458 0.542         1
+race_eth       0.000   0.591 0.625         1
+ret_sav        0.970   0.000 0.105         1
+hown           0.980   0.338 0.000         1
+both_pars        NaN     NaN   NaN       NaN
+
+
#Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions
+
+impute <- impute %>%
+    select(-dad_educ_hs, -race) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now
+
+
+
+

Multiple Imputation

+
+
# Paper that uses multiple imputation: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6049093/
+# Multiple imputation in R reference: https://library.virginia.edu/data/articles/getting-started-with-multiple-imputation-in-r
+init = mice(impute, maxit=10)
+
+

+ iter imp variable
+  1   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  1   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  1   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  1   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  1   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  2   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  2   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  2   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  2   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  2   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  3   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  3   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  3   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  3   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  3   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  4   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  4   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  4   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  4   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  4   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  5   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  5   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  5   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  5   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  5   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  6   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  6   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  6   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  6   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  6   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  7   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  7   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  7   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  7   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  7   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  8   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  8   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  8   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  8   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  8   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  9   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  9   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  9   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  9   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  9   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  10   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  10   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  10   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  10   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+  10   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+
+
meth = init$method
+predM = init$predictorMatrix
+
+# Set distributions
+num <- c("mom_age") # numerical variables
+log <- c("hisp", "ret_sav", "hown", "both_pars") # binary variables
+poly <- c("mom_educ_hs") # ordered categorical variables
+poly2 <- c("race", "race_eth")# Unordered categorical variable
+
+# Assign Variables to be imputed
+predM[,c("id")] <- 0 # not using as a predictor variable
+predM[c("id"),] <- 0 # not using as a predictor variable
+predM[c("race_eth"),] <- 1 # not using as a predictor variable
+predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 # not imputing
+meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" #not imputing missing values, just using as a predictor
+
+# Income and Wealth imputation
+set.seed(25)
+imputed = mice(impute, method=meth, predictorMatrix=predM, m=10)
+
+

+ iter imp variable
+  1   1  pincome  pnetworth
+  1   2  pincome  pnetworth
+  1   3  pincome  pnetworth
+  1   4  pincome  pnetworth
+  1   5  pincome  pnetworth
+  1   6  pincome  pnetworth
+  1   7  pincome  pnetworth
+  1   8  pincome  pnetworth
+  1   9  pincome  pnetworth
+  1   10  pincome  pnetworth
+  2   1  pincome  pnetworth
+  2   2  pincome  pnetworth
+  2   3  pincome  pnetworth
+  2   4  pincome  pnetworth
+  2   5  pincome  pnetworth
+  2   6  pincome  pnetworth
+  2   7  pincome  pnetworth
+  2   8  pincome  pnetworth
+  2   9  pincome  pnetworth
+  2   10  pincome  pnetworth
+  3   1  pincome  pnetworth
+  3   2  pincome  pnetworth
+  3   3  pincome  pnetworth
+  3   4  pincome  pnetworth
+  3   5  pincome  pnetworth
+  3   6  pincome  pnetworth
+  3   7  pincome  pnetworth
+  3   8  pincome  pnetworth
+  3   9  pincome  pnetworth
+  3   10  pincome  pnetworth
+  4   1  pincome  pnetworth
+  4   2  pincome  pnetworth
+  4   3  pincome  pnetworth
+  4   4  pincome  pnetworth
+  4   5  pincome  pnetworth
+  4   6  pincome  pnetworth
+  4   7  pincome  pnetworth
+  4   8  pincome  pnetworth
+  4   9  pincome  pnetworth
+  4   10  pincome  pnetworth
+  5   1  pincome  pnetworth
+  5   2  pincome  pnetworth
+  5   3  pincome  pnetworth
+  5   4  pincome  pnetworth
+  5   5  pincome  pnetworth
+  5   6  pincome  pnetworth
+  5   7  pincome  pnetworth
+  5   8  pincome  pnetworth
+  5   9  pincome  pnetworth
+  5   10  pincome  pnetworth
+
+
imputed <- complete(imputed, action = "repeated", include = FALSE)
+
+final <- imputed %>%
+    rename(id = id.1,
+           mom_age = mom_age.1,
+           ret_sav = ret_sav.1,
+           hown = hown.1,
+           mom_educ_hs = mom_educ_hs.1,
+           hisp = hisp.1,
+           both_pars = both_pars.1,
+           race_eth = race_eth.1) %>%
+    select(-starts_with("id."), -starts_with("mom_age."), -starts_with("ret_sav."), -starts_with("hown."), -starts_with("mom_educ_hs."), -starts_with("dad_educ_hs."), -starts_with("both_pars."), -starts_with("hisp."),  -starts_with("race_eth."))
+
+
+
# Join back with original income and wealth
+og_values <- impute %>%
+    select(id, pincome, pnetworth)
+
+final <- merge(x=final,y=og_values,
+                    by="id", all.x=TRUE)
+
+income <- final %>%
+    select(id, starts_with("pincome"))
+
+wealth <- final %>%
+    select(id, starts_with("pnetworth"))
+
+pincome <- tidyr::gather(income, key = "imputation", value ="pincome", matches("pincome"), -id)
+pnetworth <- tidyr::gather(wealth, key = "imputation", value ="pnetworth", matches("pnetworth"), -id)
+
+
+
+

Distribution Plots for Parent Income (non-imputed and imputed)

+
+
ggplot(pincome, aes(x=pincome, fill= imputation)) +
+    geom_density(alpha = 0.5) +
+    facet_wrap(~imputation, scales="free")
+
+

+
+
+
+
+

Distribution Plots for Parent Wealth (non-imputed and imputed)

+
+
ggplot(pnetworth, aes(x=pnetworth, fill= imputation)) +
+    geom_density(alpha = 0.5) +
+    facet_wrap(~imputation, scales="free")
+
+

+
+
+
+
+

Scatter Plots for both parent income and wealth (non-imputed is zero)

+
+
pincome$imputation <- gsub("pincome.", "", pincome$imputation)
+pincome$imputation <- ifelse(pincome$imputation == "pincome", 0, pincome$imputation)
+
+pnetworth$imputation <- gsub("pnetworth.", "", pnetworth$imputation)
+pnetworth$imputation <- ifelse(pnetworth$imputation == "pnetworth", 0, pnetworth$imputation)
+
+merge <- merge(pincome, pnetworth, by = c("id", "imputation"))
+
+ggplot(merge, aes(x=pincome, y=pnetworth, color = imputation)) +
+    geom_point(alpha = 0.5) +
+    facet_wrap(~imputation, scales="free")
+
+

+
+
+
+ +
+ + +
+ + + + \ No newline at end of file diff --git a/NLSY/nlsy_impute.R b/NLSY/nlsy_imputation.qmd similarity index 83% rename from NLSY/nlsy_impute.R rename to NLSY/nlsy_imputation.qmd index d076790..8134de9 100644 --- a/NLSY/nlsy_impute.R +++ b/NLSY/nlsy_imputation.qmd @@ -1,10 +1,28 @@ +--- +title: "NLSY Parent Income and Wealth Imputation" +format: + html: + embed-resources: true +editor: visual +execute: + warning: false +fig-cap-location: top +toc: true +toc-expand: true +--- + +## Data Cleaning + +```{r} #Packages librarian::shelf(dplyr, mice, car, corrplot, caret, tidyr) source(here::here("NLSY/nlsy_lib.R")) # Read data nlsydf = readRDS(paste0(here::here(), "/NLSY/NLSY-college-finance.rds")) +``` +```{r} test <- nlsydf %>% #Add additional variables that should be considered during multiple imputation select(id = "PUBID_1997", mom_age_birth = "CV_BIO_MOM_AGE_YOUTH_1997", # numeric @@ -41,28 +59,41 @@ test <- test |> race_eth = ifelse(hisp == "Yes" & race == "Something else? (SPECIFY)", "NonBW Hispanic", race_eth), race_eth = ifelse(hisp == "No" & race == "American Indian, Eskimo, or Aleut", "NonBW NonHispanic", race_eth), race_eth = ifelse(hisp == "No" & race == "Asian or Pacific Islander", "NonBW NonHispanic", race_eth), - race_eth = ifelse(hisp == "No" & race == "Something else? (SPECIFY)", "NonBW NonHispanic", race_eth)) + race_eth = ifelse(hisp == "No" & race == "Something else? (SPECIFY)", "NonBW NonHispanic", race_eth), + race_eth = factor(race_eth)) test$ret_sav <- ifelse(test$savings == "YES", 1, 0) test$hown <- ifelse(test$home == "OWNS OR IS BUYING; LAND CONTRACT", 1, 0) test$both_pars <- ifelse(test$both_parents == "Yes", 1, 0) test$par_dec <- ifelse(test$par1_deceased == "Yes" | test$par2_deceased == "Yes", 1, 0) # Either parent deceased +``` + +```{r} # Looking for significance with non-missing income and wealth non_missing <- test %>% filter(!is.na(pincome), !is.na(pnetworth)) summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing)) -#Non-significant: race_w and par_dec - dropping for now +#Non-significant: par_dec - dropping for now +``` + +### Checking for multi-collinearity - correlation plot for numeric variables + +```{r} -# Checking for multi-collinearity testing_cor <- test %>% - select(-id, -hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -pincome, -pnetworth, -par_dec, -mom_educ_hs,-dad_educ_hs) %>% + select(-id, -hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -pincome, -pnetworth, -par_dec, -mom_educ_hs,-dad_educ_hs, -hisp,-race_eth) %>% drop_na() M <- cor(testing_cor) corrplot(M, type = "upper") +``` + +### Checking for multi-collinearity - VIF and Successive addition of regressors + +```{r} summary(model <- lm(pincome ~ mom_age, data = test)) #Adj R2 = 0.057 summary(model <- lm(pincome ~ mom_age + hisp, data = test)) #Adj R2 = 0.09 @@ -84,13 +115,15 @@ car::vif(model) summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2108 car::vif(model) +``` + +## Checking for missings in the predictor variables -# Making df for imputation +```{r} impute <- test %>% select(-hispanic, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -par_dec, -age_resp) t <- nearZeroVar(impute) # no variables are found to have near zero variance -# Checking for missings in the predictor variables sapply(impute, function(x) sum(is.na(x))) # dad_educ_hs has 4598 missings # Calculating percent of usable cases @@ -99,8 +132,12 @@ round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and #Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions impute <- impute %>% - select(-dad_educ_hs) + select(-dad_educ_hs, -race) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now +``` +## Multiple Imputation + +```{r} # Paper that uses multiple imputation: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6049093/ # Multiple imputation in R reference: https://library.virginia.edu/data/articles/getting-started-with-multiple-imputation-in-r init = mice(impute, maxit=10) @@ -116,8 +153,9 @@ poly2 <- c("race", "race_eth")# Unordered categorical variable # Assign Variables to be imputed predM[,c("id")] <- 0 # not using as a predictor variable predM[c("id"),] <- 0 # not using as a predictor variable -predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "race", "both_pars", "race_eth")] <- 0 # not imputing -meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "race", "both_pars", "race_eth")]= "" #not imputing missing values, just using as a predictor +predM[c("race_eth"),] <- 1 # not using as a predictor variable +predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 # not imputing +meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" #not imputing missing values, just using as a predictor # Income and Wealth imputation set.seed(25) @@ -131,12 +169,14 @@ final <- imputed %>% ret_sav = ret_sav.1, hown = hown.1, mom_educ_hs = mom_educ_hs.1, - race = race.1, hisp = hisp.1, both_pars = both_pars.1, race_eth = race_eth.1) %>% - select(-starts_with("id."), -starts_with("mom_age."), -starts_with("ret_sav."), -starts_with("hown."), -starts_with("mom_educ_hs."), -starts_with("dad_educ_hs."), -starts_with("race."), -starts_with("both_pars."), -starts_with("hisp."), -starts_with("race_eth.")) + select(-starts_with("id."), -starts_with("mom_age."), -starts_with("ret_sav."), -starts_with("hown."), -starts_with("mom_educ_hs."), -starts_with("dad_educ_hs."), -starts_with("both_pars."), -starts_with("hisp."), -starts_with("race_eth.")) + +``` +```{r} # Join back with original income and wealth og_values <- impute %>% select(id, pincome, pnetworth) @@ -152,17 +192,27 @@ wealth <- final %>% pincome <- tidyr::gather(income, key = "imputation", value ="pincome", matches("pincome"), -id) pnetworth <- tidyr::gather(wealth, key = "imputation", value ="pnetworth", matches("pnetworth"), -id) +``` + +## Distribution Plots for Parent Income (non-imputed and imputed) -# Plots for comparing distribution for imputed and non-imputed +```{r} ggplot(pincome, aes(x=pincome, fill= imputation)) + geom_density(alpha = 0.5) + facet_wrap(~imputation, scales="free") +``` +## Distribution Plots for Parent Wealth (non-imputed and imputed) + +```{r} ggplot(pnetworth, aes(x=pnetworth, fill= imputation)) + geom_density(alpha = 0.5) + facet_wrap(~imputation, scales="free") +``` + +## Scatter Plots for both parent income and wealth (non-imputed is zero) -# A scatter plot of these two variables jointly would be useful. +```{r} pincome$imputation <- gsub("pincome.", "", pincome$imputation) pincome$imputation <- ifelse(pincome$imputation == "pincome", 0, pincome$imputation) @@ -174,6 +224,4 @@ merge <- merge(pincome, pnetworth, by = c("id", "imputation")) ggplot(merge, aes(x=pincome, y=pnetworth, color = imputation)) + geom_point(alpha = 0.5) + facet_wrap(~imputation, scales="free") - -# Save imputed data -#write.csv(nlsydf_imp, 'NLSY/NLSY-college-finance_imp.csv') +```