Skip to content

Commit

Permalink
Merge in data from partner's survey #22
Browse files Browse the repository at this point in the history
  • Loading branch information
emilycantrell committed Jun 2, 2024
1 parent db6f522 commit 807890e
Show file tree
Hide file tree
Showing 6 changed files with 278 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Exploration of data about partner from LISS
library(tidyverse)

# The purpose of this code is to determine which features about the partner requires us to
# The purpose of this code is to determine which features about the partner require us to
# coalesce data across waves. For example:
# (1) Partner's birthyear is only reported once. If the person remains with the same partner, the birthyear
# is not reported again in subsequent waves. Therefore, we need to coalesce data across waves.
Expand Down
8 changes: 7 additions & 1 deletion explorations/exploration_of_linking_partner_data.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Emily Cantrell
# Exploration of fertility intentions questions from LISS
# Exploration of merging in data from the partner's survey
library(tidyverse)

# This is a draft of the code for linking partner's data with the primary respondent's data,
Expand Down Expand Up @@ -226,3 +226,9 @@ train_linked_with_partner %>%
# Almost all data on fertility intentions is missing among the few same-sex couples,
# so I didn't make a plot for them.

# HOW MANY PARTNERS COME FROM THE SUPPLEMENTARY DATA?
nrow(train_linked_with_partner)
min(train_full$birthyear_bg)
train_linked_with_partner %>%
group_by(new_child, birthyear_bg_PartnerSurvey < 1975) %>%
count()
139 changes: 139 additions & 0 deletions explorations/function_for_linking_partner_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Draft of function of for merging in data from the partner's survey
library(tidyverse)

# This is a draft of the code for linking partner's data with the primary respondent's data,
# in order to merge in the partner's fertility intention data.
# I started with the code in exploration_of_linking_partner_data.R, and turned it into a function,
# with some edits for clarity.

# Read in the data
train_full <- read.csv("/Users/ecantrell/Documents/PreFer\ 2024/prefer_data/training_data/PreFer_train_data.csv")
background_full <- read.csv("/Users/ecantrell/Documents/PreFer\ 2024/prefer_data/other_data/PreFer_train_background_data.csv")

# Create function to merge in partner data
merge_in_partner_data <- function(train_data, background_data) {

# Make a vector of features to merge in from the partner's survey, for use in modeling
features_to_use_as_partner_data_in_model <- c(
# Fertility expectations in 2020
"cf20m128", "cf20m129", "cf20m130",
# Fertility expectations in 2019
"cf19l128", "cf19l129", "cf19l130",
# Number of kids reported in 2019 and 2020
"cf19l455", "cf20m455")

# Make vectors of features that will be coalesced across waves, for use in the merging process
# Note: Must list the more recent features first in order for the coalesce function to work
raw_features_about_living_with_partner <- c("cf20m025", "cf19l025", "cf18k025", "cf17j025", "cf16i025", "cf15h025",
"cf14g025", "cf13f025", "cf12e025", "cf11d025", "cf10c025", "cf09b025", "cf08a025")
raw_features_about_partner_birth_year <- c("cf20m026", "cf19l026", "cf18k026", "cf17j026", "cf16i026", "cf15h026",
"cf14g026", "cf13f026", "cf12e026", "cf11d026", "cf10c026", "cf09b026", "cf08a026")
raw_features_about_partner_gender <- c("cf20m032", "cf19l032", "cf18k032", "cf17j032", "cf16i032", "cf15h032",
"cf14g032", "cf13f032", "cf12e032", "cf11d032", "cf10c032", "cf09b032", "cf08a032")

# Select a few features of interest, plus features that will help us double-check that the merged-in person is really the partner
train_subsetted_columns <- train_data %>%
select("nomem_encr",
"gender_bg",
"birthyear_bg",
all_of(features_to_use_as_partner_data_in_model),
all_of(raw_features_about_living_with_partner),
all_of(raw_features_about_partner_birth_year),
all_of(raw_features_about_partner_gender)
) %>%
# Collect the most recent response to whether they live with a partner in a single variable
mutate(live_with_partner = coalesce(!!!syms(raw_features_about_living_with_partner))) %>%
# Collect the most recently reported partner birth year in a single variable
mutate(partner_birth_year = coalesce(!!!syms(raw_features_about_partner_birth_year))) %>%
# Collect the most recent indicator of partner's gender in a single variable
mutate(partner_gender = coalesce(!!!syms(raw_features_about_partner_gender))) %>%
# Remove raw data that was used in the coalesced variables
select(-all_of(raw_features_about_living_with_partner),
-all_of(raw_features_about_partner_birth_year),
-all_of(raw_features_about_partner_gender))

# If this is time-shifted data, filter the background data to 2017 and earlier
if("time_shifted_data" %in% colnames(train_data)) {
background_data <- background_data %>%
filter(wave <= 201712)
}

# For each person, filter to only the most recent wave in which they appeared
background_most_recent_wave <- background_data %>%
group_by(nomem_encr) %>%
arrange(desc(wave)) %>%
slice_head() %>%
ungroup() %>%
select(nomem_encr, nohouse_encr, positie)

# Merge household ID and household position data with training data
train_subsetted_columns <- left_join(train_subsetted_columns, background_most_recent_wave, by = "nomem_encr")

# Create a copy of "train_subsetted_columns" to represent possible partners
train_partner <- train_subsetted_columns %>%
rename_with(~ paste0(., "_PartnerSurvey"), -nohouse_encr)

# Merge train_subsetted_columns with train_partner
# This produces a dataframe that only contains people whose partner also responded to the survey
subsetted_train_linked_with_partner <- train_subsetted_columns %>%
left_join(train_partner, by = "nohouse_encr", relationship = "many-to-many") %>%
filter(# Remove rows where person was linked to self
nomem_encr != nomem_encr_PartnerSurvey,
# Filter to only people who are head of household, wedded partner, or unwedded partner in most recent wave where they appeared
positie %in% c(1,2,3),
positie_PartnerSurvey %in% c(1,2,3),
# Filter to people from households where at least one supposed partner reported living together with a partner
((live_with_partner == 1) | (live_with_partner_PartnerSurvey ==1)),
# Remove rows where reported birthyears are mismatched
(partner_birth_year == birthyear_bg_PartnerSurvey | is.na(partner_birth_year) | is.na(birthyear_bg_PartnerSurvey)),
(partner_birth_year_PartnerSurvey == birthyear_bg | is.na(partner_birth_year_PartnerSurvey) | is.na(birthyear_bg)),
# Remove rows where reported genders are mismatched
(partner_gender == gender_bg_PartnerSurvey | is.na(partner_gender) | is.na(gender_bg_PartnerSurvey)),
(partner_gender_PartnerSurvey == gender_bg | is.na(partner_gender_PartnerSurvey) | is.na(gender_bg))
)

# Select only the data about the partner (we'll merge this into the full training data, which already has data on the ego)
partner_variables_to_keep <- paste0(features_to_use_as_partner_data_in_model, "_PartnerSurvey")
subsetted_train_linked_with_partner <- subsetted_train_linked_with_partner %>%
select(nomem_encr, all_of(partner_variables_to_keep))

# Merge the data about the partner with the full train data
# This produces a dataframe with everyone from the training data, even if they don't have a partner
full_train_linked_with_partner <- left_join(train_data, subsetted_train_linked_with_partner, by = "nomem_encr")

# Create an indicator for whether there is partner survey data
ids_that_have_partner_survey <- subsetted_train_linked_with_partner$nomem_encr
full_train_linked_with_partner <- full_train_linked_with_partner %>%
mutate(partner_survey_available = ifelse(nomem_encr %in% ids_that_have_partner_survey, 1, 0))

return(full_train_linked_with_partner)
}

#### QUALITY ASSURANCE CHECKS ####

# Check that responses are usually reasonably aligned between ego and partner
toy <- merge_in_partner_data(train_full, background_full)

# How many kids do you have?
table(toy$cf20m455, toy$cf20m455_PartnerSurvey, useNA = "ifany")
table(toy$cf19l455, toy$cf19l455_PartnerSurvey, useNA = "ifany")

# Do you expect to have more kids? (3 = "I don't know")
table(toy$cf20m128, toy$cf20m128_PartnerSurvey, useNA = "ifany")
table(toy$cf19l128, toy$cf19l128_PartnerSurvey, useNA = "ifany")

# How many more kids do you expect to have?
table(toy$cf20m129, toy$cf20m129_PartnerSurvey, useNA = "ifany")
table(toy$cf19l129, toy$cf19l129_PartnerSurvey, useNA = "ifany")

# Within how many years do you expect to have more kids?
table(toy$cf20m130, toy$cf20m130_PartnerSurvey, useNA = "ifany")
table(toy$cf19l130, toy$cf19l130_PartnerSurvey, useNA = "ifany")

# Test the function on time-shifted data
train_2018to2020 <- read.csv("/Users/ecantrell/Documents/PreFer\ 2024/time_shifted_data/train_data_for_2018to2020.csv")
toy_time_shifted <- merge_in_partner_data(train_2018to2020, background_full)

# How many kids do you have?
table(toy_time_shifted$cf20m455, toy_time_shifted$cf20m455_PartnerSurvey, useNA = "ifany")
table(toy_time_shifted$cf19l455, toy_time_shifted$cf19l455_PartnerSurvey, useNA = "ifany")
8 changes: 7 additions & 1 deletion feature_time_shift.R
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,13 @@ features_for_2018to2020 <- features_for_2018to2020 %>%
cf10c026 = NA,
cf08a028 = NA,
cf09b028 = NA,
cf10c028 = NA
cf10c028 = NA,
cf08a025 = NA,
cf09b025 = NA,
cf10c025 = NA,
cf08a032 = NA,
cf09b032 = NA,
cf10c032 = NA
)

######## STEP 6: SAVE THE FILES! ########
Expand Down
Loading

0 comments on commit 807890e

Please sign in to comment.