SPHC-2021.Rmd

---
title: "Self-reported sexual identity in Stockholm County in 2021"
author: Guoqiang Zhang, Maya Mathur, Matteo Quartagno
email: guoqiang.zhang@ki.se
output: html_notebook
editor_options: 
  chunk_output_type: console
---

### 1. Load Packages
```{r}
library(naniar)
library(tidyverse)
library(survey)
library(vcd)
library(rcompanion)
library(mitml)
library(mitools)
source("/Users/guoqiang.zhang/Library/CloudStorage/OneDrive-KarolinskaInstitutet/Karolinska Institutet/Research Projects/Population Trends and Individual Fluidity of Sexual Identity in Stockholm County/Research Letter/Population_Trends_and_Individual_Fluidity_of_Sexual_Identity_in_Stockholm_County_2010_to_2021/Helper_functions.R") # helper function
```

### 2. Import and Prepare Data
```{r}
load("/Volumes/LGBT Project data/d_2021.RData")

# sampling strata
n_miss( d_2021$stratum )
d_2021$sampling_strata <- as.factor( d_2021$stratum )
length( unique( d_2021$sampling_strata ) ) # 38 strata

d_2021 <- d_2021 %>%
  mutate( sampling_strata_region = recode( sampling_strata, 
                                           `114` = "Upplands-Väsby",
                                           `115` = "Vallentuna",
                                           `117` = "Österåker",
                                           `120` = "Värmdö",
                                           `123` = "Järfälla",
                                           `125` = "Ekerö",
                                           `126` = "Huddinge",
                                           `127` = "Botkyrka",
                                           `128` = "Salem",
                                           `136` = "Haninge",
                                           `138` = "Tyresö",
                                           `139` = "Upplands-Bro",
                                           `140` = "Nykvarn",
                                           `160` = "Täby",
                                           `162` = "Danderyd",
                                           `163` = "Sollentuna",
                                           `181` = "Södertälje",
                                           `182` = "Nacka",
                                           `183` = "Sundbyberg",
                                           `184` = "Solna",
                                           `186` = "Lidingö",
                                           `187` = "Vaxholm",
                                           `188` = "Norrtälje",
                                           `191` = "Sigtuna",
                                           `192` = "Nynäshamn",
                                           `180103` = "Spånga-Tensta",
                                           `180104` = "Hässelby-Vällingby",
                                           `180106` = "Bromma",
                                           `180108` = "Kungsholmen",
                                           `180109` = "Norrmalm",
                                           `180110` = "Östermalm",
                                           `180115` = "Skarpnäck",
                                           `180118` = "Farsta",
                                           `180124` = "Skärholmen",
                                           `180125` = "Rinkeby-Kista",
                                           `180127` = "Södermalm",
                                           `180128` = "Enskede-Årsta-Vantör",
                                           `180129` = "Hägersten-Älvsjö"
                                           ) )

# sexual identity
table( d_2021$F21F45_Q57, useNA = "always" )
d_2021$sexual_identity_2021 <- factor( ifelse( d_2021$F21F45_Q57 == 1, "Heterosexual", 
                                          ifelse( d_2021$F21F45_Q57 == 2, "Homosexual",
                                                  ifelse( d_2021$F21F45_Q57 == 3, "Bisexual", "None of the above" ) ) ),
                                  levels = c( "Heterosexual", "Homosexual", "Bisexual", "None of the above" ) )
table( d_2021$sexual_identity_2021, useNA = "always" )

# sex
table( d_2021$kon, useNA = "always" )
d_2021$sex <- factor( ifelse( d_2021$kon == 1, "Male", "Female" ), 
                      levels = c( "Male", "Female" ) )
table( d_2021$sex, useNA = "always" )

# age
summary( d_2021$F21alder )
d_2021$age <- d_2021$F21alder

# generation
d_2021$birth_year <- 2021 - d_2021$age
d_2021 <- d_2021 %>%
  mutate( generation = case_when(
    birth_year >= 1997 & birth_year <= 2012 ~ "Generation Z (1997–2012)",
    birth_year >= 1981 & birth_year <= 1996 ~ "Millennials (1981–1996)",
    birth_year >= 1965 & birth_year <= 1980 ~ "Generation X (1965–1980)",
    birth_year >= 1946 & birth_year <= 1964 ~ "Baby Boomers (1946–1964)",
    birth_year >= 1928 & birth_year <= 1945 ~ "Silent Generation (1928–1945)",
    birth_year >= 1901 & birth_year <= 1927 ~ "Greatest Generation (1901–1927)"
  ) )

d_2021$generation <- factor( 
  d_2021$generation, 
  levels = c( "Generation Z (1997–2012)", "Millennials (1981–1996)", "Generation X (1965–1980)", "Baby Boomers (1946–1964)", "Silent Generation (1928–1945)", "Greatest Generation (1901–1927)" ) )
summary( d_2021$generation )

# country of birth
table( d_2021$fodelseland, useNA = "always" )
d_2021$country_of_birth <- factor( ifelse( d_2021$fodelseland == "Afrika", "Africa",
                                           ifelse( d_2021$fodelseland == "Asien", "Asia",
                                                   ifelse( d_2021$fodelseland == "Europa", "Europe",
                                                           ifelse( d_2021$fodelseland == "Sverige", "Sweden", "Americas" ) ) ) ),
                                   levels = c( "Africa", "Americas", "Asia", "Europe", "Sweden" ) )
table( d_2021$country_of_birth, useNA = "always" )

# education
table( d_2021$utbniva2021, useNA = "always" )
d_2021$education <- factor( ifelse( d_2021$utbniva2021 <= 2, "<=9 years",
                                    ifelse( d_2021$utbniva2021 <= 4, "10-12 years", ">=13 years" ) ),
                            levels = c( "<=9 years", "10-12 years", ">=13 years" ) )
table( d_2021$education, useNA = "always" )

# disposable income
summary( d_2021$dispink2021, useNA = "always" )
d_2021$dispink2021[ d_2021$dispink2021 < 0 ] <- NA
d_2021$income <- factor( ifelse( d_2021$dispink2021 <= 2500, "<=2,500",
                                 ifelse( d_2021$dispink2021 > 2500 & d_2021$dispink2021 <= 3500, "(2,500, 3,500]",
                                         ifelse( d_2021$dispink2021 > 3500 & d_2021$dispink2021 <= 4500, "(3,500, 4,500]", ">4,500" ) ) ),
                         levels = c( "<=2,500", "(2,500, 3,500]", "(3,500, 4,500]", ">4,500" ) )
table( d_2021$income, useNA = "always" )

# marital status
table( d_2021$civil2021, useNA = "always" )
d_2021$marital_status <- factor( ifelse( d_2021$civil2021 == "G" | d_2021$civil2021 == "RP", "Currently married",
                                         ifelse( d_2021$civil2021 == "OG", "Never married", "Other" ) ), 
                                 levels = c( "Never married", "Currently married", "Other" ) )
table( d_2021$marital_status, useNA = "always" )


##### design weights #####

summary( d_2021$F21dvikt )
d_2021$design_weight <- d_2021$F21dvikt


##### calibrated weights #####

d_2021$calibrated_weight <- d_2021$F21kalvikt


##### non-response #####

# unit non-response
d_2021$design_weight_unit_nonresponse <- d_2021$F21dbvikt # weights calculated assuming Missing Completely At Random (MCAR) within each stratum
summary( d_2021$design_weight_unit_nonresponse )
sum( d_2021$design_weight_unit_nonresponse ) # No. of source population = 1,827,424

unitresponse_prob <- d_2021 %>% 
  group_by( sampling_strata_region ) %>% 
  summarise( unitresponse_prob = unique( design_weight ) / unique( design_weight_unit_nonresponse ),
             no.of.population = sum( design_weight_unit_nonresponse ),
             sample_size = unique( no.of.population )/unique( design_weight ) ) # calculate overall unit response rate, and no. of population and sample size within each stratum

d_2021 <- d_2021 %>% 
  left_join( unitresponse_prob, by = "sampling_strata_region" )

# item non-response
itemresponse_prob <- d_2021 %>% 
  group_by( sampling_strata_region ) %>% 
  summarise( itemresponse_prob = sum( !is.na( sexual_identity_2021 ) ) / n() ) # calculate item response rate

d_2021 <- d_2021 %>% 
  left_join( itemresponse_prob, by = "sampling_strata_region" ) %>% 
  mutate( itemresponse_prob = ifelse( is.na( sexual_identity_2021 ), 0, itemresponse_prob ) )


##### replicating non-respondent rows for sampling design #####

# because d_2021 includes data only for respondents (unit response), we need to duplicate rows for non-respondents
# so that the final dataset represents the sampling process
rows_to_add <- d_2021 %>%
  group_by( sampling_strata_region ) %>%
  summarise( to_add = unique( sample_size ) - n(),
             no.of.population = unique( no.of.population ),
             sample_size = unique( sample_size ),
             design_weight = unique( design_weight ) )

sum( rows_to_add$to_add ) # 24,819 non-respondents

rows_to_add$to_add <- round( rows_to_add$to_add ) # to resolve floating-point precision issues

replicated_rows <- rows_to_add[ rep( row.names( rows_to_add ), rows_to_add$to_add ), c( 1, 3:5 ) ]

d_2021_complete <- bind_rows( d_2021, replicated_rows ) # 47,885 individuals in the original sample
```

### 3. Check Calibrated Weights and Proxy Auxiliary Variables
```{r}
survey_design_calibrated <- svydesign( ids = ~ 1, 
                                       strata = ~ sampling_strata_region,
                                       weights = ~ calibrated_weight,
                                       fpc = ~ no.of.population,
                                       data = d_2021 )

# we were unable to re-calibrate the weights as recommended (https://cran.r-project.org/web/packages/survey/vignettes/precalibrated.pdf), because the original auxiliary variables used for calibration were not available anymore

svytable( ~ sampling_strata_region, survey_design_calibrated ) # calculate population totals for each sampling stratum

round( summary( d_2021$calibrated_weight ), 2 ) # check the distribution of calibrated weights
round( summary( d_2021$calibrated_weight / d_2021$design_weight ), 2 ) # check the ratios of weights

ggplot( d_2021, aes( x = calibrated_weight ) ) +
  geom_histogram( binwidth = 1, fill = "steelblue", color = "steelblue" ) +
  theme_classic() +
  scale_x_continuous( breaks = c( 0, 100, 200, 300 ), 
                      expand = c( 0.005, 0 ) ) +
  scale_y_continuous( limits = c( 0, 360 ),
                      breaks = c( 0, 100, 200, 300 ), 
                      expand = c( 0, 0 ) ) +
  labs( x = "Calibrated Weights", y = "Frequency" ) +
  theme( axis.text.y = element_text( family = "Arial", size = 11 ),
         axis.title.y = element_text( family = "Arial", size = 11 ),
         axis.text.x = element_text( family = "Arial", size = 11 ),
         axis.title.x = element_text( family = "Arial", size = 11 )
         )

# calculate strength of correlation between sexual identity in 2021 and proxy auxiliary variables
d_2021_correlation <- d_2021 %>% 
  rename( country_of_birth_auxiliary = fodelseland2 ) %>%
  mutate( 
   age_auxiliary = factor( 
      case_when(
        age <= 29 ~ "<=29",
        age >=30 & age <= 44 ~ "30-44",
        age >=45 & age <= 64 ~ "45-64",
        age >=65 & age <= 79 ~ "65-79",
        age >= 80 ~ ">=80" ),
      levels = c( "<=29", "30-44", "45-64", "65-79", ">=80" ) ),
   sex_auxiliary = sex,
   country_of_birth_auxiliary = as.factor( country_of_birth_auxiliary ),
   marital_status_auxiliary = as.factor(
     ifelse( marital_status == "Currently married", 
             "Currently married", 
             "Other" ) ),
   education_auxiliary = education,
   sampling_strata_region_auxiliary = sampling_strata_region
   )
  
summary( d_2021_correlation )

assocstats( table( d_2021_correlation$sexual_identity_2021, d_2021_correlation$sex_auxiliary ) )
assocstats( table( d_2021_correlation$sexual_identity_2021, d_2021_correlation$country_of_birth_auxiliary ) )
assocstats( table( d_2021_correlation$sexual_identity_2021, d_2021_correlation$marital_status_auxiliary ) )
assocstats( table( d_2021_correlation$sexual_identity_2021, d_2021_correlation$sampling_strata_region_auxiliary ) )

freemanTheta( table( d_2021_correlation$sexual_identity_2021, d_2021_correlation$age_auxiliary ), group = "row", digits = 2 )

freemanTheta( table( d_2021_correlation[ d_2021_correlation$sex == "Male", ]$sexual_identity_2021, d_2021_correlation[ d_2021_correlation$sex == "Male", ]$age_auxiliary ), group = "row", digits = 2 )

freemanTheta( table( d_2021_correlation[ d_2021_correlation$sex == "Female", ]$sexual_identity_2021, d_2021_correlation[ d_2021_correlation$sex == "Female", ]$age_auxiliary ), group = "row", digits = 2 )

freemanTheta( table( d_2021_correlation$sexual_identity_2021, d_2021_correlation$education_auxiliary ), group = "row", digits = 2 )

# references
freemanTheta( table( d_2021_correlation$age_auxiliary, d_2021_correlation$education_auxiliary ), group = "row", digits = 2 )
freemanTheta( table( d_2021_correlation$age_auxiliary, d_2021_correlation$income ), group = "row", digits = 2 )
freemanTheta( table( d_2021_correlation$education_auxiliary, d_2021_correlation$income ), group = "row", digits = 2 )
```

### 4. Multiple Imputation
#### 4.1. Obtain incomplete dataset
```{r}
summary( d_2021 )

selected_variables <- c( "sampling_strata_region", "calibrated_weight", "no.of.population", "sexual_identity_2021", "age", "sex", "country_of_birth", "education", "dispink2021", "marital_status" )
  
d_2021_incomplete <- d_2021 %>% 
  select( all_of( selected_variables ) ) %>%
  rename( income = dispink2021 ) %>%
  mutate( 
    country_of_birth = factor( 
      case_when(
        country_of_birth == "Africa" | country_of_birth == "Americas" | country_of_birth == "Asia" ~ "Outside Europe",
        country_of_birth == "Sweden" ~ "Sweden",
        country_of_birth == "Europe" ~ "Europe" ),
      levels = c( "Sweden", "Europe", "Outside Europe" ) )
    )

# create weight strata (n = 25) for two-level multivariate normal imputation
d_2021_incomplete$weight_strata <- as.factor( cut( d_2021_incomplete$calibrated_weight,
                                                   breaks = quantile( d_2021_incomplete$calibrated_weight, 
                                                                      probs = seq( 0, 1, length.out = 26 ) ),
                                                   labels = 1:25,
                                                   include.lowest = TRUE,
                                                   right = TRUE ) )

ggplot( d_2021_incomplete %>% 
          group_by( weight_strata ) %>% 
          summarise( mean_weight = mean( calibrated_weight ), sd_weight = sd( calibrated_weight ) ),
        aes( y = reorder( weight_strata, -mean_weight ), x = mean_weight ) ) +
  geom_line( aes( group = 1 ) ) +
  geom_point() +
  geom_errorbarh( aes( xmin = mean_weight - sd_weight, xmax = mean_weight + sd_weight ), height = 0.4, alpha = 0.5 ) +
  scale_x_continuous( limits = c( 0, 260 ), 
                      breaks = c( 0, 50, 100, 150, 200, 250 ), 
                      expand = c( 0, 0 ) ) +
  labs( y = "Calibrated Weight Strata", x = "Mean and Standard Deviation of Calibrated Weights" ) +
  theme_classic() +
  theme( axis.text.y = element_text( family = "Arial", size = 11 ),
         axis.title.y = element_text( family = "Arial", size = 11 ),
         axis.text.x = element_text( family = "Arial", size = 11 ),
         axis.title.x = element_text( family = "Arial", size = 11 )
         )

summary( d_2021_incomplete )
sapply( d_2021_incomplete, class ) # all continuous variables are numeric, and all categorical variables are factor
# save( d_2021_incomplete, file = "/Volumes/LGBT Project data/Multiple Imputation/d_2021_incomplete.RData" )
```

#### 4.2. Visualize missing pattern
```{r}
# identify missing variables
vis_miss( d_2021_incomplete )
n_miss( d_2021_incomplete )
miss_var_summary( d_2021_incomplete ) # 2.2% missing in sexual_identity_2021, 2.1% in education, and 0.2% in income
```

#### 4.3. Two-level multivariate normal imputation
```{r}
# use multiple imputation to account for item non-response
# assume Missing At Random (MAR)

# specify imputation model
fml_imp_2021 <- sexual_identity_2021 + education + income ~ 1 + age*sex + country_of_birth + marital_status + ( 1 | weight_strata )

# perform a "dry run"
imp_ini_2021 <- jomoImpute( data = d_2021_incomplete, # incomplete dataset
                            formula = fml_imp_2021, # imputation model
                            random.L1 = "full", # random covariance matrix of residuals
                            n.burn = 2, # number of burn-in iterations
                            n.iter = 10, # number of between-imputation iterations
                            m = 2, # number of imputation
                            seed = 12345 # set the seed so that results can be reproduced
                            )

summary( imp_ini_2021 ) # check output


# increase the number of burn-in and between-imputation iterations
# to determine the number of iterations for the final imputation
imp_preliminary_2021 <- jomoImpute( data = d_2021_incomplete,
                                    formula = fml_imp_2021,
                                    random.L1 = "full",
                                    n.burn = 5000,
                                    n.iter = 1000,
                                    m = 2,
                                    seed = 12345
                                    ) # took around 30 minutes

# save( imp_preliminary_2021, file = "/Volumes/LGBT Project data/Multiple Imputation/imp_preliminary_2021.RData" )
summary( imp_preliminary_2021 ) # summarize model and display convergence statistics
plot( imp_preliminary_2021, trace = "all", print = "beta" ) # check trace and autocorrelation plots


# final imputation with the chosen number of iterations
imp_final_2021 <- jomoImpute( data = d_2021_incomplete,
                              formula = fml_imp_2021,
                              random.L1 = "full",
                              n.burn = 2000,
                              n.iter = 1000,
                              m = 20,
                              seed = 12345
                              ) # took around 1 hour and a half

# save( imp_final_2021, file = "/Volumes/LGBT Project data/Multiple Imputation/imp_final_2021.RData" )
summary( imp_final_2021 )
plot( imp_final_2021, trace = "all", print = "beta" )
```

#### 4.4. Validate imputed data
```{r}
# extract imputed datasets
original_data_2021 <- mitmlComplete( imp_final_2021, print = 0 ) # extract original incomplete dataset
implist_2021 <- mitmlComplete( imp_final_2021, print = "all" ) # extract all imputed datasets

original_data_2021$imputation <- "0"
all_data_2021 <- bind_rows( original_data_2021,
                            bind_rows( implist_2021, .id = "imputation" ) ) # merge datasets
all_data_2021$imputation <- as.numeric( all_data_2021$imputation )
summary( all_data_2021 )

# sexual identity in 2021
ggplot( all_data_2021[ !is.na( all_data_2021$sexual_identity_2021 ), ],
        aes( fill = sexual_identity_2021, x = imputation ) ) + 
  geom_bar( position = "fill" ) + 
  scale_y_continuous( labels = scales::percent ) + 
  scale_fill_discrete( name = "Sexual identity in 2021" ) +
  labs(
    x = "Imputation number",
    y = "Proportion",
    caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
  theme_classic() +
  theme( axis.title.x = element_text( family = "Arial", size = 11 ),
         axis.text.x = element_text( family = "Arial", size = 11 ),
         axis.text.y = element_text( family = "Arial", size = 11 ),
         axis.title.y = element_text( family = "Arial", size = 11 ),
         legend.text = element_text( family = "Arial", size = 10 ),
         legend.title = element_text( family = "Arial", size = 10 ),
         legend.position = "bottom",
         plot.caption = element_text( family = "Arial", size = 10, hjust = 0 ) 
  )

# education
ggplot( all_data_2021[ !is.na( all_data_2021$education ), ],
        aes( fill = education, x = imputation ) ) + 
  geom_bar( position = "fill" ) + 
  scale_y_continuous( labels = scales::percent ) + 
  scale_fill_discrete( name = "Level of education" ) +
  labs(
    x = "Imputation number",
    y = "Proportion",
    caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
  theme_classic() +
  theme( axis.title.x = element_text( family = "Arial", size = 11 ),
         axis.text.x = element_text( family = "Arial", size = 11 ),
         axis.text.y = element_text( family = "Arial", size = 11 ),
         axis.title.y = element_text( family = "Arial", size = 11 ),
         legend.text = element_text( family = "Arial", size = 10 ),
         legend.title = element_text( family = "Arial", size = 10 ),
         legend.position = "bottom",
         plot.caption = element_text( family = "Arial", size = 10, hjust = 0 ) 
  )

# income
summary( all_data_2021$income )
nrow( all_data_2021[ all_data_2021$income < 0 & !is.na( all_data_2021$income ), ] ) #  152 imputed values are negative
```

#### 4.5. Prepare for survey analyses
```{r}
# transform certain variables in imputed datasets
implist_2021 <- mitmlComplete( imp_final_2021, print = "all" ) # extract all imputed datasets
summary( implist_2021[[1]] )

implist_2021_transformed <- lapply( implist_2021, function( df ) {
  within( df, {
    
    # age group
    age_group <- cut( age,
                      breaks = c( seq( 15, 79, by = 2 ), 100 ),
                      include.lowest = TRUE,
                      right = TRUE )
    
    # generation
    birth_year <- 2021 - age
    generation <- factor( 
      case_when(
        birth_year >= 1997 & birth_year <= 2012 ~ "Generation Z (1997–2012)",
        birth_year >= 1981 & birth_year <= 1996 ~ "Millennials (1981–1996)",
        birth_year >= 1965 & birth_year <= 1980 ~ "Generation X (1965–1980)",
        birth_year >= 1946 & birth_year <= 1964 ~ "Baby Boomers (1946–1964)",
        birth_year >= 1928 & birth_year <= 1945 ~ "Silent Generation (1928–1945)",
        birth_year >= 1901 & birth_year <= 1927 ~ "Greatest Generation (1901–1927)" ),
      levels = c( "Generation Z (1997–2012)", "Millennials (1981–1996)", "Generation X (1965–1980)", "Baby Boomers (1946–1964)", "Silent Generation (1928–1945)", "Greatest Generation (1901–1927)" ) 
    )

    # sexual identity in 2021
    levels( sexual_identity_2021 )[ levels( sexual_identity_2021 ) == "None of the above" ] <- "Other"
    sexual_identity_lgb <- factor(
      case_when( sexual_identity_2021 == "Homosexual" | sexual_identity_2021 == "Bisexual" ~ "LGB",
                 sexual_identity_2021 == "Heterosexual" ~ "Heterosexual",
                 sexual_identity_2021 == "Other" ~ "Other" ),
      levels = c( "Heterosexual", "LGB", "Other" ) )
  } )
} )

imputed_data_2021_transformed <- bind_rows( implist_2021_transformed, .id = "imputation" ) # merge datasets
imputed_data_2021_transformed$imputation <- as.numeric( imputed_data_2021_transformed$imputation )
summary( imputed_data_2021_transformed )
n_miss( imputed_data_2021_transformed ) # 0 missing


# use calibrated weights to account for unit non-response
# create survey design
survey_design_imp <- svydesign( ids = ~ 1,
                                strata = ~ sampling_strata_region,
                                weights = ~ calibrated_weight,
                                fpc = ~ no.of.population,
                                data = imputationList( implist_2021_transformed )
                                ) # without pre-calibration
```

### 5. Estimation of Proportions of Sexual Identities
#### 5.1. Complete-case analysis
##### 5.1.1. Define survey design
```{r}
d_2021_complete_cc <- d_2021_complete

# create survey design
survey_design_cc <- svydesign( ids = ~ 1, 
                               strata = ~ sampling_strata_region,
                               weights = ~ design_weight,
                               fpc = ~ no.of.population,
                               data = d_2021_complete_cc )

categories <- c( "Homosexual", "Bisexual" )

# the following analyses assume MCAR
```

##### 5.1.2. Calculate proportion by generation
```{r}
list_of_df <- list()

for ( cat in categories ) {
  prop_cc_generation <- svyby( formula = as.formula( paste0( "~ I( sexual_identity_2021 == '", cat, "' )" ) ),
                               by = ~ generation,
                               design = subset( survey_design_cc, !is.na( sexual_identity_2021 ) & !is.na( generation ) ),
                               FUN = svyciprop,
                               vartype = "ci",
                               method = "beta" )
  
  colnames( prop_cc_generation ) <- c( "subgroup", paste0( cat, "_point_estimate_2021" ), paste0( cat, "_lower_ci_2021" ), paste0( cat, "_upper_ci_2021" ) )
  
  list_of_df[[cat]] <- prop_cc_generation
}

prop_cc_generation <- Reduce( function( df1, df2 ) { 
  merge( df1, df2, by = "subgroup" )
}, 
list_of_df )

prop_cc_generation <- left_join( prop_cc_generation,
                                 d_2021_complete_cc[ !is.na( d_2021_complete_cc$sexual_identity_2021 ) &
                                                       !is.na( d_2021_complete_cc$generation ), ] %>%
                                   group_by( subgroup = generation ) %>%
                                   summarise( sample_size_2021 = n() ),
                                 by = "subgroup" ) %>%
  mutate( sample_size_2021 = prettyNum( sample_size_2021, big.mark = ",", preserve.width = "none" ) )
```

##### 5.1.3. Calculate proportion by age groups
```{r}
d_2021_iteration_age_group <- d_2021_complete

d_2021_iteration_age_group$age_group <- cut( d_2021_iteration_age_group$age,
                                             breaks = seq( 15, 79, by = 2 ),
                                             include.lowest = TRUE,
                                             right = TRUE )
table( d_2021_iteration_age_group$age_group )

survey_design_iteration_age_group <- svydesign( ids = ~ 1, 
                                                strata = ~ sampling_strata_region,
                                                weights = ~ design_weight,
                                                fpc = ~ no.of.population,
                                                data = d_2021_iteration_age_group )

list_of_df <- list()

for ( cat in categories ) {
  prop_cc_age_group <- svyby( formula = as.formula( paste0( "~ I( sexual_identity_2021 == '", cat, "' )" ) ),
                              by = ~ age_group,
                              design = subset( survey_design_iteration_age_group, 
                                               !is.na( sexual_identity_2021 ) & !is.na( age_group ) ),
                              FUN = svyciprop,
                              vartype = "ci",
                              method = "beta" )
  
  colnames( prop_cc_age_group ) <- c( "subgroup", paste0( cat, "_point_estimate_2021" ), paste0( cat, "_lower_ci_2021" ), paste0( cat, "_upper_ci_2021" ) )
  
  list_of_df[[cat]] <- prop_cc_age_group
}

prop_cc_age_group <- Reduce( function( df1, df2 ) { 
  merge( df1, df2, by = "subgroup" )
}, 
list_of_df )

writexl::write_xlsx( prop_cc_age_group, "prop_cc_age_group_2021.xlsx" )
```

##### 5.1.4. Overall proportions of sexual identities in Stockholm County
```{r}
list_of_df <- list()

for ( cat in categories ) {
  prop_cc_overall <- svyciprop( formula = as.formula( paste0( "~ I( sexual_identity_2021 == '", cat, "' )" ) ),
                                design = subset( survey_design_cc, !is.na( sexual_identity_2021 ) ),
                                vartype = "ci",
                                method = "beta" )
  
  prop_cc_overall <- data.frame( prop_cc_overall[ 1 ], confint( prop_cc_overall )[ 1 ], confint( prop_cc_overall )[ 2 ] )
  
  colnames( prop_cc_overall ) <- c( paste0( cat, "_point_estimate_2021" ), paste0( cat, "_lower_ci_2021" ), paste0( cat, "_upper_ci_2021" ) )
  
  list_of_df[[cat]] <- prop_cc_overall
}

prop_cc_overall <- Reduce( function( df1, df2 ) {
  cbind( df1, df2 )
}, 
list_of_df ) %>% 
  as.data.frame() %>%
  rownames_to_column( var = "subgroup" )

prop_cc_overall[ 1, 1 ] <- "Stockholm County"

prop_cc_overall <- bind_cols( prop_cc_overall,
                              d_2021_complete_cc[ !is.na( d_2021_complete_cc$sexual_identity_2021 ), ] %>% 
                                summarise( sample_size_2021 = n() ) ) %>%
  mutate( sample_size_2021 = prettyNum( sample_size_2021, big.mark = ",", preserve.width = "none" ) )
```

##### 5.1.5. Merge datasets
```{r}
prop_cc_summary_2021 <- rbind( prop_cc_generation, prop_cc_overall )

writexl::write_xlsx( prop_cc_summary_2021, "prop_cc_summary_2021.xlsx" )
```

#### 5.2. Survey analyses of imputed datasets
```{r}
# among demographic subgroups
sexual_identities <- c( "Homosexual", "Bisexual" )
demographic_vars <- c( "generation" )

prop_imp_list_subgroup_2021 <- list()

for( cat in demographic_vars ) {
  results <- calc_prop_imp_subgroup( implist = implist_2021_transformed,
                                     design = survey_design_imp,
                                     sexual_identities = sexual_identities,
                                     demog_var = cat,
                                     year = 2021 )
  
  prop_imp_list_subgroup_2021[[ cat ]] <- results
  }

prop_imp_summary_subgroup_2021 <- do.call( rbind, prop_imp_list_subgroup_2021 )
rownames( prop_imp_summary_subgroup_2021 ) <- NULL

# in Stockholm County
prop_imp_overall_2021 <- calc_prop_imp_overall( design = survey_design_imp,
                                                sexual_identities = sexual_identities,
                                                year = 2021 )

# merge results
prop_imp_summary_2021 <- rbind( prop_imp_summary_subgroup_2021, prop_imp_overall_2021 )
writexl::write_xlsx( prop_imp_summary_2021, "prop_imp_summary_2021.xlsx" )

# among age group
prop_imp_age_group <- calc_prop_imp_subgroup( implist = implist_2021_transformed,
                                              design = survey_design_imp,
                                              sexual_identities = sexual_identities,
                                              demog_var = "age_group",
                                              year = 2021 )
writexl::write_xlsx( prop_imp_age_group, "prop_imp_age_group_2021.xlsx" )

# calculate overall proportion of LGB identity by generation
prop_lgb_generation <- summary( MIcombine( with( survey_design_imp,
                                                 svyby( formula = ~ I( sexual_identity_lgb == "LGB" ),
                                                        by = ~ generation,
                                                        FUN = svyciprop,
                                                        method = "beta" ) ) ) )
rownames_to_column( prop_lgb_generation[ , c( "results", "(lower", "upper)" ) ], var = "subgroup" ) %>%
  mutate( across( -1, ~ round(. * 100, 1 ) ) )
```

```{r}
# check status
renv::status()

# record R package versions
renv::snapshot()
```