01_dataCleaning.Rmd

---
title: "01_dataCleaning"
output: html_document
---

```{r setup, include=FALSE, message=F}

#conditional install and load libraries
if (!require("pacman")) install.packages("pacman")
pacman::p_load(foreign, tidyverse, purrr, dplyr)

#load data
df <- read.spss('../data/raw/July24_NRC_analysis14.sav', to.data.frame=TRUE) #used for SEM analysis

```

__clean the reading scales__

```{r readingScales}

#make smaller dfs
df_sentences <- df[, grepl("^SC.*_scored$", names(df))]  #p=28
df_synonyms <-  df[, grepl("^Syn.*_scored$", names(df))] #p=60
df_analogies <- df[, grepl("^A.*_scored$", names(df))]   #p=20

#change all NAs to 0 
df_sentences[is.na(df_sentences)] <- 0 ; df_synonyms[is.na(df_synonyms)] <- 0 ; df_analogies[is.na(df_analogies)] <- 0

```

__clean the ART (print exposure)__

```{r ART scale}

#grep ART vars
df_ART <- df[, grep("ART_", names(df), value=TRUE)] 

#remove subscale scores -- recalculate to ensure db calculations correct (leverage no numeric values in var name)
df_ART <- df_ART[, grep('[0-9]+', names(df_ART))]

#modify variables names so consistent with fict/nonf/foil names from Dr. Mar/Eric -- make 6 digits
for(i in 1:length(names(df_ART))){
  if(nchar(names(df_ART)[i])==5){ #if 5 characters, add 2 0s
    names(df_ART)[i]<-paste(substr(names(df_ART)[i],1,4),"00", substr(names(df_ART)[i],5,5), sep="")
  }
  if(nchar(names(df_ART)[i])==6){ #if 6 characters, add 1 0s
    names(df_ART)[i]<-paste(substr(names(df_ART)[i],1,4),"0", substr(names(df_ART)[i],5,6), sep="")
  }
}

#remove underscore
names(df_ART) <- gsub("_", "", names(df_ART), fixed = TRUE) 

#change all NAs to 0 
df_ART[is.na(df_ART)] <- 0

#isolate items in each category (note: fiction missing ART060 and ART031)
df_fict <- df_ART[, c(
'ART019', 'ART023', 'ART028', 'ART035', 'ART036', 'ART064', 'ART069', 'ART073', 'ART077', 'ART083', 'ART085', 'ART087', 'ART091', 'ART102', 'ART117', 'ART121', 'ART122', 'ART132', 'ART140', 'ART141', 'ART157', 'ART163', 'ART179', 'ART192', 'ART002', 'ART009', 'ART011', 'ART037', 'ART051', 'ART056', 'ART058', 'ART071', 'ART079', 'ART104', 'ART105', 'ART112', 'ART131', 'ART138', 'ART146', 'ART149', 'ART150', 'ART151', 'ART156', 'ART162', 'ART181', 'ART182', 'ART183', 'ART191', 'ART198', 'ART012', 'ART018', 'ART020', 'ART026', 'ART032', 'ART042', 'ART048', 'ART065', 'ART070', 'ART074', 'ART081', 'ART096', 'ART097', 'ART098', 'ART099', 'ART101', 'ART107', 'ART119', 'ART133', 'ART135', 'ART168', 'ART170', 'ART171', 'ART173', 'ART003', 'ART054', 'ART066', 'ART093', 'ART125', 'ART142', 'ART186', 'ART190', 'ART193', 'ART200', 'ART004', 'ART005', 'ART008', 'ART017', 'ART021', 'ART022', 'ART038', 'ART059', 'ART067', 'ART082', 'ART084', 'ART089', 'ART090', 'ART094', 'ART111', 'ART147', 'ART166', 'ART172', 'ART187', 'ART188', 'ART189', 'ART194', 'ART196', 'ART197', 'ART199')]

df_nonf <- df_ART[, c(
'ART015', 'ART044', 'ART123', 'ART128', 'ART129', 'ART134', 'ART136', 'ART148', 'ART158', 'ART180', 'ART029', 'ART045', 'ART068', 'ART076', 'ART110', 'ART114', 'ART120', 'ART145', 'ART160', 'ART177', 'ART007', 'ART033', 'ART039', 'ART046', 'ART086', 'ART118', 'ART154', 'ART175', 'ART176', 'ART185', 'ART010', 'ART014', 'ART025', 'ART075', 'ART078', 'ART088', 'ART095', 'ART124', 'ART137', 'ART167', 'ART013', 'ART047', 'ART080', 'ART103', 'ART109', 'ART130', 'ART143', 'ART144', 'ART165', 'ART174')]

df_foil <- df_ART[, c(
'ART001', 'ART006', 'ART016', 'ART024', 'ART027', 'ART030', 'ART034', 'ART040', 'ART041', 'ART043', 'ART049', 'ART050', 'ART052', 'ART053', 'ART055', 'ART057', 'ART061', 'ART062', 'ART063', 'ART072', 'ART092', 'ART100', 'ART106', 'ART108', 'ART113', 'ART115', 'ART116', 'ART126', 'ART127', 'ART139', 'ART152', 'ART153', 'ART155', 'ART159', 'ART161', 'ART164', 'ART169', 'ART178', 'ART184', 'ART195')]

```

__clean the PLoR__

```{r PLoR}

#for clarity, vector of names in df
df_names <- names(df)

#find motivation items (but not demotivation items)
df_motivations  <- df[, grepl("Mtv", df_names) & !grepl("DMtv", df_names)] #excluded 3

#find obstacles
df_obstacles <- df[, grepl("DMtv", df_names)] #excluded 0

#find attitudes
df_attitudes<- df[, grepl("Att", df_names)] #excluded 2? plus 2 reverse scored

#find behaviours
df_behaviours <- df[, grepl("Bhv", df_names)] #excluded 0, but 3 reverse scored

#in these dfs, recode the labels (e.g. Strongly Agree, to numeric)
recode_fn <- function(df){
  df <- as.data.frame(apply(df, 2, function(x)gsub('\\s+', '',x))) #remove spaces
  df<- data.frame(lapply(df, as.character), stringsAsFactors=FALSE) #make character
  df[df=="DisagreeStrongly"] <- '1' #recode variables
  df[df=="Disagree"] <- '2'
  df[df=="DisagreeSomewhat"] <- '3'
  df[df=="NeitherAgreenorDisagree"] <- '4'
  df[df=="AgreeSomewhat"] <- '5'
  df[df=="Agree"] <- '6'
  df[df=="AgreeStrongly"] <- '7'
  return(df)
}

#use function
df_motivations  <- recode_fn(df_motivations)
df_obstacles    <- recode_fn(df_obstacles)
df_attitudes    <- recode_fn(df_attitudes)
df_behaviours   <- recode_fn(df_behaviours)

################################
#Factor 1: Reading Motivations (n=14)
################################
# 
# #facet 1: reading to grow (n=7)
# df_motivations_1 <- df_motivations[, c(
#   'Mtv3',  #03. I read to learn new things (e.g., to learn a new skill).	
#   'Mtv8',  #08. I read to help me better understand myself (e.g., personal development).	
#   'Mtv14', #14. I read to improve my language ability (e.g., vocabulary, grammar).	
#   'Mtv16', #16. I read to better understand others.
#   'Mtv17', #17. I read to be a better role model for others.
#   'Mtv4',  #04. I read to think in new ways (e.g., be more creative, expand my worldview).	
#   'Mtv6'   #06. I read to feel connected to other people.	
# )]
# 
# #facet 2: read for enjoyment (n=5)
# df_motivations_2 <- df_motivations[, c(
#   'Mtv10', #10. I read to relax.	
#   'Mtv2',  #02. I read for entertainment (e.g., to pass the time).	
#   'Mtv1',  #01. I read for pleasure.	
#   'Mtv7',  #07. I read to become immersed in the world of the text.
#   'Mtv15'  #15. I read to experience certain emotions.	
# )]
# 
# #facet 3: reading to avoid guilt (n=2)
# df_motivations_3 <- df_motivations[, c(
#   'Mtv11', #11. I read because I would feel guilty/ embarrassed if I did not.
#   'Mtv9'   #9. I read to avoid interacting with others.	
# )]
# 
# #exclude
# df_motivations <- subset(df_motivations, select=-c(
#   Mtv5,  #05. I read to impress others.	
#   Mtv12, #12. I read because of school/ work requirements.	
#   Mtv13  #13. I read to keep up with current culture (e.g., new music, celebrities, current events and news).	
# ))

#note: these factors were updated on April 6, 2020, based on new EFA analysis by Kyle

#facet 1: reading for enjoyment (n=5)
df_motivations_1 <- df_motivations[, c(
'Mtv2',  #02. I read for entertainment (e.g., to pass the time).	
'Mtv1',  #01. I read for pleasure.	
'Mtv10', #10. I read to relax.	
'Mtv7',  #07. I read to become immersed in the world of the text.
'Mtv15'  #15. I read to experience certain emotions.	
)]

#facet 2: reading to grow (n=7)
df_motivations_2 <- df_motivations[, c(
'Mtv3',  #03. I read to learn new things (e.g., to learn a new skill).	
'Mtv8',  #08. I read to help me better understand myself (e.g., personal development).	
'Mtv14', #14. I read to improve my language ability (e.g., vocabulary, grammar).	
'Mtv16', #16. I read to better understand others.
'Mtv17', #17. I read to be a better role model for others.
'Mtv4',  #04. I read to think in new ways (e.g., be more creative, expand my worldview).	
'Mtv6'   #06. I read to feel connected to other people.	
)]

#facet 3: reading due to the judgment or attention of others (n=3)
df_motivations_3 <- df_motivations[, c(
  'Mtv11', #11. I read because I would feel guilty/ embarrassed if I did not.
  'Mtv5',  #05. I read to impress others.	
  'Mtv9'   #9. I read to avoid interacting with others.	
)]

#exclude
df_motivations <- subset(df_motivations, select=-c(
  Mtv12, #12. I read because of school/ work requirements.	
  Mtv13  #13. I read to keep up with current culture (e.g., new music, celebrities, current events and news).	
))

################################
#Factor 2: Reading Obstacles (n=15)
################################

# #facet 1: do not enjoy  (n=7)
# df_obstacles_1 <- df_obstacles[, c(
#   'DMtv4',  #04. I often don't read because I find it boring.	
#   'DMtv11', #11. I often don't read because I never got in the habit.	
#   'DMtv5',  #05. often don't read because I don't have to.	
#   'DMtv6',  #06. I often don't read because I don't see the point.
#   'DMtv7',  #07. I often don't read because I'd rather be doing something else.	
#   'DMtv13', #13. I often don't read because being asked to analyze books in highschool made reading less pleasurable.
#   'DMtv12'  #12. I often don't read because being assigned things to read in highschool ruined it for me.
# )]
# 
# #facet 2: circumstances (n=4)
# df_obstacles_2 <- df_obstacles[, c(
#   'DMtv1',  #01. I often don't read because I don't have enough time.	
#   'DMtv2',  #02. I often don't read because I am feeling too tired.	
#   'DMtv9',  #09. I often don't read because of the cost.	
#   'DMtv14'  #14. I read all day at work/school and this prevents me from enjoying reading.
# )]
# 
# #facet 3: reading is not easy (n=4)
# df_obstacles_3 <- df_obstacles[, c(
#   'DMtv15', #15. I often don't read because my friends don't like to read.
#   'DMtv8',  #08. I often don't read because reading is not seen as cool.	
#   'DMtv10', #10. I often don't read because materials are not accessible to me where I live.	
#   'DMtv3'   #03. I often don't read because I find it difficult.	
# )]

#facet 1: circumstances (n=4)
df_obstacles_1 <- df_obstacles[, c(
  'DMtv1',  #01. I often don't read because I don't have enough time.	
  'DMtv2',  #02. I often don't read because I am feeling too tired.	
  'DMtv9',  #09. I often don't read because of the cost.	
  'DMtv14'  #14. I read all day at work/school and this prevents me from enjoying reading.
)]

#facet 2: do not enjoy  (n=7)
df_obstacles_2 <- df_obstacles[, c(
  'DMtv4',  #04. I often don't read because I find it boring.	
  'DMtv11', #11. I often don't read because I never got in the habit.	
  'DMtv5',  #05. often don't read because I don't have to.	
  'DMtv6',  #06. I often don't read because I don't see the point.
  'DMtv7',  #07. I often don't read because I'd rather be doing something else.	
  'DMtv13', #13. I often don't read because being asked to analyze books in highschool made reading less pleasurable.
  'DMtv12'  #12. I often don't read because being assigned things to read in highschool ruined it for me.
)]

#facet 3: reading is not socially valued (n=3)
df_obstacles_3 <- df_obstacles[, c(
  'DMtv15', #15. I often don't read because my friends don't like to read.
  'DMtv8',  #08. I often don't read because reading is not seen as cool.	
  'DMtv10' #10. I often don't read because materials are not accessible to me where I live.	
)]

df_obstacles <- subset(df_obstacles, select=-c(
  DMtv3  #03. I often don't read because I find it difficult.	
))

################################
#Factor 3: Reading Attitudes (n=6) ... 
################################

# #facet 1: desirable (n=3)
# df_attitudes_1 <- df_attitudes[, c(
#   'Att3_R',  #03. I would like to spend less time reading.	
#   'Att2',  #02. I would like to spend more time reading.
#   'Att8'   #08. It is important to be well-read
# )]
# 
# #facet 2: identity (n=3)
# df_attitudes_2 <- df_attitudes[, c(
#   'Att7',  #07. I feel that what I read is a good reflection of who I am as a person.	
#   'Att5',  #05. You can tell a lot about a person by what s/he reads.	
#   'Att1'   #01. I think of myself as a 'reader'.	
# )]
# 
# #REMOVE
# df_attitudes <- subset(df_attitudes, select=-c(
#   Att4,  #04. I sometimes feel embarrassed about what I choose to read.	
#   Att6,  #06. I don't understand why some people read certain genres (e.g., sci-fi, romance, history).	
#   Att6_R, #excluded
#   Att3 #reverse scored
# ))

#facet 1: important identity (n=6)
df_attitudes_1 <- df_attitudes[, c(
  'Att2',  #02. I would like to spend more time reading.
  'Att3_R',  #03. I would like to spend less time reading.	
  'Att8',   #08. It is important to be well-read
  'Att5',  #05. You can tell a lot about a person by what s/he reads.	
  'Att1',  #01. I think of myself as a 'reader'.	
  'Att7'  #07. I feel that what I read is a good reflection of who I am as a person.	
)]

#REMOVE
df_attitudes <- subset(df_attitudes, select=-c(
  Att4,  #04. I sometimes feel embarrassed about what I choose to read.	
  Att6,  #06. I don't understand why some people read certain genres (e.g., sci-fi, romance, history).	
  Att6_R, #excluded
  Att3 #reverse scored
))

################################
#Factor 4: Reading Styles (n=8)
################################

# #facet 1: surface (n=3)
# df_behaviours_1 <- df_behaviours[, c(
#   'Bhv3_R', #03. I often skim things that I read online to get the gist of them.	
#   'Bhv4_R', #04. I often skim things that I read in hard copy (i.e., offline) to get the gist of them.	
#   'Bhv5_R'  #05. I can easily stop reading something if I don't find it worth my while.
# )]
# 
# #facet 2: compulsive (n=3)
# df_behaviours_2 <- df_behaviours[, c(
#   'Bhv7', #07. I feel compelled to read things that are in the same series as something that I've enjoyed.	
#   'Bhv8', #08. I often think about what I have been reading even when I'm not actively reading it.
#   'Bhv6' #06. I feel compelled to finish everything that I begin reading.	
# )]
# 
# #facet 3: prolific (n=2)
# df_behaviours_3 <- df_behaviours[, c(
#   'Bhv2', #02. I like to have more than one thing to read "on the go" at a time.	
#   'Bhv1'  #01. I enjoy re-reading things that I've read before.	
# )]
# 
# df_behaviours <- subset(df_behaviours, select=-c(
#   Bhv3, #excluded 
#   Bhv4, #excluded 
#   Bhv5  #excluded 
# ))

#facet 1: surface (n=3)
df_behaviours_1 <- df_behaviours[, c(
  'Bhv3_R', #03. I often skim things that I read online to get the gist of them.	
  'Bhv4_R', #04. I often skim things that I read in hard copy (i.e., offline) to get the gist of them.	
  'Bhv5_R'  #05. I can easily stop reading something if I don't find it worth my while.
)]

#facet 2: compulsive (n=4)
df_behaviours_2 <- df_behaviours[, c(
  'Bhv7', #07. I feel compelled to read things that are in the same series as something that I've enjoyed.	
  'Bhv6', #06. I feel compelled to finish everything that I begin reading.	      
  'Bhv8', #08. I often think about what I have been reading even when I'm not actively reading it.
  'Bhv1'  #01. I enjoy re-reading things that I've read before.	
)]

df_behaviours <- subset(df_behaviours, select=-c(
  Bhv2, #02. I like to have more than one thing to read "on the go" at a time.	
  Bhv3, #excluded 
  Bhv4, #excluded 
  Bhv5  #excluded 
))

```


```{r write out, results='hide'}

#write out reading dfs
write.csv(df_analogies, paste0('../data/out/reading/df_analogies_', Sys.Date(), '.csv', sep=''), row.names = F)
write.csv(df_sentences, paste0('../data/out/reading/df_sentences_', Sys.Date(), '.csv', sep=''), row.names = F)
write.csv(df_synonyms, paste0('../data/out/reading/df_synonyms_', Sys.Date(), '.csv', sep=''), row.names = F)

#write out ART dfs
write.csv(df_fict, paste0('../data/out/ART/df_fict_', Sys.Date(), '.csv', sep=''), row.names = F)
write.csv(df_nonf, paste0('../data/out/ART/df_nonf_', Sys.Date(), '.csv', sep=''), row.names = F)
write.csv(df_foil, paste0('../data/out/ART/df_foil_', Sys.Date(), '.csv', sep=''), row.names = F)

#write out PLoR scale dfs

#first, see names of dfs in environment
names(dfs <- Filter(function(x) is(x, "data.frame"), mget(ls())))
      
#make a list of dfs we want
list_of_dfs <- list("df_motivations"    = df_motivations,  
                  "df_motivations_1"    = df_motivations_1,
                  "df_motivations_2"    = df_motivations_2,
                  "df_motivations_3"    = df_motivations_3,
                  "df_obstacles"        = df_obstacles,  
                  "df_obstacles_1"      = df_obstacles_1,    
                  "df_obstacles_2"      = df_obstacles_2,    
                  "df_obstacles_3"      = df_obstacles_3,   
                  "df_attitudes"        = df_attitudes,
                  "df_attitudes_1"      = df_attitudes_1,             
                  #"df_attitudes_2"      = df_attitudes_2,    #not part of new EFA results
                  "df_behaviours"       = df_behaviours,
                  "df_behaviours_1"     = df_behaviours_1,
                  "df_behaviours_2"     = df_behaviours_2)
                  #"df_behaviours_3"     = df_behaviours_3)   #not part of new EFA results

#write a function that takes names, and pastes into directory
output_csv <- function(data, names){
  folder_path <- '../data/out/PLoR/'
  write_csv(data, paste0(folder_path, 'PLoR_', names, '.csv'))
}

#write out dfs using purrr
list(data=list_of_dfs,
  names=names(list_of_dfs)) %>%
  pmap(output_csv)

```