.Rhistory

"total_enrollment_hisp_tot",
"total_enrollment_white_tot",
"total_enrollment_asian_tot",
"total_enrollment_amin_tot",
"total_enrollment_multi_tot",
"total_enrollment_unkn_tot",
"total_enrollment_nonres_tot",
"total_enrollment",
"grant01",
"state03",
"total_undergraduates",
"nettuition01",
"tot_rev_w_auxother_sum",
"depreciation01"
)
var_normalizers <- c(
"fte_count",
"cpi_scalar_2015",
"hepi_scalar_2015",
"heca_scalar_2015"
)
# Old variable lists
#var_expend <- c(
#         "instruction01",
#         "instruction02",
#         "studserv01",
#         "tot_rev_w_auxother_sum",
#         "grant01",
#         "any_aid_pct",
#         "total01",
#         "instruction_share",
#         "education_share",
#         "noneducation_share",
#         "studserv_share",
#         "admin_share",
#         "research_share",
#         "pubserv_share",
#         "depreciation01")
#var_revenue <-c("tuition03",
#                "tuition_reliance_a1",
#                "govt_reliance_a",
#                "nettuition_share",
#                "tot_rev_w_auxother_sum")
#var_outcome <- c("fte_count",
#                 "fte12mn",
#                 "grad_rate_150_n",
#                 "grad_rate_150_p",
#                 "grad_rate_150_n4yr",
#                 "grad_rate_150_p4yr",
#                 "ftretention_rate",
#                 "ptretention_rate",
#                 "grad_rate_adj_cohort_n",
#                 "grad_rate_adj_cohort_n4yr",
#                 "bachelordegrees",
#                 "masterdegrees",
#                 "doctordegrees",
#                 "firstprofdegrees")
#var_misc <- c("year_total_undergrad",
#              "total_undergraduates",
#              "total_enrollment_multi_tot")
# Check missing rates
Yearly_Missingness(data, var_independent) # looks like number of faculty is going to be unusuable - too many missing values
var_independent <- var_independent[-match(c("ftall1", "ptall1", "ptall2"), var_independent)]
Yearly_Missingness(data, var_outcomes) # Look pretty good!
Yearly_Missingness(data, var_covariates) # Look pretty good!
Yearly_Missingness(data, var_normalizers) # Look pretty good!
# Check summaries
apply(data[var_independent],MARGIN =2,FUN = summary)
apply(data[var_outcomes],MARGIN =2,FUN = summary) # Some very low and very high values - may want to think about dropping those schools
apply(data[var_covariates],MARGIN =2,FUN = summary) # some schools with 0 undergraduates - will want to drop those
apply(data[var_normalizers],MARGIN =2,FUN = summary) # school with 1 FTE - will want to drop those
### Test student service expenditures
pct_education_rev <- (data$instruction01 + data$studserv01)/data$tot_rev_w_auxother_sum
pct_education_rev <- sort(pct_education_rev)
summary(pct_education_rev)
tail(pct_education_rev, 100)
rm(pct_education_rev)
### Total enrollment
summary(data$total_enrollment)
summary(data$total_enrollment_amin_tot)
summary(data$total_enrollment_asian_tot)
summary(data$total_enrollment_black_tot)
summary(data$total_enrollment_hisp_tot)
summary(data$total_enrollment_white_tot)
summary(data$total_enrollment_multi_tot) # A ton of missing
summary(data$total_enrollment_unkn_tot)
summary(data$total_enrollment_nonres_tot)
### Test total expenditures
pct_education_exp <- (data$instruction01 + data$studserv01)/data$total01
summary(pct_education_exp)
pct_education_exp <- sort(pct_education_exp)
tail(pct_education_exp) # a few very high numbers, but looks good otherwise
rm(pct_education_exp)
### ACT and SAT scores
names(data)
test_scores <- names(data)[c(318:331)]
Yearly_Missingness(data, test_scores) # too high - probably unusuable
rm(test_scores)
### CONCLUSIONS
# fte_count and fte12mn are similar - use fte_count = fall enrollment.
# Full-time / Part-time employees too high a missing rate - can't use
# sufficient race data to include as covariate - i.e. under represented minorities black & hispanic
# post 2003 have low levels of missingness in all covariates and possible outcomes - restrict to post 2003
# grad rate variables typically have high missingness ~10% so may not be the best variable.
# ACT/SAT have high missingness rates so can't use
##### MAKE FINAL SAMPLE
### Define ID variables we're going to use
var_ids <- c(
"groupid",
"unitid",
"instname",
"academicyear",
"state",
"census_division"
)
### Define other variables we may  be interested in
var_other <- c(
"associatedegrees",
"masterdegrees",
"doctordegrees",
"firstprofdegrees",
"totaldegrees",
"any_aid_pct",
"instruction_share",
"education_share",
"noneducation_share",
"studserv_share",
"admin_share",
"research_share",
"pubserv_share"
)
### Create dataset with just the variables we need
sample <- data[ ,c(
var_ids,
var_independent,
var_covariates,
var_normalizers,
var_outcomes,
var_other
)]
### Keep only observations starting in 2003
sample <- sample[sample$academicyear>=2003, ]
# Check how many schools we start with in this time range
num_schools <- length(unique(sample$groupid))
num_schools # 553 universities
### Only keep schools from the 50 states plus DC
# Define states in sample
states <- c( "AK", "AL", "AR", "AZ", "CA",
"CO", "CT", "DC", "DE", "FL",
"GA", "HI", "IA", "ID", "IL",
"IN", "KS", "KY", "LA", "MA",
"MD", "ME", "MI", "MN", "MO",
"MS", "MT", "NC", "ND", "NE",
"NH", "NJ", "NM", "NV", "NY",
"OH", "OK", "OR", "PA", "RI",
"SC", "SD", "TN", "TX", "UT",
"VA", "VT", "WA", "WI", "WV", "WY")
Data_Table(sample, "state")
length(unique(sample$state))
# Only keep the states listed above
sample <- sample[sample$state %in% states, ]
length(unique(sample$state))==length(states) # the same - good
# Keep track of how may schools lost / still in sample
num_schools - length(unique(sample$groupid)) # 9 schools dropped
num_schools <- length(unique(sample$groupid)) # replace number of schools we have
num_schools # 544
### Drop schools that didn't exists in entirety of the time range
# Get total number of years for each school
total_yrs <- length(unique(sample$academicyear))
yrs_existed <- data.frame(tapply(sample$academicyear, sample$groupid, function(x){length(x)}))
names(yrs_existed) <- "yrs_existed"
yrs_existed$groupid <- as.numeric(rownames(yrs_existed))
Data_Table(yrs_existed, "yrs_existed") # Will keep large majority of the dataset - 505 schools kept
# Drop schools that have less than the total years
sample <- merge(sample, yrs_existed, all=T)
sample <- sample[sample$yrs_existed==total_yrs, ]
rm(yrs_existed)
# Keep track of how may schools lost / still in sample
num_schools - length(unique(sample$groupid)) # 39 schools dropped
num_schools <- length(unique(sample$groupid)) # replace number of schools we have
num_schools # 505
# Remove yrs_existed from the dataset
sample <- Drop_Var(sample, "yrs_existed")
rm(total_yrs)
### Drop schools that are missing bachelor's degrees or graduation rates, or gave out 0 bachelor's degrees
# Check to see if graduation rate variables are ever part-missing
sum(1 - (is.na(sample$grad_rate_150_n4yr)==is.na(sample$grad_rate_150_p4yr) & is.na(sample$grad_rate_150_n4yr)==is.na(sample$grad_rate_adj_cohort_n4yr)))# 6
sample$tag[(is.na(sample$grad_rate_150_n4yr)==is.na(sample$grad_rate_150_p4yr) & is.na(sample$grad_rate_150_n4yr)==is.na(sample$grad_rate_adj_cohort_n4yr))==FALSE] <- 1
View(sample[sample$tag==1 & !is.na(sample$tag), var_outcomes]) # will just have to drop this university - no way to recover the other two missing variables
sample <- Drop_Var(sample, "tag")
# Tag schools that are missing bachelor's degrees or graduation rates in a year
sample$temp <- 0
sample$temp[is.na(sample$grad_rate_150_p4yr)] <- 1
sample$temp[is.na(sample$bachelordegrees)] <- 1
Data_Table(sample, "temp")
to_drop <- data.frame(tapply(sample$temp, sample$groupid, function(x){max(x)}))
names(to_drop) <- "to_drop"
to_drop$groupid <- as.numeric(rownames(to_drop))
Data_Table(to_drop, "to_drop") # will lose 48 schools
# Drop schools
sample <- merge(sample, to_drop, all=T)
sample <- sample[sample$to_drop!=1, ]
# Keep track of how may schools lost / still in sample
num_schools - length(unique(sample$groupid)) # 48 schools dropped
num_schools <- length(unique(sample$groupid)) # replace number of schools we have
num_schools # 457
# Remove tags from the dataset
sample <- Drop_Var(sample, c("temp", "to_drop"))
rm(to_drop)
### Drop schools that gave out 0 bachelor's degrees in a year or had fewer bachelor's degrees awarded than associate's degrees
# Tag entries where this is the case
sample$temp <- 0
sample$temp[!is.na(sample$bachelordegrees) & sample$bachelordegrees==0] <- 1
sample$temp[!is.na(sample$bachelordegrees) & !is.na(sample$associatedegrees) & sample$bachelordegrees<sample$associatedegrees] <- 1
Data_Table(sample, "temp")
View(sample[sample$temp==1, c("bachelordegrees", "associatedegrees")])
to_drop <- data.frame(tapply(sample$temp, sample$groupid, function(x){max(x)}))
names(to_drop) <- "to_drop"
to_drop$groupid <- as.numeric(rownames(to_drop))
Data_Table(to_drop, "to_drop") # will lose 12 schools
# Drop schools
sample <- merge(sample, to_drop, all=T)
sample <- sample[sample$to_drop!=1, ]
# Keep track of how may schools lost / still in sample
num_schools - length(unique(sample$groupid)) # 12 schools dropped
num_schools <- length(unique(sample$groupid)) # replace number of schools we have
num_schools # 445
# Remove tags from the dataset
sample <- Drop_Var(sample, c("temp", "to_drop"))
rm(to_drop)
### Remove schools that had too high or too low bachelor's degrees awarded
summary(sample$bachelordegrees)
hist(sample$bachelordegrees, breaks=100)
# Some huge values
quantile(sample$bachelordegrees, seq(0, 1, length=201)) # Check top 2.5% - 97.5th percentile is 8285
# Tag schools that have total degrees awarded in top 2.5% of values
sample$temp<- 0
sample$temp[sample$bachelordegrees>=8285.00] <- 1
Data_Table(sample, "temp")
to_check <- data.frame(tapply(sample$temp, sample$groupid, function(x){max(x)}))
names(to_check) <- "to_check"
to_check$groupid <- as.numeric(rownames(to_check))
Data_Table(to_check, "to_check") # 17 schools
# Investigate
sample <- merge(sample, to_check, all=T)
View(sample[sample$to_check==1, c("groupid", "instname", "academicyear", "bachelordegrees", "total_undergraduates")])
# Nothing implausible actually (in relation to total undergraduates on campus) - just keep them
# Remove tags from the dataset
sample <- Drop_Var(sample, c("temp", "to_check"))
rm(to_check)
### Check missing rates of all the variables now
Yearly_Missingness(sample, var_independent)
Yearly_Missingness(sample, var_outcomes)
Yearly_Missingness(sample, var_covariates)
Yearly_Missingness(sample, var_normalizers)
### Drop schools missing covariates / independent variables for the whole time
#function checks if every entry for that group is missing the variable for 13 years (or more)
missing_var <- function(varlist,data){
missing_var <- NULL
for(var in varlist){
tmp <- tapply(data[,var],data$groupid,function(x){
sum(is.na(x))  } )
missing_var <- c(missing_var,rownames(tmp[(tmp >= 13)])) # 13 since we have 13 years here
}
return(missing_var)
}
missing_var(c(var_independent,var_outcomes,var_covariates,var_normalizers),sample)
missing_var(var_independent,sample)
missing_var(var_outcomes,sample)
missing_var(var_covariates,sample) #all missingness for every year contained in these variables.
missing_var(var_normalizers,sample)
length(unique(missing_var(c(var_independent,var_outcomes,var_covariates,var_normalizers),sample)))
#4 schools missing some covariates in every year - drop these schools.
drop <- missing_var(c(var_independent,var_outcomes,var_covariates,var_normalizers),sample)
keep = 1 - (sample$groupid %in% drop)
sample <- sample[(keep==1),]
rm(keep)
rm(drop)
# Keep track of how may schools lost / still in sample
num_schools - length(unique(sample$groupid)) # 4 schools dropped
num_schools <- length(unique(sample$groupid)) # replace number of schools we have
num_schools # 445
### Impute Values where there are very few missing values
# NEED TO ADD CODE HERE
##### MAKE/GET VARIABLES WE NEED FOR ANALYSIS
### Merge with unemployment and GDP
# Load gdp and unemployment datasets
load("data/Working Data/GDP Per Capita 2003-2015.Rda")
load("data/Working Data/Unemployment 2003-2015.Rda")
# Merge
sample <- merge(sample, gdp_data, all=T)
sum(is.na(sample$gdp_per_capita)) # 0
sample <- merge(sample, unemployment_data, all=T)
sum(is.na(sample$unemployment)) # 0
### Generate race % variables
sample$pct_black <- 100*sample$total_enrollment_black_tot / sample$total_enrollment
sample$pct_hisp <- 100*sample$total_enrollment_hisp_tot / sample$total_enrollment
sample$pct_white <- 100*sample$total_enrollment_white_tot / sample$total_enrollment
sample$pct_asian <- 100*sample$total_enrollment_asian_tot / sample$total_enrollment
sample$pct_other <- 100 -(sample$pct_black + sample$pct_hisp + sample$pct_white + sample$pct_asian)
sample$pct_urm <- sample$pct_black + sample$pct_hisp
sample$pct_nonurm <- sample$pct_white + sample$pct_asian
### Create a variable that's the total spent on education
sample$education_ttl <- sample$instruction01 + sample$studserv01
### Transform variables into 2015 dollars and/or per FTE
# Define function to rescale to 2015 dollars for a specified index
inflation_transform <- function(data, varlist, index){
for(var in varlist){
data[ ,paste(var, "_", index, sep ="")] <- data[ ,var]/data[ ,paste(index, "_scalar_2015", sep="")]  # (current yr $) / (current yr $ / 2015 $) = (2015 $)
}
return(data)
}
# Define function to rescale to per FTE
fte_transform <- function(data, varlist){
for(var in varlist){
data[ ,paste(var,"_fte",sep ="")] <- data[ ,var]/data$fte_count
}
return(data)
}
# rescale to 2015 dollars (cpi)
var_cpi_only <- c(
"gdp_per_capita"
)
sample <- inflation_transform(sample, var_cpi_only, "cpi")
# rescale to FTE
var_fte_only <- c(
"bachelordegrees"
)
sample <- fte_transform(sample, var_fte_only)
# Rescale to 2015 dollars key variables (using cpi) AND fte
var_cpi_fte <- c(
"instruction01",
"studserv01",
"education_ttl",
"instruction02",
"grant01",
"state03",
"nettuition01"
)
sample <- inflation_transform(sample, var_cpi_fte, "cpi")
sample <- fte_transform(sample, paste(var_cpi_fte, "_cpi", sep=""))
# Check
names(sample)
sample_allvar <- data[(data$groupid %in% sample$groupid)*(data$academic)]
sample_allvar <- data[(data$groupid %in% sample$groupid)*(data$academic),]
names(sample_allvar)
tmp <- seq(1,10)
tmp
tmp[crep(0,5),rep(1,5))]
tmp[c(rep(0,5),rep(1,5))]
tmp <- c(1,2,3)
tmp[c(TRUE,TRUE,FALSE)]
tmp[c(1,1,0)]
as.logical(c(1,1))
tmp[as.logical(c(1,1))]
tmp[as.logical(c(1,1,0))]
sample_allvar <- data[as.logical((data$groupid %in% sample$groupid)*(data$academic %in% )),]
sample_allvar <- data[as.logical((data$groupid %in% sample$groupid)*
(data$academicyear %in% sample$academicyear )),]
length(unique(sample_allvar$groupid))
length(unique(sample$groupid))
ids <- c("groupid","academicyear")
sample_allvar <- merge(data,sample[,ids],by =ids)
View(sample_allvar)
View(sample_allvar)
help("match")
help(settdiff)
help(setdiff)
setdiff(unique(sample$groupid),unique(sample_allvar$groupid))
setdiff(unique(sample$groupid),unique(sample_allvar$groupid))
unique(sample$groupid)
length(unique(sample_allvar$groupid))
length(unique(sample$groupid))
length(unique(sample$groupid))
debugSource('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/03_General_Cleaning.R', echo=TRUE)
length(unique(sample$groupid))
num_schools
length(unique(sample$groupid))
setdiff(unique(sample$groupid),unique(sample_allvar$groupid))
sample_allvar <- merge(data,sample[,ids],by =ids)
ids <- c("groupid","academicyear")
sample_allvar <- merge(data,sample[,ids],by =ids)
unique(sample_allvar$groupid)
setdiff(unique(sample$groupid),unique(sample_allvar$groupid))
unique(sample$groupid)
unique(sample_allvar$groupid)
unique(sample_allvar$groupid)
unique(sample$groupid)
sample[is.na(sample$groupid),]
debugSource('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/03_General_Cleaning.R', echo=TRUE)
print(length(unique(sample$groupid)) )
debugSource('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/03_General_Cleaning.R', echo=TRUE)
load("data/Working Data/GDP Per Capita 2003-2015.Rda")
load("data/Working Data/Unemployment 2003-2015.Rda")
View(gdp_data)
help9merge
help(merge)
levels(as.factor(gdp_data$state))
levels(as.factor(sample$state))
length(unique(sample$groupid))
sample <- merge(sample, gdp_data, by = c("academicyear","state"))
length(unique(sample$groupid))
sum(is.na(sample$gdp_per_capita)) # 0
rm(list=ls())
debugSource('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/03_General_Cleaning.R', echo=TRUE)
ids <- c("groupid","academicyear")
sample_allvar <- merge(data,sample[,ids],by =ids)
length(unique(sample$groupid))
length(unique(sample_allvar$groupid))
source('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/03_General_Cleaning.R', echo=TRUE)
require(ProjectTemplate)
cache(sample)
cache(sample_allvar)
help("cache")
cache("sample")
cache("sample_allvar")
obvs <- length(unique(sample$groupid))
obvs
levels(sample$census_division)
levels(as.factor(sample$census_division))
blocks <- as.character(levels(as.factor(sample$census_division)))
is.factor(blocks)
blocks[1]
blocks <- as.numeric(levels(as.factor(sample$census_division)))
blocks[1]
tmp <- c(1,2,3)
tmp1 <- c(3,4,5)
tmp %in% tmp1
i=1
block <- sample$groupid[sample$census_division == i]
1=4
i=4
block <- sample$groupid[sample$census_division == i]
block <- unique(sample$groupid[sample$census_division == i])
n_test <- round(length(unique(block))*0.2)
n_train <- length(unique(block)) - n_test
test <- data.frame(groupid = sample(n_test,block), test = 1)
test <- data.frame(groupid = sample(n_test,block))
help(sample)
test <- data.frame(groupid = sample(block,n_test), test = 1)
tets
test
train <- data.frame(groupid = block[as.logical(1-(test %in% block)*1)] ,test =0)
train
split <- unique(sample$groupid)
split <- data.frame(groupid =unique(sample$groupid))
split <- merge(split,test,by = groupid)
names(split)
names(test)
split <- merge(split,test,by = "groupid")
View(split)
split <- merge(split,train,by = "groupid")
View(split)
test_train <- rbind(test,train)
test_train
split <- data.frame(groupid =unique(sample$groupid),test = 0)
View(split)
merge(split,test_train,by = groupid)
merge(split,test_train,by = "groupid")
help(merge)
test
split
split$test[(splt$groupid %in% test)] <- 1
split$test[(split$groupid %in% test)] <- 1
split$groupid[split$test==1]
split$groupid[(split$test==1)]
summary(split$test)
(split$groupid %in% test)
sum((split$groupid %in% test))
test <- sample(block,n_test)
test
split$groupid
split$groupid %in% test
sum(split$groupid %in% test)
which((split$groupid %in% test)==TRUE)
split$test[which((split$groupid %in% test)==TRUE)] <- 1
sum(split$test)
rm(block)
rm(blocks)
rm(list=ls())
source("munge/01_GDP_Cleaning.R")
source("munge/02_Unemployment_Cleaning.R")
source('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/03_General_Cleaning.R', echo=TRUE)
source('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/04_TrainingTest_Split.R', echo=TRUE)
split <- data.frame(groupid =unique(sample$groupid),test=1)
blocks <- as.numeric(levels(as.factor(sample$census_division)))
for(i in blocks){
block <- unique(sample$groupid[sample$census_division == i])
n_test <- round(length(unique(block))*0.2)
n_train <- length(unique(block)) - n_test
test <- sample(block,n_test)
split$test[which((split$groupid %in% test)==TRUE)] <- 1
}
split
split <- data.frame(groupid =unique(sample$groupid),test=0)
blocks <- as.numeric(levels(as.factor(sample$census_division)))
for(i in blocks){
block <- unique(sample$groupid[sample$census_division == i])
n_test <- round(length(unique(block))*0.2)
n_train <- length(unique(block)) - n_test
test <- sample(block,n_test)
split$test[which((split$groupid %in% test)==TRUE)] <- 1
}
split
sum(split$test)
cache("sample_allvar_train")
(sample$groupid %in% split$groupid[test==0])
source('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/04_TrainingTest_Split.R', echo=TRUE)
which((sample$groupid %in% split$groupid[test==0]) == TRUE)
source('~/Academics/UCLA_Academics/Classes/Stats_201B/DeltaCostProject_DataAnalysis/munge/04_TrainingTest_Split.R', echo=TRUE)
levels(as.factor(sample_allvar_test$census_division))
levels(as.factor(sample_allvar_train$census_division))