-
Notifications
You must be signed in to change notification settings - Fork 0
/
incumbent_effect.r
181 lines (147 loc) · 7.23 KB
/
incumbent_effect.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
library(dplyr)
library(tidyr)
library(stringr)
OBAMA_MARGIN_2008 = 7.2
KERRY_MARGIN_2004 = -2.4
## Calculate Partisan Voter Index of each congressional district
## Source: Daily Kos
## The CSV contains the results of elections 2000, 2004, and 2008 by congressional district
PVI_04_08 <- read.csv("president_by_CD_2000_2008.csv") %>% ## only going to use 04-08, can go back an do 00-04 if need be
mutate(diff_08 = Obama - McCain) %>%
mutate(diff_04 = Kerry - Bush..04) %>%
mutate(PVI_08 = diff_08 - OBAMA_MARGIN_2008) %>%
mutate(PVI_04 = diff_04 - KERRY_MARGIN_2004) %>%
mutate(PVI = (PVI_08 + PVI_04)/2) %>%
rename(D = CD) %>%
mutate(D = str_pad(as.character(D), 2, side = "left", "0")) %>%
mutate(D = ifelse(D == "AL", "00", D)) %>%
mutate(CD = paste(State, D, sep="-"))
OBAMA_MARGIN_2008 = 7.2
OBAMA_MARGIN_2012 = 3.9
## Same deal but with 2010 districts
PVI_08_12 <- read.csv("2008_2012_pres_by_2012_districts.csv") %>%
mutate(diff_2012 = Obama.2012 - Romney.2012) %>%
mutate(diff_2008 = Obama.2008 - McCain.2008) %>%
mutate(PVI_2008 = diff_2008 - OBAMA_MARGIN_2008) %>%
mutate(PVI_2012 = diff_2012 - OBAMA_MARGIN_2012) %>%
mutate(PVI = (PVI_2008 + PVI_2012)/2) %>%
separate(CD, into = c("ST", "D"), sep = "-") %>%
mutate(D = ifelse(D == "AL", "00", D)) %>%
unite(CD, ST, D, sep = "-")
PVI_list = list()
PVI_list[[2004]] = PVI_04_08
PVI_list[[2006]] = PVI_04_08
PVI_list[[2008]] = PVI_04_08
PVI_list[[2010]] = PVI_04_08
PVI_list[[2012]] = PVI_08_12
PVI_list[[2014]] = PVI_08_12
generic_ballots = list()
## Source: RCP
generic_ballots[["2014"]] = -5.7
generic_ballots[["2012"]] = 1.2
generic_ballots[["2010"]] = -6.8
generic_ballots[["2008"]] = 10.7
generic_ballots[["2006"]] = 7.9
generic_ballots[["2004"]] = -2.6
state_abbreviations = cbind(state.abb, state.name) %>%
data.frame() %>%
mutate(state.abb = as.character(state.abb)) %>%
mutate(state.name = as.character(state.name))
# Annoyingly, some state democratic parties aren't "DEM" so this is a fix for that
non_dem_D = c("DCG", "IDA", "IDE", "IDP", "IND", "PPD") #easiest way to get to just rep/dem
# CSV file for each year of congressional elections back to 2004
# Source: FEC
races_04_08 <- list.files(path = "past_elections/")
# Open each year of races, do a lot of cleaning up, and get each one in a nice dataframe
race_df_list = list()
for (csv in races_04_08){
# get the year from the name, and the PVI & generic ballot from the year
year = substr(csv, 1,4)
PVI_df <- PVI_list[[as.numeric(year)]]
GENERIC_BALLOT = generic_ballots[[year]]
races_df <- read.csv(paste("past_elections/", csv, sep="")) %>%
filter(GENERAL != "") %>%
mutate(PARTY = sub("*", "", PARTY)) %>%
filter((PARTY %in% non_dem_D) == F) %>%
mutate(PARTY = ifelse(grepl("D",PARTY), "DEM", "REP")) %>%
filter(PARTY %in% c("DEM", "REP")) %>%
mutate(GENERAL.. = as.numeric(sub("%", "", GENERAL..))) %>%
mutate(GENERAL.. = ifelse(is.na(GENERAL..), 0, GENERAL..)) %>%
filter(GENERAL != "Unopposed") %>%
filter(TOTAL.VOTES == "") %>%
filter(FEC.ID. != "n/a") %>%
rename(X.I. = INCUMBENT.INDICATOR..I.) %>%
rename(D = DISTRICT) %>%
filter(D != "S") %>%
mutate(D = str_pad(as.character(D), 2, side = "left", "0")) %>%
select(`D`, `STATE`, `STATE.ABBREVIATION`, `PARTY`, `GENERAL..`, `X.I.`) %>%
mutate(key = paste(D, STATE.ABBREVIATION, PARTY, sep = "-"))
incumbent_party <- filter(races_df, GENERAL.. > 0) %>%
group_by(D, STATE) %>%
mutate(I.P. = ifelse(X.I. == "(I)", PARTY, "")) %>%
filter(I.P.!= "") %>%
select(`I.P.`, `D`, `STATE`)
# Some races have multiple incumbents, these need to get ignored
# It happens when districts merge, and is pretty rare
multiple_i <- group_by(incumbent_party, D, STATE) %>%
summarize(n = n()) %>%
filter(n > 1) %>%
mutate(key = paste(D, STATE, sep = "-"))
incumbent_party <- mutate(incumbent_party, I.P. = ifelse(paste(D, STATE, sep="-") %in% multiple_i$key, NA, I.P.))
# Get the margin (dem - rep) of each district
# we use a margin > 95% to weed out uncontested races
races_diff <- group_by(races_df, D, STATE, PARTY) %>% summarise(general = max(GENERAL..)) %>%
spread(PARTY, general) %>%
filter(!is.na(DEM)) %>%
filter(!is.na(REP)) %>%
mutate(diff = DEM - REP ) %>%
filter(abs(diff) < 95) %>% # filters out omst unopposed
ungroup() %>%
left_join(incumbent_party, by=c("D" = "D", "STATE" = "STATE"))
races_diff <- inner_join(races_diff, state_abbreviations, by=c("STATE" = "state.name")) %>%
mutate(CD = paste(state.abb, D, sep = "-")) %>%
mutate(CD = sub(" ", "", CD)) %>%
left_join(select(PVI_df, `PVI`, `CD`), by="CD") %>%
unique()
race_PVI <- races_diff %>%
select(`CD`, `I.P.`, `diff`, `PVI`) %>%
mutate(year = substr(csv, 1,4)) %>%
mutate(I.P. = ifelse(is.na(I.P.), "OPN", I.P.)) %>%
mutate(PVI_expect = PVI + GENERIC_BALLOT)
# This has for each year the actual difference and the difference based on PVI + Generic Ballot
race_df_list[[csv]] <- race_PVI
}
# combine all races into a single dataframe
combined_2004_2014 <- do.call("rbind", race_df_list) %>%
mutate(PVI_miss = diff - PVI_expect) %>%
filter(abs(diff) < 20) %>%
filter(abs(PVI_expect) < 20)
# How many incumbents would win by just PVI + generic ballot
winners_by_incumbent <- combined_2004_2014 %>% group_by(CD, I.P., year) %>% summarize(margin = max(PVI_expect)) %>%
mutate(inc = ifelse(I.P. != "OPN", T, F)) %>%
mutate(inc_win = ifelse(inc, ifelse(I.P. == "REP", (margin < 0), (margin > 0)), F)) %>%
filter(inc) %>%
group_by(inc_win) %>% summarize(n = n())
# For the three types of races, open, dem incumbent, rep incumbent, plots PVI_expect to actual difference
p <- ggplot(combined_2004_2014, aes(x = PVI_expect, y = diff, color = I.P.)) + geom_point()
p <- p + stat_smooth(method = "lm")
p
## Below are the linear models for PVI_expect against actual margin
## Ideally, the models would have good R^2 and I could just plug in the 2016 PVI_expect & incumbent
## The R^2 are low, however, below 0.33 for all of them.
## I think a sample size going back further would help this
## Generic ballot polling error has varied a lot, and years in which all Dems or all Reps outperform have happened and will reduce significance
## In the end, I just used these data for the intercepts, to figure out the bump given to an incumbent
## Hopefully I'll go back and find a more emperically derived formula at some point.
## Had I started this more than 2 weeks before the election, it wouldn't be so sloppy...
pvi_expect_fit <- lm(diff ~ PVI_expect, data = combined_2004_2014)
summary(pvi_expect_fit)
pvi_effect_open <- lm(diff ~ PVI_expect, data = filter(combined_2004_2014, I.P. == "OPN"))
summary(pvi_effect_open)
pvi_effect_dem <- lm(diff ~ PVI_expect, data = filter(combined_2004_2014, I.P. == "DEM"))
summary(pvi_effect_dem)
pvi_effect_rep <- lm(diff ~ PVI_expect, data = filter(combined_2004_2014, I.P. == "REP"))
summary(pvi_effect_rep)
group_by(combined_2004_2014, year, I.P.) %>% summarize(med = median(PVI_miss, na.rm = T))
group_by(combined_2004_2014, I.P.) %>% summarize(med = median(PVI_miss, na.rm = T), sd = sd(PVI_miss, na.rm=T), n = n())
wt_inc_sd_mean = weighted.mean(c(15.80872, 11.86640), c(952, 993))