euro_model.Rmd

---
title: "Euro 2021 Model"
---


```{r data-prep}
library(tidyverse)
library(readxl)
library(cmdstanr)
library(tidybayes)
library(posterior)
library(viridis)
library(pROC)
cmdstanr::register_knitr_engine(override = F)

#Load in data

uefa_nations = read_csv('data/uefa_nations_league_results.csv')

match_day_1 = read_xlsx('predictions/predictions_day_1.xlsx') %>% 
              drop_na() %>% 
              mutate(match = map2_chr(team1, team2, ~c(.x, .y) %>% sort %>% paste(collapse = 'v.')))

match_day_2 = read_xlsx('predictions/predictions_day_2.xlsx') %>% 
              drop_na() %>% 
              mutate(match = map2_chr(team1, team2, ~c(.x, .y) %>% sort %>% paste(collapse = 'v.')))

match_day_3 = read_xlsx('predictions/predictions_day_3.xlsx') %>% 
              drop_na() %>% 
              mutate(match = map2_chr(team1, team2, ~c(.x, .y) %>% sort %>% paste(collapse = 'v.')))

# COndition on match day 1 results
euro_data = read_csv('data/qualifying_round_games.csv') %>% 
            bind_rows(uefa_nations) %>% 
            bind_rows(match_day_1) %>% 
            bind_rows(match_day_2) %>% 
            bind_rows(match_day_3)

ranking_data = read_csv('data/rankings.csv') %>% 
               mutate(prior_score = (elo_march_2019 - mean(elo_march_2019))/sd(elo_march_2019)) %>% 
               arrange(team)


# extract data for model
teams = ranking_data$team
nteams = length(teams)
ngames = nrow(euro_data)
team1 = match(euro_data$team1, teams)
team2 = match(euro_data$team2, teams)
score1 = euro_data$score1
score2 = euro_data$score2
# Used for some models, not all
df = 7
b_mean = 0
b_sd = 0.05
prior_score = ranking_data$prior_score

# Store data in a list to pass to Stan
model_data = list(
  nteams = nteams,
  ngames = ngames,
  team1 = team1,
  team2 = team2,
  score1 = score1,
  score2 = score2,
  df = df,
  prior_score = prior_score,
  b_mean = b_mean,
  b_sd = b_sd
)

```


```{r model-fit}
# Instantiate model and run sampling.
model = cmdstan_model('models/euro_raw_dif.stan')
fit = model$sample(model_data, parallel_chains=4, seed=19920908)
```

```{r}
fit$draws('a') %>% 
  as_draws_df %>% 
  spread_draws(a[i]) %>% 
  mean_qi %>% 
  bind_cols(ranking_data) %>% 
  arrange(desc(elo_march_2019)) %>% 
  ggplot(aes(a, fct_reorder(team, a)))+
  geom_point()
```


```{r predict-funcs}

a = fit$draws('a') %>% as_draws_df
sigma_y = fit$draws('sigma_y')
est_df = fit$draws('df')

goal_diff = function(teamA, teamB, do_round=T){
  set.seed(0)
  ixa = match(teamA, str_to_title(teams))
  ixb = match(teamB, str_to_title(teams))
  ai = a[, ixa]
  aj = a[, ixb]
  random_outcome = (ai - aj) + rt(nrow(ai-ai), est_df)*sigma_y
  if(do_round){
    round(pull(random_outcome))
  }
  else{
    pull(random_outcome)
  }
  
  
}

prob_win = function(teamA, teamB){
  random_outcome = goal_diff(teamA, teamB)
  mean(random_outcome>0)
}

predict = function(teamA, teamB){
  gd = goal_diff(teamA, teamB)
  outcome_space = tibble(outcome = c('team1wins', 'team2wins', 'draw'),
                         result = c(1, -1, 0))
  
  gdr = case_when(gd<0~-1, gd>0~1, T~0)
  
  tibble(result = gdr) %>% 
    right_join(outcome_space) %>% 
    group_by(outcome) %>% 
    summarise(n = n()) %>% 
    mutate(n = n/sum(n)) %>% 
    spread(outcome, n)
  
}


predict_no_draw = function(teamA, teamB){
  
  gd = goal_diff(teamA, teamB)
  #No draws in round of 16
  # This is a hack
  gd = gd[gd!=0]

  tibble(team1wins=mean(gd>0), team2wins=mean(gd<0))
  
}
```


```{r}
# Construct groups for predicting results of the group stage
group_a_teams = tibble(team = c('Italy','Switzerland','Turkey','Wales'), group = 'A')
group_b_teams = tibble(team = c('Belgium','Denmark','Finland','Russia'), group = 'B' )
group_c_teams = tibble(team = c('Austria','Netherlands','North Macedonia', 'Ukraine'), group = 'C')
group_d_teams = tibble(team = c('Croatia', 'Czech', 'England', 'Scotland'), group = 'D' )
group_e_teams = tibble(team = c('Poland', 'Slovakia','Spain', 'Sweden'), group = 'E' )
group_f_teams = tibble(team = c('France','Germany','Hungary','Portugal'), group = 'F')

# Combine individual group dataframes into a single dataframe.
groups = bind_rows(group_a_teams, group_b_teams, group_c_teams, group_d_teams, group_e_teams, group_f_teams) %>% 
         mutate(i = seq_along(team))


plot_data = full_join(groups, groups, by='group') %>% 
  filter(i.x!=i.y) %>%
  mutate(p_x_win = map2_dbl(team.x, team.y, prob_win),
         match = map2_chr(team.x, team.y, ~c(.x, .y) %>% sort %>% paste(collapse = 'v.') )) %>% 
  rename(Group = group)  %>% 
  mutate(p_x_win = map2_dbl(match, p_x_win, ~if_else(.x %in% c(match_day_1$match, match_day_2$match), NaN, .y)))
  

group_plot = plot_data %>% 
  ggplot(aes(team.y, team.x, fill = p_x_win))+
  geom_tile(size = 1, color = 'black')+
  geom_text(aes(label = scales::percent(p_x_win, accuracy = 0.1)), color = if_else(plot_data$p_x_win<0.5, 'white','black' ), size = 6)+
  facet_wrap(~Group, scales = 'free', labeller = label_both)+
  scale_fill_continuous(type='viridis',labels = scales::percent, na.value = 'black')+
  theme(aspect.ratio = 1,
        panel.background = element_blank(),
        strip.background = element_rect(fill = 'black'),
        strip.text = element_text(color = 'white', size = 12),
        plot.title = element_text(size = 22),
        plot.subtitle = element_text(size = 12),
        panel.spacing = unit(2, "lines")
        )+
  labs(y='', 
       x = '',
       title = 'Euro 2020',
       fill = 'Win Probability',
       subtitle = 'Probability Team on y Axis Beats Team on x Axis Conditioned on First Two Match Days')+
  guides(fill = F)

ggsave('group_predictions.png', group_plot, dpi = 400, height = 10, width = 15)
  

```


```{r round-of-16}

team1 = c('Wales','Italy','Netherlands','Belgium','Croatia','France', 'England', 'Sweden')
team2 = c('Denmark','Austria','Czech','Portugal', 'Spain', 'Switzerland', 'Germany','Ukraine')

tibble(team1, team2) %>% 
  mutate(preds = map2(team1, team2, predict_no_draw)) %>% 
  unnest(preds) 
  write_csv('predictions/ro16.csv')
```