-
Notifications
You must be signed in to change notification settings - Fork 27
/
eurovision_scraping.R
67 lines (51 loc) · 2.04 KB
/
eurovision_scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# required libraries
library(rvest)
library(xml2)
library(dplyr)
#' Get Eurovision final results from history
#'
#' @param year A numeric in the form YYYY
#' @return A dataframe of Eurovision results
#' @examples get_eurovision(1974)
get_eurovision <- function(year) {
# get url from input and read html
input <- paste0("https://en.wikipedia.org/wiki/Eurovision_Song_Contest_", year)
chart_page <- xml2::read_html(input, fill = TRUE)
# scrape data from any sortable table
chart <- chart_page %>%
rvest::html_nodes("#mw-content-text") %>%
xml2::xml_find_all("//table[contains(@class, 'sortable')]")
charts <- list()
chartvec <- vector()
for (i in 1:length(chart)) {
assign(paste0("chart", i),
# allow for unexpected errors but warn user
tryCatch({rvest::html_table(chart[[i]], fill = TRUE)}, error = function (e) {print("Potential issue discovered in this year!")})
)
charts[[i]] <- get(paste0("chart", i))
# only include tables that have Points
chartvec[i] <- sum(grepl("Points", colnames(get(paste0("chart", i))))) == 1 & sum(grepl("Category|Venue|Broadcaster", colnames(get(paste0("chart", i))))) == 0
}
results_charts <- charts[chartvec]
place_col <- colnames(results_charts[[1]])[grepl("Place", colnames(results_charts[[1]]))]
# account for move to semifinals and qualifying rounds
if (year < 1956) {
stop("Contest was not held before 1956!")
} else if (year == 1956) {
stop("Contest was held in 1956 but no points were awarded!")
} else if (year %in% c(1957:1995)) {
results_charts[[1]] %>%
dplyr::arrange(!!place_col)
} else if (year == 1996) {
results_charts[[2]] %>%
dplyr::arrange(!!place_col)
} else if (year %in% 1997:2003) {
results_charts[[1]] %>%
dplyr::arrange(!!place_col)
} else if (year %in% 2004:2007) {
results_charts[[2]] %>%
dplyr::arrange(!!place_col)
} else if (year == 2020) {
stop("The Eurovision Song Contest was cancelled in 2020 due to the COVID-19 pandemic.")
}
}