-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapping.Rmd
121 lines (91 loc) · 2.6 KB
/
scrapping.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
---
title: "scrapping bilheteria"
subtitle: "Junto com alguns tratamentos"
author: "Guilherme dos Santos"
date: "1 de junho de 2019"
output: html_document
---
```{r setup, include=FALSE}
library(rvest)
library(dplyr)
library(readr)
library(lubridate)
library(purrr)
library(forecast)
```
```{r funcao}
scrapping_bilheteria <- function(i){
mojo <- paste0("https://www.boxofficemojo.com/monthly/?view=releasedate&chart=bymonth&month=",
as.character(i),
"&view=releasedate")
box_office <- read_html(mojo)
html_nodes(box_office, css = "center table") -> tabela
tabela_boxoffice <- html_table(tabela, header = T)
tabela_boxoffice <- tabela_boxoffice[[1]]
tabela_boxoffice %>%
rename(bilheteria = `Total Gross`,
Ano = Year,
filmes = Movies,
media = Avg.) %>%
mutate(Mes = month.abb[i],
bilheteria = parse_number(bilheteria),
media = parse_number(media)) %>%
select(Ano, Mes, bilheteria, filmes, media) -> tabela_boxoffice
tabela_boxoffice
}
```
```{r}
meses <- lapply(1:12, scrapping_bilheteria)
meses <- reduce(meses, rbind.data.frame)
meses %>%
mutate(Mes = ymd(paste0(Ano, "-", Mes, "-01"))) %>%
select(-Ano) %>%
arrange(Mes) -> meses
series <- ts(data = meses$bilheteria, start = c(1982,1), end = c(2019, 5), frequency = 12)
monthplot(series)
```
```{r}
boxplot(meses$bilheteria ~ month(meses$Mes))
```
```{r inclue = FALSE, eval = FALSE}
# aa <- diff(series)
# plot(aa)
# acf(aa)
# pacf(aa)
# acf(aa, lag = 60)
# pacf(aa, lag = 60)
# aa <- diff(aa, lag = 12)
# plot(aa)
# acf(aa, lag = 60)
# pacf(aa, lag = 60)
# parece ser esse modelo
modelo <- arima(series, order = c(0,1,1), seasonal = list(order = c(0,1,1), period = 12))
```
```{r}
library(dygraphs)
dygraph(series) %>% dyRangeSelector()
```
```{r}
bdif <- diff(series)
plot(bdif)
acf(bdif, lag = 60)
pacf(bdif, lag = 60)
bdif <- diff(bdif, lag = 12)
plot(bdif)
acf(bdif, lag = 60)
pacf(bdif, lag = 60)
```
Aparentemente a parte não sazonal é um MA(1) (dps de uma diferença) e a parte sazonal é 1 MA 1 tbm com uma diferenca, então seria um $SARIMA(0,1,1)\times(0,1,1)_{12}$
```{r}
# separei de 2017 em diante para previsão
parcela_estimacao <- window(series, start = 1982, c(2016, 12))
modelo <- arima(parcela_estimacao, order = c(0,1,1), seasonal = list(order = c(0,1,1), period = 12))
prev <- forecast(modelo)
plot(prev)
```
## Resíduos
```{r}
residuos <- residuals(modelo)
plot(residuos)
acf(residuos)
```