-
Notifications
You must be signed in to change notification settings - Fork 2
/
github export functions.Rmd
181 lines (152 loc) · 6.36 KB
/
github export functions.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
---
title: Functions for the Shiny Application Using Twitter Data
author: Nipunjeet Gujral
data: May 27th, 2018
output:
html_document:
theme: spacelab
highlight: neon
---
##### Setup
```{r libraries, message=FALSE, warning=FALSE}
# markdown related
library(rmarkdown)
# library to attach to twitter API
library(twitteR)
# library to manipulate data
library(dplyr)
library(stringr)
library(lubridate)
library(tidytext)
library(tm)
# library to visualize data
library(wordcloud)
library(RColorBrewer)
library(plotly)
library(ggplot2)
# shiny related
library(shiny)
library(shinydashboard)
```
```{r authentication, echo=FALSE, message=FALSE, warning=FALSE}
authentication <- function(x){
consumer_key = "______"
consumer_secret = "___"
access_token = "______"
access_secret = "_____"
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
}
```
```{r froming tweet df, message=FALSE, warning=FALSE, echo=FALSE}
pull_tweets <- function(twitter_handle, n){
tweets <- userTimeline(user = twitter_handle, # pull tweets
n = n,
excludeReplies = TRUE) %>% # source tweets
twListToDF() %>% # reformat into dataframe
dplyr::select(text, created, retweetCount, favoriteCount, statusSource, id) %>% # select useful rows
dplyr::mutate(ratio = retweetCount/favoriteCount, # ratio: retweet/favorite
new_tz = format(created, tz = "America/Los_Angeles"), # new time zone
day_of_week = wday(new_tz, label = TRUE), # day of the week
hour = hour(new_tz), # hour
month = month(new_tz)) # month
return(tweets)
}
```
##### Analysing Tweeting Habits
```{r tweets per day, message=FALSE, warning=FALSE}
# create a bar chart depicting the days tweets are created
tweets_per_day <- function(temp_tweets){
freq_table <- data.frame(table(temp_tweets$day_of_week)) # forming a dataframe of frequncy per day
plot <- plot_ly(x = freq_table$Var1, # plotting
y = freq_table$Freq,
type = "bar") %>%
layout(title = "Tweets per Day of the Week")
return(plot)
}
```
```{r tweets per hour, message=FALSE, warning=FALSE}
tweets_per_hour <- function(temp_tweets){
# create a line plot depicting the time at which a tweet was created
freq_table <- data.frame(table(temp_tweets$hour)) # forming data frame of frequency per hour
plot <- plot_ly(x = freq_table$Var1, # plotting
y = freq_table$Freq,
type = "scatter",
mode = "line+markers") %>%
layout(title = "Distribution of Tweets in a Day(PDT Time)",
xaxis = list(title = "Hour (PDT)"),
yaxis = list(title = "Frequency"))
return(plot)
}
```
```{r soure piechaart, message=FALSE, warning=FALSE}
tweet_source <- function(temp_tweets){
# create a pie graphs depicting the source from which the tweet was sent
temp <- str_extract(temp_tweets$statusSource, ">(.*)<") # selecting all characters between "><"
temp <- gsub('<', "", temp) # removing the "<"
temp <- gsub('>', "", temp) # removing the ">"
temp_df <- data.frame(table(temp)) # tabling the frequency of sources
colnames(temp_df) <- c("source", "freq") # changing column names accorindingly
pie <- plot_ly(data = temp_df, # plotting
labels = ~source,
values = ~freq,
type = "pie",
textinfo = 'label+percent',
textposition = 'inside')
return(pie)
}
```
##### Analysing User Engagment
```{r retweet vs favorited, message=FALSE, warning=FALSE}
fav_vs_retweet <- function(temp_tweets) {
# illustrate the point the fact that user engagment mearured via Retweet Count and Favorited Count have a strong, posative correlation
fit <- lm(temp_tweets$retweetCount ~ temp_tweets$favoriteCount) # linear regression betwenn RT and Fav
plot <- plot_ly(data = temp_tweets, # plotting
x = ~favoriteCount,
y = ~retweetCount,
type = "scatter") %>%
add_lines(x = ~favoriteCount,
y = fitted(fit)) %>%
layout(title = "Retweet vs. Favorited Count",
xaxis = list(title = "Favorited Count"),
yaxis = list(title = "Retweet Count"))
return(plot)
}
```
```{r ratio vs created, message=FALSE, warning=FALSE}
ratio_vs_time <- function(temp_tweets){
# illustrate how the twitter API samples tweets according to time
plot <- plot_ly(data = temp_tweets, # plotting ratio vs. time
x = ~created, y = ~ratio,
type = "scatter")
return(plot)
}
```
```{r retweet and favorited histogramn, message=FALSE, warning=FALSE}
distributions <- function(temp_tweets){
# analyse the distributions of the Retweet Count and Favorited Count metrics
plot <- plot_ly(data = temp_tweets, # plotting both RT and Fav distribution
alpha = 0.6) %>%
add_histogram(x = ~retweetCount, name = "Retweet") %>%
add_histogram(x = ~favoriteCount, name = "Favorited") %>%
layout(barmode = "overlay",
xaxis = list(title = " "),
title = "Distributions of Retweet and Favorited Counts")
return(plot)
}
```
##### Outputs
```{r Function Calls, message=FALSE, warning=FALSE}
# setup
authentication()
temp_tweets <- pull_tweets("elonmusk", 1000)
View(temp_tweets)
# time analysis
tweets_per_day(temp_tweets)
tweets_per_hour(temp_tweets)
ratio_vs_time(temp_tweets)
# variable distribution
fav_vs_retweet(temp_tweets)
distributions(temp_tweets)
# source
tweet_source(temp_tweets)
```