First is to develop a Twitter developer account, once this is achieved, then the security credential codes will then provide by the platform and is ready for the actual scraping process.
getwd()
##Twitter Web scraping package
#library (rtweet) #library(openxlsx) #library(xlsx) #library(twitteR) #library(ROAuth) #library (rtweet)
twitter_token <- create_token( app = "Trump get_tweets script", consumer_key = "Key", consumer_secret = "Key", access_token = 'Key', access_secret = 'Key', set_renv = FALSE)
Trump<- search_tweets("Trump", n=1000, include_rts=FALSE, retryonratelimit=TRUE, lang="en")
Trump<-twListToDF(Trump) View(Trump)
Next is to export #Trump data frame data into a workable excel file.
export(Trump,"DonaldTrump_Tweets.xlsx")
Import #Trump data to working directory
library(readxl) Trump <- read_excel("DonaldTrump_Tweets.xlsx", sheet = "Sheet 1") View(Trump)
install.packages("rtweet") install.packages("SnowballC") install.packages('devtools') install.packages("slam") install.packages("wordcloud") install.packages('tm') install.packages("dplyr") install.packages("tidytext") install.packages("ggplot2") install.packages("forestmangr") install.packages("multcomp") install.packages("purrr") install.packages("twitteR", dependencies = TRUE) install.packages("party", dependencies = TRUE) library (rtweet) library(SnowballC) library(devtools) library(slam) library(wordcloud) library(tm) library(dplyr) library(tidytext) library(ggplot2) library(forestmangr) library(multcomp) library(twitteR) library(party) library(tidyverse)
#----------------------------------
Initially, the “created_at” column having multiple information, next is to change the class type “POSIXct” to “as.date”. This can immediately take away the timing information.
Trump$created_at<-as.Date(Trump$created_at)
Subset “Twitter” and “for” words from source column and turn that into empty character, by using the function “str_replace”
Trump$source <- str_replace(Trump$source, "for", "")
Trump$source <- str_replace(Trump$source, "Twitter", "")
Here, we have prepared two different methods for subset the punctuation and symbols from the text (tweet message) columns.
By using the function (gsub) to subset any unnecessary symbols or punctuation from the “text” columns from Trump dataset. As shown in Figure X, all the necessary characters have been removed.
Trump$text <- gsub("https\S*", "", Trump$text) Trump$text <- gsub("@\S*", "", Trump$text) Trump$text <- gsub("amp", "", Trump$text) Trump$text <- gsub("[\r\n]", "", Trump$text) Trump$text <- gsub("[[:punct:]]", "", Trump$text)
Trump$quoted_location<- gsub("https\S*", "", Trump$quoted_location) Trump$quoted_location <- gsub("@\S*", "", Trump$quoted_location) Trump$quoted_location <- gsub("amp", "", Trump$quoted_location) Trump$quoted_location <- gsub("[\r\n]", "", Trump$quoted_location) Trump$quoted_location <- gsub("[[:punct:]]", "", Trump$quoted_location)
Trump$source<- gsub("https\S*", "", Trump$source) Trump$source <- gsub("@\S*", "", Trump$source) Trump$source <- gsub("amp", "", Trump$source) Trump$source <- gsub("[\r\n]", "", Trump$source) Trump$source <- gsub("[[:punct:]]", "", Trump$source)
• Use the Function (tm), first is to create a corpus by inputting the Text column into the character vectors.
#Option2
library(tm)
#build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(Trump$text))
#convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
#remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
#remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
#remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp") myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpusCopy <- myCorpus
myCorpus <- tm_map(myCorpus, stemDocument) # stem words
The actual dataset has cleaned in two methods, there are slightly differences on each output, here we will look into the tweet number #8534, both methods are used function (writeLines) to review the tweet content.
#with Corpus writeLines(strwrap(myCorpus[[8534]]$content, 60))
#without 2nd way of working
writeLines(strwrap(Trump$text[8534], 60))
writeLines(strwrap(Trump$text[8534], 60))
Both results have removed the punctuation, empty spaces and special characters. However, in method 1 we can see all the letters are in lower case but the characters are automatically corrected and completed by the software. In method 2, all the characters are in lower case, but we can see the word ‘family’ is not corrected.
Trump_C <- Trump[Trump$is_retweet==FALSE, ]
Trump_C<- subset(Trump_C, is.na(Trump_C$reply_to_status_id))
Trump_C <- Trump_C %>% arrange(-favorite_count) Trump_C[1,5] Trump_C <- Trump_C %>% arrange(-retweet_count) Trump_C[1,5]
Trump_retweets <- Trump[Trump$is_retweet==TRUE,]
Trump_replies <- subset(Trump, !is.na(Trump$reply_to_status_id))
data <- data.frame( category=c( "Retweets", "Replies"), count=c( 192, 120) )
#-------------------------------------------------------------------------
unique(Trump$quoted_location)
406 unique share locations
length(unique(Trump$quoted_location))
Trump %>% count(quoted_location, sort = TRUE) %>% mutate(quoted_location = reorder(quoted_location, n)) %>% top_n(20) %>% ggplot(aes(x = quoted_location, y = n)) + geom_col() + coord_flip() + labs(x = "Location", y = "Count", title = "Top 20 locations of Twitter users")
Trump$quoted_location <- as.character(Trump$quoted_location)
set.seed(1234) wordcloud(Trump$quoted_location, min.freq=4, scale=c(5, .5), random.order=FALSE, rot.per=0.3, colors=brewer.pal(8, "Dark2"))
#==========================================================
unique(Trump$source)
#130 unique device types length(unique(Trump$source))
Trump %>% count(source, sort = TRUE) %>% mutate(source = reorder(source, n)) %>% top_n(20) %>% ggplot(aes(x = source, y = n)) + geom_col() + coord_flip() + labs(x = "Source", y = "Count", title = "Top 20 type of sources")
Trump$source <- as.character(Trump$source)
set.seed(1234) wordcloud(Trump$source, min.freq=4, scale=c(5, .5), random.order=FALSE, rot.per=0.3, colors=brewer.pal(8, "Dark2"))
#Remove any stop words
tweets <- Trump %>% select(text) %>% unnest_tokens(word, text) tweets <- tweets %>% anti_join(stop_words) %>% filter(!word =="trump" ) %>% filter(!word =="win" ) %>% filter(!word =="fake") %>% filter(!word =="war" ) %>% filter(!word =="bomb" ) %>% filter(!word =="im" ) %>% filter(!word =="it's" ) %>% filter(!word =="i'm " )
tweets %>% count(word, sort = TRUE) %>% top_n(20) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(x = word, y = n)) + geom_col() + xlab(NULL) + coord_flip() + labs(y = "Count", x = "Unique words", title = "Most frequent words found in the tweets", subtitle = "Stop words removed from the list")
tweets <- as.character(tweets) tweets <- gsub("c\(", " ", tweets) set.seed(1234) wordcloud(tweets, min.freq=4, scale=c(5, .1), random.order=FALSE, rot.per=.1, colors=brewer.pal(9, "Set2"))
#library(tidytext) #library(stringr) #library(dplyr) #library(janeaustenr)
#nrc emotion lexicon get_sentiments("nrc") #nrc emotion #negative or positive get_sentiments("bing") #with score get_sentiments("afinn")
library(ggplot2) ggplot(data=sentimentscores,aes(x=sentiment,y=Score))+ geom_bar(aes(fill=sentiment),stat = "identity")+ theme(legend.position="none")+ xlab("Sentiments")+ylab("Scores")+ ggtitle("Total sentiment based on scores")+ theme_minimal()
Generally falling onto the negative side as words of negative, fear, anger, and sadness are presented and there are less than 200 scores which is joyful. Based on the statistic, majority of these outputs are coming from US citizen and there are clear disappointed voices are lying under these numbers. However, the indictors in blue (positive) and pink (trust) are in locating on the opposite location, where seemed to be Donald Trump’s supporters.