-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLanguage - Influenced.R
179 lines (141 loc) · 5.56 KB
/
Language - Influenced.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
library(readxl)
library(dplyr)
library(writexl)
library(tm)
library(tidyverse)
library(tidytext)
library(gt)
##############################################################
# Load Data
##############################################################
impData <-read_excel("./MasterRecord.xlsx",1)
##############################################################
# Create Filter List
##############################################################
filtFolders <-impData%>%distinct(REF)
#remove NA
filtFolders <-filtFolders[!(is.na(filtFolders$REF)),]
filtFolders$REF <-str_c("./",filtFolders$REF,"/") #format as filepaths
##############################################################
# Build a list of Files
##############################################################
for (r in filtFolders)
{
filenames <- list.files(r, pattern="*.txt", full.names=TRUE)
}
##############################################################
# strip the extension
##############################################################
#create empty dataframe
dfText <- data.frame(matrix(ncol=3,nrow=0))
colnames(dfText) <- c('text','REF','sub_Ref') #future tidy up column names
#use a loop to populate data frame
for (i in 1:length(filenames))
{
#get text and remove carriage returns
strWord <-read_file(filenames[i])
strWord <-gsub("[\r\n]", " ", strWord)
strPath <-str_extract_all(filenames[i],"(?<=./).+(?=//)")
strFile <-str_sub(sub(".txt","",filenames[i]),-4)
#add_Row
dfText[nrow(dfText) + 1,] <- c(strWord,strPath,strFile)
}
############### BRING IN DATA ABOUT WHETEHR INFLUENCED
#//RSJS note bringing in as a join so code is here for future usage.
dfTextMerge <-inner_join(dfText,impData, by=c("REF","sub_Ref"))
############### Get the columns we want
#RSJS need to tidy up copy column naming, hence here using original_author
# for futur dev make this clearer
dfTextMerge <-dfTextMerge%>%select(text,REF,Original_Author)
dfTextMerge <-dfTextMerge%>%mutate(status = if_else(is.na(Original_Author), "ORIGINAL", "COPY"))
############### SPLIT DATA INTO INDIVIDUAL WORDS ######################################
dfText <-select(dfTextMerge,-REF,-Original_Author) #Drop columns we don't need
##############################################################
# Create individual words and remove stop words
##############################################################
tidy_word <- dfTextMerge %>%
unnest_tokens(word, text)
# Get the stop words dataset
data("stop_words")
tidy_word_filtered <- tidy_word %>%
anti_join(stop_words, by = "word")
#remove numbers
tidy_word <-tidy_word_filtered%>%mutate(word = gsub(x = word, pattern = "[0-9]+|[[:punct:]]|\\(.*\\)", replacement = ""))
#remove th etc
tidy_word <- tidy_word %>%
filter(!(word %in% c("nd", "th", "st", "rd")))
#remove blanks
tidy_word <-subset(tidy_word, word!="")
#build matrix
dfletterCount <- tidy_word%>%group_by(status)%>%count(word)
############### SPLIT DATA INTO TABLES ######################################
top_words <- dfletterCount %>%
group_by(word) %>%
summarize(total_n = sum(n)) %>%
top_n(30, wt = total_n) %>%
arrange(desc(total_n))
copy_words <- dfletterCount %>%
filter(status == "COPY")%>%
select(word, n) %>%
distinct()
original_words <- dfletterCount %>%
filter(status == "ORIGINAL") %>%
select(word, n) %>%
distinct()
appear_in_both <-copy_words %>%
inner_join(original_words, by = "word", suffix = c(".copy", ".original")) %>%
mutate(
n = n.copy + n.original,
status = "BOTH"
) %>%
select(status, word, n)
unique_to_copy <- copy_words %>%
anti_join(original_words, by = "word") %>%
arrange(desc(n)) %>%
head(30)
unique_to_original <- original_words %>%
anti_join(copy_words, by = "word") %>%
arrange(desc(n)) %>%
head(30)
############## Rank Data
rank_Top<-top_words%>%mutate(rank = order(total_n, decreasing=TRUE))
rank_Top<-rank_Top%>%filter(rank <= 20)
rank_Top <- rank_Top %>% mutate(status = "ALL")
rank_Top <- rank_Top %>% rename(n = total_n)
# Reorder rank_Top to match the order of rank_Original and rank_Copy
rank_Top <- rank_Top %>% select(status, word, n, rank)
rank_Copy<-unique_to_copy%>%mutate(rank = order(n, decreasing=TRUE))
rank_Copy<-rank_Copy%>%filter(rank <= 20)
rank_Original<-unique_to_original%>%mutate(rank = order(n, decreasing=TRUE))
rank_Original<-rank_Original%>%filter(rank <= 20)
rank_Both<-appear_in_both%>%mutate(rank = order(n, decreasing=TRUE))
rank_Both<-rank_Both%>%filter(rank <= 20)
###############COMBINE
combined_df <- union(rank_Original, rank_Copy)
combined_df <- union(combined_df, rank_Top)
combined_df <- union(combined_df, rank_Both)
###############PIVOT
xCross <-pivot_wider(
combined_df,
names_from = status,
values_from = c(word,n))
xWords = xCross%>%select(rank,starts_with("word_"))
xWords = xWords%>%rename_at(vars(everything()), ~ sub("word_", "", .x))
##############################################################
# BUILD TABLE
##############################################################
tab1<-gt(xWords) %>%
tab_header(
title = md("**Frequent Words: Original entries vs Copied Entries vs All Entries**"),
subtitle = md("*Stop Words Removed*"))%>%
tab_options(table.font.names = 'Times New Roman',
column_labels.font.weight = 'bold',
heading.title.font.size = 22,
heading.subtitle.font.size = 16,
table.font.color = 'Black',
source_notes.font.size = 14,
#source_notes.
table.font.size = 14)%>%
tab_source_note(source_note = "Source: RobertStJohnSmith.com - Work in Progress")
tab1
tab1%>%gtsave("Survey Results:Frequent Words by Type.png")