-
Notifications
You must be signed in to change notification settings - Fork 0
/
descriptive_statistic.Rmd
74 lines (65 loc) · 3.21 KB
/
descriptive_statistic.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
---
title: "descriptive_statistic"
author: "周震宇"
date: "2020/3/30"
output: html_document
---
```{r}
library(readr)
library(dplyr)
library(ggplot2)
dt = read_csv('/Users/zhouzhenyu/personal_repos/textbook_extraction/data/concept_page_nums/all_words_info.csv',col_names = c('chpt_id','cpt_id','fre'))
cpt_names_dt = read_delim('/Users/zhouzhenyu/personal_repos/textbook_extraction/data/concepts/all_concepts.csv', delim='::',col_names = c('cpt_name','na','cpt_id'))
cpt_names_dt = cpt_names_dt[,c(1,3)]
cpt_names_dt = cpt_names_dt %>% distinct(cpt_id,.keep_all=TRUE)
chpt_names_dt = read_csv('/Users/zhouzhenyu/personal_repos/textbook_extraction/data/concepts/book_chapter_ids.csv',col_names = c('chpt_name','chpt_id'))
dt = merge(dt,cpt_names_dt,by = 'cpt_id')
dt = merge(dt, chpt_names_dt, by='chpt_id')
dt$book_name = gsub('\\d','',dt$chpt_name)
# 高频词
high_fre_cpt_table = dt %>% group_by(cpt_name) %>% summarise(n=sum(fre),n_chpt=length(unique(chpt_name))) %>% arrange(desc(n)) %>% head(20)
high_fre_chpt_table = dt %>% group_by(chpt_name) %>% summarise(n=sum(fre),n_cpt=length(unique(cpt_name))) %>% arrange(desc(n)) %>% head(15)
high_fre_book_table = dt %>% group_by(book_name) %>%
summarise(n_cpt=length(unique(cpt_name)),
n_chpt=length(unique(chpt_name)),
n=sum(fre)) %>%
arrange(desc(n)) %>% head(15)
ggplot(dt,aes(x=fre))+geom_histogram()
# 占比为零的频次
length(dt$chpt_id)/length(unique(dt$cpt_id))/length(unique(dt$chpt_id))
#记录,概念 1~5的出现频次
calc_fre = function(num){
return(c(num,sum(dt$fre==num),sum(dt$fre==num)/length(dt$chpt_id)))
}
mapply(calc_fre,1:5)
calc_concept_fre = function(dt,num){
return(c(num,sum(dt$n==num),sum(dt$n==num)/length(dt$cpt_name)))
}
tmp = dt %>% group_by(cpt_name) %>% summarise(n=sum(fre),n_chpt=length(unique(chpt_name))) %>% arrange(desc(n))
cpt_summary = mapply(function(x) calc_concept_fre(tmp,x),1:10)
write.csv(cpt_summary,'/Users/zhouzhenyu/Documents/postgraduate_essay/essay_table/cpt_summary.csv')
book_res1 = dt %>% group_by(cpt_name) %>%
filter(book_name=='StatisticalModels') %>%
summarise(n=sum(fre),n_chpt=length(unique(chpt_name)))%>%
arrange(desc(n)) %>% head(20)
book_res2 = dt %>% group_by(cpt_name) %>%
filter(book_name=='ReinforcementLearning') %>%
summarise(n=sum(fre),n_chpt=length(unique(chpt_name)))%>%
arrange(desc(n)) %>% head(20)
book_res3 = dt %>% group_by(cpt_name) %>%
filter(book_name=='StochasticProcesses') %>%
summarise(n=sum(fre),n_chpt=length(unique(chpt_name)))%>%
arrange(desc(n)) %>% head(20)
```
```{r}
# 词云图
wordcloud2::wordcloud2(tmp[,1:2])
```
```{r}
write.csv(high_fre_cpt_table,'/Users/zhouzhenyu/Documents/postgraduate_essay/essay_table/high_fre_cpt_table.csv')
write.csv(high_fre_chpt_table,'/Users/zhouzhenyu/Documents/postgraduate_essay/essay_table/high_fre_chpt_table.csv')
write.csv(high_fre_book_table,'/Users/zhouzhenyu/Documents/postgraduate_essay/essay_table/high_fre_book_table.csv')
write.csv(book_res1,'/Users/zhouzhenyu/Documents/postgraduate_essay/essay_table/book_res1.csv')
write.csv(book_res2,'/Users/zhouzhenyu/Documents/postgraduate_essay/essay_table/book_res2.csv')
write.csv(book_res3,'/Users/zhouzhenyu/Documents/postgraduate_essay/essay_table/book_res3.csv')
```