-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathCount_Number_of_Word_with_Clean.py
37 lines (30 loc) · 1.2 KB
/
Count_Number_of_Word_with_Clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
'''
Created on Feb 22, 2017
@author: Xuebin Wei
www.lbsocial.net
excluse stopwords
find the 10 most common words
and write the result to an excel file
'''
import xlwt
from collections import Counter
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
book = xlwt.Workbook() # create a new excel file
sheet_test = book.add_sheet('word_count') # add a new sheet
i = 0
sheet_test.write(i,0,'word') # write the header of the first column
sheet_test.write(i,1,'count') # write the header of the second column
sheet_test.write(i,2,'ratio') # write the header of the third column
with open('','r',encoding='utf-8', errors = 'ignore') as text_word: # define the location of your txt file
# convert all the word into lower cases
# filter out stop words
word_list = [i for i in text_word.read().lower().split() if i not in stop]
word_total = word_list.__len__()
count_result = Counter(word_list)
for result in count_result.most_common(10):
i = i+1
sheet_test.write(i,0,result[0])
sheet_test.write(i,1,result[1])
sheet_test.write(i,2,(result[1]/word_total))
book.save('')# define the location of your excel file