-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathstops.py
61 lines (53 loc) · 1.83 KB
/
stops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import nltk
import string
import re
from nltk.probability import FreqDist
from cltk.tokenize.indian_tokenizer import indian_punctuation_tokenize_regex as i_word
import os
path = "./datasets/dnyaneshwari/"
punctuation = '''''!()-[]{};:'"\,<>./?@#$%^&*_~'''
extra_punctuation = '||'
if os.path.isfile("./stop_words.txt"):
os.remove("./stop_words.txt")
if os.path.isfile(path+"dnyaneshwari.txt"):
os.remove(path+"dnyaneshwari.txt")
final = ""
for file_name in os.listdir(path):
full_path = os.path.join(path, file_name)
file_content = open(full_path).read()
for char in file_content:
if char not in punctuation + extra_punctuation:
final = final + char
i_words = i_word(final)
f = open(path+"dnyaneshwari.txt", 'a+')
word_string = '\n'.join(i_words)
f.write(word_string)
with open(path + "dnyaneshwari.txt") as f1:
dnyaneshwari_words = f1.read().splitlines()
os.remove(path+"dnyaneshwari.txt")
path = "./datasets/haripath/"
if os.path.isfile(path+"haripath.txt"):
os.remove(path+"haripath.txt")
final = ""
for file_name in os.listdir(path):
full_path = os.path.join(path, file_name)
file_content = open(full_path).read()
for char in file_content:
if char not in punctuation + extra_punctuation:
final = final + char
i_words = i_word(final)
f = open(path + "haripath.txt", 'a+')
word_string = '\n'.join(i_words)
f.write(word_string)
with open(path + "haripath.txt") as f1:
haripath_words = f1.read().splitlines()
os.remove(path+"haripath.txt")
words = dnyaneshwari_words
words = words + haripath_words
fdist = FreqDist(words)
common_words = fdist.most_common(100)
f = open('/home/mahesh/Mahesh/marathi_text_wikisource/stops_words.txt', 'a+')
cw_list = [x[0] for x in common_words]
common_words = '\n'.join(cw_list)
print(common_words)
f. write(common_words)