-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreProcessamento.py
164 lines (113 loc) · 3.2 KB
/
preProcessamento.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# coding: utf-8
# user/bin/python
import nltk
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
import string
import gensim
from gensim import corpora
import re
import unicodedata
##### esse código será responsável por fazer a modelagem de tópicos no nosso trabalho
def leitura(arquivo, posicao):
arq = open(arquivo, "r")
info = arq.read()
arq.close()
info = info.split('\n')
data = []
for i in range(0, len(info)):
info[i] = info[i].split('\t')
data.append(info[i][posicao]) ### posicao é 7 para base do reclame aqui
### tem que olhar qual posição na base do twitter
finalData = []
for i in range(0, len(data)):
for j in range(0, len(data[i])):
finalData.append(data[i][j].lower())
return data
def preProcessamento(info): ### pré processando os dados
stop_free = " ".join([i for i in info.lower().split() if i not in stop]) ## retirando stop words
punc_free = ''.join(ch for ch in stop_free if ch not in exclude) ### retirando pontuação
return punc_free
def removeWords(adjectives, text):
i=0
while(i<len(text)):
j=0
while(j<len(text[i])):
if len([(a.end()) for a in list(re.finditer(text[i][j], adjectives))]) != 0:
text[i].pop(j)
j=j-1
j=j+1
if len(text[i]) == 0:
break
if len(text) == 0:
break
i=i+1
return text
def readWords(file_name):
text_file = open(file_name, 'r')
info = text_file.read()
text_file.close()
return info
def removeNumbers(info):
i=0
while(i<len(info)):
j=0
while(j<len(info[i])):
if (re.match(r'^[0-9]+$', info[i][j])):
info[i].pop(j)
j=j-1
j=j+1
i=i+1
return info
def stemming(sentence):
stemmer = RSLPStemmer()
phrase = []
for word in sentence:
word = unicode(word, 'utf-8')
word = unicodedata.normalize("NFKD", word)
phrase.append(stemmer.stem(word.lower()))
return phrase
def saida(info):
arq_saida = open("baseReclamacoes", "w")
aux = 0
aux2 = 0
for i in info:
aux2 = 0
for j in i:
j = unicode(j, 'utf-8')
j = unicodedata.normalize("NFKD", j)
if aux2<len(info[aux])-1:arq_saida.write(u''.join(j).encode('utf-8')+' ')
else:arq_saida.write(u''.join(j).encode('utf-8'))
aux2 = aux2+1
if aux<len(info)-1:arq_saida.write('\n')
aux=aux+1
arq_saida.close()
def removeLenWord(info, limiar): ### retira palavras com tamanho menor que uma limiar
i=0
while(i<len(info)):
j=0
while(j<len(info[i])):
if len(info[i][j])<=limiar:
info[i].pop(j)
j=j-1
j=j+1
i=i+1
return info
if __name__ == '__main__':
info = leitura("reclameAqui.txt", 6)
stop = set(stopwords.words('portuguese'))
exclude = set(string.punctuation)
pontuacao = ['<br', '.', ',', '?', '!', '(', ')', ':', '-', '...', '<', '>', 'RT']
for i in range(0, len(info)):
for j in pontuacao:
info[i] = info[i].replace(j, ' ')
info = [preProcessamento(data).split() for data in info] ## ao final tenho tudo pre processado
adjectives = readWords('lista_adjetivos')
stop = readWords('stop_words')
info = removeWords(adjectives, info) ### retirando adjetivos
info = removeWords(stop, info) ### retirando stop words
info = removeNumbers(info)
info = removeLenWord(info, 3)
#for i in range(0, len(info)):
# info[i] = stemming(info[i])
saida(info)