-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleaner_dataset.py
38 lines (29 loc) · 1.04 KB
/
cleaner_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from clean import cleaner, remove_accent, remove_genre
from unidecode import unidecode
POSITIVE_PATH = "dataset/positivas.txt"
NEGATIVE_PATH = "dataset/negativas.txt"
DATASET_PATH = "dataset/dataset.csv"
DELIMITER = ","
POSITIVE_NUMBER = 1
NEGATIVE_NUMBER = 0
positives = set()
negatives = set()
def read_file(path, words):
with open(path, "r") as file:
for word in file:
word = cleaner(word)
accentless_word = unidecode(word)
words.add(word)
words.add(accentless_word)
words.add(remove_genre(word))
words.add(remove_genre(accentless_word))
def write_dataset(file, words, number):
words.remove("")
for word in words:
file.write(f"{number}{DELIMITER}{word}\n")
read_file(POSITIVE_PATH, positives)
read_file(NEGATIVE_PATH, negatives)
with open(DATASET_PATH, "w") as dataset_file:
dataset_file.write(f"sentiment{DELIMITER}sentence\n")
write_dataset(dataset_file, positives, POSITIVE_NUMBER)
write_dataset(dataset_file, negatives, NEGATIVE_NUMBER)