-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmost_simple_way.py
47 lines (34 loc) · 1.34 KB
/
most_simple_way.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# NLP tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from icecream import ic
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
TRAIN_CORPUS = './train_after_analysis.csv'
STOP_WORDS = './stopwords.txt'
WORDS_COLUMN = 'words_keep'
content = pd.read_csv(TRAIN_CORPUS)
corpus = content[WORDS_COLUMN].values
STOP_WORDS_SIZE = 100
WORDS_LONG_TAIL_BEGIN = 10000
WORDS_SIZE = WORDS_LONG_TAIL_BEGIN - STOP_WORDS_SIZE
stop_words = open(STOP_WORDS).read().split()[:STOP_WORDS_SIZE]
tfidf = TfidfVectorizer(max_features=WORDS_SIZE, stop_words=stop_words)
text_vectors = tfidf.fit_transform(corpus)
print(text_vectors.shape)
targets = content['label']
x_train, x_test, y_train, y_test = train_test_split(text_vectors, targets, test_size=0.2, random_state=0)
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
accuracy = accuracy_score(y_test, rf.predict(x_test))
f1 = f1_score(y_test, rf.predict(x_test), average='macro')
precision = precision_score(y_test, rf.predict(x_test), average='macro')
recall = recall_score(y_test, rf.predict(x_test), average='macro')
ic(f1)
ic(precision)
ic(recall)
ic(accuracy)