forked from rasto2211/Twitter-User-Gender-Classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
logistic_reg.py
56 lines (39 loc) · 1.91 KB
/
logistic_reg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import numpy as np
from common import load_data
from common import load_data_split
from common import encode_class_labels
from common import report_results
from common import extract_feats_from_text
from common import extract_feats_from_text_and_desc
from common import extract_tweet_count_feats
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
JOBS = 4
PARAMS = [{'penalty':["l1", "l2"],
'C': [4, 2, 1.5, 1, 0.5, 0.1, 0.05, 0.01, 0.001, 0.0001]}]
df = load_data()
train_rows, test_rows = load_data_split()
y_train, y_test, class_names = \
encode_class_labels(train_rows, test_rows, df)
print("Features only from Text")
X_train, X_test = extract_feats_from_text(df, train_rows, test_rows)
grid_search = GridSearchCV(LogisticRegression(), PARAMS, n_jobs=JOBS, verbose=5, cv=4,
scoring="f1")
grid_search.fit(X_train, y_train)
report_results(grid_search, y_train, X_train, y_test, X_test, class_names)
print("Features from tweet text and description")
X_train, X_test = extract_feats_from_text_and_desc(df, train_rows, test_rows)
grid_search = GridSearchCV(LogisticRegression(), PARAMS, n_jobs=JOBS, verbose=5, cv=4,
scoring="f1")
grid_search.fit(X_train, y_train)
report_results(grid_search, y_train, X_train, y_test, X_test, class_names)
print("Features from tweet text, description, retweet count, tweet count"
" and number of favorite tweets of the user")
tweet_feats_train, tweet_feats_test = extract_tweet_count_feats(df, train_rows, test_rows)
X_train = hstack((X_train, tweet_feats_train))
X_test = hstack((X_test, tweet_feats_test))
grid_search = GridSearchCV(LogisticRegression(), PARAMS, n_jobs=JOBS, verbose=5, cv=4,
scoring="f1")
grid_search.fit(X_train, y_train)
report_results(grid_search, y_train, X_train, y_test, X_test, class_names)