-
Notifications
You must be signed in to change notification settings - Fork 1
/
util.py
108 lines (78 loc) · 3.94 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
#imports to remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
#import to do word stemming
from nltk.stem.snowball import EnglishStemmer
#imports to do wordcloud visualization of features
import matplotlib.pyplot as plt
from wordcloud import WordCloud
#for tf/idf
from sklearn.feature_extraction.text import TfidfVectorizer
def cleaning_data(true_df_path, fake_df_path, attribute_to_use, bert = True):
"""
Helper function to pre-process data (lower case, remove article source, removal of punctation & stop words).
Inputs:
true_df_path: (str) path to csv file containing articles labeled as 'True'
fake_df_path: (str) path to csv file containing articles labeled as 'False'
Returns a pd dataframe with clean strings
"""
#load in data
fake_df = pd.read_csv(fake_df_path, usecols= [attribute_to_use])
true_df = pd.read_csv(true_df_path, usecols= [attribute_to_use])
#add labels
fake_df['target'] = 0
true_df['target'] = 1
#remove newspaper source (Reuters) from the "true" articles before merging
if attribute_to_use == 'text':
true_df[attribute_to_use] = true_df[attribute_to_use].replace(r'\A.*\(Reuters\)', '', regex=True)
# i.e. if we are using titles
else:
true_df[attribute_to_use] = true_df[attribute_to_use].replace(r'Reuters', ' ', regex=True)
# Merge fake and true articles into one dataset
data = true_df.append(fake_df).sample(frac=1).reset_index().drop(columns=['index'])
if not bert:
#remove all punctuation, single letters, and digits
data[attribute_to_use] = data[attribute_to_use].replace(r'[^\w\s]', ' ', regex=True).str.lower()
data[attribute_to_use] = data[attribute_to_use].replace(r'\s\w\s', ' ', regex=True)
data[attribute_to_use] = data[attribute_to_use].replace(r'\d+\w*', ' ', regex=True)
#remove stop words
data[attribute_to_use] = data[attribute_to_use].apply(lambda words: ' '.join\
(word for word in words.split() if word not in stop))
return data
def stemming(pd_series):
"""
Converts words into stems and returns a string with stemmed words per observation.
Input:
pd_series: (pd series) a pandas series where each observation represents an article in
a string form
Returns: (pd series)
"""
stemmer = EnglishStemmer()
processed_series = pd_series.apply(lambda words: ' '.join(stemmer.stem(word) for word in words.split()))
return processed_series
def wordcloud(series_of_strings, max_words):
"""
Visulalizes pandas series as a cloud of words.
Returns: plot
"""
word_cloud = WordCloud(collocations = False, background_color = 'white',\
max_words = max_words, max_font_size = 100, width = 800,\
height = 400).generate(str(series_of_strings))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
def tf_idf_vectorizer(train_attribute, val_attribute, test_attribute, max_features):
"""vectorize training, validation, and testing data. Produce TF-IDF vectors for each subset"""
# Note, ngrams = 1, which is the default value if not specified in TfidfVectorizer.
text_transformer = TfidfVectorizer(stop_words='english', max_features = max_features)
# fit_transform() method learns vocabulary and `IDF` used for both training & test data.
# Returns document-term matrix with calculated `TF-IDF` values.
X_train_text = text_transformer.fit_transform(train_attribute)
# transform() method uses the vocabulary and document frequencies (df) learned by fit_transform().
# Returns document-term matrix with calculated `TF-IDF` values.
X_val_text = text_transformer.transform(val_attribute)
X_test_text = text_transformer.transform(test_attribute)
return text_transformer, X_train_text, X_val_text, X_test_text