-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
GauravBh1010tt
committed
Mar 27, 2018
1 parent
61dfaf2
commit 88899b8
Showing
10 changed files
with
912 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# -*- coding: utf-8 -*- | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
import os | ||
import re | ||
import nltk | ||
import numpy as np | ||
from sklearn import feature_extraction | ||
from tqdm import tqdm | ||
|
||
|
||
_wnl = nltk.WordNetLemmatizer() | ||
|
||
|
||
def normalize_word(w): | ||
return _wnl.lemmatize(w).lower() | ||
|
||
|
||
def get_tokenized_lemmas(s): | ||
return [normalize_word(t) for t in nltk.word_tokenize(s)] | ||
|
||
|
||
def clean(s): | ||
# Cleans a string: Lowercasing, trimming, removing non-alphanumeric | ||
|
||
return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower() | ||
|
||
|
||
def remove_stopwords(l): | ||
# Removes stopwords from a list of tokens | ||
return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS] | ||
|
||
|
||
def gen_or_load_feats(feat_fn, headlines, bodies, feature_file): | ||
if not os.path.isfile(feature_file): | ||
feats = feat_fn(headlines, bodies) | ||
np.save(feature_file, feats) | ||
|
||
return np.load(feature_file) | ||
|
||
|
||
|
||
|
||
def word_overlap_features(headlines, bodies): | ||
X = [] | ||
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): | ||
clean_headline = clean(headline) | ||
clean_body = clean(body) | ||
clean_headline = get_tokenized_lemmas(clean_headline) | ||
clean_body = get_tokenized_lemmas(clean_body) | ||
features = [ | ||
len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))] | ||
X.append(features) | ||
return X | ||
|
||
|
||
def refuting_features(headlines, bodies): | ||
_refuting_words = [ | ||
'fake', | ||
'fraud', | ||
'hoax', | ||
'false', | ||
'deny', 'denies', | ||
# 'refute', | ||
'not', | ||
'despite', | ||
'nope', | ||
'doubt', 'doubts', | ||
'bogus', | ||
'debunk', | ||
'pranks', | ||
'retract' | ||
] | ||
X = [] | ||
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): | ||
clean_headline = clean(headline) | ||
clean_headline = get_tokenized_lemmas(clean_headline) | ||
features = [1 if word in clean_headline else 0 for word in _refuting_words] | ||
X.append(features) | ||
return X | ||
|
||
|
||
def polarity_features(headlines, bodies): | ||
_refuting_words = [ | ||
'fake', | ||
'fraud', | ||
'hoax', | ||
'false', | ||
'deny', 'denies', | ||
'not', | ||
'despite', | ||
'nope', | ||
'doubt', 'doubts', | ||
'bogus', | ||
'debunk', | ||
'pranks', | ||
'retract' | ||
] | ||
|
||
def calculate_polarity(text): | ||
tokens = get_tokenized_lemmas(text) | ||
return sum([t in _refuting_words for t in tokens]) % 2 | ||
X = [] | ||
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): | ||
clean_headline = clean(headline) | ||
clean_body = clean(body) | ||
features = [] | ||
features.append(calculate_polarity(clean_headline)) | ||
features.append(calculate_polarity(clean_body)) | ||
X.append(features) | ||
return np.array(X) | ||
|
||
|
||
def ngrams(input, n): | ||
input = input.split(' ') | ||
output = [] | ||
for i in range(len(input) - n + 1): | ||
output.append(input[i:i + n]) | ||
return output | ||
|
||
|
||
def chargrams(input, n): | ||
output = [] | ||
for i in range(len(input) - n + 1): | ||
output.append(input[i:i + n]) | ||
return output | ||
|
||
|
||
def append_chargrams(features, text_headline, text_body, size): | ||
grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)] | ||
grams_hits = 0 | ||
grams_early_hits = 0 | ||
grams_first_hits = 0 | ||
for gram in grams: | ||
if gram in text_body: | ||
grams_hits += 1 | ||
if gram in text_body[:255]: | ||
grams_early_hits += 1 | ||
if gram in text_body[:100]: | ||
grams_first_hits += 1 | ||
features.append(grams_hits) | ||
features.append(grams_early_hits) | ||
features.append(grams_first_hits) | ||
return features | ||
|
||
|
||
def append_ngrams(features, text_headline, text_body, size): | ||
grams = [' '.join(x) for x in ngrams(text_headline, size)] | ||
grams_hits = 0 | ||
grams_early_hits = 0 | ||
for gram in grams: | ||
if gram in text_body: | ||
grams_hits += 1 | ||
if gram in text_body[:255]: | ||
grams_early_hits += 1 | ||
features.append(grams_hits) | ||
features.append(grams_early_hits) | ||
return features | ||
|
||
|
||
def hand_features(headlines, bodies): | ||
|
||
def binary_co_occurence(headline, body): | ||
# Count how many times a token in the title | ||
# appears in the body text. | ||
bin_count = 0 | ||
bin_count_early = 0 | ||
for headline_token in clean(headline).split(" "): | ||
if headline_token in clean(body): | ||
bin_count += 1 | ||
if headline_token in clean(body)[:255]: | ||
bin_count_early += 1 | ||
return [bin_count, bin_count_early] | ||
|
||
def binary_co_occurence_stops(headline, body): | ||
# Count how many times a token in the title | ||
# appears in the body text. Stopwords in the title | ||
# are ignored. | ||
bin_count = 0 | ||
bin_count_early = 0 | ||
for headline_token in remove_stopwords(clean(headline).split(" ")): | ||
if headline_token in clean(body): | ||
bin_count += 1 | ||
bin_count_early += 1 | ||
return [bin_count, bin_count_early] | ||
|
||
def count_grams(headline, body): | ||
# Count how many times an n-gram of the title | ||
# appears in the entire body, and intro paragraph | ||
|
||
clean_body = clean(body) | ||
clean_headline = clean(headline) | ||
features = [] | ||
features = append_chargrams(features, clean_headline, clean_body, 2) | ||
features = append_chargrams(features, clean_headline, clean_body, 8) | ||
features = append_chargrams(features, clean_headline, clean_body, 4) | ||
features = append_chargrams(features, clean_headline, clean_body, 16) | ||
features = append_ngrams(features, clean_headline, clean_body, 2) | ||
features = append_ngrams(features, clean_headline, clean_body, 3) | ||
features = append_ngrams(features, clean_headline, clean_body, 4) | ||
features = append_ngrams(features, clean_headline, clean_body, 5) | ||
features = append_ngrams(features, clean_headline, clean_body, 6) | ||
return features | ||
|
||
X = [] | ||
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): | ||
X.append(binary_co_occurence(headline, body) | ||
+ binary_co_occurence_stops(headline, body) | ||
+ count_grams(headline, body)) | ||
|
||
|
||
return X |
Oops, something went wrong.