Skip to content

Commit

Permalink
27-mar: contents of dl_text
Browse files Browse the repository at this point in the history
  • Loading branch information
GauravBh1010tt committed Mar 27, 2018
1 parent 61dfaf2 commit 88899b8
Show file tree
Hide file tree
Showing 10 changed files with 912 additions and 9 deletions.
2 changes: 2 additions & 0 deletions dl_text/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-

Binary file added dl_text/__init__.pyc
Binary file not shown.
18 changes: 9 additions & 9 deletions dl.py → dl_text/dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ def clean(text):
def tokenize(sent):
return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

def process_data(sent_Q,sent_A=None,wordVec_model=None,dimx=100,dimy=100,vocab_size=10000,embedding_dim=300):
def process_data(sent_l,sent_r=None,wordVec_model=None,dimx=100,dimy=100,vocab_size=10000,embedding_dim=300):
sent1 = []
sent1.extend(sent_Q)
if sent_A:
sent1.extend(sent_A)
sent1.extend(sent_l)
if sent_r:
sent1.extend(sent_r)
# sent1 = [' '.join(i) for i in sent1]
sentence = ["%s %s %s" % (START,x,END) for x in sent1]
tokenize_sent = [regexp_tokenize(x,
Expand All @@ -69,7 +69,7 @@ def process_data(sent_Q,sent_A=None,wordVec_model=None,dimx=100,dimy=100,vocab_s
for i,sent in enumerate(tokenize_sent):
tokenize_sent[i] = [w if w in word_to_index else unk_token for w in sent]

len_train = len(sent_Q)
len_train = len(sent_l)
text=[]
for i in tokenize_sent:
text.extend(i)
Expand All @@ -93,7 +93,7 @@ def process_data(sent_Q,sent_A=None,wordVec_model=None,dimx=100,dimy=100,vocab_s

X_data = np.array(X_data)

if sent_A:
if sent_r:
for sent in tokenize_sent[len_train:]:
temp = [START for i in range(dimy)]
for ind,word in enumerate(sent[0:dimy]):
Expand Down Expand Up @@ -125,9 +125,9 @@ def process_data(sent_Q,sent_A=None,wordVec_model=None,dimx=100,dimy=100,vocab_s



if sent_A and wordVec_model:
if sent_r and wordVec_model:
return X_data,y_data,embedding_matrix
elif sent_A:
elif sent_r:
return X_data,y_data
elif wordVec_model:
return X_data,embedding_matrix
Expand Down Expand Up @@ -160,4 +160,4 @@ def loadGloveModel(glovefile):
model[word] = embedding
print 'Loaded Word2Vec GloVe Model.....'
print len(model), ' words loaded.....'
return model
return model
Binary file added dl_text/dl.pyc
Binary file not shown.
209 changes: 209 additions & 0 deletions dl_text/hnd_ft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import os
import re
import nltk
import numpy as np
from sklearn import feature_extraction
from tqdm import tqdm


_wnl = nltk.WordNetLemmatizer()


def normalize_word(w):
return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
# Cleans a string: Lowercasing, trimming, removing non-alphanumeric

return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
# Removes stopwords from a list of tokens
return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def gen_or_load_feats(feat_fn, headlines, bodies, feature_file):
if not os.path.isfile(feature_file):
feats = feat_fn(headlines, bodies)
np.save(feature_file, feats)

return np.load(feature_file)




def word_overlap_features(headlines, bodies):
X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
clean_headline = clean(headline)
clean_body = clean(body)
clean_headline = get_tokenized_lemmas(clean_headline)
clean_body = get_tokenized_lemmas(clean_body)
features = [
len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))]
X.append(features)
return X


def refuting_features(headlines, bodies):
_refuting_words = [
'fake',
'fraud',
'hoax',
'false',
'deny', 'denies',
# 'refute',
'not',
'despite',
'nope',
'doubt', 'doubts',
'bogus',
'debunk',
'pranks',
'retract'
]
X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
clean_headline = clean(headline)
clean_headline = get_tokenized_lemmas(clean_headline)
features = [1 if word in clean_headline else 0 for word in _refuting_words]
X.append(features)
return X


def polarity_features(headlines, bodies):
_refuting_words = [
'fake',
'fraud',
'hoax',
'false',
'deny', 'denies',
'not',
'despite',
'nope',
'doubt', 'doubts',
'bogus',
'debunk',
'pranks',
'retract'
]

def calculate_polarity(text):
tokens = get_tokenized_lemmas(text)
return sum([t in _refuting_words for t in tokens]) % 2
X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
clean_headline = clean(headline)
clean_body = clean(body)
features = []
features.append(calculate_polarity(clean_headline))
features.append(calculate_polarity(clean_body))
X.append(features)
return np.array(X)


def ngrams(input, n):
input = input.split(' ')
output = []
for i in range(len(input) - n + 1):
output.append(input[i:i + n])
return output


def chargrams(input, n):
output = []
for i in range(len(input) - n + 1):
output.append(input[i:i + n])
return output


def append_chargrams(features, text_headline, text_body, size):
grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
grams_hits = 0
grams_early_hits = 0
grams_first_hits = 0
for gram in grams:
if gram in text_body:
grams_hits += 1
if gram in text_body[:255]:
grams_early_hits += 1
if gram in text_body[:100]:
grams_first_hits += 1
features.append(grams_hits)
features.append(grams_early_hits)
features.append(grams_first_hits)
return features


def append_ngrams(features, text_headline, text_body, size):
grams = [' '.join(x) for x in ngrams(text_headline, size)]
grams_hits = 0
grams_early_hits = 0
for gram in grams:
if gram in text_body:
grams_hits += 1
if gram in text_body[:255]:
grams_early_hits += 1
features.append(grams_hits)
features.append(grams_early_hits)
return features


def hand_features(headlines, bodies):

def binary_co_occurence(headline, body):
# Count how many times a token in the title
# appears in the body text.
bin_count = 0
bin_count_early = 0
for headline_token in clean(headline).split(" "):
if headline_token in clean(body):
bin_count += 1
if headline_token in clean(body)[:255]:
bin_count_early += 1
return [bin_count, bin_count_early]

def binary_co_occurence_stops(headline, body):
# Count how many times a token in the title
# appears in the body text. Stopwords in the title
# are ignored.
bin_count = 0
bin_count_early = 0
for headline_token in remove_stopwords(clean(headline).split(" ")):
if headline_token in clean(body):
bin_count += 1
bin_count_early += 1
return [bin_count, bin_count_early]

def count_grams(headline, body):
# Count how many times an n-gram of the title
# appears in the entire body, and intro paragraph

clean_body = clean(body)
clean_headline = clean(headline)
features = []
features = append_chargrams(features, clean_headline, clean_body, 2)
features = append_chargrams(features, clean_headline, clean_body, 8)
features = append_chargrams(features, clean_headline, clean_body, 4)
features = append_chargrams(features, clean_headline, clean_body, 16)
features = append_ngrams(features, clean_headline, clean_body, 2)
features = append_ngrams(features, clean_headline, clean_body, 3)
features = append_ngrams(features, clean_headline, clean_body, 4)
features = append_ngrams(features, clean_headline, clean_body, 5)
features = append_ngrams(features, clean_headline, clean_body, 6)
return features

X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
X.append(binary_co_occurence(headline, body)
+ binary_co_occurence_stops(headline, body)
+ count_grams(headline, body))


return X
Loading

0 comments on commit 88899b8

Please sign in to comment.