27-mar: contents of dl_text

GauravBh1010tt · Mar 27, 2018 · 88899b8 · 88899b8
1 parent 61dfaf2
commit 88899b8
Show file tree

Hide file tree

Showing 10 changed files with 912 additions and 9 deletions.
diff --git a/dl_text/__init__.py b/dl_text/__init__.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+
diff --git a/dl_text/__init__.pyc b/dl_text/__init__.pyc
diff --git a/dl.py → dl_text/dl.py b/dl.py → dl_text/dl.py
@@ -47,11 +47,11 @@ def clean(text):
 def tokenize(sent):
     return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
 
-def process_data(sent_Q,sent_A=None,wordVec_model=None,dimx=100,dimy=100,vocab_size=10000,embedding_dim=300):
+def process_data(sent_l,sent_r=None,wordVec_model=None,dimx=100,dimy=100,vocab_size=10000,embedding_dim=300):
     sent1 = []
-    sent1.extend(sent_Q)
-    if sent_A:
-        sent1.extend(sent_A)
+    sent1.extend(sent_l)
+    if sent_r:
+        sent1.extend(sent_r)
 #    sent1 = [' '.join(i) for i in sent1]
     sentence = ["%s %s %s" % (START,x,END) for x in sent1]
     tokenize_sent = [regexp_tokenize(x, 
@@ -69,7 +69,7 @@ def process_data(sent_Q,sent_A=None,wordVec_model=None,dimx=100,dimy=100,vocab_s
     for i,sent in enumerate(tokenize_sent):
         tokenize_sent[i] = [w if w in word_to_index else unk_token for w in sent]
 
-    len_train = len(sent_Q)
+    len_train = len(sent_l)
     text=[]
     for i in tokenize_sent:
         text.extend(i)
@@ -93,7 +93,7 @@ def process_data(sent_Q,sent_A=None,wordVec_model=None,dimx=100,dimy=100,vocab_s
 
     X_data = np.array(X_data)
 
-    if sent_A:
+    if sent_r:
         for sent in tokenize_sent[len_train:]:
             temp = [START for i in range(dimy)]
             for ind,word in enumerate(sent[0:dimy]):
@@ -125,9 +125,9 @@ def process_data(sent_Q,sent_A=None,wordVec_model=None,dimx=100,dimy=100,vocab_s
 
 
 
-    if sent_A and wordVec_model:
+    if sent_r and wordVec_model:
         return X_data,y_data,embedding_matrix
-    elif sent_A:
+    elif sent_r:
         return X_data,y_data
     elif wordVec_model:
         return X_data,embedding_matrix
@@ -160,4 +160,4 @@ def loadGloveModel(glovefile):
         model[word] = embedding
     print 'Loaded Word2Vec GloVe Model.....'
     print len(model), ' words loaded.....'
-    return model
+    return model
diff --git a/dl_text/dl.pyc b/dl_text/dl.pyc
diff --git a/dl_text/hnd_ft.py b/dl_text/hnd_ft.py
@@ -0,0 +1,209 @@
+import os
+import re
+import nltk
+import numpy as np
+from sklearn import feature_extraction
+from tqdm import tqdm
+
+
+_wnl = nltk.WordNetLemmatizer()
+
+
+def normalize_word(w):
+    return _wnl.lemmatize(w).lower()
+
+
+def get_tokenized_lemmas(s):
+    return [normalize_word(t) for t in nltk.word_tokenize(s)]
+
+
+def clean(s):
+    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
+
+    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()
+
+
+def remove_stopwords(l):
+    # Removes stopwords from a list of tokens
+    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]
+
+
+def gen_or_load_feats(feat_fn, headlines, bodies, feature_file):
+    if not os.path.isfile(feature_file):
+        feats = feat_fn(headlines, bodies)
+        np.save(feature_file, feats)
+
+    return np.load(feature_file)
+
+
+
+
+def word_overlap_features(headlines, bodies):
+    X = []
+    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
+        clean_headline = clean(headline)
+        clean_body = clean(body)
+        clean_headline = get_tokenized_lemmas(clean_headline)
+        clean_body = get_tokenized_lemmas(clean_body)
+        features = [
+            len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))]
+        X.append(features)
+    return X
+
+
+def refuting_features(headlines, bodies):
+    _refuting_words = [
+        'fake',
+        'fraud',
+        'hoax',
+        'false',
+        'deny', 'denies',
+        # 'refute',
+        'not',
+        'despite',
+        'nope',
+        'doubt', 'doubts',
+        'bogus',
+        'debunk',
+        'pranks',
+        'retract'
+    ]
+    X = []
+    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
+        clean_headline = clean(headline)
+        clean_headline = get_tokenized_lemmas(clean_headline)
+        features = [1 if word in clean_headline else 0 for word in _refuting_words]
+        X.append(features)
+    return X
+
+
+def polarity_features(headlines, bodies):
+    _refuting_words = [
+        'fake',
+        'fraud',
+        'hoax',
+        'false',
+        'deny', 'denies',
+        'not',
+        'despite',
+        'nope',
+        'doubt', 'doubts',
+        'bogus',
+        'debunk',
+        'pranks',
+        'retract'
+    ]
+
+    def calculate_polarity(text):
+        tokens = get_tokenized_lemmas(text)
+        return sum([t in _refuting_words for t in tokens]) % 2
+    X = []
+    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
+        clean_headline = clean(headline)
+        clean_body = clean(body)
+        features = []
+        features.append(calculate_polarity(clean_headline))
+        features.append(calculate_polarity(clean_body))
+        X.append(features)
+    return np.array(X)
+
+
+def ngrams(input, n):
+    input = input.split(' ')
+    output = []
+    for i in range(len(input) - n + 1):
+        output.append(input[i:i + n])
+    return output
+
+
+def chargrams(input, n):
+    output = []
+    for i in range(len(input) - n + 1):
+        output.append(input[i:i + n])
+    return output
+
+
+def append_chargrams(features, text_headline, text_body, size):
+    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
+    grams_hits = 0
+    grams_early_hits = 0
+    grams_first_hits = 0
+    for gram in grams:
+        if gram in text_body:
+            grams_hits += 1
+        if gram in text_body[:255]:
+            grams_early_hits += 1
+        if gram in text_body[:100]:
+            grams_first_hits += 1
+    features.append(grams_hits)
+    features.append(grams_early_hits)
+    features.append(grams_first_hits)
+    return features
+
+
+def append_ngrams(features, text_headline, text_body, size):
+    grams = [' '.join(x) for x in ngrams(text_headline, size)]
+    grams_hits = 0
+    grams_early_hits = 0
+    for gram in grams:
+        if gram in text_body:
+            grams_hits += 1
+        if gram in text_body[:255]:
+            grams_early_hits += 1
+    features.append(grams_hits)
+    features.append(grams_early_hits)
+    return features
+
+
+def hand_features(headlines, bodies):
+
+    def binary_co_occurence(headline, body):
+        # Count how many times a token in the title
+        # appears in the body text.
+        bin_count = 0
+        bin_count_early = 0
+        for headline_token in clean(headline).split(" "):
+            if headline_token in clean(body):
+                bin_count += 1
+            if headline_token in clean(body)[:255]:
+                bin_count_early += 1
+        return [bin_count, bin_count_early]
+
+    def binary_co_occurence_stops(headline, body):
+        # Count how many times a token in the title
+        # appears in the body text. Stopwords in the title
+        # are ignored.
+        bin_count = 0
+        bin_count_early = 0
+        for headline_token in remove_stopwords(clean(headline).split(" ")):
+            if headline_token in clean(body):
+                bin_count += 1
+                bin_count_early += 1
+        return [bin_count, bin_count_early]
+
+    def count_grams(headline, body):
+        # Count how many times an n-gram of the title
+        # appears in the entire body, and intro paragraph
+
+        clean_body = clean(body)
+        clean_headline = clean(headline)
+        features = []
+        features = append_chargrams(features, clean_headline, clean_body, 2)
+        features = append_chargrams(features, clean_headline, clean_body, 8)
+        features = append_chargrams(features, clean_headline, clean_body, 4)
+        features = append_chargrams(features, clean_headline, clean_body, 16)
+        features = append_ngrams(features, clean_headline, clean_body, 2)
+        features = append_ngrams(features, clean_headline, clean_body, 3)
+        features = append_ngrams(features, clean_headline, clean_body, 4)
+        features = append_ngrams(features, clean_headline, clean_body, 5)
+        features = append_ngrams(features, clean_headline, clean_body, 6)
+        return features
+
+    X = []
+    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
+        X.append(binary_co_occurence(headline, body)
+                 + binary_co_occurence_stops(headline, body)
+                 + count_grams(headline, body))
+
+
+    return X