code/15_natural_language_processing_nb.py

# # Natural Language Processing (NLP)


# ## Part 1: Introduction
# *Adapted from [NLP Crash Course](http://files.meetup.com/7616132/DC-NLP-2013-09%20Charlie%20Greenbacker.pdf) by Charlie Greenbacker and [Introduction to NLP](http://spark-public.s3.amazonaws.com/nlp/slides/intro.pdf) by Dan Jurafsky*

# ### What is NLP?
# - Using computers to process (analyze, understand, generate) natural human languages

# ### Why is NLP useful?
# - Most knowledge created by humans is unstructured text
# - Need some way to make sense of it
# - Enables quantitative analysis of text data

# ### What are some of the higher level task areas?
# - **Speech recognition and generation**: Apple Siri
#     - Speech to text
#     - Text to speech
# - **Question answering**: IBM Watson
#     - Match query with knowledge base
#     - Reasoning about intent of question
# - **Machine translation**: Google Translate
#     - One language to another to another
# - **Information retrieval**: Google
#     - Finding relevant results
#     - Finding similar results
# - **Information extraction**: Gmail
#     - Structured information from unstructured documents
# - **Assistive technologies**: Google autocompletion
#     - Predictive text input
#     - Text simplification
# - **Natural Language Generation**: computer-generated articles
#     - Generating text from data
# - **Automatic summarization**: Google News
#     - Extractive summarization
#     - Abstractive summarization
# - **Sentiment analysis**: Twitter analysis
#     - Attitude of speaker

# ### What are some of the lower level components?
# - **Tokenization**: breaking text into tokens (words, sentences, n-grams)
# - **Stopword removal**: a/an/the
# - **Stemming and lemmatization**: root word
# - **TF-IDF**: word importance
# - **Part-of-speech tagging**: noun/verb/adjective
# - **Named entity recognition**: person/organization/location
# - **Spelling correction**: "New Yrok City"
# - **Word sense disambiguation**: "buy a mouse"
# - **Segmentation**: "New York City subway"
# - **Language detection**: "translate this page"
# - **Machine learning**

# ### Why is NLP hard?
# - **Ambiguity**:
#     - Teacher Strikes Idle Kids
#     - Red Tape Holds Up New Bridges
#     - Hospitals are Sued by 7 Foot Doctors
#     - Juvenile Court to Try Shooting Defendant
#     - Local High School Dropouts Cut in Half
# - **Non-standard English**: tweets/text messages
# - **Idioms**: "throw in the towel"
# - **Newly coined words**: "retweet"
# - **Tricky entity names**: "Where is A Bug's Life playing?"
# - **World knowledge**: "Mary and Sue are sisters", "Mary and Sue are mothers"

# ### How does NLP work?
# - Build probabilistic model using data about a language
# - Requires an understanding of the language
# - Requires an understanding of the world (or a particular domain)


# ## Part 2: Reading in the Yelp Reviews

# - "corpus" = collection of documents
# - "corpora" = plural form of corpus

import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

# read yelp.csv into a DataFrame
url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/yelp.csv'
yelp = pd.read_csv(url)

# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(yelp_best_worst.text, yelp_best_worst.stars, random_state=1)


# ## Part 3: Tokenization
# - **What:** Separate text into units such as sentences or words
# - **Why:** Gives structure to previously unstructured text
# - **Notes:** Relatively easy with English language text, not easy with some languages

# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)

# rows are documents, columns are terms (aka "tokens" or "features")
train_dtm.shape

# last 50 features
print vect.get_feature_names()[-50:]

# show vectorizer options
vect

# [CountVectorizer documentation](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

# - **lowercase:** boolean, True by default
# - Convert all characters to lowercase before tokenizing.

# don't convert to lowercase
vect = CountVectorizer(lowercase=False)
train_dtm = vect.fit_transform(X_train)
train_dtm.shape

# - **token_pattern:** string
# - Regular expression denoting what constitutes a "token". The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).

# allow tokens of one character
vect = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
train_dtm = vect.fit_transform(X_train)
train_dtm.shape

# - **ngram_range:** tuple (min_n, max_n)
# - The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.

# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
train_dtm = vect.fit_transform(X_train)
train_dtm.shape

# last 50 features
print vect.get_feature_names()[-50:]

# **Predicting the star rating:**

# use default options for CountVectorizer
vect = CountVectorizer()

# create document-term matrices
train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)

# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)

# calculate accuracy
print metrics.accuracy_score(y_test, y_pred_class)

# calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
y_test_binary.mean()

# define a function that accepts a vectorizer and returns the accuracy
def tokenize_test(vect):
    train_dtm = vect.fit_transform(X_train)
    print 'Features: ', train_dtm.shape[1]
    test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(train_dtm, y_train)
    y_pred_class = nb.predict(test_dtm)
    print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)


# ## Part 4: Stopword Removal
# - **What:** Remove common words that will likely appear in any text
# - **Why:** They don't tell you much about your text

# show vectorizer options
vect

# - **stop_words:** string {'english'}, list, or None (default)
# - If 'english', a built-in stop word list for English is used.
# - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
# - If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.

# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

# set of stop words
print vect.get_stop_words()


# ## Part 5: Other CountVectorizer Options
# - **max_features:** int or None, default=None
# - If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)

# all 100 features
print vect.get_feature_names()

# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 2), max_features=100000)
tokenize_test(vect)

# - **min_df:** float in range [0.0, 1.0] or int, default=1
# - When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts.

# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)


# ## Part 6: Introduction to TextBlob
# TextBlob: "Simplified Text Processing"

# print the first review
print yelp_best_worst.text[0]

# save it as a TextBlob object
review = TextBlob(yelp_best_worst.text[0])

# list the words
review.words

# list the sentences
review.sentences

# some string methods are available
review.lower()


# ## Part 7: Stemming and Lemmatization

# **Stemming:**
# - **What:** Reduce a word to its base/stem/root form
# - **Why:** Often makes sense to treat related words the same way
# - **Notes:**
#     - Uses a "simple" and fast rule-based approach
#     - Stemmed words are usually not shown to users (used for analysis/indexing)
#     - Some search engines treat words with the same stem as synonyms

# initialize stemmer
stemmer = SnowballStemmer('english')

# stem each word
print [stemmer.stem(word) for word in review.words]

# **Lemmatization**
# - **What:** Derive the canonical form ('lemma') of a word
# - **Why:** Can be better than stemming
# - **Notes:** Uses a dictionary-based approach (slower than stemming)

# assume every word is a noun
print [word.lemmatize() for word in review.words]

# assume every word is a verb
print [word.lemmatize(pos='v') for word in review.words]

# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
    text = unicode(text, 'utf-8').lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

# use split_into_lemmas as the feature extraction function
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)

# last 50 features
print vect.get_feature_names()[-50:]


# ## Part 8: Term Frequency - Inverse Document Frequency (TF-IDF)
# - **What:** Computes "relative frequency" that a word appears in a document compared to its frequency across all documents
# - **Why:** More useful than "term frequency" for identifying "important" words in each document (high frequency in that document, low frequency in other documents)
# - **Notes:** Used for search engine scoring, text summarization, document clustering

# example documents
train_simple = ['call you tonight',
                'Call me a cab',
                'please call me... PLEASE!']

# CountVectorizer
vect = CountVectorizer()
pd.DataFrame(vect.fit_transform(train_simple).toarray(), columns=vect.get_feature_names())

# TfidfVectorizer
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(train_simple).toarray(), columns=vect.get_feature_names())


# ## Part 9: Using TF-IDF to Summarize a Yelp Review

# create a document-term matrix using TF-IDF
vect = TfidfVectorizer(stop_words='english')
dtm = vect.fit_transform(yelp.text)
features = vect.get_feature_names()
dtm.shape

def summarize():
    
    # choose a random review that is at least 300 characters
    review_length = 0
    while review_length < 300:
        review_id = np.random.randint(0, len(yelp))
        review_text = unicode(yelp.text[review_id], 'utf-8')
        review_length = len(review_text)
    
    # create a dictionary of words and their TF-IDF scores
    word_scores = {}
    for word in TextBlob(review_text).words:
        word = word.lower()
        if word in features:
            word_scores[word] = dtm[review_id, features.index(word)]
    
    # print words with the top 5 TF-IDF scores
    print 'TOP SCORING WORDS:'
    top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5]
    for word, score in top_scores:
        print word
    
    # print 5 random words
    print '\n' + 'RANDOM WORDS:'
    random_words = np.random.choice(word_scores.keys(), size=5, replace=False)
    for word in random_words:
        print word
    
    # print the review
    print '\n' + review_text

summarize()


# ## Part 10: Sentiment Analysis

print review

# polarity ranges from -1 (most negative) to 1 (most positive)
review.sentiment.polarity

# understanding the apply method
yelp['length'] = yelp.text.apply(len)

# define a function that accepts text and returns the polarity
def detect_sentiment(text):
    return TextBlob(text.decode('utf-8')).sentiment.polarity

# create a new DataFrame column for sentiment
yelp['sentiment'] = yelp.text.apply(detect_sentiment)

# boxplot of sentiment grouped by stars
yelp.boxplot(column='sentiment', by='stars')

# reviews with most positive sentiment
yelp[yelp.sentiment == 1].text.head()

# reviews with most negative sentiment
yelp[yelp.sentiment == -1].text.head()

# widen the column display
pd.set_option('max_colwidth', 500)

# negative sentiment in a 5-star review
yelp[(yelp.stars == 5) & (yelp.sentiment < -0.3)].head()

# positive sentiment in a 1-star review
yelp[(yelp.stars == 1) & (yelp.sentiment > 0.5)].head()

# reset the column display width
pd.reset_option('max_colwidth')


# ## Part 11: Adding Features to a Document-Term Matrix

# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# split the new DataFrame into training and testing sets
feature_cols = ['text', 'sentiment', 'cool', 'useful', 'funny']
X = yelp_best_worst[feature_cols]
y = yelp_best_worst.stars
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# use CountVectorizer with text column only
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train[:, 0])
test_dtm = vect.transform(X_test[:, 0])
print train_dtm.shape
print test_dtm.shape

# shape of other four feature columns
X_train[:, 1:].shape

# cast other feature columns to float and convert to a sparse matrix
extra = sp.sparse.csr_matrix(X_train[:, 1:].astype(float))
extra.shape

# combine sparse matrices
train_dtm_extra = sp.sparse.hstack((train_dtm, extra))
train_dtm_extra.shape

# repeat for testing set
extra = sp.sparse.csr_matrix(X_test[:, 1:].astype(float))
test_dtm_extra = sp.sparse.hstack((test_dtm, extra))
test_dtm_extra.shape

# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(train_dtm, y_train)
y_pred_class = logreg.predict(test_dtm)
print metrics.accuracy_score(y_test, y_pred_class)

# use logistic regression with all features
logreg = LogisticRegression(C=1e9)
logreg.fit(train_dtm_extra, y_train)
y_pred_class = logreg.predict(test_dtm_extra)
print metrics.accuracy_score(y_test, y_pred_class)


# ## Part 12: Fun TextBlob Features

# spelling correction
TextBlob('15 minuets late').correct()

# spellcheck
Word('parot').spellcheck()

# definitions
Word('bank').define('v')

# language identification
TextBlob('Hola amigos').detect_language()