from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re
normalizer = WordNetLemmatizer()
treebank_wordnet_pos = {
'J': 'a', # adjective
'V': 'v', # verb
'N': 'n', # noun
'R': 'r', # adverb
}
def get_wordnet_pos(treebank_pos, default='n'):
return treebank_wordnet_pos.get(treebank_pos[0], default)
def preprocess_text(txt):
txt = re.sub(r'\W+', ' ', txt).lower()
tokens = word_tokenize(txt)
return [
normalizer.lemmatize(token[0], get_wordnet_pos(token[1]))
for token in pos_tag(tokens)
]
One of the most common ways to implement the BoW model in Python is as a dictionary where each word appearing in the document (key) is associated with the number of times it appears (value).
def text_to_bow(txt):
bow_dictionary = {}
tokens = preprocess_text(txt)
for token in tokens:
if token in bow_dictionary:
bow_dictionary[token] += 1
else:
bow_dictionary[token] = 1
return bow_dictionary
txt = "I love fantastic flying fish. These flying fish are just ok, so maybe I will find another few fantastic fish..."
text_to_bow(txt)
Sometimes a dictionary just won’t fit the bill. Topic modelling applications, for example, require an implementation of bag-of-words that is a bit more mathematical: feature vectors. Turning text into a BoW vector is known as feature extraction or vectorization.
When building BoW vectors, we generally
Create a features dictionary of all vocabulary in our training data (usually several documents) mapped to indices. In other words, assign an index to each word in the corpus.
Using this dictionary, convert new documents into vectors using a vectorization function: create a vector of 0s, with a length of all known words, and count how many times each word appears in the document.
def create_features_dictionary(documents):
features_dictionary = {}
tokens = preprocess_text(" ".join(documents))
i = 0
for token in tokens:
if token not in features_dictionary:
features_dictionary[token] = i
i += 1
return features_dictionary
training_documents = [
"Five fantastic fish flew off to find faraway functions.",
"Maybe find another five fantastic fish?",
"Find my fish with a function please!"
]
features_dictionary = create_features_dictionary(training_documents)
features_dictionary
def text_to_bow_vector(txt, features_dictionary):
bow_vector = len(features_dictionary)*[0]
tokens = preprocess_text(txt)
for token in tokens:
i = features_dictionary[token]
bow_vector[i] += 1
return bow_vector
txt = "Another five fish find another faraway fish."
text_to_bow_vector(txt, features_dictionary)
For text_to_bow(), we can approximate the functionality with the collections module’s Counter() function:
from collections import Counter
txt = "I love fantastic flying fish. These flying fish are just ok, so maybe I will find another few fantastic fish..."
tokens = preprocess_text(txt)
Counter(tokens)
For vectorization, we can use CountVectorizer from the machine learning library scikit-learn. Use fit() to train the features dictionary and then transform() to transform text into a vector:
from sklearn.feature_extraction.text import CountVectorizer
class LemmaTokenizer(object):
def __call__(self, txt):
return preprocess_text(txt)
bow_vectorizer = CountVectorizer(
tokenizer=LemmaTokenizer()
)
bow_vectorizer.fit(training_documents)
bow_vectorizer.vocabulary_
bow_vectorizer.get_feature_names()
Note that words in sklearn's feature dictionary are sorted in alphabetical order — so the bow vectorizer here won't be in the same order as ours
txt = "Another five fish find another faraway fish."
bow_vector = bow_vectorizer.transform([txt])
print(bow_vector.toarray())
BoW also has several advantages over other language models.
It’s an easier model to get started with and a few Python libraries already have built-in support for it.
Because bag-of-words relies on single words, rather than sequences of words, there are more examples of each unit of language in the training corpus. More examples means the model has less data sparsity (i.e., it has more training knowledge to draw from)
While BoW still suffers from overfitting in terms of vocabulary, it overfits less than other statistical models, allowing for more flexibility in grammar and word choice.
Alas, there is a trade-off for all the brilliance BoW brings to the table.
Unless you want sentences that look like “the a but for the”, BoW is NOT a great primary model for text prediction — the probability of the following word is always just the most frequently used words.
The BoW model’s word tokens lack context, which can make a word’s intended meaning unclear. Ex: if you look at the original text you may find that in fact every “good” was preceded by a “not.”
Like all statistical models, BoW suffers from overfitting when it comes to vocabulary. What happens if the model comes across a new word that wasn’t in the training data?
Dataset: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
import pandas as pd
df = pd.read_csv(
'data/SMSSpamCollection.tsv',
delimiter='\t',
header=None,
names=['category','text']
)
df.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
df['text'], df['category'], random_state=0
)
len(X_train)
class LemmaTokenizer(object):
def __call__(self, txt):
return preprocess_text(txt)
bow_vectorizer = CountVectorizer(
tokenizer=LemmaTokenizer()
)
training_vectors = bow_vectorizer.fit_transform(X_train)
test_vectors = bow_vectorizer.transform(X_test)
bow_vectorizer.get_feature_names()[:5]
from sklearn.naive_bayes import MultinomialNB
spam_classifier = MultinomialNB()
spam_classifier.fit(training_vectors, y_train)
from sklearn.metrics import accuracy_score, f1_score
def spam_or_not(label): return "spam" if label else "ham"
y_predict = spam_classifier.predict(test_vectors)
accuracy = accuracy_score(y_test, y_predict)
f1score = f1_score(y_test, y_predict, pos_label='ham')
print("The predictions for the test data were {:.2f}% accurate."
.format(accuracy * 100))
print("f1-score: {:.2f}%"
.format(f1score * 100))
errors = y_test[y_predict != y_test].index
idx = errors[0]
print("For example, '{:s}' was classified as {:s} (wrongfully)."
.format(X_test.iloc[idx], spam_or_not(y_predict[idx]))
)
print("\nMeanwhile, '{:s}' was classified as {:s} (rightfully)."
.format(X_test.iloc[0], spam_or_not(y_predict[0])))