corpus = [
    "another five fish find another faraway fish",
    "i love fantastic flying fish"
]

Bag of words¶

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    tokenizer=None,
    token_pattern=r"(?u)\b\w+\b"
)
vectorizer.fit_transform(corpus).toarray()

array([[2, 0, 1, 1, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 1, 1, 1]])

vectorizer.vocabulary_

{'another': 0,
 'five': 5,
 'fish': 4,
 'find': 3,
 'faraway': 2,
 'i': 7,
 'love': 8,
 'fantastic': 1,
 'flying': 6}

TF-IDF¶

https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L609

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    norm=None,
    token_pattern=r"(?u)\b\w+\b"
)
vectorizer.fit_transform(corpus).toarray()

array([[2.81093022, 0.        , 1.40546511, 1.40546511, 2.        ,
        1.40546511, 0.        , 0.        , 0.        ],
       [0.        , 1.40546511, 0.        , 0.        , 1.        ,
        0.        , 1.40546511, 1.40546511, 1.40546511]])

Integer encoding¶

from sklearn.preprocessing import LabelEncoder
import numpy as np

tokens = [txt.split(' ') for txt in corpus]
tokens

[['another', 'five', 'fish', 'find', 'another', 'faraway', 'fish'],
 ['i', 'love', 'fantastic', 'flying', 'fish']]

vocab = np.unique(np.concatenate(tokens))
vocab

array(['another', 'fantastic', 'faraway', 'find', 'fish', 'five',
       'flying', 'i', 'love'], dtype='<U9')

vectorizer = LabelEncoder()
vectorizer.fit(vocab)
[vectorizer.transform(x) for x in tokens]

[array([0, 5, 4, 3, 0, 2, 4]), array([7, 8, 1, 6, 4])]

One-hot encoding¶

from sklearn.preprocessing import OneHotEncoder

vocab.reshape(-1,1)

array([['another'],
       ['fantastic'],
       ['faraway'],
       ['find'],
       ['fish'],
       ['five'],
       ['flying'],
       ['i'],
       ['love']], dtype='<U9')

vectorizer = OneHotEncoder(handle_unknown='ignore', sparse=False)
vectorizer.fit(vocab.reshape(-1,1))
[vectorizer.transform(np.array(x).reshape(-1,1)) for x in tokens]

[array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.]])]