In [1]:
corpus = [
    "another five fish find another faraway fish",
    "i love fantastic flying fish"
]

Bag of words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
In [3]:
vectorizer = CountVectorizer(
    tokenizer=None,
    token_pattern=r"(?u)\b\w+\b"
)
vectorizer.fit_transform(corpus).toarray()
Out[3]:
array([[2, 0, 1, 1, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 1, 1, 1]])
In [4]:
vectorizer.vocabulary_
Out[4]:
{'another': 0,
 'five': 5,
 'fish': 4,
 'find': 3,
 'faraway': 2,
 'i': 7,
 'love': 8,
 'fantastic': 1,
 'flying': 6}
In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [6]:
vectorizer = TfidfVectorizer(
    norm=None,
    token_pattern=r"(?u)\b\w+\b"
)
vectorizer.fit_transform(corpus).toarray()
Out[6]:
array([[2.81093022, 0.        , 1.40546511, 1.40546511, 2.        ,
        1.40546511, 0.        , 0.        , 0.        ],
       [0.        , 1.40546511, 0.        , 0.        , 1.        ,
        0.        , 1.40546511, 1.40546511, 1.40546511]])

Integer encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
In [8]:
tokens = [txt.split(' ') for txt in corpus]
tokens
Out[8]:
[['another', 'five', 'fish', 'find', 'another', 'faraway', 'fish'],
 ['i', 'love', 'fantastic', 'flying', 'fish']]
In [9]:
vocab = np.unique(np.concatenate(tokens))
vocab
Out[9]:
array(['another', 'fantastic', 'faraway', 'find', 'fish', 'five',
       'flying', 'i', 'love'], dtype='<U9')
In [10]:
vectorizer = LabelEncoder()
vectorizer.fit(vocab)
[vectorizer.transform(x) for x in tokens]
Out[10]:
[array([0, 5, 4, 3, 0, 2, 4]), array([7, 8, 1, 6, 4])]

One-hot encoding

In [11]:
from sklearn.preprocessing import OneHotEncoder
In [12]:
vocab.reshape(-1,1)
Out[12]:
array([['another'],
       ['fantastic'],
       ['faraway'],
       ['find'],
       ['fish'],
       ['five'],
       ['flying'],
       ['i'],
       ['love']], dtype='<U9')
In [13]:
vectorizer = OneHotEncoder(handle_unknown='ignore', sparse=False)
vectorizer.fit(vocab.reshape(-1,1))
[vectorizer.transform(np.array(x).reshape(-1,1)) for x in tokens]
Out[13]:
[array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.]])]