corpus = [
"another five fish find another faraway fish",
"i love fantastic flying fish"
]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
tokenizer=None,
token_pattern=r"(?u)\b\w+\b"
)
vectorizer.fit_transform(corpus).toarray()
vectorizer.vocabulary_
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
norm=None,
token_pattern=r"(?u)\b\w+\b"
)
vectorizer.fit_transform(corpus).toarray()
from sklearn.preprocessing import LabelEncoder
import numpy as np
tokens = [txt.split(' ') for txt in corpus]
tokens
vocab = np.unique(np.concatenate(tokens))
vocab
vectorizer = LabelEncoder()
vectorizer.fit(vocab)
[vectorizer.transform(x) for x in tokens]
from sklearn.preprocessing import OneHotEncoder
vocab.reshape(-1,1)
vectorizer = OneHotEncoder(handle_unknown='ignore', sparse=False)
vectorizer.fit(vocab.reshape(-1,1))
[vectorizer.transform(np.array(x).reshape(-1,1)) for x in tokens]