N-gram

Qu’est-ce que c’est

Markov assumption

Start & end tokens

Définition mathématique

python

from nltk.util import ngrams
from collections import Counter

tokens = preprocess_text(txt)

# Unigram (Bag-of-Words) approach:
bag_of_words = Counter(tokens)
print("Most frequent words according to Bag-of-Words:\n",
      bag_of_words.most_common(10))

# Bigram approach:
bigrams = Counter(ngrams(tokens, 2))
print("Most frequent word sequences according to Bigrams:\n",
      bigrams.most_common(10),
     "\n")

Choisir n