Load data

In [1]:
import numpy as np
import pandas as pd
In [2]:
from IPython.core.display import display, HTML
display(HTML('''
<style>
.dataframe td, .dataframe th {
  border: 1px solid black;
  background: white;
}
.dataframe td {
  text-align: left;
}
</style>
'''))
In [3]:
df = pd.DataFrame({
    'Outlook': ['sunny', 'sunny', 'overcast', 'rain', 'rain', 'rain', 'overcast', 'sunny', 'sunny', 'rain', 'sunny', 'overcast', 'overcast', 'rain'],
    'Temperature': ['hot', 'hot', 'hot', 'mild', 'cool', 'cool', 'cool', 'mild', 'cool', 'mild', 'mild', 'mild', 'hot', 'mild'],
    'Humidity': ['high', 'high', 'high', 'high', 'normal', 'normal', 'normal', 'high', 'normal', 'normal', 'normal', 'high', 'normal','high'],
    'Wind': ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'strong'],
    'Play': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
})
HTML(df.to_html(index=False))
Out[3]:
Outlook Temperature Humidity Wind Play
sunny hot high weak no
sunny hot high strong no
overcast hot high weak yes
rain mild high weak yes
rain cool normal weak yes
rain cool normal strong no
overcast cool normal strong yes
sunny mild high weak no
sunny cool normal weak yes
rain mild normal weak yes
sunny mild normal strong yes
overcast mild high strong yes
overcast hot normal weak yes
rain mild high strong no

Explore data

In [4]:
val, count = np.unique(df['Play'], return_counts=True)
n = np.sum(count)

for i,v in enumerate(val):
    print('P(Play={:<3s}) = {:d}/{:d}'.format(v, count[i], n))
P(Play=no ) = 5/14
P(Play=yes) = 9/14
In [5]:
for column in df.drop('Play', axis=1).columns:
    dftmp = pd.crosstab(df[column], df['Play'],
                        margins=False,
                        rownames=[None],colnames=[column])

    dftmp.columns = 'Play=' + dftmp.columns

    for i,v in enumerate(val):
        dftmp.iloc[:,i] = dftmp.iloc[:,i].astype('string') + '/' + str(count[i])

    display(HTML(dftmp.to_html()))
Outlook Play=no Play=yes
overcast 0/5 4/9
rain 2/5 3/9
sunny 3/5 2/9
Temperature Play=no Play=yes
cool 1/5 3/9
hot 2/5 2/9
mild 2/5 4/9
Humidity Play=no Play=yes
high 4/5 3/9
normal 1/5 6/9
Wind Play=no Play=yes
strong 3/5 3/9
weak 2/5 6/9

From scratch

In [6]:
dfYes = df[df['Play'] == 'yes']
dfNo  = df[df['Play'] == 'no']
In [7]:
nYes = len(dfYes)
nNo  = len(dfNo)

print(nYes, nNo)
9 5
In [8]:
pYes = (dfYes['Outlook']     == 'sunny').sum()/nYes  \
    * (dfYes['Temperature'] == 'cool').sum()/nYes   \
    * (dfYes['Humidity']    == 'high').sum()/nYes   \
    * (dfYes['Wind']        == 'strong').sum()/nYes \
    * nYes/len(df)
pYes
Out[8]:
0.005291005291005291
In [9]:
pNo = (dfNo['Outlook']     == 'sunny').sum()/nNo  \
    * (dfNo['Temperature'] == 'cool').sum()/nNo   \
    * (dfNo['Humidity']    == 'high').sum()/nNo   \
    * (dfNo['Wind']        == 'strong').sum()/nNo \
    * nNo/len(df)
pNo
Out[9]:
0.020571428571428574
In [10]:
print('Prediction:', ('yes' if pYes > pNo else 'no'))
Prediction: no

With sklearn

On a le choix entre

  • BernoulliNB: si toutes les caractéristiques sont binaires ({0,1})
  • MultinomialNB: si les données sont discrètes (ex {1,2,3})
  • GaussianNB: si les données sont continues (ex [1..5])
In [11]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.preprocessing import LabelEncoder

# Encode labels to integers
encoder = LabelEncoder()
c = {}

Y = encoder.fit_transform(df['Play'])
c['Play'] = list(encoder.classes_)

X = df.drop('Play', axis=1)
for column in X.columns:
    X[column] = encoder.fit_transform(X[column])
    c[column] = list(encoder.classes_)
In [12]:
# Pre-compute likelihood tables
model = MultinomialNB()
model.fit(X, Y)
Out[12]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [13]:
# Predict most likely outcome
res = model.predict([[
    c['Outlook'].index('sunny'),
    c['Temperature'].index('cool'),
    c['Humidity'].index('high'),
    c['Wind'].index('strong'),
]])[0]

print('Prediction:', c['Play'][res])
Prediction: no
In [14]:
'''
# Evaluate
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)
confusion_matrix(y_test, y_pred)
'''
Out[14]:
'\n# Evaluate\nfrom sklearn.metrics import accuracy_score, confusion_matrix\n\ny_pred = model.predict(X_test)\naccuracy_score(y_test, y_pred, normalize=True)\nconfusion_matrix(y_test, y_pred)\n'