import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
display(HTML('''
<style>
.dataframe td, .dataframe th {
border: 1px solid black;
background: white;
}
.dataframe td {
text-align: left;
}
</style>
'''))
df = pd.DataFrame({
'Outlook': ['sunny', 'sunny', 'overcast', 'rain', 'rain', 'rain', 'overcast', 'sunny', 'sunny', 'rain', 'sunny', 'overcast', 'overcast', 'rain'],
'Temperature': ['hot', 'hot', 'hot', 'mild', 'cool', 'cool', 'cool', 'mild', 'cool', 'mild', 'mild', 'mild', 'hot', 'mild'],
'Humidity': ['high', 'high', 'high', 'high', 'normal', 'normal', 'normal', 'high', 'normal', 'normal', 'normal', 'high', 'normal','high'],
'Wind': ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'strong'],
'Play': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
})
HTML(df.to_html(index=False))
val, count = np.unique(df['Play'], return_counts=True)
n = np.sum(count)
for i,v in enumerate(val):
print('P(Play={:<3s}) = {:d}/{:d}'.format(v, count[i], n))
for column in df.drop('Play', axis=1).columns:
dftmp = pd.crosstab(df[column], df['Play'],
margins=False,
rownames=[None],colnames=[column])
dftmp.columns = 'Play=' + dftmp.columns
for i,v in enumerate(val):
dftmp.iloc[:,i] = dftmp.iloc[:,i].astype('string') + '/' + str(count[i])
display(HTML(dftmp.to_html()))
dfYes = df[df['Play'] == 'yes']
dfNo = df[df['Play'] == 'no']
nYes = len(dfYes)
nNo = len(dfNo)
print(nYes, nNo)
pYes = (dfYes['Outlook'] == 'sunny').sum()/nYes \
* (dfYes['Temperature'] == 'cool').sum()/nYes \
* (dfYes['Humidity'] == 'high').sum()/nYes \
* (dfYes['Wind'] == 'strong').sum()/nYes \
* nYes/len(df)
pYes
pNo = (dfNo['Outlook'] == 'sunny').sum()/nNo \
* (dfNo['Temperature'] == 'cool').sum()/nNo \
* (dfNo['Humidity'] == 'high').sum()/nNo \
* (dfNo['Wind'] == 'strong').sum()/nNo \
* nNo/len(df)
pNo
print('Prediction:', ('yes' if pYes > pNo else 'no'))
On a le choix entre
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.preprocessing import LabelEncoder
# Encode labels to integers
encoder = LabelEncoder()
c = {}
Y = encoder.fit_transform(df['Play'])
c['Play'] = list(encoder.classes_)
X = df.drop('Play', axis=1)
for column in X.columns:
X[column] = encoder.fit_transform(X[column])
c[column] = list(encoder.classes_)
# Pre-compute likelihood tables
model = MultinomialNB()
model.fit(X, Y)
# Predict most likely outcome
res = model.predict([[
c['Outlook'].index('sunny'),
c['Temperature'].index('cool'),
c['Humidity'].index('high'),
c['Wind'].index('strong'),
]])[0]
print('Prediction:', c['Play'][res])
'''
# Evaluate
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)
confusion_matrix(y_test, y_pred)
'''