import pandas as pd
import numpy as np
df = pd.DataFrame(data=[
['Jay', 35, 35, 3, 'No'],
['Rina', 22, 50, 2, 'Yes'],
['Emma', 63, 200, 1, 'No'],
['Tommy', 59, 170, 1, 'No'],
['Neil', 25, 40, 4, 'Yes']
],
columns=['Name', 'Age', 'Income', 'Credit cards', 'Response'])
df
# Valeur à prédire
x = pd.Series(['David', 37, 50, 2], index=df.columns[:-1])
x
# Normalize data
df2 = df.copy()
x2 = x.copy()
for col in df.select_dtypes(exclude=['object']).columns:
_max = df2[col].max()
df2[col] = df2[col]/_max
x2[col] = x2[col]/_max
df2
x2
# Compute the euclidean distance
distances = pd.Series(dtype='float')
for idx, row in df2.iterrows():
d = (row['Age'] - x2['Age'])**2
d += (row['Income'] - x2['Income'])**2
d += (row['Credit cards'] - x2['Credit cards'])**2
d = np.sqrt(d)
distances.loc[idx] = d
distances = distances.sort_values(ascending=True)
distances
# 3 closest records
df.loc[distances[:3].index]
df.loc[distances[:3].index]['Response'].mode()
Rina, Jay & Neil sont les plus proches de David.
On prédit la réponse la plus commune: Yes.
from sklearn import neighbors
X = df.drop(['Name', 'Response'], axis=1)
Y = df['Response']
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
knn.fit(X, Y)
knn.predict([x.drop('Name')])