In [2]:
import pandas as pd
import numpy as np

Load data

In [3]:
df = pd.DataFrame(data=[
    ['Jay', 35, 35, 3, 'No'],
    ['Rina', 22, 50, 2, 'Yes'],
    ['Emma', 63, 200, 1, 'No'],
    ['Tommy', 59, 170, 1, 'No'],
    ['Neil', 25, 40, 4, 'Yes']
],
columns=['Name', 'Age', 'Income', 'Credit cards', 'Response'])

df
Out[3]:
Name Age Income Credit cards Response
0 Jay 35 35 3 No
1 Rina 22 50 2 Yes
2 Emma 63 200 1 No
3 Tommy 59 170 1 No
4 Neil 25 40 4 Yes
In [4]:
# Valeur à prédire
x = pd.Series(['David', 37, 50, 2], index=df.columns[:-1])
x
Out[4]:
Name            David
Age                37
Income             50
Credit cards        2
dtype: object

From scratch

In [5]:
# Normalize data
df2 = df.copy()
x2  = x.copy()

for col in df.select_dtypes(exclude=['object']).columns:
    _max      = df2[col].max()
    df2[col]  = df2[col]/_max
    x2[col]   = x2[col]/_max
df2
Out[5]:
Name Age Income Credit cards Response
0 Jay 0.555556 0.175 0.75 No
1 Rina 0.349206 0.250 0.50 Yes
2 Emma 1.000000 1.000 0.25 No
3 Tommy 0.936508 0.850 0.25 No
4 Neil 0.396825 0.200 1.00 Yes
In [6]:
x2
Out[6]:
Name               David
Age             0.587302
Income              0.25
Credit cards         0.5
dtype: object
In [7]:
# Compute the euclidean distance
distances = pd.Series(dtype='float')

for idx, row in df2.iterrows():
    d  = (row['Age'] - x2['Age'])**2
    d += (row['Income'] - x2['Income'])**2
    d += (row['Credit cards'] - x2['Credit cards'])**2
    d  = np.sqrt(d)

    distances.loc[idx] = d

distances = distances.sort_values(ascending=True)
distances
Out[7]:
1    0.238095
0    0.262931
4    0.537384
3    0.737865
2    0.891807
dtype: float64
In [8]:
# 3 closest records
df.loc[distances[:3].index]
Out[8]:
Name Age Income Credit cards Response
1 Rina 22 50 2 Yes
0 Jay 35 35 3 No
4 Neil 25 40 4 Yes
In [9]:
df.loc[distances[:3].index]['Response'].mode()
Out[9]:
0    Yes
dtype: object

Rina, Jay & Neil sont les plus proches de David.
On prédit la réponse la plus commune: Yes.

Using sklearn

In [10]:
from sklearn import neighbors
In [11]:
X = df.drop(['Name', 'Response'], axis=1)
Y = df['Response']
In [12]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
knn.fit(X, Y)
Out[12]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
In [13]:
knn.predict([x.drop('Name')])
Out[13]:
array(['Yes'], dtype=object)