import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame(data=[
[12,39],
[20,39],
[28,30],
[18,52],
[29,54],
[33,46],
[24,55],
[45,99],
[45,63],
[52,70],
[51,66],
[52,63],
[55,58],
[53,23],
[55,14],
[61, 8],
[64,19],
[69, 7],
[72,24]
], columns=['x', 'y'])
df.shape
plt.scatter(df['x'], df['y'])
np.random.seed(200)
k=3
centroids={}
for i in range(k):
centroids[i+1] = [
np.random.randint(df.x.min(),df.x.max()),
np.random.randint(df.y.min(),df.y.max())
]
centroids
# Plot the data points with centroids
fig = plt.figure(figsize=(5,5))
plt.scatter(df['x'], df['y'], color='k')
colmap = {1: 'r', 2: 'y', 3: 'b'}
for i in centroids.keys():
plt.scatter(*centroids[i], color=colmap[i], marker='s')
plt.show()
def compute_distance():
for i in centroids.keys():
distance = np.sqrt(
(df['x']-centroids[i][0])**2
+ (df['y']-centroids[i][1])**2
)
df['distance_from_{}'.format(i)] = distance
compute_distance()
df.head()
cols = ['distance_from_{}'.format(i) for i in centroids.keys()]
cols
def assign_centroids():
df['closest'] = df[cols].idxmin(axis=1) \
.str.replace('distance_from_', '')
assign_centroids()
df.head()
# Plot classified data points
def plot_df():
fig = plt.figure(figsize=(5,5))
plt.scatter(
df['x'],
df['y'],
color=df['closest'].map(lambda x: colmap[int(x)]),
alpha=0.5)
# Plot centroids
for i in centroids.keys():
plt.scatter(*centroids[i], color=colmap[i], marker='s')
plot_df()
def update_centroids():
for i in centroids.keys():
points = df[df['closest'] == str(i)]
centroids[i][0] = np.mean(points['x'])
centroids[i][1] = np.mean(points['y'])
update_centroids()
centroids
plot_df()
Until centroids don't move anymore
i = 0
while True:
compute_distance()
before = df['closest'].copy()
assign_centroids()
if (df['closest'] != before).sum() == 0:
break
if i == 100:
break
update_centroids()
i += 1
print('KNN converged after {:d} iteration(s)'.format(i))
centroids
plot_df()
from sklearn.cluster import KMeans
X = df[['x', 'y']]
kmeans = KMeans(n_clusters=3, max_iter=100)
kmeans.fit(X)
kmeans.cluster_centers_
kmeans.labels_