Load libraries¶

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

!pip install imbalanced-learn

Requirement already satisfied: imbalanced-learn in /home/myself/anaconda3/lib/python3.7/site-packages (0.7.0)
Requirement already satisfied: joblib>=0.11 in /home/myself/anaconda3/lib/python3.7/site-packages (from imbalanced-learn) (0.13.2)
Requirement already satisfied: scipy>=0.19.1 in /home/myself/anaconda3/lib/python3.7/site-packages (from imbalanced-learn) (1.3.0)
Requirement already satisfied: numpy>=1.13.3 in /home/myself/anaconda3/lib/python3.7/site-packages (from imbalanced-learn) (1.18.5)
Requirement already satisfied: scikit-learn>=0.23 in /home/myself/anaconda3/lib/python3.7/site-packages (from imbalanced-learn) (0.23.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/myself/anaconda3/lib/python3.7/site-packages (from scikit-learn>=0.23->imbalanced-learn) (2.1.0)

Load data¶

from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=10000,
    n_features=2,
    n_redundant=0,
    n_clusters_per_class=1,
    weights=[0.99],
    flip_y=0,
    random_state=1)

X[:5]

array([[0.22201371, 0.54020654],
       [1.34743875, 1.41282352],
       [0.53723817, 0.37273043],
       [2.13446179, 1.40481867],
       [2.31582718, 1.35685798]])

y[:5]

array([0, 0, 0, 0, 0])

df      = pd.DataFrame(data=X)
df['y'] = y
df.head()

df.y.value_counts()

0    9900
1     100
Name: y, dtype: int64

df0 = df[df.y == 0]
df1 = df[df.y == 1]

plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')

<matplotlib.collections.PathCollection at 0x7f18c368b160>

Up sampling¶

from sklearn.utils import resample

df0 = df[df.y == 0]
df1 = df[df.y == 1]

df1_up = resample(
  df1,
  replace=True,
  n_samples=len(df0),
  random_state=0
)

df_up = pd.concat([df0, df1_up])
df_up.y.value_counts()

1    9900
0    9900
Name: y, dtype: int64

df0 = df[df.y == 0]
df1 = df[df.y == 1]

plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')

<matplotlib.collections.PathCollection at 0x7f18c3586198>

SMOTE¶

from imblearn.over_sampling import SMOTE

smote      = SMOTE()
X_up, y_up = smote.fit_resample(df.drop('y', axis=1), df.y)

df_up      = pd.DataFrame(data=X_up)
df_up['y'] = y_up
df_up.y.value_counts()

1    9900
0    9900
Name: y, dtype: int64

df0 = df_up[df_up.y == 0]
df1 = df_up[df_up.y == 1]

plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')

<matplotlib.collections.PathCollection at 0x7f18be531080>

Down sampling¶

df0 = df[df.y == 0]
df1 = df[df.y == 1]

df0_down = resample(
  df0,
  replace=False,
  n_samples=len(df1),
  random_state=0
)

df_down = pd.concat([df0_down, df1])
df_down.y.value_counts()

1    100
0    100
Name: y, dtype: int64

df0 = df_down[df_down.y == 0]
df1 = df_down[df_down.y == 1]

plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')

<matplotlib.collections.PathCollection at 0x7f18be4f89e8>

Near Miss¶

from imblearn.under_sampling import NearMiss

nm = NearMiss(n_neighbors=5)
X_down, y_down = nm.fit_resample(df.drop('y', axis=1), df.y)

df_down      = pd.DataFrame(data=X_down)
df_down['y'] = y_down
df_down.y.value_counts()

1    100
0    100
Name: y, dtype: int64

df0 = df_down[df_down.y == 0]
df1 = df_down[df_down.y == 1]

plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')

plt.ylim((-0.81, 3.27))
plt.xlim((-3.47, 4.82))

(-3.47, 4.82)

df0 = df[df.y == 0]
df1 = df[df.y == 1]

plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')

<matplotlib.collections.PathCollection at 0x7f18be405588>

ClusterCentroids¶

from imblearn.under_sampling import ClusterCentroids

X = df.drop('y', axis=1)
y = df.y

_object = ClusterCentroids()
_X, _y  = _object.fit_resample(X, y)

# Plot
X0 = _X[_y == 0]
X1 = _X[_y == 1]

plt.scatter(X0[0], X0[1], color='blue')
plt.scatter(X1[0], X1[1], color='orange')

plt.ylim((-0.81, 3.27))
plt.xlim((-3.47, 4.82))
plt.title('{:s} ({:d}/{:d})'.format(str(_object), len(X0), len(X1)))
plt.show()

Combining over & under sampling¶

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SVMSMOTE

pipeline = Pipeline(steps=[
    ('o', SVMSMOTE(sampling_strategy=0.1)),   # over-sampling 10%
    ('u', RandomUnderSampler())               # under-sampling
])
_X, _y = pipeline.fit_resample(X,y)

# Plot
X0 = _X[_y == 0]
X1 = _X[_y == 1]

plt.scatter(X0[0], X0[1], color='blue')
plt.scatter(X1[0], X1[1], color='orange')

plt.ylim((-0.81, 3.27))
plt.xlim((-3.47, 4.82))
plt.title('{:s} ({:d}/{:d})'.format(str(pipeline), len(X0), len(X1)))
plt.show()

	0	1
0	0.222014	0.540207
1	1.347439	1.412824
2	0.537238	0.372730
3	2.134462	1.404819
4	2.315827	1.356858