import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install imbalanced-learn
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=10000,
n_features=2,
n_redundant=0,
n_clusters_per_class=1,
weights=[0.99],
flip_y=0,
random_state=1)
X[:5]
y[:5]
df = pd.DataFrame(data=X)
df['y'] = y
df.head()
df.y.value_counts()
df0 = df[df.y == 0]
df1 = df[df.y == 1]
plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')
from sklearn.utils import resample
df0 = df[df.y == 0]
df1 = df[df.y == 1]
df1_up = resample(
df1,
replace=True,
n_samples=len(df0),
random_state=0
)
df_up = pd.concat([df0, df1_up])
df_up.y.value_counts()
df0 = df[df.y == 0]
df1 = df[df.y == 1]
plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_up, y_up = smote.fit_resample(df.drop('y', axis=1), df.y)
df_up = pd.DataFrame(data=X_up)
df_up['y'] = y_up
df_up.y.value_counts()
df0 = df_up[df_up.y == 0]
df1 = df_up[df_up.y == 1]
plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')
df0 = df[df.y == 0]
df1 = df[df.y == 1]
df0_down = resample(
df0,
replace=False,
n_samples=len(df1),
random_state=0
)
df_down = pd.concat([df0_down, df1])
df_down.y.value_counts()
df0 = df_down[df_down.y == 0]
df1 = df_down[df_down.y == 1]
plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')
from imblearn.under_sampling import NearMiss
nm = NearMiss(n_neighbors=5)
X_down, y_down = nm.fit_resample(df.drop('y', axis=1), df.y)
df_down = pd.DataFrame(data=X_down)
df_down['y'] = y_down
df_down.y.value_counts()
df0 = df_down[df_down.y == 0]
df1 = df_down[df_down.y == 1]
plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')
plt.ylim((-0.81, 3.27))
plt.xlim((-3.47, 4.82))
df0 = df[df.y == 0]
df1 = df[df.y == 1]
plt.scatter(df0[0], df0[1], color='blue')
plt.scatter(df1[0], df1[1], color='orange')
from imblearn.under_sampling import ClusterCentroids
X = df.drop('y', axis=1)
y = df.y
_object = ClusterCentroids()
_X, _y = _object.fit_resample(X, y)
# Plot
X0 = _X[_y == 0]
X1 = _X[_y == 1]
plt.scatter(X0[0], X0[1], color='blue')
plt.scatter(X1[0], X1[1], color='orange')
plt.ylim((-0.81, 3.27))
plt.xlim((-3.47, 4.82))
plt.title('{:s} ({:d}/{:d})'.format(str(_object), len(X0), len(X1)))
plt.show()
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SVMSMOTE
pipeline = Pipeline(steps=[
('o', SVMSMOTE(sampling_strategy=0.1)), # over-sampling 10%
('u', RandomUnderSampler()) # under-sampling
])
_X, _y = pipeline.fit_resample(X,y)
# Plot
X0 = _X[_y == 0]
X1 = _X[_y == 1]
plt.scatter(X0[0], X0[1], color='blue')
plt.scatter(X1[0], X1[1], color='orange')
plt.ylim((-0.81, 3.27))
plt.xlim((-3.47, 4.82))
plt.title('{:s} ({:d}/{:d})'.format(str(pipeline), len(X0), len(X1)))
plt.show()