import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({
'A': [10, 11, 8, 3, 2, 1],
'B': [ 6, 4, 5, 3, 2.8, 1]
})
df
plt.scatter(df.A, df.B)
plt.xlabel('Caractéristique A')
plt.ylabel('Caractéristique B')
df_norm = (df-df.mean())/df.std()
plt.scatter(df_norm.A, df_norm.B)
plt.xlabel('Caractéristique A')
plt.ylabel('Caractéristique B')
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
res = pca.fit_transform(df_norm)
res
# Singular values
pca.singular_values_.round(2)
# Eigenvalues
pca.explained_variance_.round(2)
# Eigenvalues/eigenvalues.sum()
pca.explained_variance_ratio_.round(2)
# Eigenvectors
pca.components_
plt.bar(['PC1', 'PC2'], pca.explained_variance_ratio_)
k = 1
df_reduced = np.dot(pca.components_[:k], df_norm.T)
plt.scatter(df_reduced[0], np.ones_like(df_reduced[0]))
cov = np.cov(df_norm.T)
cov
(df_norm.T.dot(df_norm))/(len(df_norm)-1)
eigenvalues, eigenvectors = np.linalg.eig(cov)
# Here the column v[:,i] is the eigenvector
# corresponding to the eigenvalue w[i]
# Make it the opposite: v[i, :]
eigenvectors = eigenvectors.T
print("Eigenvalues (explained variance):\n", eigenvalues, "\n")
print("Eigenvectors (components):\n", eigenvectors)
print("Explained variance ratio:\n", eigenvalues/eigenvalues.sum())
rsort_eigenvalues_idx = eigenvalues.argsort()[::-1]
rsort_eigenvalues_idx
eigenvalues[rsort_eigenvalues_idx]/eigenvalues.sum()
rsort_eigenvectors = eigenvectors[rsort_eigenvalues_idx]
rsort_eigenvectors
PC1 is a principal component that captures 0.92% of the data variance, using a combination of a and b (-0.71⋅a - 0.71⋅b). That means that a 1-D graph, using just PC1 would be a good approximation of the 2-D graph since it would account for 92% of the variation in the data. This can be used to identify clusters of data.
k = 1
df_reduced = np.dot(rsort_eigenvectors[:k], df_norm.T)
df_reduced
plt.scatter(df_reduced[0], np.ones_like(df_reduced[0]))
U, s, V = np.linalg.svd(df_norm, full_matrices=False)
# Left singular vectors
U
# Singular values
s.round(2)
# Right singular vectors = eigenvectors
V
# Eigenvalues
n_sample = len(df_norm)
(s**2/(n_sample-1)).round(2)
# Transformed data
k = 2
U[:, :k]*s[:k]