Kernel PCA
# Split the dataset
# Feature scaling
# Apply Kernel
from sklearn.decomposition import KernelPCA
# kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
# X_kpca = kpca.fit_transform(X)
# X_back = kpca.inverse_transform(X_kpca)
# pca = PCA()
# X_pca = pca.fit_transform(X)
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)
# Fit Logistic Regression to the Training set
Linear Discriminant Analysis
Sklearn
Linear Discriminant Analysis
Supervised Feature Extraction
# Split the dataset
# Feature scaling
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 2)
# X_r2 = lda.fit(X, y).transform(X)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
# Fit Logistic Regression to the Training set
Principal Component Analysis
Sklearn
Principal Component Analysis
Unsupervised Feature Extraction
# Split the dataset
# Feature scaling
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
# X_r = pca.fit(X).transform(X)
X = pca.fit_transform(X_train)
X = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
% str(pca.explained_variance_ratio_))
# Fit Logistic Regression to the Training set
Explained Variance Example
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import make_classification
import tensorflow as tf
%matplotlib inline
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(cancer.keys())
print('target names ', cancer['target_names'])
# print(cancer['DESCR'])
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
features_for_pca = df.columns.shape[0]
df['target'] = cancer['target']
print(df.groupby('target')['target'].count())
scaler = StandardScaler()
scaler.fit(df.drop('target', axis = 1))
scaled_data = scaler.transform(df.drop('target', axis = 1))
# Replace spaces
column_names = []
for name in df.columns:
column_names.append(name.replace(" ", "_"))
df.columns = column_names
# print(df.columns)
df.head()
# https://www.analyticsvidhya.com/blog/2016/03/practical-guide-principal-component-analysis-python/
pca_full = PCA(n_components=features_for_pca)
pca_full.fit(scaled_data)
# Amount of variance for each component
var = pca_full.explained_variance_ratio_
# Cumulative sum
cumsum = np.cumsum(np.round(pca_full.explained_variance_ratio_, decimals=4)*100)
# Find the number of pca components that account for 95% (~10)
plt.figure(figsize=(12, 4))
# plt.subplot(nrows=1, ncols=3, nplt=1)
plt.subplot(121)
plt.plot(var, 'b-')
plt.title('Variance')
plt.xlabel('Components')
plt.ylabel('% Explained Variance')
plt.grid()
plt.subplot(122)
plt.plot(cumsum, 'r-')
plt.title('Cumulative Sum')
plt.xlabel('Components')
plt.ylabel('Sum % Explained Variance')
plt.grid()