Readings
https://machinelearningmastery.com/feature-selection-machine-learning-python/
https://scikit-learn.org/stable/modules/feature_selection.html
Libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, mean_squared_error
import tensorflow as tf
%matplotlib inline
Get Data
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(cancer.keys())
print('target names ', cancer['target_names'])
# print(cancer['DESCR'])
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
features_for_pca = df.columns.shape[0]
df['target'] = cancer['target']
print(df.groupby('target')['target'].count())
scaler = StandardScaler()
scaler.fit(df.drop('target', axis = 1))
scaled_data = scaler.transform(df.drop('target', axis = 1))
# Replace spaces
column_names = []
for name in df.columns:
column_names.append(name.replace(" ", "_")) #_ Added for Atom
df.columns = column_names
# print(df.columns)
df.head()
Logistic Regression L1
X = df.drop('target', axis = 1)
y = df['target']
model = LogisticRegression(penalty='l1')
model.fit(X, y)
coef_dict = {}
for coef, feat in zip(model.coef_[0,:],X):
if coef != 0: coef_dict[feat] = coef
print(pd.DataFrame.from_dict(coef_dict, orient='index', columns=['Coef']).sort_values(by=['Coef']))
# or
vth = pd.DataFrame({
'Name': df.drop('target', axis=1).columns,
'VThScore': model.coef_[0]
})
print(vth[vth['VThScore'] != 0].sort_values(by=['VThScore']).Name)
VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(df.drop('target', axis=1))
print(pd.DataFrame(df[df.columns[sel.get_support(indices=True)]].columns, columns=['Coef']))
SelectKBest
X = df.drop('target', axis = 1)
y = df['target']
selector = SelectKBest(chi2, k=10)
fit_skb = selector.fit(X, y)
skb = pd.DataFrame({
'Name': df.drop('target', axis=1).columns,
'SKBScore': fit_skb.scores_
})
print(skb.sort_values(by='SKBScore', ascending=False).head(10))
# cols = selector.get_support(indices=True)
# print(cols)
# print(df.columns[cols])
Recursive Feature Elimination
model = LogisticRegression()
rfe = RFE(model, 10)
fit = rfe.fit(X, y)
rfe = pd.DataFrame({
'Name': df.drop('target', axis=1).columns,
'Rank': fit.ranking_,
'Support': fit.support_
})
print(rfe.sort_values(by=['Rank']).head(10).sort_index())
Extra Trees
extrees = ExtraTreesClassifier()
extrees.fit(X, y)
extrees = pd.DataFrame({
'Name': df.drop('target', axis=1).columns,
'ExTrees': extrees.feature_importances_
})
print(extrees.sort_values(by=['ExTrees']).head(10).sort_index())
Hat tip to
Soledad Galli
onward
In all feature selection procedures, it is good practice to select the features by examining only the training set. And this is to avoid overfit.
Misc Methods
# remove constant features
constant_features = [
feat for feat in X_train.columns if X_train[feat].std() == 0
]
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)
X_train.shape, X_test.shape
# remove quasi-constant features
sel = VarianceThreshold(
threshold=0.01) # 0.1 indicates 99% of observations approximately
sel.fit(X_train) # fit finds the features with low variance
sum(sel.get_support())
# check for duplicated features in the training set
duplicated_feat = []
for i in range(0, len(X_train.columns)):
if i % 10 == 0: # this helps me understand how the loop is going
print(i)
col_1 = X_train.columns[i]
for col_2 in X_train.columns[i + 1:]:
if X_train[col_1].equals(X_train[col_2]):
duplicated_feat.append(col_2)
len(duplicated_feat)
# remove duplicated features
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)
X_train.shape, X_test.shape
# find and remove correlated features
# to reduce the feature space
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
return col_corr
corr_features = correlation(X_train, 0.8)
# removed correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)
X_train.shape, X_test.shape
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import roc_auc_score
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# Step backward greedy selection algorithm
# sfs1 = SFS(RandomForestRegressor(),
# k_features=10,
# forward=False,
# floating=False,
# verbose=2,
# scoring='r2',
# cv=3)
# sfs1 = sfs1.fit(np.array(X_train), y_train)
# Exhaustive feature selector
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
# efs1 = EFS(RandomForestClassifier(n_jobs=4, random_state=0),
# min_features=1,
# max_features=4,
# scoring='roc_auc',
# print_progress=True,
# cv=2)
# efs1 = efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)), y_train)
# find important features using univariate roc-auc
# select features using the coefficient of a non
# regularised logistic regression
# from sklearn.feature_selection import SelectFromModel
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
# sfm = SelectFromModel(LogisticRegression(C=1000))
# sfm.fit(scaler.transform(X, y)
# SelectFromModel(RandomForestClassifier(n_estimators=400))