Cliff Whitworth

Checklist

Is there an expert on the data
Supervised or Unsupervised

Is there a target

Regression or Classification

Continuous or discrete values (class labels)
Predicting a value or identifying group membership

Feature Selection

# Remove null features
df.dropna(how='all', axis='columns', inplace=True)

# Remove null observations
df.isnull().sum()
df.drop(df.index[[null_rows]], inplace=True)

```
# Check for data leakage with expert
```

# Check for features that have only one value
constant_features = [
    feat for feat in df.columns if len(df[feat].unique()) == 1
]

# Check for features that have only zero values but have null values
constant_features = [
    feat for feat in df.columns if len(df[feat].fillna(0).unique()) == 1
]

# Check for Quasi Constant Values, does not count null values
for col in df.columns.sort_values():
    if (len(df[col].unique()) < 4):
        print(df[col].value_counts())

# Check for duplicate features
duplicated_feat = []
for i in range(0, len(dataset.columns)):
    if i % 10 == 0:  # Keep track of the loop
        print('loop tracker', i)

    col_1 = dataset.columns[i]

    for col_2 in dataset.columns[i + 1:]:
        if dataset[col_1].equals(dataset[col_2]):
            duplicated_feat.append(col_2)

# Does target need engineering
# Check for other features that need preliminary engineering

# Separate dataset into train, validate, and test
# Good practice to select the features by examining only the training set to avoid overfit
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['target'], axis=1),
    df['target'],
    test_size=0.2)

More feature selection

Check for correlated features
Feature importance
Mutual information
SelectKBest, SelectPercentile
Fisher Score - Chi-Square
ANOVA
ROC / AUC
Coefficients (Lasso)
Selection by model

Feature Engineering

Missing Data / Complete Case Analysis
Convert percentages to numerics
Outliers
Imputation
Model comparison
Reduce feature labels
Check for rare values
One hot encoding
Weight of evidence
Collinearity
Regularization

Imbalanced Classes

SMOTE
Up-Sample
Down-Sample

Regression Models

# Linear Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Polynomial Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

model = PolynomialFeatures(degree = 4)
X_poly = model.fit_transform(X_train)
model.fit(X_poly, y_train)
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)
lin_reg.predict(model.fit_transform(X_test))

# Support Vector Regression
# https://scikit-learn.org/stable/modules/svm.html#regression
from sklearn.svm import SVR

model = SVR()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Decision Tree Regression
# https://scikit-learn.org/stable/modules/tree.html#regression
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Random Forest Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 50)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# LassoCV Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
from sklearn.linear_model import LassoCV

model = LassoCV(cv=5,normalize=True,alphas=[.0005])
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# AdaBoost Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
from sklearn.ensemble import AdaBoostRegressor

model = AdaBoostRegressor(n_estimators=100,loss="linear",learning_rate=.005)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Ridge Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
from sklearn.linear_model import Rigde

model = Ridge(random_state=10,normalize=True,alpha=.001)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# ElasticNet Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
from sklearn.linear_model import ElasticNet

model = linear_model.ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

Classification Models

# Logistic Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)
predictions = model.predict(X_test)

# K-Nearest Neighbor Classification
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
model.fit(X_train, y_train)
prediction = model.predict(X_test)

# Support Vector Classification
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
from sklearn.svm import SVC

model = SVC(kernel = 'linear')
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Kernel SVM Classification
# https://scikit-learn.org/stable/auto_examples/svm/plot_custom_kernel.html#sphx-glr-auto-examples-svm-plot-custom-kernel-py
from sklearn.svm import SVC

model = SVC(kernel = 'rbf')
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Naive Bayes Classification
# https://scikit-learn.org/stable/modules/naive_bayes.html
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
model.predict(X_test)

# Decision Tree Classification
# https://scikit-learn.org/stable/modules/tree.html#classification
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion = 'entropy')
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Random Forest Classification
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

Deep Learning Models

# Artificial Neural Network
# https://keras.io/getting-started/sequential-model-guide/
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(12, input_dim=4, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mse'])
model.fit(X_train, y_train, epochs=100, batch_size=10)

# TensorFlow 2.x
# https://www.tensorflow.org/beta/guide/keras/overview
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf

# Regression
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, input_shape=(D,), activation='relu'),
  # Regression doesn't need the activation function
  tf.keras.layers.Dense(1)
])

opt = tf.keras.optimizers.Adam(0.01)
model.compile(optimizer=opt, loss='mae')
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100)

# Classification
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, input_shape=(D,), activation='relu'),
  # Sigmoid is good for binary classes
  tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100)

# Convolutional Nueral Network

import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.models import Model, Sequential

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape = (64, 64, 3), activation = 'relu'))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Conv2D(32, (3, 3), activation = 'relu'))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Flatten())
model.add(Dense(number_of_units, activation='relu'))
model.add(Dense(number_of_classes, activation='softmax'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=32)

# TensorFlow 2.x
i = Input(shape=X_train[0].shape)
x = Conv2D(32, (3, 3), strides=2, activation='relu')(i)
x = Conv2D(64, (3, 3), strides=2, activation='relu')(x)
x = Conv2D(128, (3, 3), strides=2, activation='relu')(x)
x = Flatten()(x)
x = Dropout(0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(K, activation='softmax')(x)

model = Model(i, x)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100)

# Recurrent Neural Network
import tensorflow as tf
from tensorflow.keras.layers import Input, SimpleRNN, LSTM, Dense, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(LSTM(units = number_of_units, return_sequences = True, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
model.fit(X_train, y_train, epochs=100, batch_size = 32)

# TensorFlow 2.x
i = Input(shape=X_train[0].shape)
x = SimpleRNN(5, activation='relu')(i)
# x = LSTM(128)(i)
# Softmax good for multiclass
x = Dense(10, activation='softmax')(x)
model = Model(i, x)
model.compile(
  loss='mse',
  optimizer=Adam(lr=0.1),
)

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100)

Cross Validation
Grid Search
Metrics - https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
Regression Metrics

R Squared
Explained Variance
Mean Absolute Error
Mean Squared Error
Mean Squared Log Error

Classification Metrics

Accuracy
Classification Report
Confusion Matrix
Matthews Correlation Coefficient
ROC / AUC

Notes

Recent Notes

Checklist

Checklist