Checklist

  • Is there an expert on the data
  • Supervised or Unsupervised
    • Is there a target
  • Regression or Classification
    • Continuous or discrete values (class labels)
    • Predicting a value or identifying group membership
  • Feature Selection
    • # Remove null features
      df.dropna(how='all', axis='columns', inplace=True)
    • # Remove null observations
      df.isnull().sum()
      df.drop(df.index[[null_rows]], inplace=True)
    • # Check for data leakage with expert
    • # Check for features that have only one value
      constant_features = [
          feat for feat in df.columns if len(df[feat].unique()) == 1
      ]
    • # Check for features that have only zero values but have null values
      constant_features = [
          feat for feat in df.columns if len(df[feat].fillna(0).unique()) == 1
      ]
    • # Check for Quasi Constant Values, does not count null values
      for col in df.columns.sort_values():
          if (len(df[col].unique()) < 4):
              print(df[col].value_counts())
    • # Check for duplicate features
      duplicated_feat = []
      for i in range(0, len(dataset.columns)):
          if i % 10 == 0:  # Keep track of the loop
              print('loop tracker', i)
      
          col_1 = dataset.columns[i]
      
          for col_2 in dataset.columns[i + 1:]:
              if dataset[col_1].equals(dataset[col_2]):
                  duplicated_feat.append(col_2)
  • # Does target need engineering
    # Check for other features that need preliminary engineering
  • # Separate dataset into train, validate, and test
    # Good practice to select the features by examining only the training set to avoid overfit
    from sklearn.model_selection import train_test_split
    
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(labels=['target'], axis=1),
        df['target'],
        test_size=0.2)
  • More feature selection
    • Check for correlated features
    • Feature importance
    • Mutual information
    • SelectKBest, SelectPercentile
    • Fisher Score - Chi-Square
    • ANOVA
    • ROC / AUC
    • Coefficients (Lasso)
    • Selection by model
  • Feature Engineering
    • Missing Data / Complete Case Analysis
    • Convert percentages to numerics
    • Outliers
    • Imputation
    • Model comparison
    • Reduce feature labels
    • Check for rare values
    • One hot encoding
    • Weight of evidence
    • Collinearity
    • Regularization
  • Imbalanced Classes
    • SMOTE
    • Up-Sample
    • Down-Sample
  • Regression Models
    • # Linear Regression
      # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
      from sklearn.linear_model import LinearRegression
      
      model = LinearRegression()
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # Polynomial Regression
      # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
      from sklearn.preprocessing import PolynomialFeatures
      from sklearn.linear_model import LinearRegression
      
      model = PolynomialFeatures(degree = 4)
      X_poly = model.fit_transform(X_train)
      model.fit(X_poly, y_train)
      lin_reg = LinearRegression()
      lin_reg.fit(X_poly, y_train)
      lin_reg.predict(model.fit_transform(X_test))
    • # Support Vector Regression
      # https://scikit-learn.org/stable/modules/svm.html#regression
      from sklearn.svm import SVR
      
      model = SVR()
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # Decision Tree Regression
      # https://scikit-learn.org/stable/modules/tree.html#regression
      from sklearn.tree import DecisionTreeRegressor
      
      model = DecisionTreeRegressor()
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # Random Forest Regression
      # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
      from sklearn.ensemble import RandomForestRegressor
      
      model = RandomForestRegressor(n_estimators = 50)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # LassoCV Regression
      # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
      from sklearn.linear_model import LassoCV
      
      model = LassoCV(cv=5,normalize=True,alphas=[.0005])
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # AdaBoost Regression
      # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
      from sklearn.ensemble import AdaBoostRegressor
      
      model = AdaBoostRegressor(n_estimators=100,loss="linear",learning_rate=.005)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # Ridge Regression
      # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
      from sklearn.linear_model import Rigde
      
      model = Ridge(random_state=10,normalize=True,alpha=.001)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # ElasticNet Regression
      # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
      from sklearn.linear_model import ElasticNet
      
      model = linear_model.ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
  • Classification Models
    • # Logistic Regression
      # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
      from sklearn.linear_model import LogisticRegression
      
      model = LogisticRegression()
      model.fit(X_train,y_train)
      predictions = model.predict(X_test)
    • # K-Nearest Neighbor Classification
      # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
      from sklearn.neighbors import KNeighborsClassifier
      
      model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
      model.fit(X_train, y_train)
      prediction = model.predict(X_test)
    • # Support Vector Classification
      # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
      from sklearn.svm import SVC
      
      model = SVC(kernel = 'linear')
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # Kernel SVM Classification
      # https://scikit-learn.org/stable/auto_examples/svm/plot_custom_kernel.html#sphx-glr-auto-examples-svm-plot-custom-kernel-py
      from sklearn.svm import SVC
      
      model = SVC(kernel = 'rbf')
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # Naive Bayes Classification
      # https://scikit-learn.org/stable/modules/naive_bayes.html
      from sklearn.naive_bayes import GaussianNB
      
      model = GaussianNB()
      model.fit(X_train, y_train)
      model.predict(X_test)
    • # Decision Tree Classification
      # https://scikit-learn.org/stable/modules/tree.html#classification
      from sklearn.tree import DecisionTreeClassifier
      
      model = DecisionTreeClassifier(criterion = 'entropy')
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
    • # Random Forest Classification
      # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
      from sklearn.ensemble import RandomForestClassifier
      
      model = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
  • Deep Learning Models
    • # Artificial Neural Network
      # https://keras.io/getting-started/sequential-model-guide/
      from keras.models import Sequential
      from keras.layers import Dense
      
      model = Sequential()
      model.add(Dense(12, input_dim=4, activation='relu'))
      model.add(Dense(8, activation='relu'))
      model.add(Dense(1, activation='sigmoid'))
      model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mse'])
      model.fit(X_train, y_train, epochs=100, batch_size=10)
      
      # TensorFlow 2.x
      # https://www.tensorflow.org/beta/guide/keras/overview
      try:
        # %tensorflow_version only exists in Colab.
        %tensorflow_version 2.x
      except Exception:
        pass
      
      import tensorflow as tf
      
      # Regression
      model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, input_shape=(D,), activation='relu'),
        # Regression doesn't need the activation function
        tf.keras.layers.Dense(1)
      ])
      
      opt = tf.keras.optimizers.Adam(0.01)
      model.compile(optimizer=opt, loss='mae')
      model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100)
      
      # Classification
      model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, input_shape=(D,), activation='relu'),
        # Sigmoid is good for binary classes
        tf.keras.layers.Dense(1, activation='sigmoid')
      ])
      
      model.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
      
      model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100)
    • # Convolutional Nueral Network
      
      import tensorflow as tf
      from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten, Dropout, MaxPooling2D
      from tensorflow.keras.models import Model, Sequential
      
      model = Sequential()
      model.add(Conv2D(32, (3, 3), input_shape = (64, 64, 3), activation = 'relu'))
      model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
      model.add(Conv2D(32, (3, 3), activation = 'relu'))
      model.add(MaxPooling2D(pool_size = (2, 2)))
      model.add(Flatten())
      model.add(Dense(number_of_units, activation='relu'))
      model.add(Dense(number_of_classes, activation='softmax'))
      model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
      model.fit(X_train, y_train, epochs=100, batch_size=32)
      
      # TensorFlow 2.x
      i = Input(shape=X_train[0].shape)
      x = Conv2D(32, (3, 3), strides=2, activation='relu')(i)
      x = Conv2D(64, (3, 3), strides=2, activation='relu')(x)
      x = Conv2D(128, (3, 3), strides=2, activation='relu')(x)
      x = Flatten()(x)
      x = Dropout(0.2)(x)
      x = Dense(512, activation='relu')(x)
      x = Dropout(0.2)(x)
      x = Dense(K, activation='softmax')(x)
      
      model = Model(i, x)
      model.compile(optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
      
      model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100)
    • # Recurrent Neural Network
      import tensorflow as tf
      from tensorflow.keras.layers import Input, SimpleRNN, LSTM, Dense, Dropout
      from tensorflow.keras.models import Model, Sequential
      from tensorflow.keras.optimizers import Adam
      
      model = Sequential()
      model.add(LSTM(units = number_of_units, return_sequences = True, input_shape=(X.shape[1], X.shape[2])))
      model.add(Dropout(0.2))
      model.add(Dense(y.shape[1], activation='softmax'))
      model.compile(optimizer = 'adam', loss = 'mean_squared_error')
      model.fit(X_train, y_train, epochs=100, batch_size = 32)
      
      # TensorFlow 2.x
      i = Input(shape=X_train[0].shape)
      x = SimpleRNN(5, activation='relu')(i)
      # x = LSTM(128)(i)
      # Softmax good for multiclass
      x = Dense(10, activation='softmax')(x)
      model = Model(i, x)
      model.compile(
        loss='mse',
        optimizer=Adam(lr=0.1),
      )
      
      model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100)
  • Cross Validation
  • Grid Search
  • Metrics - https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
  • Regression Metrics
    • R Squared
    • Explained Variance
    • Mean Absolute Error
    • Mean Squared Error
    • Mean Squared Log Error
  • Classification Metrics
    • Accuracy
    • Classification Report
    • Confusion Matrix
    • Matthews Correlation Coefficient
    • ROC / AUC