Hat tip to Soledad Galli

Replace values based on training set for both training and test sets


Missing Values

Missing Completely at Random, Missing at Random, Missing Not at Random

# Sample data
dataframe = pd.read_csv(
    'some.csv', usecols=use_cols).sample(
        10000, random_state=1)

# Inspet Values
dataframe.column.dropna().unique()

# Values distribution
dataframe.column.hist(bins=100)

# Creating a binary variable based on values in another column
dataframe['binary_column'] = np.where(dataframe.column.isin(['Some Value']), 1, 0)

# Count and plot values
dataframe['some column'].value_counts().plot.bar()

# Percentage of count values
dataframe.Column.value_counts() / len(dataframe)

# Count isnull per column
dataframe.isnull().sum()

# Find precentages
dataframe.groupby(['Target'])['Feature'].mean()

# Missing count and percentages based on missing
missing = len(dataframe[dataframe.Column.isnull()])
dataframe[dataframe.Column.isnull()].groupby(['ColumnToCompare'])['Column'].count().sort_values() / missing


Outliers

# Simulate outlier
import seaborn as sns
sns.distplot(data.Age.fillna(some outlier value))

# Show distribution of values
fig = dataframe.Column.hist(bins=100)
fig.set_title('Distribution')
fig.set_xlabel('X')
fig.set_ylabel('Y')

# Boxplot of distribution
fig = dataframe.boxplot(column='Column')
fig.set_title('Boxplot')
fig.set_xlabel('X')
fig.set_ylabel('Y')

# Describe and define IQR, Lower_fence, Upper_fence
dataframe.Column.describe()
IQR = dataframe.Column.quantile(0.75) - dataframe.Column.quantile(0.25)
Lower_fence = dataframe.Column.quantile(0.25) - (IQR * 1.5) # sometimes 3 is used
Upper_fence = dataframe.Column.quantile(0.75) + (IQR * 1.5) # instead of 1.5

# Add a column with redefined outliers and run algorithms to see impact
# Upper boundary values, capping, top-coding


Labels

# Reduce cardinality of variable
dataframe['New Column'] = dataframe['Column'].astype(str).str[0]


Rare Values

# Show relationships
relationships = pd.Series(dataframe['Feature'].value_counts() / len(dataframe)).reset_index
relationships.columns = ['Feature', 'Feature_Percent']

# Show means and merge with relationships
means = data.groupby(['Feature'])['NonBinaryTarget'].mean().reset_index()
relationships = relationships.merge(means, on='Feature', how='left')

# Regroup rare labels
dataframe[dataframe >= 0.1].index
group = {
    l: ('rare' if l not in dataframe[dataframe >= 0.1].index else l)
    for l in dataframe.index
}
dataframe['Grouped_Feature'] = dataframe['Feature'].map(group)

# Lables unique to train or test sets
unique_train = [
    x for x in X_train['Feature'].unique() if x not in X_test['Feature'].unique()
]


Missingness

Capture the importance of missingness by creating an additional variable indicating whether the data was missing for that observation (1) or not (0)

X_train['Feature_NaN'] = np.where(X_train['Feature'].isnull(), 1, 0)


Imputation

Imputation alters variance of original distribution and should be done over the training set, and then propagated to the test set.

# Missing values
dataframe.isnull().mean()

# Separate into training and testing set
# Impute mean and zero to training set
# Mean if Gaussian, median if not
X_train[variable+' median'] = X_train[variable].fillna(train_median)
X_train[variable+' zero'] = X_train[variable].fillna(0)
X_test[variable+' median'] = X_test[variable].fillna(train_median)

# Random sample
# Use random_state to repeat values
X_train[variable].dropna().sample(X_train[variable].isnull().sum(), random_state=0)

# End of distribution
X_train.Column.mean()+1.5*X_train.Column.std()

# Arbitrary value
# Adding a missingness category


Encoding

# One hot encoding
# Ordinal encoding
# Count or frequency encoding
# Target guided encoding
# Mean encoding
# Weight of evidence encoding
# Probabiliy ration encoding
prob_df = X_train.groupby(['Feature'])['Target'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df['FeatureCompliment'] = 1-prob_df.Feature
prob_df['ratio'] = prob_df.Feature/prob_df.FeatureCompliment
label_ratios = prob_df['ratio'].to_dict()
X_train['Feature_ratio'] = X_train.Feature.map(label_ratios)
X_test['Feature_ratio'] = X_test.Feature.map(label_ratios)


Misc Functions

# Convert series to data to_frame
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.to_frame.html
Series.to_frame())