Imbalanced classification (or classification problems with low prevalence (low number of instances in one of the classes) can be challenging. In this post, I have discussed how we can model a problem with prevalence of 0.09% for positive class using gradient boosting and generalized linear model.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import interp
from sklearn.preprocessing import scale
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score, roc_curve, auc, confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split
from xgboost import XGBClassifier
import itertools
import glmnet
import xgboost as xgb
import shap
import seaborn as sns
sns.set_style("ticks")
mpl.rcParams['axes.linewidth'] = 3 
mpl.rcParams['lines.linewidth'] =7
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings 
warnings.filterwarnings("ignore")
%matplotlib inline

# Unused Libraries here
# import keras
# from bayes_opt import BayesianOptimization
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# from fancyimpute import SoftImpute, IterativeImputer

Functions

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _x_y_maker

def _x_y_maker(df, target, to_drop):
    """
    A function to receive the dataframe and make the X, and Y matrices:
    ----------
    parameters
    
    input: dataframe df
           string target (class label)
           list[string] to_drop (list of columns need to be dropped)
           
    output: dataframe Feature matrix (df_X)
            Target vector (Y)
    """    
    Targets = df[target]
    to_drop.append(target)
    Features = df.drop(to_drop, axis = 1)
    Scaled_Features = scale(Features.values)
    df_X = pd.DataFrame(data = Scaled_Features, columns = Features.columns.tolist())
    Y = Targets.values
    
    return df_X, Y

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _plot_roc_nfolds_xgboost

def _plot_roc_nfolds_xgboost(df_X, Y,
                                     n_folds = 10,
                                     n_estimators = 100,
                                     learning_rate = 0.05,
                                     max_depth = 3,
                                     min_child_weight = 5.0,
                                     gamma = 0.5,
                                     reg_alpha = 0.0,
                                     reg_lambda = 1.0,
                                     subsample = 0.9,
                                     objective = "binary:logistic",
                                     scale_pos_weight = 1.0,
                                     shuffle = True,
                                     random_state = 1367
                            ):
    """
    a function to plot k-fold cv ROC using xgboost
    input parameters:
                     df_X : features : pandas dataframe (numpy array will be built inside the function)
                     Y : targets
                     n_folds : number of cv folds (default = 10)
                     n_estimators = number of trees (default = 100)
                     learning_rate : step size of xgboost (default = 0.05)
                     max_depth : maximum tree depth for xgboost (default = 3)
                     min_child_weight : (default = 5.0)
                     gamma : (default = 0.5)
                     reg_alpha : lasso penalty (L1) (default = 0.0)
                     reg_lambda : ridge penalty (L2) (default = 1.0)
                     subsample : subsample fraction (default = 0.9)
                     objective : objective function for ML (default = "binary:logistic" for classification)
                     scale_pos_weight : (default = 1.0)
                     shuffle : shuffle flag for cv (default = True)
                     random_state : (default = 1367)
    
    """

    # Defining the data
    X = scale(df_X.values)
    y = Y
    n_samples, n_features = X.shape

    # Run classifier with cross-validation and plot ROC curves
    cv = StratifiedKFold(n_splits = n_folds , shuffle = shuffle , random_state = random_state)
    classifier = XGBClassifier(learning_rate = learning_rate,
                               n_estimators = n_estimators,
                               max_depth = max_depth,
                               min_child_weight = min_child_weight,
                               gamma = gamma,
                               reg_alpha = reg_alpha,
                               reg_lambda = reg_lambda,
                               subsample = subsample,
                               objective = objective,
                               nthread = 4,
                               scale_pos_weight = 1.,
                               base_score = np.mean(y),
                               seed = random_state,
                               random_state = random_state)
    

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    plt.figure(figsize=(18 , 13))
    i = 0
    for train, test in cv.split(X, y):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=3, alpha=0.5, 
                 label="ROC Fold %d (AUC = %0.2f)" % (i+1, roc_auc))

        i += 1
    plt.plot([0, 1], [0, 1], linestyle="--", lw=3, color="k",
             label="Luck", alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr, mean_tpr, color="navy",
             label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
             lw=4)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color="grey", alpha=.4,
                     label=r"$\pm$ 1 Standard Deviation")

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel("False Positive Rate" ,  fontweight = "bold" , fontsize=30)
    plt.ylabel("True Positive Rate",fontweight = "bold" , fontsize=30)
    plt.tick_params(axis="both", which="major", labelsize=20)
    plt.legend( prop={"size":20} , loc = 4)
    # plt.savefig("./roc_xgboost.pdf" ,bbox_inches="tight")
    plt.show()

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _permute

def _permute(df, random_state):
    """
    a funtion to permute the rows of features and add them
    to the dataframe as noisy features to explore stability
    input parameters:
                    df : pandas dataframe
                    random_state : random seed for noisy features
    """
    normal_df = df.copy().reset_index(drop = True)
    noisy_df = df.copy()
    noisy_df.rename(columns = {col : "noisy_" + col for col in noisy_df.columns}, inplace = True)
    np.random.seed(seed = random_state)
    noisy_df = noisy_df.reindex(np.random.permutation(noisy_df.index))
    merged_df = pd.concat([normal_df, noisy_df.reset_index(drop = True)] , axis = 1)
    
    return merged_df
    
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _my_feature_selector

def _my_feature_selector(X, Y, n_iter = 1,
                                  num_boost_round = 1000,
                                  nfold = 10,
                                  stratified = True,
                                  metrics = ("auc"),
                                  early_stopping_rounds = 50,
                                  seed = 1367,
                                  shuffle = True,
                                  show_stdv = False,
                                  params = None,
                                  importance_type = "total_gain",
                                  callbacks = False,
                                  verbose_eval = False):
    """
    a function to run xgboost kfolds cv and train the model based on the best boosting round of each iteration.
    for different number of iterations. at each iteration noisy features are added as well. at each iteration
    internal and external cv calculated.
    NOTE: it is recommended to use ad-hoc parameters to make sure the model is under-fitted for the sake of feature selection
    input parameters:
                    X: features (pandas dataframe or numpy array)
                    Y: targets (1D array or list)
                    n_iter: total number of iterations (default = 1)
                    num_boost_rounds: max number of boosting rounds, (default = 1000)
                    stratified: stratificaiton of the targets (default = True)
                    metrics: classification/regression metrics (default = ("auc))
                    early_stopping_rounds: the criteria for stopping if the test metric is not improved (default = 20)
                    seed: random seed (default = 1367)
                    shuffle: shuffling the data (default = True)
                    show_stdv = showing standard deviation of cv results (default = False)
                    params = set of parameters for xgboost cv
                            (default_params = {
                                               "eval_metric" : "auc",
                                               "tree_method": "hist",
                                               "objective" : "binary:logistic",
                                               "learning_rate" : 0.05,
                                               "max_depth": 2,
                                               "min_child_weight": 1,
                                               "gamma" : 0.0,
                                               "reg_alpha" : 0.0,
                                               "reg_lambda" : 1.0,
                                               "subsample" : 0.9,
                                               "max_delta_step": 1,
                                               "silent" : 1,
                                               "nthread" : 4,
                                               "scale_pos_weight" : 1
                                               }
                            )
                    importance_type = importance type of xgboost as string (default = "total_gain")
                                      the other options will be "weight", "gain", "cover", and "total_cover"
                    callbacks = printing callbacks for xgboost cv
                                (defaults = False, if True: [xgb.callback.print_evaluation(show_stdv = show_stdv),
                                                             xgb.callback.early_stop(early_stopping_rounds)])
                    verbose_eval : a flag to show the result during train on train/test sets (default = False)
    outputs:
            outputs_dict: a dict contains the fitted models, internal and external cv results for train/test sets
            df_selected: a dataframe contains feature names with their frequency of being selected at each run of folds
    """
    # callback flag
    if(callbacks == True):
        callbacks = [xgb.callback.print_evaluation(show_stdv = show_stdv),
                     xgb.callback.early_stop(early_stopping_rounds)]
    else:
        callbacks = None
    
    # params
    default_params = {
                      "eval_metric" : "auc",
                      "tree_method": "hist",
                      "objective" : "binary:logistic",
                      "learning_rate" : 0.05,
                      "max_depth": 2,
                      "min_child_weight": 1,
                      "gamma" : 0.0,
                      "reg_alpha" : 0.0,
                      "reg_lambda" : 1.0,
                      "subsample" : 0.9,
                      "max_delta_step": 1,
                      "silent" : 1,
                      "nthread" : 4,
                      "scale_pos_weight" : 1,
                      "base_score" : np.mean(Y)
                     }
    # updating the default parameters with the input
    if params is not None:
        for key, val in params.items():
            default_params[key] = val
            
    # total results list
    int_cv_train = []
    int_cv_test = []
    ext_cv_train = []
    ext_cv_test = []
    bst_list = []
    
    # main loop
    for iteration in range(n_iter):
        
        # results list at iteration
        int_cv_train2 = []
        int_cv_test2 = []
        ext_cv_train2 = []
        ext_cv_test2 = []
        
        # update random state 
        random_state = seed * iteration 
    
        # adding noise to data
        X_permuted = _permute(df = X, random_state = random_state)
        cols = X_permuted.columns.tolist()
        Xval = X_permuted.values

        # building DMatrix for training/testing + kfolds cv
        cv = StratifiedKFold(n_splits = nfold , shuffle = shuffle , random_state = random_state)

        for train_index, test_index in cv.split(Xval, Y):
            Xval_train = Xval[train_index]
            Xval_test = Xval[test_index]
            Y_train = Y[train_index]
            Y_test = Y[test_index]

            X_train = pd.DataFrame(data = Xval_train, columns = cols)
            X_test = pd.DataFrame(data = Xval_test, columns = cols)
            
            dtrain = xgb.DMatrix(data = X_train, label = Y_train)
            dtest = xgb.DMatrix(data = X_test, label = Y_test)

            # watchlist during final training
            watchlist = [(dtrain,"train"), (dtest,"eval")]
            
            # a dict to store training results
            evals_result = {}
            
            # xgb cv
            cv_results = xgb.cv(params = default_params,
                                dtrain = dtrain,
                                num_boost_round = num_boost_round,
                                nfold = nfold,
                                stratified = stratified,
                                metrics = metrics,
                                early_stopping_rounds = early_stopping_rounds,
                                seed = random_state,
                                verbose_eval = verbose_eval,
                                shuffle = shuffle,
                                callbacks = callbacks)
            int_cv_train.append(cv_results.iloc[-1][0])
            int_cv_test.append(cv_results.iloc[-1][2])
            int_cv_train2.append(cv_results.iloc[-1][0])
            int_cv_test2.append(cv_results.iloc[-1][2])
            
            # xgb train
            bst = xgb.train(params = default_params,
                            dtrain = dtrain,
                            num_boost_round = len(cv_results) - 1,
                            evals = watchlist,
                            evals_result = evals_result,
                            verbose_eval = verbose_eval)
            # appending outputs
            bst_list.append(bst)
            ext_cv_train.append(evals_result["train"]["auc"][-1])
            ext_cv_test.append(evals_result["eval"]["auc"][-1])
            ext_cv_train2.append(evals_result["train"]["auc"][-1])
            ext_cv_test2.append(evals_result["eval"]["auc"][-1])
        
        print(F"-*-*-*-*-*-*-*-*- Iteration {iteration + 1} -*-*-*-*-*-*-*-*")
        print(F"-*-*- Internal-{nfold} Folds-CV-Train = {np.mean(int_cv_train2):.3} +/- {np.std(int_cv_train2):.3} -*-*- Internal-{nfold} Folds-CV-Test = {np.mean(int_cv_test2):.3} +/- {np.std(int_cv_test2):.3} -*-*")
        print(F"-*-*- External-{nfold} Folds-CV-Train = {np.mean(ext_cv_train2):.3} +/- {np.std(ext_cv_train2):.3} -*-*- External-{nfold} Folds-CV-Test = {np.mean(ext_cv_test2):.3} +/- {np.std(ext_cv_test2):.3} -*-*")
    
    # putting together the outputs in one dict
    outputs = {}
    outputs["bst"] = bst_list
    outputs["int_cv_train"] = int_cv_train
    outputs["int_cv_test"] = int_cv_test
    outputs["ext_cv_train"] = ext_cv_train
    outputs["ext_cv_test"] = ext_cv_test
    

    pruned_features = []
    for bst_ in outputs["bst"]:
        features_gain = bst_.get_score(importance_type = importance_type)
        for key, value in features_gain.items():
            pruned_features.append(key)

    unique_elements, counts_elements = np.unique(pruned_features, return_counts = True)
    counts_elements = [float(i) for i in list(counts_elements)]
    df_features = pd.DataFrame(data = {"columns" : list(unique_elements) , "count" : counts_elements})
    df_features.sort_values(by = ["count"], ascending = False, inplace = True)

    # plotting
    plt.figure()
    df_features.sort_values(by = ["count"]).plot.barh(x = "columns", y = "count", color = "pink" , figsize = (8,8))
    plt.show()

    plt.figure(figsize = (16,12))

    plt.subplot(2,2,1)
    plt.title(F"Internal {nfold} CV {metrics.upper()} - Train", fontsize = 20)
    plt.hist(outputs["int_cv_train"], bins = 20, color = "lightgreen")

    plt.subplot(2,2,2)
    plt.title(F"Internal {nfold} CV {metrics.upper()} - Test", fontsize = 20)
    plt.hist(outputs["int_cv_test"], bins = 20, color = "lightgreen")

    plt.subplot(2,2,3)
    plt.title(F"External {nfold} CV {metrics.upper()} - Train", fontsize = 20)
    plt.hist(outputs["ext_cv_train"], bins = 20, color = "lightblue")

    plt.subplot(2,2,4)
    plt.title(F"External {nfold} CV {metrics.upper()} - Test", fontsize = 20)
    plt.hist(outputs["ext_cv_test"], bins = 20, color = "lightblue")

    plt.show()
    
    return outputs, df_features
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: Confidence Interval calculator for 95% significance

def _ConfidenceInterval(score, n):
    """
    confidence interval for 95% significance level
    input: score such as accuracy, auc
           the size of test set n
    """
    
    CI_range = 1.96 * np.sqrt( (score * (1 - score)) / n)
    CI_pos = score + CI_range
    CI_neg = score - CI_range
    if(CI_neg < 0):
        CI_neg = 0
        
    return CI_neg, CI_pos

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _bst_shap

def _bst_shap(X, Y, num_boost_round = 1000,
                                    nfold = 10,
                                    stratified = True,
                                    metrics = ("auc"),
                                    early_stopping_rounds = 50,
                                    seed = 1367,
                                    shuffle = True,
                                    show_stdv = False,
                                    params = None,
                                    callbacks = False):
    """
    a function to run xgboost kfolds cv with the params and train the model based on best boosting rounds
    input parameters:
                    X: features
                    Y: targets
                    num_boost_rounds: max number of boosting rounds, (default = 1000)
                    stratified: stratificaiton of the targets (default = True)
                    metrics: classification/regression metrics (default = ("auc))
                    early_stopping_rounds: the criteria for stopping if the test metric is not improved (default = 20)
                    seed: random seed (default = 1367)
                    shuffle: shuffling the data (default = True)
                    show_stdv = showing standard deviation of cv results (default = False)
                    params = set of parameters for xgboost cv ( default_params = {
                                                                                  "eval_metric" : "auc",
                                                                                  "tree_method": "hist",
                                                                                  "objective" : "binary:logistic",
                                                                                  "learning_rate" : 0.05,
                                                                                  "max_depth": 2,
                                                                                  "min_child_weight": 5,
                                                                                  "gamma" : 0.0,
                                                                                  "reg_alpha" : 0.0,
                                                                                  "reg_lambda" : 1.0,
                                                                                  "subsample" : 0.9,
                                                                                  "silent" : 1,
                                                                                  "nthread" : 4,
                                                                                  "scale_pos_weight" : 1
                                                                                 }
                                                                    )
                   callbacks = printing callbacks for xgboost cv (defaults: [xgb.callback.print_evaluation(show_stdv = show_stdv),
                                                                             xgb.callback.early_stop(early_stopping_rounds)])
    outputs:
            bst: the trained model based on optimum number of boosting rounds
            cv_results: a dataframe contains the cv-results (train/test + std of train/test) for metric
    """
    # callback flag
    if(callbacks == True):
        callbacks = [xgb.callback.print_evaluation(show_stdv = show_stdv),
                     xgb.callback.early_stop(early_stopping_rounds)]
    else:
        callbacks = None
    
    # params
    default_params = {
                      "eval_metric" : "auc",
                      "tree_method": "hist",
                      "objective" : "binary:logistic",
                      "learning_rate" : 0.05,
                      "max_depth": 2,
                      "min_child_weight": 1,
                      "gamma" : 0.0,
                      "reg_alpha" : 0.0,
                      "reg_lambda" : 1.0,
                      "subsample" : 0.9,
                      "max_delta_step": 1,
                      "silent" : 1,
                      "nthread" : 4
                     }
    # updating the default parameters with the input
    if params is not None:
        for key, val in params.items():
            default_params[key] = val

    # building DMatrix for training
    dtrain = xgb.DMatrix(data = X, label = Y)
    print("-*-*-*-*-*-*-* CV Started *-*-*-*-*-*-*-")
    cv_results = xgb.cv(params = default_params,
                        dtrain = dtrain,
                        num_boost_round = num_boost_round,
                        nfold = nfold,
                        stratified = stratified,
                        metrics = metrics,
                        early_stopping_rounds = early_stopping_rounds,
                        seed = seed,
                        shuffle = shuffle,
                        callbacks = callbacks)
    print(F"*-*- Boosting Round = {len(cv_results) - 1} -*-*- Train = {cv_results.iloc[-1][0]:.3} -*-*- Test = {cv_results.iloc[-1][2]:.3} -*-*")
    print("-*-*-*-*-*-*-* Train Started *-*-*-*-*-*-*-")
    bst = xgb.train(params = default_params,
                    dtrain = dtrain,
                    num_boost_round = len(cv_results) - 1,
                   )
    
    return bst, cv_results

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _plot_xgboost_cv_score

def _plot_xgboost_cv_score(cv_results):
    """
    a function to plot train/test cv score for 
    """

    import matplotlib as mpl
    mpl.rcParams['axes.linewidth'] = 3 
    mpl.rcParams['lines.linewidth'] = 2
    plt.figure(figsize=(10,8))
    plt.errorbar(range(cv_results.shape[0]), cv_results["train-auc-mean"],
                yerr=cv_results["train-auc-std"], fmt = "--", ecolor="lightgreen", c = "navy", label = "Train (9-Folds)")

    plt.errorbar(range(cv_results.shape[0]), cv_results["test-auc-mean"],
                yerr=cv_results["test-auc-std"], fmt = "--", ecolor="lightblue", c = "red", label = "Test (1-Fold)")


    plt.xlabel("# of Boosting Rounds" ,  fontweight = "bold" , fontsize=30)
    plt.ylabel("AUC",fontweight = "bold" , fontsize=30)
    plt.tick_params(axis='both', which='major', labelsize=20)
    plt.legend(loc = 4, prop={'size': 20})
    plt.show()
    
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _plot_importance

def _plot_importance(bst, importance_type = "total_gain", color = "pink", figsize = (10,10)):
    """
    a function to plot feature importance in xgboost
    """
    from xgboost import plot_importance
    from pylab import rcParams
    rcParams['figure.figsize'] = figsize
    plot_importance(bst, importance_type = importance_type, color = color, xlabel = importance_type)
    plt.show()

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _plot_confusion_matrix
def _plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize = 20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize = 20)
    plt.xlabel('Predicted label', fontsize = 20)
    plt.tick_params(axis='both', which='major', labelsize=20)
    plt.tight_layout()

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _clf_xgboost

def _clf_xgboost(df_X, Y, test_size, n_folds):
    X = df_X.values
    clf = XGBClassifier(learning_rate = 0.05,
                                n_estimators = 100,
                                max_depth = 3,
                                min_child_weight = 5.,
                               # max_delta_step= 1,
                                gamma = 0.5,
                                reg_alpha = 0.0,
                                reg_lambda = 1.0,
                                subsample = 0.9,
                                colsample_bytree = 0.9,
                                objective = "binary:logistic",
                                nthread = 4,
                                #scale_pos_weight = (len(Y) - sum(Y)) / sum(Y),
                                base_score = np.mean(Y),
                                seed = 1367,
                                random_state = 1367)
    
    X_train, X_test, y_train, y_test = train_test_split(df_X, Y, test_size = test_size, shuffle = True, random_state = 1367, stratify = Y)
    clf.fit(X_train, y_train, eval_metric="auc", early_stopping_rounds = 50, verbose = False, eval_set=[(X_test, y_test)])
    accuracy_score = clf.score(X_test, y_test)
    cv_roc_score = cross_val_score(clf, X, Y, cv = n_folds, scoring = "roc_auc")
    y_pred_prob = clf.predict_proba(X_test)
    roc_score = roc_auc_score(y_test, y_pred_prob[:,1])
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1])
    optimal_idx = np.argmax(np.abs(tpr - fpr))
    optimal_threshold = thresholds[optimal_idx]
    
    return accuracy_score, roc_score, cv_roc_score, y_pred_prob, X_test, y_test, optimal_threshold

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _xgboost_auc_hist
def _xgboost_auc_hist(df_X, Y, n_iterations = 100, alpha = 0.95):
    """
    plot histogram of ROC AUC for N stratified classification for 95% significance level
    """

    n_size = int(len(Y) * 0.30)

    # Main Loop
    scores = list()
    for i in range(n_iterations):
        # split the data into train/test sets
        X_train, X_test, y_train, y_test = train_test_split(df_X, Y, test_size = n_size, shuffle = True, random_state = 1367 * i , stratify = Y)
        # fit the xgboost classifier
        clf = XGBClassifier(learning_rate = 0.05,
                                    n_estimators = 100,
                                    max_depth = 3,
                                    min_child_weight = 5.,
                                   # max_delta_step= 1,
                                    gamma = 0.5,
                                    reg_alpha = 0.0,
                                    reg_lambda = 1.0,
                                    subsample = 0.9,
                                    colsample_bytree = 0.9,
                                    objective = "binary:logistic",
                                    nthread = 4,
                                    #scale_pos_weight = (len(Y) - sum(Y)) / sum(Y),
                                    base_score = np.mean(Y),
                                    seed = 1367 * i ,
                                    random_state = 1367 * i)
        clf.fit(X_train, y_train, eval_metric = "auc", early_stopping_rounds = 50, verbose = False, eval_set = [(X_test, y_test)])
        # evaluate model
        predictions = clf.predict_proba(X_test)[:,1]
        pred_auc = roc_auc_score(y_test, predictions)
        scores.append(pred_auc)

    # Ploting the scores
    plt.figure(figsize=(10,5))
    plt.hist(scores, color = "pink")
    plt.xlabel("ROC AUC", fontsize = 30)
    plt.tick_params(axis = "both", which = "major", labelsize = 15)
    plt.show()

    # confidence intervals
    alpha = 0.95
    p = ((1.0 - alpha)/2.0) * 100
    lower = max(0.0, np.percentile(scores, p))
    p = (alpha + ((1.0 - alpha)/2.0)) * 100
    upper = min(1.0, np.percentile(scores, p))
    print(F"{alpha*100}% Significance Level - ROC AUC Confidence Interval= [{lower:.3f} , {upper:.3f}]")
    
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _glmnet

def _glmnet(X, Y, alpha = 0.1, n_splits = 10, scoring = "roc_auc"):
    """
    a function for standard glmnet
    input parameters: pandas dataframe X
                      class labels Y
                      alpha = 0.1
                      n_splits = 10
                      scoring = "roc_auc"
    output: glmnet model 

    """  
    model = glmnet.LogitNet(alpha = alpha,
                     n_lambda = 100,
                     n_splits = n_splits,
                     cut_point = 1.0,
                     scoring = scoring,
                     n_jobs = -1,
                     random_state = 1367)
    model.fit(X, Y)
    
    return model

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _plot_glmnet_cv_score

def _plot_glmnet_cv_score(model):
    """
    a function to plot cv scores vs lambda
    parameters:
               model : a fitted glmnet object
    """
    mpl.rcParams['axes.linewidth'] = 3 
    mpl.rcParams['lines.linewidth'] =2
    plt.figure(figsize=(10,6))
    plt.errorbar(-np.log(model.lambda_path_), model.cv_mean_score_, yerr=model.cv_standard_error_ , c = "r", ecolor="k", marker = "o" )
    plt.vlines(-np.log(model.lambda_best_), ymin = min(model.cv_mean_score_) - 0.05 , ymax = max(model.cv_mean_score_) + 0.05, lw = 3, linestyles = "--", colors = "b" ,label = "best $\lambda$")
    plt.vlines(-np.log(model.lambda_max_), ymin = min(model.cv_mean_score_) - 0.05 , ymax = max(model.cv_mean_score_) + 0.05, lw = 3, linestyles = "--", colors = "c" ,label = "max $\lambda$")
    plt.tick_params(axis='both', which='major', labelsize = 12)
    plt.grid(True)
    plt.ylim([min(model.cv_mean_score_) - 0.05, max(model.cv_mean_score_) + 0.05])
    plt.legend(loc = 4, prop={'size': 20})
    plt.xlabel("$-Log(\lambda)$" , fontsize = 20)
    plt.ylabel(F"Mean {model.n_splits} Folds CV {(model.scoring).upper()}", fontsize = 20)
    plt.title(F"Best $\lambda$ = {model.lambda_best_[0]:.2} with {len(np.nonzero(  model.coef_)[1])} Features" , fontsize = 20)
    plt.show()

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _plot_glmnet_coeff_path

def _plot_glmnet_coeff_path(model, df):
    """
    a function to plot coefficients vs lambda
    parameters:
               model : a fitted glmnet object
               df: in case that the input is the pandas dataframe,
                   the column names of the coeff. will appear as a legend
    """
    mpl.rcParams['axes.linewidth'] = 3 
    mpl.rcParams['lines.linewidth'] =2
    plt.figure(figsize=(10,6))
    if not df.empty:
        for i in list(np.nonzero(np.reshape(model.coef_, (1,-1)))[1]):
            plt.plot(-np.log(model.lambda_path_) ,(model.coef_path_.reshape(-1,model.coef_path_.shape[-1]))[i,:], label = df.columns.values.tolist()[i]);
        plt.legend(loc= "right", bbox_to_anchor=(1.2 , .5), ncol=1, fancybox=True, shadow=True)

    else:
        for i in list(np.nonzero(np.reshape(model.coef_, (1,-1)))[1]):
            plt.plot(-np.log(model.lambda_path_) ,(model.coef_path_.reshape(-1, model.coef_path_.shape[-1]))[i,:]);
            
    plt.tick_params(axis='both', which='major', labelsize = 12)    
    plt.ylabel("Coefficients", fontsize = 20)
    plt.xlabel("-$Log(\lambda)$", fontsize = 20)
    plt.title(F"Best $\lambda$ = {model.lambda_best_[0]:.2} with {len(np.nonzero(model.coef_)[1])} Features" , fontsize = 20)
    plt.grid(True)
    plt.show()
    
#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _df_glmnet_coeff_path

def _df_glmnet_coeff_path(model, df):
    """
    a function to build a dataframe for nonzero coeff.
    parameters:
               model : a fitted glmnet object
               df: in case that the input is the pandas dataframe,
                   the column names of the coeff. will appear as a legend
                   
    """
    idx = list(np.nonzero(np.reshape(model.coef_, (1,-1)))[1])
    dct = dict( zip([df.columns.tolist()[i] for i in idx], [model.coef_[0][i] for i in idx]))
        
    return pd.DataFrame(data = dct.items() , columns = ["Features", "Coeffs"]).sort_values(by = "Coeffs", ascending = False).reset_index(drop = True)

#*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# Function: _glmnet_best_score_alpha

def _glmnet_best_score_alpha(X, Y):
    """
    a function to run glmnet for different alpha
    """
    
    def logitnet_nonzero_coef(g):
        idx = np.where(g.coef_[0] != 0)[0]
        return pd.DataFrame({"idx": idx, "coef": g.coef_[0][idx]})

    alpha_list = np.arange(0.0, 0.5, 0.05)
    mean_auc = []
    std_auc = []
    n_features = []
    for alpha in alpha_list:
        model = _glmnet(X, Y, alpha = alpha)
        mean_auc.append(model.cv_mean_score_[model.lambda_best_inx_])
        std_auc.append(model.cv_standard_error_[model.lambda_best_inx_])
        fe = logitnet_nonzero_coef(model)
        n_features.append(fe.shape[0])
        
    mpl.rcParams['axes.linewidth'] = 3 
    mpl.rcParams['lines.linewidth'] =2
    plt.figure(figsize=(12,6))

    plt.subplot(1,2,1)
    plt.errorbar(alpha_list, mean_auc, yerr = std_auc, c = "r", ecolor="k", marker = "o" )
    plt.tick_params(axis='both', which='major', labelsize = 12)
    plt.xlabel(r"$\alpha$" , fontsize = 20)
    plt.ylabel("Mean 10 Folds CV AUC", fontsize = 20)
    plt.grid(True)

    plt.subplot(1,2,2)
    plt.plot(alpha_list, n_features, c = "r",  marker = "o" )
    plt.tick_params(axis='both', which='major', labelsize = 12)
    plt.xlabel(r"$\alpha$" , fontsize = 20)
    plt.ylabel("Number of Features", fontsize = 20)
    plt.grid(True)

    plt.show()

   
    

First, I loaded the data into a pandas dataframe to get some idea.

# readin the data into a dataframe
dateparser = lambda x: pd.datetime.strptime(x, "%Y-%m-%d")
df_raw = pd.read_csv("./device_failure.csv",
                     parse_dates = ["date"],
                     date_parser = dateparser,
                     encoding = "cp1252")

print("Shape: {}".format(df_raw.shape))
print("Prevalence = {:.3f}%".format(df_raw["failure"].sum()/df_raw.shape[0] * 100))

Shape: (124494, 12)
Prevalence = 0.085%

df_raw.head()

	date	device	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute9
0	2015-01-01	S1F01085	215630672	56	0	52	6	407438	7
1	2015-01-01	S1F0166B	61370680	0	3	0	6	403174	0
2	2015-01-01	S1F01E6Y	173295968	0	0	0	12	237394	0
3	2015-01-01	S1F01JE0	79694024	0	0	0	6	410186	0
4	2015-01-01	S1F01R2B	135970480	0	0	0	15	313173	3

Printing out the statistical description of the data. It would help us to have idea about the variation of the each column in the data.

df_raw.describe()

	failure	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute7	attribute8	attribute9
count	124494.000000	1.244940e+05	124494.000000	124494.000000	124494.000000	124494.000000	124494.000000	124494.000000	124494.000000	124494.000000
mean	0.000851	1.223881e+08	159.484762	9.940455	1.741120	14.222669	260172.657726	0.292528	0.292528	12.451524
std	0.029167	7.045933e+07	2179.657730	185.747321	22.908507	15.943028	99151.078547	7.436924	7.436924	191.425623
min	0.000000	0.000000e+00	0.000000	0.000000	0.000000	1.000000	8.000000	0.000000	0.000000	0.000000
25%	0.000000	6.128476e+07	0.000000	0.000000	0.000000	8.000000	221452.000000	0.000000	0.000000	0.000000
50%	0.000000	1.227974e+08	0.000000	0.000000	0.000000	10.000000	249799.500000	0.000000	0.000000	0.000000
75%	0.000000	1.833096e+08	0.000000	0.000000	0.000000	12.000000	310266.000000	0.000000	0.000000	0.000000
max	1.000000	2.441405e+08	64968.000000	24929.000000	1666.000000	98.000000	689161.000000	832.000000	832.000000	18701.000000

Printing out the number of Null values in each column. These numbers should be imputed before feeding into the classifier.

df_raw.isnull().sum()

date          0
device        0
failure       0
attribute1    0
attribute2    0
attribute3    0
attribute4    0
attribute5    0
attribute6    0
attribute7    0
attribute8    0
attribute9    0
dtype: int64

Checking the data types of the columns to make sure all the columns have continuous values as it is mentioned in the meta data, except date and device

df_raw.dtypes

date          datetime64[ns]
device                object
failure                int64
attribute1             int64
attribute2             int64
attribute3             int64
attribute4             int64
attribute5             int64
attribute6             int64
attribute7             int64
attribute8             int64
attribute9             int64
dtype: object

Checking the pair-wise correlation between features

methods = {‘pearson’, ‘kendall’, ‘spearman’}

df_raw.corr(method = "kendall")

	failure	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute7	attribute8	attribute9
failure	1.000000	0.001611	0.053361	0.003208	0.057295	0.003475	0.002337	0.096926	0.096926	0.004633
attribute1	0.001611	1.000000	-0.001004	0.001960	0.001289	-0.003713	-0.001779	-0.002030	-0.002030	-0.002662
attribute2	0.053361	-0.001004	1.000000	-0.018322	0.219415	-0.022125	-0.062126	0.107491	0.107491	-0.027759
attribute3	0.003208	0.001960	-0.018322	1.000000	0.117459	0.089091	0.056736	-0.009622	-0.009622	0.369092
attribute4	0.057295	0.001289	0.219415	0.117459	1.000000	-0.017980	0.009706	0.160216	0.160216	0.046189
attribute5	0.003475	-0.003713	-0.022125	0.089091	-0.017980	1.000000	0.057295	-0.016603	-0.016603	0.026998
attribute6	0.002337	-0.001779	-0.062126	0.056736	0.009706	0.057295	1.000000	-0.013186	-0.013186	0.069585
attribute7	0.096926	-0.002030	0.107491	-0.009622	0.160216	-0.016603	-0.013186	1.000000	1.000000	-0.017097
attribute8	0.096926	-0.002030	0.107491	-0.009622	0.160216	-0.016603	-0.013186	1.000000	1.000000	-0.017097
attribute9	0.004633	-0.002662	-0.027759	0.369092	0.046189	0.026998	0.069585	-0.017097	-0.017097	1.000000

# Number of unique devices: this can be used as feauture based on the demand
df_raw["device"].nunique()

Preprocessing: attribute7 and attribute8 are totally correlated (the same feature). For now, I am gonna drop date and device as well. Device has 1169 unique values that can be used as features.

df = df_raw.copy()
target = "failure"
to_drop = ["date", "device", "attribute8"]
df_X, Y = _x_y_maker(df, target, to_drop)

Printing the first 5 rows of the sclaed features.

df_X.head()

	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute7	attribute9
0	1.323358	-0.047478	-0.053516	2.193905	-0.515755	1.485268	-0.039335	-0.028479
1	-0.865998	-0.073170	-0.037365	-0.076004	-0.515755	1.442263	-0.039335	-0.065047
2	0.722517	-0.073170	-0.053516	-0.076004	-0.139414	-0.229738	-0.039335	-0.065047
3	-0.605942	-0.073170	-0.053516	-0.076004	-0.515755	1.512983	-0.039335	-0.065047
4	0.192770	-0.073170	-0.053516	-0.076004	0.048757	0.534543	-0.039335	-0.049375

Checking the correlation again.

df_X.corr()

	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute7	attribute9
attribute1	1.000000	-0.004250	0.003701	0.001836	-0.003376	-0.001522	0.000151	0.001121
attribute2	-0.004250	1.000000	-0.002617	0.146593	-0.013999	-0.026350	0.141367	-0.002736
attribute3	0.003701	-0.002617	1.000000	0.097452	-0.006696	0.009027	-0.001884	0.532366
attribute4	0.001836	0.146593	0.097452	1.000000	-0.009773	0.024870	0.045631	0.036069
attribute5	-0.003376	-0.013999	-0.006696	-0.009773	1.000000	-0.017049	-0.009384	0.005949
attribute6	-0.001522	-0.026350	0.009027	0.024870	-0.017049	1.000000	-0.012207	0.021152
attribute7	0.000151	0.141367	-0.001884	0.045631	-0.009384	-0.012207	1.000000	0.006861
attribute9	0.001121	-0.002736	0.532366	0.036069	0.005949	0.021152	0.006861	1.000000

Running my feature selector along with noise with 10-folds CV for multiple iterations to make sure about the selected features. This would help to have a simpler model with the same performance

# choosing params to under-fit the model for feature selection + noisy features. This would help to have simpler model
params = {
          "learning_rate" : 0.05,
          "max_depth": 2,
          "min_child_weight": 5,
          "gamma" : 0.5
          
         }
o_, d_ = _my_feature_selector(df_X, Y, n_iter = 10,
                                       params = params,
                                       seed = 1367,
                                       callbacks = False)

-*-*-*-*-*-*-*-*- Iteration 1 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.935 +/- 0.00485 -*-*- Internal-10 Folds-CV-Test = 0.897 +/- 0.00849 -*-*
-*-*- External-10 Folds-CV-Train = 0.933 +/- 0.005 -*-*- External-10 Folds-CV-Test = 0.898 +/- 0.0435 -*-*
-*-*-*-*-*-*-*-*- Iteration 2 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.932 +/- 0.00891 -*-*- Internal-10 Folds-CV-Test = 0.896 +/- 0.00948 -*-*
-*-*- External-10 Folds-CV-Train = 0.931 +/- 0.00924 -*-*- External-10 Folds-CV-Test = 0.892 +/- 0.0413 -*-*
-*-*-*-*-*-*-*-*- Iteration 3 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.934 +/- 0.0065 -*-*- Internal-10 Folds-CV-Test = 0.896 +/- 0.00591 -*-*
-*-*- External-10 Folds-CV-Train = 0.933 +/- 0.00634 -*-*- External-10 Folds-CV-Test = 0.892 +/- 0.0571 -*-*
-*-*-*-*-*-*-*-*- Iteration 4 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.935 +/- 0.00669 -*-*- Internal-10 Folds-CV-Test = 0.897 +/- 0.00905 -*-*
-*-*- External-10 Folds-CV-Train = 0.933 +/- 0.00776 -*-*- External-10 Folds-CV-Test = 0.894 +/- 0.0548 -*-*
-*-*-*-*-*-*-*-*- Iteration 5 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.934 +/- 0.00684 -*-*- Internal-10 Folds-CV-Test = 0.897 +/- 0.00607 -*-*
-*-*- External-10 Folds-CV-Train = 0.932 +/- 0.00653 -*-*- External-10 Folds-CV-Test = 0.894 +/- 0.052 -*-*
-*-*-*-*-*-*-*-*- Iteration 6 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.932 +/- 0.0068 -*-*- Internal-10 Folds-CV-Test = 0.898 +/- 0.0082 -*-*
-*-*- External-10 Folds-CV-Train = 0.931 +/- 0.00678 -*-*- External-10 Folds-CV-Test = 0.892 +/- 0.0493 -*-*
-*-*-*-*-*-*-*-*- Iteration 7 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.931 +/- 0.0071 -*-*- Internal-10 Folds-CV-Test = 0.896 +/- 0.00643 -*-*
-*-*- External-10 Folds-CV-Train = 0.93 +/- 0.00858 -*-*- External-10 Folds-CV-Test = 0.892 +/- 0.0339 -*-*
-*-*-*-*-*-*-*-*- Iteration 8 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.933 +/- 0.00951 -*-*- Internal-10 Folds-CV-Test = 0.898 +/- 0.00782 -*-*
-*-*- External-10 Folds-CV-Train = 0.932 +/- 0.00995 -*-*- External-10 Folds-CV-Test = 0.892 +/- 0.0766 -*-*
-*-*-*-*-*-*-*-*- Iteration 9 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.929 +/- 0.00658 -*-*- Internal-10 Folds-CV-Test = 0.895 +/- 0.00973 -*-*
-*-*- External-10 Folds-CV-Train = 0.928 +/- 0.00671 -*-*- External-10 Folds-CV-Test = 0.896 +/- 0.0523 -*-*
-*-*-*-*-*-*-*-*- Iteration 10 -*-*-*-*-*-*-*-*
-*-*- Internal-10 Folds-CV-Train = 0.935 +/- 0.00538 -*-*- Internal-10 Folds-CV-Test = 0.898 +/- 0.00774 -*-*
-*-*- External-10 Folds-CV-Train = 0.933 +/- 0.0058 -*-*- External-10 Folds-CV-Test = 0.883 +/- 0.0507 -*-*




<Figure size 432x288 with 0 Axes>

png

params = {
          "learning_rate" : 0.05,
          "max_depth": 2,
          "min_child_weight": 5.,
          "gamma" : 0.5
         }
bst, cvr = _bst_shap(df_X, Y, params = params, callbacks = True)

-*-*-*-*-*-*-* CV Started *-*-*-*-*-*-*-
[22:09:38] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:38] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:38] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[22:09:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.5	test-auc:0.5
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
[1]	train-auc:0.5	test-auc:0.5
[2]	train-auc:0.5	test-auc:0.5
[3]	train-auc:0.5	test-auc:0.5
[4]	train-auc:0.5	test-auc:0.5
[5]	train-auc:0.5	test-auc:0.5
[6]	train-auc:0.5	test-auc:0.5
[7]	train-auc:0.5	test-auc:0.5
[8]	train-auc:0.5	test-auc:0.5
[9]	train-auc:0.5	test-auc:0.5
[10]	train-auc:0.5	test-auc:0.5
[11]	train-auc:0.5	test-auc:0.5
[12]	train-auc:0.5	test-auc:0.5
[13]	train-auc:0.5	test-auc:0.5
[14]	train-auc:0.5	test-auc:0.5
[15]	train-auc:0.5	test-auc:0.5
[16]	train-auc:0.5	test-auc:0.5
[17]	train-auc:0.5	test-auc:0.5
[18]	train-auc:0.502591	test-auc:0.499988
[19]	train-auc:0.507799	test-auc:0.499972
[20]	train-auc:0.515655	test-auc:0.499924
[21]	train-auc:0.515655	test-auc:0.499924
[22]	train-auc:0.521354	test-auc:0.504899
[23]	train-auc:0.527633	test-auc:0.509397
[24]	train-auc:0.544239	test-auc:0.51922
[25]	train-auc:0.547894	test-auc:0.519192
[26]	train-auc:0.562889	test-auc:0.528576
[27]	train-auc:0.562888	test-auc:0.528573
[28]	train-auc:0.575345	test-auc:0.551115
[29]	train-auc:0.589815	test-auc:0.570037
[30]	train-auc:0.589813	test-auc:0.570032
[31]	train-auc:0.589815	test-auc:0.570038
[32]	train-auc:0.594991	test-auc:0.579028
[33]	train-auc:0.603815	test-auc:0.588018
[34]	train-auc:0.607991	test-auc:0.587983
[35]	train-auc:0.607994	test-auc:0.587977
[36]	train-auc:0.609034	test-auc:0.587952
[37]	train-auc:0.611603	test-auc:0.58792
[38]	train-auc:0.611606	test-auc:0.58792
[39]	train-auc:0.621348	test-auc:0.592728
[40]	train-auc:0.630541	test-auc:0.59751
[41]	train-auc:0.641352	test-auc:0.610898
[42]	train-auc:0.645456	test-auc:0.619846
[43]	train-auc:0.649023	test-auc:0.624774
[44]	train-auc:0.649023	test-auc:0.629293
[45]	train-auc:0.652603	test-auc:0.638277
[46]	train-auc:0.656164	test-auc:0.638207
[47]	train-auc:0.656163	test-auc:0.638208
[48]	train-auc:0.660153	test-auc:0.638036
[49]	train-auc:0.660154	test-auc:0.638033
[50]	train-auc:0.668212	test-auc:0.637722
[51]	train-auc:0.683198	test-auc:0.651949
[52]	train-auc:0.68676	test-auc:0.65183
[53]	train-auc:0.702173	test-auc:0.664918
[54]	train-auc:0.705216	test-auc:0.673904
[55]	train-auc:0.716932	test-auc:0.688166
[56]	train-auc:0.746947	test-auc:0.705966
[57]	train-auc:0.766238	test-auc:0.732417
[58]	train-auc:0.76625	test-auc:0.732301
[59]	train-auc:0.781952	test-auc:0.749171
[60]	train-auc:0.785332	test-auc:0.753368
[61]	train-auc:0.788794	test-auc:0.753104
[62]	train-auc:0.788767	test-auc:0.753255
[63]	train-auc:0.789759	test-auc:0.757569
[64]	train-auc:0.789792	test-auc:0.757407
[65]	train-auc:0.791717	test-auc:0.762459
[66]	train-auc:0.791741	test-auc:0.762543
[67]	train-auc:0.794139	test-auc:0.771179
[68]	train-auc:0.79414	test-auc:0.771128
[69]	train-auc:0.798393	test-auc:0.78537
[70]	train-auc:0.799311	test-auc:0.790212
[71]	train-auc:0.799362	test-auc:0.790179
[72]	train-auc:0.799436	test-auc:0.790156
[73]	train-auc:0.804061	test-auc:0.789724
[74]	train-auc:0.809716	test-auc:0.793472
[75]	train-auc:0.81023	test-auc:0.793571
[76]	train-auc:0.810291	test-auc:0.793676
[77]	train-auc:0.810273	test-auc:0.794129
[78]	train-auc:0.815368	test-auc:0.793779
[79]	train-auc:0.818557	test-auc:0.798271
[80]	train-auc:0.818553	test-auc:0.798253
[81]	train-auc:0.818556	test-auc:0.798244
[82]	train-auc:0.819048	test-auc:0.798204
[83]	train-auc:0.819046	test-auc:0.798256
[84]	train-auc:0.819084	test-auc:0.798343
[85]	train-auc:0.819094	test-auc:0.798331
[86]	train-auc:0.826427	test-auc:0.810713
[87]	train-auc:0.827297	test-auc:0.815134
[88]	train-auc:0.827317	test-auc:0.820203
[89]	train-auc:0.82735	test-auc:0.820224
[90]	train-auc:0.835594	test-auc:0.836472
[91]	train-auc:0.839335	test-auc:0.83642
[92]	train-auc:0.839334	test-auc:0.836343
[93]	train-auc:0.844798	test-auc:0.840614
[94]	train-auc:0.845237	test-auc:0.840554
[95]	train-auc:0.845206	test-auc:0.840589
[96]	train-auc:0.845202	test-auc:0.840541
[97]	train-auc:0.845634	test-auc:0.840411
[98]	train-auc:0.846109	test-auc:0.840341
[99]	train-auc:0.8461	test-auc:0.840213
[100]	train-auc:0.846579	test-auc:0.840174
[101]	train-auc:0.847455	test-auc:0.840033
[102]	train-auc:0.848847	test-auc:0.839937
[103]	train-auc:0.848889	test-auc:0.839934
[104]	train-auc:0.851638	test-auc:0.839058
[105]	train-auc:0.851658	test-auc:0.839094
[106]	train-auc:0.851696	test-auc:0.839025
[107]	train-auc:0.851761	test-auc:0.839107
[108]	train-auc:0.855548	test-auc:0.838203
[109]	train-auc:0.855952	test-auc:0.838216
[110]	train-auc:0.859843	test-auc:0.837271
[111]	train-auc:0.859896	test-auc:0.837218
[112]	train-auc:0.862772	test-auc:0.844197
[113]	train-auc:0.866681	test-auc:0.847177
[114]	train-auc:0.866765	test-auc:0.847206
[115]	train-auc:0.867214	test-auc:0.847115
[116]	train-auc:0.868626	test-auc:0.855585
[117]	train-auc:0.869564	test-auc:0.859661
[118]	train-auc:0.869588	test-auc:0.859604
[119]	train-auc:0.870008	test-auc:0.859496
[120]	train-auc:0.870083	test-auc:0.859439
[121]	train-auc:0.870515	test-auc:0.859276
[122]	train-auc:0.871057	test-auc:0.859007
[123]	train-auc:0.871131	test-auc:0.859277
[124]	train-auc:0.871211	test-auc:0.859541
[125]	train-auc:0.871218	test-auc:0.859699
[126]	train-auc:0.871588	test-auc:0.863687
[127]	train-auc:0.871767	test-auc:0.863805
[128]	train-auc:0.87185	test-auc:0.864008
[129]	train-auc:0.871903	test-auc:0.863906
[130]	train-auc:0.871946	test-auc:0.863755
[131]	train-auc:0.871992	test-auc:0.863533
[132]	train-auc:0.87198	test-auc:0.863675
[133]	train-auc:0.872139	test-auc:0.863535
[134]	train-auc:0.872082	test-auc:0.86365
[135]	train-auc:0.872188	test-auc:0.863516
[136]	train-auc:0.872232	test-auc:0.86354
[137]	train-auc:0.872228	test-auc:0.863652
[138]	train-auc:0.872266	test-auc:0.863696
[139]	train-auc:0.872255	test-auc:0.863683
[140]	train-auc:0.872303	test-auc:0.863636
[141]	train-auc:0.872336	test-auc:0.863547
[142]	train-auc:0.872402	test-auc:0.863523
[143]	train-auc:0.872441	test-auc:0.863512
[144]	train-auc:0.876281	test-auc:0.865899
[145]	train-auc:0.876307	test-auc:0.865908
[146]	train-auc:0.876324	test-auc:0.86593
[147]	train-auc:0.876275	test-auc:0.870426
[148]	train-auc:0.876311	test-auc:0.8704
[149]	train-auc:0.876318	test-auc:0.870282
[150]	train-auc:0.876344	test-auc:0.870302
[151]	train-auc:0.876323	test-auc:0.87028
[152]	train-auc:0.876347	test-auc:0.870184
[153]	train-auc:0.876402	test-auc:0.87025
[154]	train-auc:0.876392	test-auc:0.870155
[155]	train-auc:0.876437	test-auc:0.870074
[156]	train-auc:0.876463	test-auc:0.870078
[157]	train-auc:0.884578	test-auc:0.870697
[158]	train-auc:0.884583	test-auc:0.87077
[159]	train-auc:0.888683	test-auc:0.867303
[160]	train-auc:0.88869	test-auc:0.867468
[161]	train-auc:0.89149	test-auc:0.863448
[162]	train-auc:0.891448	test-auc:0.86335
[163]	train-auc:0.89525	test-auc:0.865754
[164]	train-auc:0.895239	test-auc:0.865826
[165]	train-auc:0.902117	test-auc:0.873953
[166]	train-auc:0.902968	test-auc:0.872901
[167]	train-auc:0.906869	test-auc:0.881087
[168]	train-auc:0.908143	test-auc:0.880633
[169]	train-auc:0.910264	test-auc:0.890346
[170]	train-auc:0.911123	test-auc:0.888702
[171]	train-auc:0.912856	test-auc:0.890256
[172]	train-auc:0.912859	test-auc:0.890457
[173]	train-auc:0.914933	test-auc:0.885244
[174]	train-auc:0.917162	test-auc:0.888691
[175]	train-auc:0.919827	test-auc:0.893754
[176]	train-auc:0.921293	test-auc:0.898306
[177]	train-auc:0.921269	test-auc:0.898513
[178]	train-auc:0.922025	test-auc:0.900192
[179]	train-auc:0.922283	test-auc:0.898478
[180]	train-auc:0.922729	test-auc:0.896203
[181]	train-auc:0.922026	test-auc:0.89636
[182]	train-auc:0.922845	test-auc:0.89517
[183]	train-auc:0.922994	test-auc:0.89481
[184]	train-auc:0.923685	test-auc:0.893755
[185]	train-auc:0.924463	test-auc:0.89445
[186]	train-auc:0.925612	test-auc:0.895708
[187]	train-auc:0.925986	test-auc:0.896097
[188]	train-auc:0.926308	test-auc:0.898049
[189]	train-auc:0.926701	test-auc:0.89994
[190]	train-auc:0.926804	test-auc:0.900076
[191]	train-auc:0.927169	test-auc:0.899289
[192]	train-auc:0.927667	test-auc:0.898406
[193]	train-auc:0.927906	test-auc:0.898717
[194]	train-auc:0.928189	test-auc:0.897986
[195]	train-auc:0.928687	test-auc:0.897934
[196]	train-auc:0.928905	test-auc:0.897401
[197]	train-auc:0.929011	test-auc:0.899055
[198]	train-auc:0.929382	test-auc:0.898587
[199]	train-auc:0.929621	test-auc:0.897067
[200]	train-auc:0.930354	test-auc:0.897016
[201]	train-auc:0.930419	test-auc:0.895062
[202]	train-auc:0.930577	test-auc:0.894952
[203]	train-auc:0.93098	test-auc:0.89376
[204]	train-auc:0.930982	test-auc:0.893814
[205]	train-auc:0.931073	test-auc:0.893492
[206]	train-auc:0.931204	test-auc:0.893421
[207]	train-auc:0.931199	test-auc:0.892662
[208]	train-auc:0.931229	test-auc:0.893241
[209]	train-auc:0.931645	test-auc:0.892992
[210]	train-auc:0.932143	test-auc:0.893321
[211]	train-auc:0.932168	test-auc:0.89339
[212]	train-auc:0.932059	test-auc:0.892893
[213]	train-auc:0.932261	test-auc:0.892952
[214]	train-auc:0.932471	test-auc:0.892118
[215]	train-auc:0.932655	test-auc:0.892261
[216]	train-auc:0.932859	test-auc:0.891813
[217]	train-auc:0.932999	test-auc:0.892246
[218]	train-auc:0.933072	test-auc:0.892004
[219]	train-auc:0.933236	test-auc:0.891923
[220]	train-auc:0.93351	test-auc:0.891635
[221]	train-auc:0.93367	test-auc:0.891286
[222]	train-auc:0.933816	test-auc:0.891612
[223]	train-auc:0.934	test-auc:0.891521
[224]	train-auc:0.934035	test-auc:0.890987
[225]	train-auc:0.934061	test-auc:0.890868
[226]	train-auc:0.934116	test-auc:0.890957
[227]	train-auc:0.934202	test-auc:0.890346
[228]	train-auc:0.934334	test-auc:0.890627
Stopping. Best iteration:
[178]	train-auc:0.922025+0.00538528	test-auc:0.900192+0.0420388

*-*- Boosting Round = 178 -*-*- Train = 0.922 -*-*- Test = 0.9 -*-*
-*-*-*-*-*-*-* Train Started *-*-*-*-*-*-*-
[22:09:58] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

_plot_xgboost_cv_score(cvr)

png

_plot_importance(bst, figsize=(10,6), importance_type = "total_gain")

png

Tree_explainer = shap.TreeExplainer(bst)
shap_Treevalues = Tree_explainer.shap_values(df_X)
shap.summary_plot(shap_Treevalues, df_X, plot_type = "bar", color = "pink", max_display = 10)

png

As seen, attribute3 has not played an important role through the feature selection process. Therefore, I dropped it and build the final model based on that.

df_pruned = df_X.drop(["attribute3"], axis = 1)

As seen, the total gain for attribute4, attirbute7, and attribute 2 are higher than the rest. This would say that, for linear models such as regulirized linear models including elastic net, these features will be pruned. Now, we can see, how these features perform:

_plot_roc_nfolds_xgboost(df_pruned, Y)

png

test_size = 0.3
n_folds = 10
accuracy_score, roc_score, cv_roc_score, y_pred_prob, X_test, y_test, optimal_threshold = _clf_xgboost(df_pruned, Y, test_size, n_folds)
CI_neg, CI_pos = _ConfidenceInterval(roc_score, X_test.shape[0])
print(F"Accuracy for test size {test_size}  = {accuracy_score:.5f}")
print(F"ROC for test size {test_size} = {roc_score:.3f}")
print(F"Mean ROC for {n_folds} folds CV = {cv_roc_score.mean():.3f}")
print(F"Confidence Interval Accuracy = [{CI_neg:.3f} , {CI_pos:.3f}]")

class_names = [0, 1]
#optimal threshold based on Youden Index
y_pred = (y_pred_prob[:,1] > optimal_threshold).astype(int)
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
# Plot normalized confusion matrix
plt.figure(figsize=(6,6))
_plot_confusion_matrix(cnf_matrix, classes=class_names, normalize = True)
plt.show()

Accuracy for test size 0.3  = 0.99914
ROC for test size 0.3 = 0.890
Mean ROC for 10 folds CV = 0.896
Confidence Interval Accuracy = [0.887 , 0.893]
Normalized confusion matrix
[[0.91 0.09]
 [0.25 0.75]]

png

Discussion:

As seen, the xgboost approach showed a good performance in terms of classification of the individuals in terms of Failure and Non-failure. The classification problem is an imbalanced problem (prevalence < 1%). Therefore, the classification accuracy by itself cannot be trusted and the other classification metrics such as area under ROC curve and Precision-Recalls or F-1 score which are robust to imbalanced classes should be used. The other challenge is the size of the dataset. In fact, we are looking for a model that can be generalized to a larger population. In this regard, it is also better to report the classification error with a confidence interval at a statistical significance level. For instance, to have 95% significance, we can calculate the classification error from the following formula using the Z-score of 1.96:

$ConfidenceInterval = Score \pm 1.96 \times \sqrt{ \frac{Score \times (1 - Score)}{N}}$

For instance, for the approach , we have N= 37349 (30% of the data) individuals in the testing set (unseen individuals) with an AUC of 0.89 which is within the calculated 95% confidence interval. Now, we can add the prediction pribabilities for both class 0 and 1 to the test data.

X_test["pred_proba_class_0"] = pd.Series(y_pred_prob[:,0], index = X_test.index)
X_test["pred_proba_class_1"] = pd.Series(y_pred_prob[:,1], index = X_test.index)

print(F"Test Size = {X_test.shape}")
X_test.head()

Test Size = (37349, 9)

	attribute1	attribute2	attribute4	attribute5	attribute6	attribute7	attribute9	pred_proba_class_0	pred_proba_class_1
22999	0.394775	-0.073170	-0.076004	-0.202137	-0.414557	-0.039335	-0.065047	0.999829	0.000171
26237	-1.097957	0.844409	-0.076004	0.613269	-2.623549	-0.039335	-0.065047	0.997224	0.002776
123435	-1.179217	-0.073170	-0.076004	-0.515755	0.573101	-0.039335	-0.065047	0.999710	0.000290
120834	0.404221	-0.073170	-0.076004	-0.515755	1.461849	-0.039335	-0.065047	0.999801	0.000199
31117	0.243279	-0.073170	-0.076004	1.428676	0.208837	-0.039335	-0.065047	0.999160	0.000840

Now we can repeat this process for N times (let’s do 100 times) and check the histograms of auc for 95% significance level.

_xgboost_auc_hist(df_pruned, Y, n_iterations = 100, alpha = 0.95)

png

95.0% Significance Level - ROC AUC Confidence Interval= [0.849 , 0.934]

Note that, in this example I have chosen the hyper-parameters based on rule of thumb and my experience. However, for finalizing the model, an exhaustive GridSearch or Bayesian Optimization is needed. I have not done it here due to my short time to finish this project. However, for all of my model that went to production so far, I do use hyper-parameter tuning.

2nd Approach: Regularized Linear Model

# Ridge
alpha = 0.0

glmnet_model = _glmnet(df_X, Y, alpha = alpha)
_plot_glmnet_coeff_path(glmnet_model, df_X)
_plot_glmnet_cv_score(glmnet_model)
_df_glmnet_coeff_path(glmnet_model, df_X)

png

	Features	Coeffs
0	attribute7	0.001592
1	attribute4	0.000901
2	attribute2	0.000707
3	attribute5	0.000030
4	attribute1	0.000027
5	attribute9	0.000022
6	attribute6	-0.000007
7	attribute3	-0.000013

_glmnet_best_score_alpha(df_X, Y)

png

# Elastic-Net
alpha = 0.1

glmnet_model = _glmnet(df_X, Y, alpha = alpha)
_plot_glmnet_coeff_path(glmnet_model, df_X)
_plot_glmnet_cv_score(glmnet_model)
_df_glmnet_coeff_path(glmnet_model, df_X)

png

	Features	Coeffs
0	attribute7	0.052871
1	attribute4	0.018768

As seen, glmnet has added more bias to the model (we lost around 4% AUC). However, the model is way more simpler. Generally, both methods along with each other would result in a more generalized model.

Notes could be considered:

Other than, hyper parameter optimization (using grid search or bayesian optimization), I have not implemented any neural network models here that could be used as well (neural network without hyper-parameter optimizaiton is technically useless). Another thing that could be used is the time at event. For example, any useful information such as some specific day-of-the-month that failure happened can be engineered as a new feature (attribute). In addition to this, the device include 1169 unique values where failure happened for 106 of them.

g = df_raw.groupby("device").agg({"failure" : "mean"}).sort_values(by = ["failure"], ascending = False)
g = g.reset_index(level=["device"])
print(F"Number of groups with non-zero failure rate = {g[g['failure'] > 0].shape[0]}")
g.head(10)

Number of groups with non-zero failure rate = 106

	device	failure
0	S1F0RRB1	0.200000
1	S1F10E6M	0.142857
2	S1F11MB0	0.142857
3	S1F0CTDN	0.142857
4	Z1F1AG5N	0.111111
5	W1F0PNA5	0.111111
6	W1F13SRV	0.076923
7	W1F03DP4	0.071429
8	W1F1230J	0.071429
9	W1F0T034	0.058824

For instance, the rows that include the device “S1F0RRB1” has 20% failure rate. In better words, these sets of devices (106) can be used as binary features. However, I was not sure that this is the goal or not. In fact, I am not sure, if would ruin the model since it brings the information from future with knowing the nature of the device. The other note would be for k-folds cross-validation. This devices can be used to group the data into stratified-group-k-folds for classificaiton. In better words, all the rows for specific groups go into the same folds.