jmquintana79
6/1/2018 - 4:29 AM

VALIDATION CURVE

Plot Validation Curve according to a range of values for any estimator hyper-parameter.

## scorer
from sklearn.metrics import make_scorer
def RMSE(y_true,y_pred):    
    error = y_pred - y_true
    mse = np.mean(error**2)
    return np.sqrt(mse)
def scorer():
    return make_scorer(RMSE, greater_is_better=False)
## PLOT THE VALIDATION CURVE
def plot_validation_curve(X:'array',y:'array',estimator:'scikit clf',param_name:str,param_values:list,scorer:'scikit scorer',
                          njobs:int=2,ncv:int=10,issemilog:bool=True)->tuple:
    """
    Plot the Validation Curve.
    X,y -- features / target arrays.
    estimator -- Scikit Learn estimator.
    param_name -- name of hyperparameters to be evaluated.
    param_values -- array of possible values of param to be evaluated.
    scorer -- Scikit Learn scorer: it is possible a suitable string or a custom scikit scorer function.
    njobs -- number of CPUs to be used (default 2).
    ncv -- number of cv folks to be used (default 10).
    issemilog -- use or not semilog scale.
    return -- tuple: (training scores, test (cv) scores)
    """
    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.model_selection import validation_curve

    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=param_values,
        scoring=scorer(), n_jobs=njobs, cv=ncv)


    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve")
    plt.xlabel("%s"%param_name)
    plt.ylabel("Score")
    lw = 2
    if issemilog:
        plt.semilogx(param_values, train_scores_mean, label="Training score",
                     color="darkorange", lw=lw)
    else:
        plt.plot(param_values, train_scores_mean, label="Training score",
                     color="darkorange", lw=lw)
        
    plt.fill_between(param_values, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=lw)
    if issemilog:
        plt.semilogx(param_values, test_scores_mean, label="Cross-validation score",
                     color="navy", lw=lw)
    else:
        plt.plot(param_values, test_scores_mean, label="Cross-validation score",
                     color="navy", lw=lw)

    plt.fill_between(param_values, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=lw)
    plt.legend(loc="best")
    plt.show()
    # return
    return (train_scores,test_scores)