Algorithm sklearn validation

3/30/2018 - 6:35 AM

Algorithm sklearn validation

Algorithm sklearn validation to estimate the training, cv and test error. This model is scaled previously using the MinMaxScaler (sklearn).

model_validation.py

## estimator validation
def model_validation(estimator:'sklearn estimator',X_train:'array',y_train:'array',X_test:'array',y_test:'array',isplot:bool=True,outerror:bool=True):
    """
    Estimator validation: training, cv and test error.
    estimator -- estimator to be validated.
    X_train, X_test -- features for training and test datasets (dim[m,n]).
    y_train, y_test -- targets for training and test datasets (dim[m]).
    isplot -- display plots.
    outerror -- return errors (=True) or predictions (False).
    return -- train, cv and test error or predictions according to "outerror".
    """
    # scaling + estimator
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import MinMaxScaler, StandardScaler
    scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
    #scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
    pestimator = Pipeline([('scaler',scaler), ('estimator', estimator)])
    
    # prediction
    from sklearn.model_selection import cross_val_predict
    yhat_train = pestimator.fit(X_train,y_train).predict(X_train)
    yhat_cv = cross_val_predict(pestimator,X_train,y_train,cv=10, n_jobs=1,method='predict')
    yhat_test = pestimator.fit(X_train,y_train).predict(X_test)
    # validate test prediction
    li_pos = np.where(yhat_test>(max(y_train)*2))[0]
    li_neg = np.where(yhat_test<0)[0]
    li = list(li_pos) + list(li_neg)
    yhat_test[li]=np.nan
    if len(li)>0:print('[warning] there are "%s" strange predictions.'%len(li))
    # validation
    from sklearn.metrics import mean_squared_error
    error_train = np.sqrt(mean_squared_error(y_train,yhat_train))
    error_cv = np.sqrt(mean_squared_error(y_train,yhat_cv))
    error_test = np.sqrt(mean_squared_error(np.delete(y_test,li),np.delete(yhat_test,li))) 
    print('RMSE: training = %.3f validation = %.3f test = %.3f '%(error_train,error_cv,error_test))
    # plot
    if isplot:
        train = pd.DataFrame({'y':y_train,'yhat':yhat_train})
        train.plot(figsize=(20,3),title='Train Error (rmse)')
        cv = pd.DataFrame({'y':y_train,'yhat':yhat_cv})
        cv.plot(figsize=(20,3),title='CV Error (rmse)')    
        test = pd.DataFrame({'y':y_test,'yhat':yhat_test})
        test.plot(figsize=(20,3),title='Test Error (rmse)')    
    # return
    if outerror: return {'train':error_train,'cv':error_cv,'test':error_test}
    else: return {'train':yhat_train,'cv':yhat_cv,'test':yhat_test}

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Algorithm sklearn validation