Algorithm sklearn validation to estimate the training, cv and test error. This model is scaled previously using the MinMaxScaler (sklearn).
## estimator validation
def model_validation(estimator:'sklearn estimator',X_train:'array',y_train:'array',X_test:'array',y_test:'array',isplot:bool=True,outerror:bool=True):
"""
Estimator validation: training, cv and test error.
estimator -- estimator to be validated.
X_train, X_test -- features for training and test datasets (dim[m,n]).
y_train, y_test -- targets for training and test datasets (dim[m]).
isplot -- display plots.
outerror -- return errors (=True) or predictions (False).
return -- train, cv and test error or predictions according to "outerror".
"""
# scaling + estimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
#scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
pestimator = Pipeline([('scaler',scaler), ('estimator', estimator)])
# prediction
from sklearn.model_selection import cross_val_predict
yhat_train = pestimator.fit(X_train,y_train).predict(X_train)
yhat_cv = cross_val_predict(pestimator,X_train,y_train,cv=10, n_jobs=1,method='predict')
yhat_test = pestimator.fit(X_train,y_train).predict(X_test)
# validate test prediction
li_pos = np.where(yhat_test>(max(y_train)*2))[0]
li_neg = np.where(yhat_test<0)[0]
li = list(li_pos) + list(li_neg)
yhat_test[li]=np.nan
if len(li)>0:print('[warning] there are "%s" strange predictions.'%len(li))
# validation
from sklearn.metrics import mean_squared_error
error_train = np.sqrt(mean_squared_error(y_train,yhat_train))
error_cv = np.sqrt(mean_squared_error(y_train,yhat_cv))
error_test = np.sqrt(mean_squared_error(np.delete(y_test,li),np.delete(yhat_test,li)))
print('RMSE: training = %.3f validation = %.3f test = %.3f '%(error_train,error_cv,error_test))
# plot
if isplot:
train = pd.DataFrame({'y':y_train,'yhat':yhat_train})
train.plot(figsize=(20,3),title='Train Error (rmse)')
cv = pd.DataFrame({'y':y_train,'yhat':yhat_cv})
cv.plot(figsize=(20,3),title='CV Error (rmse)')
test = pd.DataFrame({'y':y_test,'yhat':yhat_test})
test.plot(figsize=(20,3),title='Test Error (rmse)')
# return
if outerror: return {'train':error_train,'cv':error_cv,'test':error_test}
else: return {'train':yhat_train,'cv':yhat_cv,'test':yhat_test}