jmquintana79
3/30/2018 - 6:42 AM

Learning Curve

from sklearn.learning_curve import learning_curve
from sklearn.metrics import mean_squared_error, make_scorer

## prepare data
X = data[data.index.isin(ldt_train)][lcol_features].as_matrix()
y = data[data.index.isin(ldt_train)][starget].values
# polynomaial features generation
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
X = poly.fit_transform(X)

## custom scorer
def frmse(y_true, y_pred):
    return -np.sqrt(mean_squared_error(y_true, y_pred))
srmse = make_scorer(frmse,greater_is_better=False)

## scaling + estimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
estimator = Lasso(alpha=0.16)
clf = Pipeline([('scaler',scaler), ('estimator', estimator)])

## build learning curve
train_sizes, train_scores, valid_scores = learning_curve(clf, X, y, train_sizes=np.array([ 0.1, 0.33, 0.55, 0.78, 1. ]), cv=10, scoring=srmse)
# store results in a pandas df to be plotted
LC = pd.DataFrame({'sizes':train_sizes, 'train':np.mean(train_scores,axis=1), 'cv':np.mean(valid_scores,axis=1)}).set_index('sizes')
LC.plot()