Cross Validation Sample
kf = StratifiedKFold(df['target'], n_folds=10)
mse = []
fold_count = 0
for train, test in kf:
print("Processing fold %s" % fold_count)
train_fold = df.ix[train]
test_fold = df.ix[test]
# find best features
corr = train_fold.corr()['target'][train_fold.corr()['target'] < 1].abs()
corr.sort(ascending=False)
features = corr.index[[0,1]].values
# Get training examples
train_fold_input = train_fold[features].values
train_fold_output = train_fold['target']
# Fit logistic regression
logreg = LogisticRegression()
logreg.fit(train_fold_input, train_fold_output)
# Check MSE on test set
pred = logreg.predict(test_fold[features])
mse.append(mean_squared_error(test_fold.target, pred))
# Done with the fold
fold_count += 1
print(DataFrame(mse).mean())
# Processing fold 0
# Processing fold 1
# Processing fold 2
# Processing fold 3
# Processing fold 4
# Processing fold 5
# Processing fold 6
# Processing fold 7
# Processing fold 8
# Processing fold 9
DataFrame(mse).mean()
# 0 0.441212
# dtype: float64