Keiku
5/2/2017 - 4:30 AM

Split K-fold validation dataset.

Split K-fold validation dataset.

import string
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold

X_train = np.random.random((10, 2))
y_train = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

column = "pred"
n_fold = 5
p_train = pd.DataFrame(index=range(len(X_train)), columns=[column])

kf = KFold(n_fold, random_state=123)
for tr, te in kf.split(X_train):
    print(tr, te)
    X_tra, y_tra, X_val, y_val = X_train[tr], y_train[tr], X_train[te], y_train[te]

    p_val = y_val
    p_val_df = pd.DataFrame(p_val, index=te, columns=[column])
    p_train.iloc[te] = p_val_df
    print(p_train)

# [2 3 4 5 6 7 8 9] [0 1]
#   pred
# 0    1
# 1    1
# 2  NaN
# 3  NaN
# 4  NaN
# 5  NaN
# 6  NaN
# 7  NaN
# 8  NaN
# 9  NaN
# [0 1 4 5 6 7 8 9] [2 3]
#   pred
# 0    1
# 1    1
# 2    1
# 3    1
# 4  NaN
# 5  NaN
# 6  NaN
# 7  NaN
# 8  NaN
# 9  NaN
# [0 1 2 3 6 7 8 9] [4 5]
#   pred
# 0    1
# 1    1
# 2    1
# 3    1
# 4    1
# 5    0
# 6  NaN
# 7  NaN
# 8  NaN
# 9  NaN
# [0 1 2 3 4 5 8 9] [6 7]
#   pred
# 0    1
# 1    1
# 2    1
# 3    1
# 4    1
# 5    0
# 6    0
# 7    0
# 8  NaN
# 9  NaN
# [0 1 2 3 4 5 6 7] [8 9]
#    pred
# 0     1
# 1     1
# 2     1
# 3     1
# 4     1
# 5     0
# 6     0
# 7     0
# 8     0
# 9     0

X_train = np.random.random((10, 2))
y_train = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

column = "pred"
n_fold = 5
p_train = pd.DataFrame(index=range(len(X_train)), columns=[column])

skf = StratifiedKFold(n_fold, random_state=123)
for tr, te in skf.split(X_train, y_train):
    print(tr, te)
    X_tra, y_tra, X_val, y_val = X_train[tr], y_train[tr], X_train[te], y_train[te]

    p_val = y_val
    p_val_df = pd.DataFrame(p_val, index=te, columns=[column])
    p_train.iloc[te] = p_val_df
    print(p_train)

# [1 2 3 4 6 7 8 9] [0 5]
#   pred
# 0    1
# 1  NaN
# 2  NaN
# 3  NaN
# 4  NaN
# 5    0
# 6  NaN
# 7  NaN
# 8  NaN
# 9  NaN
# [0 2 3 4 5 7 8 9] [1 6]
#   pred
# 0    1
# 1    1
# 2  NaN
# 3  NaN
# 4  NaN
# 5    0
# 6    0
# 7  NaN
# 8  NaN
# 9  NaN
# [0 1 3 4 5 6 8 9] [2 7]
#   pred
# 0    1
# 1    1
# 2    1
# 3  NaN
# 4  NaN
# 5    0
# 6    0
# 7    0
# 8  NaN
# 9  NaN
# [0 1 2 4 5 6 7 9] [3 8]
#   pred
# 0    1
# 1    1
# 2    1
# 3    1
# 4  NaN
# 5    0
# 6    0
# 7    0
# 8    0
# 9  NaN
# [0 1 2 3 5 6 7 8] [4 9]
#    pred
# 0     1
# 1     1
# 2     1
# 3     1
# 4     1
# 5     0
# 6     0
# 7     0
# 8     0
# 9     0

X_train = np.random.random((10, 2))
y_train = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

column = "pred"
n_fold = 5
p_train = pd.DataFrame({"ID": list(string.ascii_uppercase)[0:10]},
                       index=range(len(X_train)),
                       columns=["ID", column])

kf = KFold(n_fold, random_state=123)
for tr, te in kf.split(X_train):
    print(tr, te)
    X_tra, y_tra, X_val, y_val = X_train[tr], y_train[tr], X_train[te], y_train[te]

    p_val = y_val
    p_val_df = pd.DataFrame(p_val, index=te, columns=[column])
    p_train.loc[te, [column]] = p_val_df
    print(p_train)

# [2 3 4 5 6 7 8 9] [0 1]
#   ID pred
# 0  A    1
# 1  B    1
# 2  C  NaN
# 3  D  NaN
# 4  E  NaN
# 5  F  NaN
# 6  G  NaN
# 7  H  NaN
# 8  I  NaN
# 9  J  NaN
# [0 1 4 5 6 7 8 9] [2 3]
#   ID pred
# 0  A    1
# 1  B    1
# 2  C    1
# 3  D    1
# 4  E  NaN
# 5  F  NaN
# 6  G  NaN
# 7  H  NaN
# 8  I  NaN
# 9  J  NaN
# [0 1 2 3 6 7 8 9] [4 5]
#   ID pred
# 0  A    1
# 1  B    1
# 2  C    1
# 3  D    1
# 4  E    1
# 5  F    0
# 6  G  NaN
# 7  H  NaN
# 8  I  NaN
# 9  J  NaN
# [0 1 2 3 4 5 8 9] [6 7]
#   ID pred
# 0  A    1
# 1  B    1
# 2  C    1
# 3  D    1
# 4  E    1
# 5  F    0
# 6  G    0
# 7  H    0
# 8  I  NaN
# 9  J  NaN
# [0 1 2 3 4 5 6 7] [8 9]
#   ID pred
# 0  A    1
# 1  B    1
# 2  C    1
# 3  D    1
# 4  E    1
# 5  F    0
# 6  G    0
# 7  H    0
# 8  I    0
# 9  J    0