JokullJohanns
5/30/2016 - 9:08 AM

Split drivers in Kaggle

Split drivers in Kaggle

import pandas as pd
from shutil import move
import os


def main():
	driversList = readDriversList()
	trainSubjects, validateSubjects = splitDrivers(driversList)

	moveFiles(driversList, validateSubjects, 'validation')

	valImages = driversList[driversList['subject'].isin(validateSubjects)][['img', 'classname']]
	trainImages = driversList[driversList['subject'].isin(trainSubjects)][['img', 'classname']]

	trainImages['classname'] = trainImages['classname'].str.replace("c", "").astype(int)
	valImages['classname'] = valImages['classname'].str.replace("c", "").astype(int)

	trainImages.to_csv('trainset.csv', index=False, header=False, sep=" ")
	valImages.to_csv('validationset.csv', index=False, header=False, sep=" ")


def readDriversList():
	return pd.read_csv('driver_imgs_list.csv', delimiter=",")


def splitDrivers(drivers, trainSize=0.8):
	subjectList = drivers['subject'].drop_duplicates()
	trainCount = int(len(subjectList) * trainSize)
	trainDrivers = subjectList[:trainCount]
	validateDrivers = subjectList[trainCount + 1:]

	return trainDrivers.values, validateDrivers.values


def moveFiles(drivers, setList, setType):
	for line in drivers[drivers['subject'].isin(setList)].values:
		if not os.path.exists('imgs/' + setType + '/' + line[1]):
			os.makedirs('imgs/' + setType + '/' + line[1])
		move('imgs/train/' + line[1] + '/' + line[2], 'imgs/' + setType + '/' + line[1] + '/' + line[2])


if __name__ == "__main__":

	main()