Prediction model for titanic survival based on Kaggle file
# coding: utf-8
# # Introduction to Programming - Data Fellowship
# In[14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# In[15]:
train = pd.read_csv("titanic/train.csv")
print(train)
# In[16]:
test = pd.read_csv("titanic/test.csv")
print(test)
# ## Activity 2: Exploratory analysis
# In[17]:
# Change Pclass and Survived in your training data from integer list to objects
train.Pclass = train.Pclass.astype(object) ## Use the .astype(typename) method
train.Survived = train.Survived.astype(object) ## Use the .astype(typename) method
train.dtypes
# In[18]:
## Explore the data types of your test dataset
test.columns ## Gives me the names of the coulmns
# In[19]:
test.dtypes
# In[20]:
# Change Pclass in the testing dataset from integers to objects
test.Pclass = test.Pclass.astype(object)
# Find if there are duplicate entries
test = test[~test.duplicated()] # remove duplicates on test
train = train[~train.duplicated()] # remove duplicates on train
# In[21]:
t_group = train.groupby(['Sex','Survived']).count() # Shows the sum of each variable
# In[22]:
## Turn the table in proportions
T = t_group.PassengerId
T = T / sum(T)
# In[23]:
def compareAgainst(dataset, group_by_array = ['Sex', 'Survived']):
train.groupby(group_by_array).size().unstack().plot(kind='bar', stacked = True)
plt.show()
# In[24]:
## Show the barplot
compareAgainst(train)
# ## Train the first submodel
# In[25]:
train
train['ModelPrediction'] = 0
#train.head()
#tT = train.groupby(['Sex','Survived']).count()
#myT = train.groupby(['Sex','ModelPrediction']).count()
#sum(train.loc[:,'ModelPrediction'])
#males_survival_p = N(train.loc[:,'ModelPrediction']).count()
#female_survival_p = N(train.loc[:,'ModelPrediction']).count()
# In[26]:
#acc1_female()
def DF_analysis(train):
## Calculating probabilities
fs_train = train[(train.Sex == 'female') & (train.Survived == 1)].count()[0]
ms_train = train[(train.Sex == 'male') & (train.Survived == 1)].count()[0]
fs_model = train[(train.Sex == 'female') & (train.ModelPrediction == 1)].count()[0]
ms_model = train[(train.Sex == 'male') & (train.ModelPrediction == 1)].count()[0]
n_females = train[(train.Sex == 'female')].count()[0]
n_males = train[(train.Sex == 'male')].count()[0]
male_srate_train = ms_train/n_males
male_srate_model = ms_model/n_males
female_srate_train = fs_train/n_females
female_srate_model = fs_model/n_females
print("Male empirical survival rate %.3f" % (male_srate_train) )
print("Male predicted survival rate %.3f" % (male_srate_model) )
print("Female empirical rate %.3f" % (female_srate_train))
print("Female predicted survival rate %.3f" % (female_srate_model) )
print("Male survival accuracy model: %.3f" % (1 - abs(male_srate_train-male_srate_model)/(male_srate_train)) )
print("Female survival accuracy model: %.3f" % (1 - abs(female_srate_train-female_srate_model)/(female_srate_train)) )
print("Model 1 results")
DF_analysis(train)
# Second model results
b = (train.Sex == 'female')
b = b * 1 # Cast from boolean to integer
train2 = train.copy() #Create a copy of train
train2['ModelPrediction'] = b
print("Model 2 results")
DF_analysis(train2)
# In[27]:
fs_train = train[(train.Sex == 'female') & (train.Survived == 1)].count()[0]
print(fs_train)
n_females = train[(train.Sex == 'female')].count()[0]
print(n_females)
print(fs_train/n_females)
# Part 2: Analysing the effect of class on survival
# In[28]:
# We use groupby method to see aggregates
T = train.groupby(['Pclass','Survived']).size() #1. Get the distribution
T/sum(T) #2. Calculate probabilities
# In[29]:
# 3. We just use our compareAgainst function to plot the proportions
compareAgainst(train, ['Survived', 'Pclass'])
# In[30]:
## We need another boolean operation
# Thirds model results
# We will only assign a 1 to Posh Females, defined as the ones who doesn't travel in 3rd class
is_PoshFemale = (train.Sex == 'female') & (train.Pclass != 3)
is_PoshFemale = is_PoshFemale * 1 # Cast from boolean to integer
train_3 = train.copy() #Create a copy of train
train_3['ModelPrediction'] = is_PoshFemale
DF_analysis(train_3)
# Data Fellowship Challenge
# In[31]:
submission = pd.DataFrame() # Creates an empty data frame
# Copy the training PassengerID column
submission['PassengerId'] = train['PassengerId']
# Creates a Model Prediction row and assign the PoshFemales prediction
submission['ModelPrediction'] = train_3['ModelPrediction']
# Renames the ModelPrediction column with Survived
submission.rename(columns = {'ModelPrediction': 'Survived'}, inplace = True)
#submission.iloc[:,1:2]
submission.to_csv("titanic_in_python_submission.csv", index=False)
# Use pwd (print working directory) to know where the file was saved.