Prediction model for titanic survival based on Kaggle file

11/22/2018 - 10:40 AM
Prediction model for titanic survival based on Kaggle file

# coding: utf-8

# # Introduction to Programming - Data Fellowship

# In[14]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# In[15]:


train = pd.read_csv("titanic/train.csv")
print(train)


# In[16]:


test = pd.read_csv("titanic/test.csv")
print(test)


# ## Activity 2: Exploratory analysis

# In[17]:


# Change Pclass and Survived in your training data from integer list to objects

train.Pclass = train.Pclass.astype(object) ## Use the .astype(typename) method
train.Survived = train.Survived.astype(object) ## Use the .astype(typename) method
train.dtypes


# In[18]:


## Explore the data types of your test dataset
test.columns ## Gives me the names of the coulmns


# In[19]:


test.dtypes


# In[20]:


# Change Pclass in the testing dataset from integers to objects
test.Pclass = test.Pclass.astype(object)

# Find if there are duplicate entries
test = test[~test.duplicated()] # remove duplicates on test
train = train[~train.duplicated()] # remove duplicates on train


# In[21]:


t_group = train.groupby(['Sex','Survived']).count() # Shows the sum of each variable


# In[22]:


## Turn the table in proportions
T = t_group.PassengerId
T = T / sum(T)


# In[23]:


def compareAgainst(dataset, group_by_array = ['Sex', 'Survived']):
    train.groupby(group_by_array).size().unstack().plot(kind='bar', stacked = True)
    plt.show()


# In[24]:


## Show the barplot
compareAgainst(train)


# ## Train the first submodel

# In[25]:


train
train['ModelPrediction'] = 0
#train.head()
#tT = train.groupby(['Sex','Survived']).count()
#myT = train.groupby(['Sex','ModelPrediction']).count()

#sum(train.loc[:,'ModelPrediction'])
#males_survival_p = N(train.loc[:,'ModelPrediction']).count()
#female_survival_p = N(train.loc[:,'ModelPrediction']).count()


# In[26]:



#acc1_female()

def DF_analysis(train):
        ## Calculating probabilities
    fs_train = train[(train.Sex == 'female') & (train.Survived == 1)].count()[0]
    ms_train = train[(train.Sex == 'male') & (train.Survived == 1)].count()[0]
    fs_model = train[(train.Sex == 'female') & (train.ModelPrediction == 1)].count()[0]
    ms_model = train[(train.Sex == 'male') & (train.ModelPrediction == 1)].count()[0]

    n_females = train[(train.Sex == 'female')].count()[0]
    n_males = train[(train.Sex == 'male')].count()[0]

    male_srate_train = ms_train/n_males
    male_srate_model = ms_model/n_males
    female_srate_train = fs_train/n_females
    female_srate_model = fs_model/n_females


    print("Male empirical survival rate %.3f" % (male_srate_train) )
    print("Male predicted survival rate %.3f" % (male_srate_model) )
    print("Female empirical rate %.3f" % (female_srate_train))
    print("Female predicted survival rate %.3f" % (female_srate_model) )

    print("Male survival accuracy model: %.3f" % (1 - abs(male_srate_train-male_srate_model)/(male_srate_train)) )
    print("Female survival accuracy model: %.3f" % (1 - abs(female_srate_train-female_srate_model)/(female_srate_train)) )

print("Model 1 results")
DF_analysis(train)
# Second model results
b = (train.Sex == 'female')
b = b * 1 # Cast from boolean to integer
train2 = train.copy() #Create a copy of train
train2['ModelPrediction'] = b
print("Model 2 results")
DF_analysis(train2)


# In[27]:


fs_train = train[(train.Sex == 'female') & (train.Survived == 1)].count()[0]
print(fs_train)
n_females = train[(train.Sex == 'female')].count()[0]
print(n_females)
print(fs_train/n_females)


# Part 2: Analysing the effect of class on survival

# In[28]:



# We use groupby method to see aggregates
T = train.groupby(['Pclass','Survived']).size() #1. Get the distribution
T/sum(T) #2. Calculate probabilities


# In[29]:


# 3. We just use our compareAgainst function to plot the proportions
compareAgainst(train, ['Survived', 'Pclass'])


# In[30]:


## We need another boolean operation
# Thirds model results
# We will only assign a 1 to Posh Females, defined as the ones who doesn't travel in 3rd class
is_PoshFemale = (train.Sex == 'female') & (train.Pclass != 3)

is_PoshFemale = is_PoshFemale * 1 # Cast from boolean to integer
train_3 = train.copy() #Create a copy of train
train_3['ModelPrediction'] = is_PoshFemale
DF_analysis(train_3)


# Data Fellowship Challenge

# In[31]:


submission = pd.DataFrame() # Creates an empty data frame
# Copy the training PassengerID column
submission['PassengerId'] = train['PassengerId'] 
# Creates a Model Prediction row and assign the PoshFemales prediction
submission['ModelPrediction'] = train_3['ModelPrediction']
# Renames the ModelPrediction column with Survived
submission.rename(columns = {'ModelPrediction': 'Survived'}, inplace = True)
#submission.iloc[:,1:2]
submission.to_csv("titanic_in_python_submission.csv", index=False)
# Use pwd (print working directory) to know where the file was saved.
Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Prediction model for titanic survival based on Kaggle file