felipehuerta17
11/22/2018 - 10:40 AM

Prediction model for titanic survival based on Kaggle file

Prediction model for titanic survival based on Kaggle file

# coding: utf-8

# # Introduction to Programming - Data Fellowship

# In[14]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# In[15]:


train = pd.read_csv("titanic/train.csv")
print(train)


# In[16]:


test = pd.read_csv("titanic/test.csv")
print(test)


# ## Activity 2: Exploratory analysis

# In[17]:


# Change Pclass and Survived in your training data from integer list to objects

train.Pclass = train.Pclass.astype(object) ## Use the .astype(typename) method
train.Survived = train.Survived.astype(object) ## Use the .astype(typename) method
train.dtypes


# In[18]:


## Explore the data types of your test dataset
test.columns ## Gives me the names of the coulmns


# In[19]:


test.dtypes


# In[20]:


# Change Pclass in the testing dataset from integers to objects
test.Pclass = test.Pclass.astype(object)

# Find if there are duplicate entries
test = test[~test.duplicated()] # remove duplicates on test
train = train[~train.duplicated()] # remove duplicates on train


# In[21]:


t_group = train.groupby(['Sex','Survived']).count() # Shows the sum of each variable


# In[22]:


## Turn the table in proportions
T = t_group.PassengerId
T = T / sum(T)


# In[23]:


def compareAgainst(dataset, group_by_array = ['Sex', 'Survived']):
    train.groupby(group_by_array).size().unstack().plot(kind='bar', stacked = True)
    plt.show()


# In[24]:


## Show the barplot
compareAgainst(train)


# ## Train the first submodel

# In[25]:


train
train['ModelPrediction'] = 0
#train.head()
#tT = train.groupby(['Sex','Survived']).count()
#myT = train.groupby(['Sex','ModelPrediction']).count()

#sum(train.loc[:,'ModelPrediction'])
#males_survival_p = N(train.loc[:,'ModelPrediction']).count()
#female_survival_p = N(train.loc[:,'ModelPrediction']).count()


# In[26]:



#acc1_female()

def DF_analysis(train):
        ## Calculating probabilities
    fs_train = train[(train.Sex == 'female') & (train.Survived == 1)].count()[0]
    ms_train = train[(train.Sex == 'male') & (train.Survived == 1)].count()[0]
    fs_model = train[(train.Sex == 'female') & (train.ModelPrediction == 1)].count()[0]
    ms_model = train[(train.Sex == 'male') & (train.ModelPrediction == 1)].count()[0]

    n_females = train[(train.Sex == 'female')].count()[0]
    n_males = train[(train.Sex == 'male')].count()[0]

    male_srate_train = ms_train/n_males
    male_srate_model = ms_model/n_males
    female_srate_train = fs_train/n_females
    female_srate_model = fs_model/n_females


    print("Male empirical survival rate %.3f" % (male_srate_train) )
    print("Male predicted survival rate %.3f" % (male_srate_model) )
    print("Female empirical rate %.3f" % (female_srate_train))
    print("Female predicted survival rate %.3f" % (female_srate_model) )

    print("Male survival accuracy model: %.3f" % (1 - abs(male_srate_train-male_srate_model)/(male_srate_train)) )
    print("Female survival accuracy model: %.3f" % (1 - abs(female_srate_train-female_srate_model)/(female_srate_train)) )

print("Model 1 results")
DF_analysis(train)
# Second model results
b = (train.Sex == 'female')
b = b * 1 # Cast from boolean to integer
train2 = train.copy() #Create a copy of train
train2['ModelPrediction'] = b
print("Model 2 results")
DF_analysis(train2)


# In[27]:


fs_train = train[(train.Sex == 'female') & (train.Survived == 1)].count()[0]
print(fs_train)
n_females = train[(train.Sex == 'female')].count()[0]
print(n_females)
print(fs_train/n_females)


# Part 2: Analysing the effect of class on survival

# In[28]:



# We use groupby method to see aggregates
T = train.groupby(['Pclass','Survived']).size() #1. Get the distribution
T/sum(T) #2. Calculate probabilities


# In[29]:


# 3. We just use our compareAgainst function to plot the proportions
compareAgainst(train, ['Survived', 'Pclass'])


# In[30]:


## We need another boolean operation
# Thirds model results
# We will only assign a 1 to Posh Females, defined as the ones who doesn't travel in 3rd class
is_PoshFemale = (train.Sex == 'female') & (train.Pclass != 3)

is_PoshFemale = is_PoshFemale * 1 # Cast from boolean to integer
train_3 = train.copy() #Create a copy of train
train_3['ModelPrediction'] = is_PoshFemale
DF_analysis(train_3)


# Data Fellowship Challenge

# In[31]:


submission = pd.DataFrame() # Creates an empty data frame
# Copy the training PassengerID column
submission['PassengerId'] = train['PassengerId'] 
# Creates a Model Prediction row and assign the PoshFemales prediction
submission['ModelPrediction'] = train_3['ModelPrediction']
# Renames the ModelPrediction column with Survived
submission.rename(columns = {'ModelPrediction': 'Survived'}, inplace = True)
#submission.iloc[:,1:2]
submission.to_csv("titanic_in_python_submission.csv", index=False)
# Use pwd (print working directory) to know where the file was saved.