In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
import sklearn.model_selection
from sklearn import metrics, preprocessing
import pickle
import math
import tensorflow as tf

In [2]:
# Create dataframes from our csv files and set indeces

df = pd.read_csv('data/train.csv')
df.set_index('PassengerId', inplace=True)

testdf = pd.read_csv('data/test.csv')
PassengerId = testdf['PassengerId']
testdf.set_index('PassengerId', inplace=True)

data = [df, testdf]

In [3]:
# Preprocess the data by converting non numerical features into numerical categorical features 
# and applying mean imputation to deal with NaN values

for dataframe in data:
    le = preprocessing.LabelEncoder()
    dataframe["Sex"] = le.fit_transform(list(dataframe["Sex"]))
    dataframe["Cabin"] = le.fit_transform(list(dataframe["Cabin"]))
    dataframe["Embarked"] = le.fit_transform(list(dataframe["Embarked"]))
    dataframe.fillna(dataframe.mean(), inplace=True)
    
print(df.head)

<bound method NDFrame.head of              Survived  Pclass  \
PassengerId                     
1                   0       3   
2                   1       1   
3                   1       3   
4                   1       1   
5                   0       3   
...               ...     ...   
887                 0       2   
888                 1       1   
889                 0       3   
890                 1       1   
891                 0       3   

                                                          Name  Sex  \
PassengerId                                                           
1                                      Braund, Mr. Owen Harris    1   
2            Cumings, Mrs. John Bradley (Florence Briggs Th...    0   
3                                       Heikkinen, Miss. Laina    0   
4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)    0   
5                                     Allen, Mr. William Henry    1   
...                                        

In [4]:
# Create our input matrix, label vector, and test input matrix

X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)
y = df['Survived']
X_test = testdf.drop(['Name', 'Ticket'], axis=1)
print(X.head)

<bound method NDFrame.head of              Pclass  Sex        Age  SibSp  Parch     Fare  Cabin  Embarked
PassengerId                                                                
1                 3    1  22.000000      1      0   7.2500    147         2
2                 1    0  38.000000      1      0  71.2833     81         0
3                 3    0  26.000000      0      0   7.9250    147         2
4                 1    0  35.000000      1      0  53.1000     55         2
5                 3    1  35.000000      0      0   8.0500    147         2
...             ...  ...        ...    ...    ...      ...    ...       ...
887               2    1  27.000000      0      0  13.0000    147         2
888               1    0  19.000000      0      0  30.0000     30         2
889               3    0  29.699118      1      2  23.4500    147         2
890               1    1  26.000000      0      0  30.0000     60         0
891               3    1  32.000000      0      0   7.7500

In [5]:
# Normalize the data

X=(X-X.mean())/X.std()
X_test=(X_test-X_test.mean())/X_test.std()

In [19]:
# Setup our model

model = tf.keras.models.Sequential([
    # Flatten out our input
    tf.keras.layers.Flatten(),
    
    # Setup our hidden layer
    tf.keras.layers.Dense(1028, activation=tf.nn.relu),
    
    # Setup output layer
    tf.keras.layers.Dense(2, activation=tf.nn.softmax)
])

# Compile our model
model.compile(optimizer='adam', 
              loss = 'sparse_categorical_crossentropy', 
             metrics=['accuracy'])

# Fit model
history = model.fit(
    X.values,
    y.values,
    epochs=20
)

# Model summary
print(model.summary())

Train on 891 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_7 (Flatten)          multiple                  0         
_________________________________________________________________
dense_19 (Dense)             multiple                  9252      
_________________________________________________________________
dense_20 (Dense)             multiple                  2058      
Total params: 11,310
Trainable params: 11,310
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
test_pred = model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test
predictions = np.c_[PassengerId, np.argmax(test_pred, axis=1)] # Note that we take the argmax over the collumns to use our softmax output
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
print(submission.head)
submission.to_csv("submissions/NNSubmission.csv", index=False)

<bound method NDFrame.head of      PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]>


Using this model, we get a score of 0.79425