{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os\n", "import sklearn\n", "import sklearn.model_selection\n", "from sklearn import metrics, preprocessing\n", "import pickle\n", "import math\n", "import tensorflow as tf" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Create dataframes from our csv files and set indeces\n", "\n", "df = pd.read_csv('data/train.csv')\n", "df.set_index('PassengerId', inplace=True)\n", "\n", "testdf = pd.read_csv('data/test.csv')\n", "PassengerId = testdf['PassengerId']\n", "testdf.set_index('PassengerId', inplace=True)\n", "\n", "data = [df, testdf]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Preprocess the data by converting non numerical features into numerical categorical features \n", "# and applying mean imputation to deal with NaN values\n", "\n", "for dataframe in data:\n", " le = preprocessing.LabelEncoder()\n", " dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n", " dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n", " dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n", " dataframe.fillna(dataframe.mean(), inplace=True)\n", " \n", "print(df.head)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Create our input matrix, label vector, and test input matrix\n", "\n", "X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n", "y = df['Survived']\n", "X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n", "print(X.head)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Normalize the data\n", "\n", "X=(X-X.mean())/X.std()\n", "X_test=(X_test-X_test.mean())/X_test.std()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 891 samples\n", "Epoch 1/20\n", "891/891 [==============================] - 1s 642us/sample - loss: 0.5211 - accuracy: 0.7778\n", "Epoch 2/20\n", "891/891 [==============================] - 0s 88us/sample - loss: 0.4308 - accuracy: 0.8103\n", "Epoch 3/20\n", "891/891 [==============================] - 0s 89us/sample - loss: 0.4153 - accuracy: 0.8182\n", "Epoch 4/20\n", "891/891 [==============================] - 0s 95us/sample - loss: 0.4099 - accuracy: 0.8316\n", "Epoch 5/20\n", "891/891 [==============================] - 0s 96us/sample - loss: 0.4052 - accuracy: 0.8249\n", "Epoch 6/20\n", "891/891 [==============================] - 0s 91us/sample - loss: 0.4029 - accuracy: 0.8272\n", "Epoch 7/20\n", "891/891 [==============================] - 0s 93us/sample - loss: 0.4014 - accuracy: 0.8339\n", "Epoch 8/20\n", "891/891 [==============================] - 0s 91us/sample - loss: 0.3934 - accuracy: 0.8384\n", "Epoch 9/20\n", "891/891 [==============================] - 0s 88us/sample - loss: 0.3905 - accuracy: 0.8339\n", "Epoch 10/20\n", "891/891 [==============================] - 0s 93us/sample - loss: 0.3905 - accuracy: 0.8260\n", "Epoch 11/20\n", "891/891 [==============================] - 0s 95us/sample - loss: 0.3896 - accuracy: 0.8328\n", "Epoch 12/20\n", "891/891 [==============================] - 0s 92us/sample - loss: 0.3832 - accuracy: 0.8440\n", "Epoch 13/20\n", "891/891 [==============================] - 0s 114us/sample - loss: 0.3822 - accuracy: 0.8429\n", "Epoch 14/20\n", "891/891 [==============================] - 0s 228us/sample - loss: 0.3810 - accuracy: 0.8361\n", "Epoch 15/20\n", "891/891 [==============================] - 0s 180us/sample - loss: 0.3781 - accuracy: 0.8418\n", "Epoch 16/20\n", "891/891 [==============================] - 0s 153us/sample - loss: 0.3771 - accuracy: 0.8384\n", "Epoch 17/20\n", "891/891 [==============================] - 0s 178us/sample - loss: 0.3731 - accuracy: 0.8395\n", "Epoch 18/20\n", "891/891 [==============================] - 0s 189us/sample - loss: 0.3721 - accuracy: 0.8429\n", "Epoch 19/20\n", "891/891 [==============================] - 0s 166us/sample - loss: 0.3729 - accuracy: 0.8440\n", "Epoch 20/20\n", "891/891 [==============================] - 0s 158us/sample - loss: 0.3747 - accuracy: 0.8406\n", "Model: \"sequential_5\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "flatten_7 (Flatten) multiple 0 \n", "_________________________________________________________________\n", "dense_19 (Dense) multiple 9252 \n", "_________________________________________________________________\n", "dense_20 (Dense) multiple 2058 \n", "=================================================================\n", "Total params: 11,310\n", "Trainable params: 11,310\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "None\n" ] } ], "source": [ "# Setup our model\n", "\n", "model = tf.keras.models.Sequential([\n", " # Flatten out our input\n", " tf.keras.layers.Flatten(),\n", " \n", " # Setup our hidden layer\n", " tf.keras.layers.Dense(1028, activation=tf.nn.relu),\n", " \n", " # Setup output layer\n", " tf.keras.layers.Dense(2, activation=tf.nn.softmax)\n", "])\n", "\n", "# Compile our model\n", "model.compile(optimizer='adam', \n", " loss = 'sparse_categorical_crossentropy', \n", " metrics=['accuracy'])\n", "\n", "# Fit model\n", "history = model.fit(\n", " X.values,\n", " y.values,\n", " epochs=20\n", ")\n", "\n", "# Model summary\n", "print(model.summary())" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "test_pred = model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n", "predictions = np.c_[PassengerId, np.argmax(test_pred, axis=1)] # Note that we take the argmax over the collumns to use our softmax output\n", "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n", "print(submission.head)\n", "submission.to_csv(\"submissions/NNSubmission.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using this model, we get a score of 0.79425" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }