MLProjects/kaggle_titanic/NeuralNetwork.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import sklearn\n",
    "import sklearn.model_selection\n",
    "from sklearn import metrics, preprocessing\n",
    "import pickle\n",
    "import math\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataframes from our csv files and set indeces\n",
    "\n",
    "df = pd.read_csv('data/train.csv')\n",
    "df.set_index('PassengerId', inplace=True)\n",
    "\n",
    "testdf = pd.read_csv('data/test.csv')\n",
    "PassengerId = testdf['PassengerId']\n",
    "testdf.set_index('PassengerId', inplace=True)\n",
    "\n",
    "data = [df, testdf]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<bound method NDFrame.head of              Survived  Pclass  \\\n",
      "PassengerId                     \n",
      "1                   0       3   \n",
      "2                   1       1   \n",
      "3                   1       3   \n",
      "4                   1       1   \n",
      "5                   0       3   \n",
      "...               ...     ...   \n",
      "887                 0       2   \n",
      "888                 1       1   \n",
      "889                 0       3   \n",
      "890                 1       1   \n",
      "891                 0       3   \n",
      "\n",
      "                                                          Name  Sex  \\\n",
      "PassengerId                                                           \n",
      "1                                      Braund, Mr. Owen Harris    1   \n",
      "2            Cumings, Mrs. John Bradley (Florence Briggs Th...    0   \n",
      "3                                       Heikkinen, Miss. Laina    0   \n",
      "4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)    0   \n",
      "5                                     Allen, Mr. William Henry    1   \n",
      "...                                                        ...  ...   \n",
      "887                                      Montvila, Rev. Juozas    1   \n",
      "888                               Graham, Miss. Margaret Edith    0   \n",
      "889                   Johnston, Miss. Catherine Helen \"Carrie\"    0   \n",
      "890                                      Behr, Mr. Karl Howell    1   \n",
      "891                                        Dooley, Mr. Patrick    1   \n",
      "\n",
      "                   Age  SibSp  Parch            Ticket     Fare  Cabin  \\\n",
      "PassengerId                                                              \n",
      "1            22.000000      1      0         A/5 21171   7.2500    147   \n",
      "2            38.000000      1      0          PC 17599  71.2833     81   \n",
      "3            26.000000      0      0  STON/O2. 3101282   7.9250    147   \n",
      "4            35.000000      1      0            113803  53.1000     55   \n",
      "5            35.000000      0      0            373450   8.0500    147   \n",
      "...                ...    ...    ...               ...      ...    ...   \n",
      "887          27.000000      0      0            211536  13.0000    147   \n",
      "888          19.000000      0      0            112053  30.0000     30   \n",
      "889          29.699118      1      2        W./C. 6607  23.4500    147   \n",
      "890          26.000000      0      0            111369  30.0000     60   \n",
      "891          32.000000      0      0            370376   7.7500    147   \n",
      "\n",
      "             Embarked  \n",
      "PassengerId            \n",
      "1                   2  \n",
      "2                   0  \n",
      "3                   2  \n",
      "4                   2  \n",
      "5                   2  \n",
      "...               ...  \n",
      "887                 2  \n",
      "888                 2  \n",
      "889                 2  \n",
      "890                 0  \n",
      "891                 1  \n",
      "\n",
      "[891 rows x 11 columns]>\n"
     ]
    }
   ],
   "source": [
    "# Preprocess the data by converting non numerical features into numerical categorical features \n",
    "# and applying mean imputation to deal with NaN values\n",
    "\n",
    "for dataframe in data:\n",
    "    le = preprocessing.LabelEncoder()\n",
    "    dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n",
    "    dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n",
    "    dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n",
    "    dataframe.fillna(dataframe.mean(), inplace=True)\n",
    "    \n",
    "print(df.head)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<bound method NDFrame.head of              Pclass  Sex        Age  SibSp  Parch     Fare  Cabin  Embarked\n",
      "PassengerId                                                                \n",
      "1                 3    1  22.000000      1      0   7.2500    147         2\n",
      "2                 1    0  38.000000      1      0  71.2833     81         0\n",
      "3                 3    0  26.000000      0      0   7.9250    147         2\n",
      "4                 1    0  35.000000      1      0  53.1000     55         2\n",
      "5                 3    1  35.000000      0      0   8.0500    147         2\n",
      "...             ...  ...        ...    ...    ...      ...    ...       ...\n",
      "887               2    1  27.000000      0      0  13.0000    147         2\n",
      "888               1    0  19.000000      0      0  30.0000     30         2\n",
      "889               3    0  29.699118      1      2  23.4500    147         2\n",
      "890               1    1  26.000000      0      0  30.0000     60         0\n",
      "891               3    1  32.000000      0      0   7.7500    147         1\n",
      "\n",
      "[891 rows x 8 columns]>\n"
     ]
    }
   ],
   "source": [
    "# Create our input matrix, label vector, and test input matrix\n",
    "\n",
    "X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n",
    "y = df['Survived']\n",
    "X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n",
    "print(X.head)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalize the data\n",
    "\n",
    "X=(X-X.mean())/X.std()\n",
    "X_test=(X_test-X_test.mean())/X_test.std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 891 samples\n",
      "Epoch 1/20\n",
      "891/891 [==============================] - 1s 642us/sample - loss: 0.5211 - accuracy: 0.7778\n",
      "Epoch 2/20\n",
      "891/891 [==============================] - 0s 88us/sample - loss: 0.4308 - accuracy: 0.8103\n",
      "Epoch 3/20\n",
      "891/891 [==============================] - 0s 89us/sample - loss: 0.4153 - accuracy: 0.8182\n",
      "Epoch 4/20\n",
      "891/891 [==============================] - 0s 95us/sample - loss: 0.4099 - accuracy: 0.8316\n",
      "Epoch 5/20\n",
      "891/891 [==============================] - 0s 96us/sample - loss: 0.4052 - accuracy: 0.8249\n",
      "Epoch 6/20\n",
      "891/891 [==============================] - 0s 91us/sample - loss: 0.4029 - accuracy: 0.8272\n",
      "Epoch 7/20\n",
      "891/891 [==============================] - 0s 93us/sample - loss: 0.4014 - accuracy: 0.8339\n",
      "Epoch 8/20\n",
      "891/891 [==============================] - 0s 91us/sample - loss: 0.3934 - accuracy: 0.8384\n",
      "Epoch 9/20\n",
      "891/891 [==============================] - 0s 88us/sample - loss: 0.3905 - accuracy: 0.8339\n",
      "Epoch 10/20\n",
      "891/891 [==============================] - 0s 93us/sample - loss: 0.3905 - accuracy: 0.8260\n",
      "Epoch 11/20\n",
      "891/891 [==============================] - 0s 95us/sample - loss: 0.3896 - accuracy: 0.8328\n",
      "Epoch 12/20\n",
      "891/891 [==============================] - 0s 92us/sample - loss: 0.3832 - accuracy: 0.8440\n",
      "Epoch 13/20\n",
      "891/891 [==============================] - 0s 114us/sample - loss: 0.3822 - accuracy: 0.8429\n",
      "Epoch 14/20\n",
      "891/891 [==============================] - 0s 228us/sample - loss: 0.3810 - accuracy: 0.8361\n",
      "Epoch 15/20\n",
      "891/891 [==============================] - 0s 180us/sample - loss: 0.3781 - accuracy: 0.8418\n",
      "Epoch 16/20\n",
      "891/891 [==============================] - 0s 153us/sample - loss: 0.3771 - accuracy: 0.8384\n",
      "Epoch 17/20\n",
      "891/891 [==============================] - 0s 178us/sample - loss: 0.3731 - accuracy: 0.8395\n",
      "Epoch 18/20\n",
      "891/891 [==============================] - 0s 189us/sample - loss: 0.3721 - accuracy: 0.8429\n",
      "Epoch 19/20\n",
      "891/891 [==============================] - 0s 166us/sample - loss: 0.3729 - accuracy: 0.8440\n",
      "Epoch 20/20\n",
      "891/891 [==============================] - 0s 158us/sample - loss: 0.3747 - accuracy: 0.8406\n",
      "Model: \"sequential_5\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "flatten_7 (Flatten)          multiple                  0         \n",
      "_________________________________________________________________\n",
      "dense_19 (Dense)             multiple                  9252      \n",
      "_________________________________________________________________\n",
      "dense_20 (Dense)             multiple                  2058      \n",
      "=================================================================\n",
      "Total params: 11,310\n",
      "Trainable params: 11,310\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "# Setup our model\n",
    "\n",
    "model = tf.keras.models.Sequential([\n",
    "    # Flatten out our input\n",
    "    tf.keras.layers.Flatten(),\n",
    "    \n",
    "    # Setup our hidden layer\n",
    "    tf.keras.layers.Dense(1028, activation=tf.nn.relu),\n",
    "    \n",
    "    # Setup output layer\n",
    "    tf.keras.layers.Dense(2, activation=tf.nn.softmax)\n",
    "])\n",
    "\n",
    "# Compile our model\n",
    "model.compile(optimizer='adam', \n",
    "              loss = 'sparse_categorical_crossentropy', \n",
    "             metrics=['accuracy'])\n",
    "\n",
    "# Fit model\n",
    "history = model.fit(\n",
    "    X.values,\n",
    "    y.values,\n",
    "    epochs=20\n",
    ")\n",
    "\n",
    "# Model summary\n",
    "print(model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<bound method NDFrame.head of      PassengerId  Survived\n",
      "0            892         0\n",
      "1            893         1\n",
      "2            894         0\n",
      "3            895         0\n",
      "4            896         0\n",
      "..           ...       ...\n",
      "413         1305         0\n",
      "414         1306         1\n",
      "415         1307         0\n",
      "416         1308         0\n",
      "417         1309         0\n",
      "\n",
      "[418 rows x 2 columns]>\n"
     ]
    }
   ],
   "source": [
    "test_pred = model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n",
    "predictions = np.c_[PassengerId, np.argmax(test_pred, axis=1)] # Note that we take the argmax over the collumns to use our softmax output\n",
    "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
    "print(submission.head)\n",
    "submission.to_csv(\"submissions/NNSubmission.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using this model, we get a score of 0.79425"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}