{ "cells": [ { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os\n", "import sklearn\n", "import sklearn.model_selection\n", "from sklearn import metrics, preprocessing\n", "import pickle\n", "import math\n", "import tensorflow as tf\n", "from os import path, getcwd, chdir" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "# Create dataframes from our csv files and set indeces\n", "\n", "df = pd.read_csv('data/train.csv')\n", "df.set_index('PassengerId', inplace=True)\n", "\n", "testdf = pd.read_csv('data/test.csv')\n", "PassengerId = testdf['PassengerId']\n", "testdf.set_index('PassengerId', inplace=True)\n", "\n", "data = [df, testdf]" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Preprocess the data by converting non numerical features into numerical categorical features \n", "# and applying mean imputation to deal with NaN values\n", "\n", "for dataframe in data:\n", " le = preprocessing.LabelEncoder()\n", " dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n", " dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n", " dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n", " dataframe.fillna(dataframe.mean(), inplace=True)\n", " \n", "print(df.head)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Create our input matrix, label vector, and test input matrix\n", "\n", "X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n", "y = df['Survived']\n", "X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n", "print(X.head)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "# Normalize the data\n", "\n", "X=(X-X.mean())/X.std()\n", "X_test=(X_test-X_test.mean())/X_test.std()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "# Setup our data to be fed into our model\n", "\n", "dataset = tf.data.Dataset.from_tensor_slices((X.values, y.values))" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:Layer sequential_11 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n", "\n", "If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n", "\n", "To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n", "\n", "Epoch 1/10\n", "891/891 [==============================] - 5s 5ms/step - loss: 0.4820 - accuracy: 0.7778\n", "Epoch 2/10\n", "891/891 [==============================] - 2s 2ms/step - loss: 0.4380 - accuracy: 0.8148\n", "Epoch 3/10\n", "891/891 [==============================] - 2s 2ms/step - loss: 0.4248 - accuracy: 0.8148\n", "Epoch 4/10\n", "891/891 [==============================] - 2s 2ms/step - loss: 0.4261 - accuracy: 0.8148\n", "Epoch 5/10\n", "891/891 [==============================] - 2s 2ms/step - loss: 0.4186 - accuracy: 0.8283\n", "Epoch 6/10\n", "891/891 [==============================] - 2s 2ms/step - loss: 0.4074 - accuracy: 0.8182\n", "Epoch 7/10\n", "891/891 [==============================] - 2s 2ms/step - loss: 0.4068 - accuracy: 0.8238\n", "Epoch 8/10\n", "891/891 [==============================] - 2s 2ms/step - loss: 0.4059 - accuracy: 0.8249\n", "Epoch 9/10\n", "891/891 [==============================] - 2s 2ms/step - loss: 0.3979 - accuracy: 0.8272\n", "Epoch 10/10\n", "891/891 [==============================] - 3s 4ms/step - loss: 0.3924 - accuracy: 0.8272\n" ] } ], "source": [ "# Setup our model\n", "\n", "model = tf.keras.models.Sequential([\n", " # Flatten out our input\n", " tf.keras.layers.Flatten(),\n", " \n", " # Setup our first layer\n", " tf.keras.layers.Dense(1024, activation=tf.nn.relu),\n", " \n", " # Setup output layer\n", " tf.keras.layers.Dense(2, activation=tf.nn.softmax)\n", "])\n", "\n", "# Compile our model\n", "model.compile(optimizer='adam', \n", " loss = 'sparse_categorical_crossentropy', \n", " metrics=['accuracy'])\n", "\n", "# Fit model\n", "history = model.fit(\n", " train_dataset, \n", " epochs=10\n", ")" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "test_pred = model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n", "predictions = np.c_[PassengerId, np.argmax(test_pred, axis=1)]\n", "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n", "print(submission.head)\n", "submission.to_csv(\"submissions/NNSubmission.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }