MLProjects/kaggle_titanic/.ipynb_checkpoints/NeuralNetwork-checkpoint.ipynb

339 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"import sklearn\n",
"import sklearn.model_selection\n",
"from sklearn import metrics, preprocessing\n",
"import pickle\n",
"import math\n",
"import tensorflow as tf"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Create dataframes from our csv files and set indeces\n",
"\n",
"df = pd.read_csv('data/train.csv')\n",
"df.set_index('PassengerId', inplace=True)\n",
"\n",
"testdf = pd.read_csv('data/test.csv')\n",
"PassengerId = testdf['PassengerId']\n",
"testdf.set_index('PassengerId', inplace=True)\n",
"\n",
"data = [df, testdf]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Survived Pclass \\\n",
"PassengerId \n",
"1 0 3 \n",
"2 1 1 \n",
"3 1 3 \n",
"4 1 1 \n",
"5 0 3 \n",
"... ... ... \n",
"887 0 2 \n",
"888 1 1 \n",
"889 0 3 \n",
"890 1 1 \n",
"891 0 3 \n",
"\n",
" Name Sex \\\n",
"PassengerId \n",
"1 Braund, Mr. Owen Harris 1 \n",
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 \n",
"3 Heikkinen, Miss. Laina 0 \n",
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 \n",
"5 Allen, Mr. William Henry 1 \n",
"... ... ... \n",
"887 Montvila, Rev. Juozas 1 \n",
"888 Graham, Miss. Margaret Edith 0 \n",
"889 Johnston, Miss. Catherine Helen \"Carrie\" 0 \n",
"890 Behr, Mr. Karl Howell 1 \n",
"891 Dooley, Mr. Patrick 1 \n",
"\n",
" Age SibSp Parch Ticket Fare Cabin \\\n",
"PassengerId \n",
"1 22.000000 1 0 A/5 21171 7.2500 147 \n",
"2 38.000000 1 0 PC 17599 71.2833 81 \n",
"3 26.000000 0 0 STON/O2. 3101282 7.9250 147 \n",
"4 35.000000 1 0 113803 53.1000 55 \n",
"5 35.000000 0 0 373450 8.0500 147 \n",
"... ... ... ... ... ... ... \n",
"887 27.000000 0 0 211536 13.0000 147 \n",
"888 19.000000 0 0 112053 30.0000 30 \n",
"889 29.699118 1 2 W./C. 6607 23.4500 147 \n",
"890 26.000000 0 0 111369 30.0000 60 \n",
"891 32.000000 0 0 370376 7.7500 147 \n",
"\n",
" Embarked \n",
"PassengerId \n",
"1 2 \n",
"2 0 \n",
"3 2 \n",
"4 2 \n",
"5 2 \n",
"... ... \n",
"887 2 \n",
"888 2 \n",
"889 2 \n",
"890 0 \n",
"891 1 \n",
"\n",
"[891 rows x 11 columns]>\n"
]
}
],
"source": [
"# Preprocess the data by converting non numerical features into numerical categorical features \n",
"# and applying mean imputation to deal with NaN values\n",
"\n",
"for dataframe in data:\n",
" le = preprocessing.LabelEncoder()\n",
" dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n",
" dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n",
" dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n",
" dataframe.fillna(dataframe.mean(), inplace=True)\n",
" \n",
"print(df.head)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Pclass Sex Age SibSp Parch Fare Cabin Embarked\n",
"PassengerId \n",
"1 3 1 22.000000 1 0 7.2500 147 2\n",
"2 1 0 38.000000 1 0 71.2833 81 0\n",
"3 3 0 26.000000 0 0 7.9250 147 2\n",
"4 1 0 35.000000 1 0 53.1000 55 2\n",
"5 3 1 35.000000 0 0 8.0500 147 2\n",
"... ... ... ... ... ... ... ... ...\n",
"887 2 1 27.000000 0 0 13.0000 147 2\n",
"888 1 0 19.000000 0 0 30.0000 30 2\n",
"889 3 0 29.699118 1 2 23.4500 147 2\n",
"890 1 1 26.000000 0 0 30.0000 60 0\n",
"891 3 1 32.000000 0 0 7.7500 147 1\n",
"\n",
"[891 rows x 8 columns]>\n"
]
}
],
"source": [
"# Create our input matrix, label vector, and test input matrix\n",
"\n",
"X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n",
"y = df['Survived']\n",
"X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n",
"print(X.head)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Normalize the data\n",
"\n",
"X=(X-X.mean())/X.std()\n",
"X_test=(X_test-X_test.mean())/X_test.std()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train on 891 samples\n",
"Epoch 1/20\n",
"891/891 [==============================] - 1s 642us/sample - loss: 0.5211 - accuracy: 0.7778\n",
"Epoch 2/20\n",
"891/891 [==============================] - 0s 88us/sample - loss: 0.4308 - accuracy: 0.8103\n",
"Epoch 3/20\n",
"891/891 [==============================] - 0s 89us/sample - loss: 0.4153 - accuracy: 0.8182\n",
"Epoch 4/20\n",
"891/891 [==============================] - 0s 95us/sample - loss: 0.4099 - accuracy: 0.8316\n",
"Epoch 5/20\n",
"891/891 [==============================] - 0s 96us/sample - loss: 0.4052 - accuracy: 0.8249\n",
"Epoch 6/20\n",
"891/891 [==============================] - 0s 91us/sample - loss: 0.4029 - accuracy: 0.8272\n",
"Epoch 7/20\n",
"891/891 [==============================] - 0s 93us/sample - loss: 0.4014 - accuracy: 0.8339\n",
"Epoch 8/20\n",
"891/891 [==============================] - 0s 91us/sample - loss: 0.3934 - accuracy: 0.8384\n",
"Epoch 9/20\n",
"891/891 [==============================] - 0s 88us/sample - loss: 0.3905 - accuracy: 0.8339\n",
"Epoch 10/20\n",
"891/891 [==============================] - 0s 93us/sample - loss: 0.3905 - accuracy: 0.8260\n",
"Epoch 11/20\n",
"891/891 [==============================] - 0s 95us/sample - loss: 0.3896 - accuracy: 0.8328\n",
"Epoch 12/20\n",
"891/891 [==============================] - 0s 92us/sample - loss: 0.3832 - accuracy: 0.8440\n",
"Epoch 13/20\n",
"891/891 [==============================] - 0s 114us/sample - loss: 0.3822 - accuracy: 0.8429\n",
"Epoch 14/20\n",
"891/891 [==============================] - 0s 228us/sample - loss: 0.3810 - accuracy: 0.8361\n",
"Epoch 15/20\n",
"891/891 [==============================] - 0s 180us/sample - loss: 0.3781 - accuracy: 0.8418\n",
"Epoch 16/20\n",
"891/891 [==============================] - 0s 153us/sample - loss: 0.3771 - accuracy: 0.8384\n",
"Epoch 17/20\n",
"891/891 [==============================] - 0s 178us/sample - loss: 0.3731 - accuracy: 0.8395\n",
"Epoch 18/20\n",
"891/891 [==============================] - 0s 189us/sample - loss: 0.3721 - accuracy: 0.8429\n",
"Epoch 19/20\n",
"891/891 [==============================] - 0s 166us/sample - loss: 0.3729 - accuracy: 0.8440\n",
"Epoch 20/20\n",
"891/891 [==============================] - 0s 158us/sample - loss: 0.3747 - accuracy: 0.8406\n",
"Model: \"sequential_5\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"flatten_7 (Flatten) multiple 0 \n",
"_________________________________________________________________\n",
"dense_19 (Dense) multiple 9252 \n",
"_________________________________________________________________\n",
"dense_20 (Dense) multiple 2058 \n",
"=================================================================\n",
"Total params: 11,310\n",
"Trainable params: 11,310\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n",
"None\n"
]
}
],
"source": [
"# Setup our model\n",
"\n",
"model = tf.keras.models.Sequential([\n",
" # Flatten out our input\n",
" tf.keras.layers.Flatten(),\n",
" \n",
" # Setup our hidden layer\n",
" tf.keras.layers.Dense(1028, activation=tf.nn.relu),\n",
" \n",
" # Setup output layer\n",
" tf.keras.layers.Dense(2, activation=tf.nn.softmax)\n",
"])\n",
"\n",
"# Compile our model\n",
"model.compile(optimizer='adam', \n",
" loss = 'sparse_categorical_crossentropy', \n",
" metrics=['accuracy'])\n",
"\n",
"# Fit model\n",
"history = model.fit(\n",
" X.values,\n",
" y.values,\n",
" epochs=20\n",
")\n",
"\n",
"# Model summary\n",
"print(model.summary())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of PassengerId Survived\n",
"0 892 0\n",
"1 893 1\n",
"2 894 0\n",
"3 895 0\n",
"4 896 0\n",
".. ... ...\n",
"413 1305 0\n",
"414 1306 1\n",
"415 1307 0\n",
"416 1308 0\n",
"417 1309 0\n",
"\n",
"[418 rows x 2 columns]>\n"
]
}
],
"source": [
"test_pred = model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n",
"predictions = np.c_[PassengerId, np.argmax(test_pred, axis=1)] # Note that we take the argmax over the collumns to use our softmax output\n",
"submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
"print(submission.head)\n",
"submission.to_csv(\"submissions/NNSubmission.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Using this model, we get a score of 0.79425"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}