339 lines
13 KiB
Plaintext
339 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import os\n",
|
|
"import sklearn\n",
|
|
"import sklearn.model_selection\n",
|
|
"from sklearn import metrics, preprocessing\n",
|
|
"import pickle\n",
|
|
"import math\n",
|
|
"import tensorflow as tf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create dataframes from our csv files and set indeces\n",
|
|
"\n",
|
|
"df = pd.read_csv('data/train.csv')\n",
|
|
"df.set_index('PassengerId', inplace=True)\n",
|
|
"\n",
|
|
"testdf = pd.read_csv('data/test.csv')\n",
|
|
"PassengerId = testdf['PassengerId']\n",
|
|
"testdf.set_index('PassengerId', inplace=True)\n",
|
|
"\n",
|
|
"data = [df, testdf]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<bound method NDFrame.head of Survived Pclass \\\n",
|
|
"PassengerId \n",
|
|
"1 0 3 \n",
|
|
"2 1 1 \n",
|
|
"3 1 3 \n",
|
|
"4 1 1 \n",
|
|
"5 0 3 \n",
|
|
"... ... ... \n",
|
|
"887 0 2 \n",
|
|
"888 1 1 \n",
|
|
"889 0 3 \n",
|
|
"890 1 1 \n",
|
|
"891 0 3 \n",
|
|
"\n",
|
|
" Name Sex \\\n",
|
|
"PassengerId \n",
|
|
"1 Braund, Mr. Owen Harris 1 \n",
|
|
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 \n",
|
|
"3 Heikkinen, Miss. Laina 0 \n",
|
|
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 \n",
|
|
"5 Allen, Mr. William Henry 1 \n",
|
|
"... ... ... \n",
|
|
"887 Montvila, Rev. Juozas 1 \n",
|
|
"888 Graham, Miss. Margaret Edith 0 \n",
|
|
"889 Johnston, Miss. Catherine Helen \"Carrie\" 0 \n",
|
|
"890 Behr, Mr. Karl Howell 1 \n",
|
|
"891 Dooley, Mr. Patrick 1 \n",
|
|
"\n",
|
|
" Age SibSp Parch Ticket Fare Cabin \\\n",
|
|
"PassengerId \n",
|
|
"1 22.000000 1 0 A/5 21171 7.2500 147 \n",
|
|
"2 38.000000 1 0 PC 17599 71.2833 81 \n",
|
|
"3 26.000000 0 0 STON/O2. 3101282 7.9250 147 \n",
|
|
"4 35.000000 1 0 113803 53.1000 55 \n",
|
|
"5 35.000000 0 0 373450 8.0500 147 \n",
|
|
"... ... ... ... ... ... ... \n",
|
|
"887 27.000000 0 0 211536 13.0000 147 \n",
|
|
"888 19.000000 0 0 112053 30.0000 30 \n",
|
|
"889 29.699118 1 2 W./C. 6607 23.4500 147 \n",
|
|
"890 26.000000 0 0 111369 30.0000 60 \n",
|
|
"891 32.000000 0 0 370376 7.7500 147 \n",
|
|
"\n",
|
|
" Embarked \n",
|
|
"PassengerId \n",
|
|
"1 2 \n",
|
|
"2 0 \n",
|
|
"3 2 \n",
|
|
"4 2 \n",
|
|
"5 2 \n",
|
|
"... ... \n",
|
|
"887 2 \n",
|
|
"888 2 \n",
|
|
"889 2 \n",
|
|
"890 0 \n",
|
|
"891 1 \n",
|
|
"\n",
|
|
"[891 rows x 11 columns]>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Preprocess the data by converting non numerical features into numerical categorical features \n",
|
|
"# and applying mean imputation to deal with NaN values\n",
|
|
"\n",
|
|
"for dataframe in data:\n",
|
|
" le = preprocessing.LabelEncoder()\n",
|
|
" dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n",
|
|
" dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n",
|
|
" dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n",
|
|
" dataframe.fillna(dataframe.mean(), inplace=True)\n",
|
|
" \n",
|
|
"print(df.head)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<bound method NDFrame.head of Pclass Sex Age SibSp Parch Fare Cabin Embarked\n",
|
|
"PassengerId \n",
|
|
"1 3 1 22.000000 1 0 7.2500 147 2\n",
|
|
"2 1 0 38.000000 1 0 71.2833 81 0\n",
|
|
"3 3 0 26.000000 0 0 7.9250 147 2\n",
|
|
"4 1 0 35.000000 1 0 53.1000 55 2\n",
|
|
"5 3 1 35.000000 0 0 8.0500 147 2\n",
|
|
"... ... ... ... ... ... ... ... ...\n",
|
|
"887 2 1 27.000000 0 0 13.0000 147 2\n",
|
|
"888 1 0 19.000000 0 0 30.0000 30 2\n",
|
|
"889 3 0 29.699118 1 2 23.4500 147 2\n",
|
|
"890 1 1 26.000000 0 0 30.0000 60 0\n",
|
|
"891 3 1 32.000000 0 0 7.7500 147 1\n",
|
|
"\n",
|
|
"[891 rows x 8 columns]>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Create our input matrix, label vector, and test input matrix\n",
|
|
"\n",
|
|
"X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n",
|
|
"y = df['Survived']\n",
|
|
"X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n",
|
|
"print(X.head)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Normalize the data\n",
|
|
"\n",
|
|
"X=(X-X.mean())/X.std()\n",
|
|
"X_test=(X_test-X_test.mean())/X_test.std()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Train on 891 samples\n",
|
|
"Epoch 1/20\n",
|
|
"891/891 [==============================] - 1s 642us/sample - loss: 0.5211 - accuracy: 0.7778\n",
|
|
"Epoch 2/20\n",
|
|
"891/891 [==============================] - 0s 88us/sample - loss: 0.4308 - accuracy: 0.8103\n",
|
|
"Epoch 3/20\n",
|
|
"891/891 [==============================] - 0s 89us/sample - loss: 0.4153 - accuracy: 0.8182\n",
|
|
"Epoch 4/20\n",
|
|
"891/891 [==============================] - 0s 95us/sample - loss: 0.4099 - accuracy: 0.8316\n",
|
|
"Epoch 5/20\n",
|
|
"891/891 [==============================] - 0s 96us/sample - loss: 0.4052 - accuracy: 0.8249\n",
|
|
"Epoch 6/20\n",
|
|
"891/891 [==============================] - 0s 91us/sample - loss: 0.4029 - accuracy: 0.8272\n",
|
|
"Epoch 7/20\n",
|
|
"891/891 [==============================] - 0s 93us/sample - loss: 0.4014 - accuracy: 0.8339\n",
|
|
"Epoch 8/20\n",
|
|
"891/891 [==============================] - 0s 91us/sample - loss: 0.3934 - accuracy: 0.8384\n",
|
|
"Epoch 9/20\n",
|
|
"891/891 [==============================] - 0s 88us/sample - loss: 0.3905 - accuracy: 0.8339\n",
|
|
"Epoch 10/20\n",
|
|
"891/891 [==============================] - 0s 93us/sample - loss: 0.3905 - accuracy: 0.8260\n",
|
|
"Epoch 11/20\n",
|
|
"891/891 [==============================] - 0s 95us/sample - loss: 0.3896 - accuracy: 0.8328\n",
|
|
"Epoch 12/20\n",
|
|
"891/891 [==============================] - 0s 92us/sample - loss: 0.3832 - accuracy: 0.8440\n",
|
|
"Epoch 13/20\n",
|
|
"891/891 [==============================] - 0s 114us/sample - loss: 0.3822 - accuracy: 0.8429\n",
|
|
"Epoch 14/20\n",
|
|
"891/891 [==============================] - 0s 228us/sample - loss: 0.3810 - accuracy: 0.8361\n",
|
|
"Epoch 15/20\n",
|
|
"891/891 [==============================] - 0s 180us/sample - loss: 0.3781 - accuracy: 0.8418\n",
|
|
"Epoch 16/20\n",
|
|
"891/891 [==============================] - 0s 153us/sample - loss: 0.3771 - accuracy: 0.8384\n",
|
|
"Epoch 17/20\n",
|
|
"891/891 [==============================] - 0s 178us/sample - loss: 0.3731 - accuracy: 0.8395\n",
|
|
"Epoch 18/20\n",
|
|
"891/891 [==============================] - 0s 189us/sample - loss: 0.3721 - accuracy: 0.8429\n",
|
|
"Epoch 19/20\n",
|
|
"891/891 [==============================] - 0s 166us/sample - loss: 0.3729 - accuracy: 0.8440\n",
|
|
"Epoch 20/20\n",
|
|
"891/891 [==============================] - 0s 158us/sample - loss: 0.3747 - accuracy: 0.8406\n",
|
|
"Model: \"sequential_5\"\n",
|
|
"_________________________________________________________________\n",
|
|
"Layer (type) Output Shape Param # \n",
|
|
"=================================================================\n",
|
|
"flatten_7 (Flatten) multiple 0 \n",
|
|
"_________________________________________________________________\n",
|
|
"dense_19 (Dense) multiple 9252 \n",
|
|
"_________________________________________________________________\n",
|
|
"dense_20 (Dense) multiple 2058 \n",
|
|
"=================================================================\n",
|
|
"Total params: 11,310\n",
|
|
"Trainable params: 11,310\n",
|
|
"Non-trainable params: 0\n",
|
|
"_________________________________________________________________\n",
|
|
"None\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Setup our model\n",
|
|
"\n",
|
|
"model = tf.keras.models.Sequential([\n",
|
|
" # Flatten out our input\n",
|
|
" tf.keras.layers.Flatten(),\n",
|
|
" \n",
|
|
" # Setup our hidden layer\n",
|
|
" tf.keras.layers.Dense(1028, activation=tf.nn.relu),\n",
|
|
" \n",
|
|
" # Setup output layer\n",
|
|
" tf.keras.layers.Dense(2, activation=tf.nn.softmax)\n",
|
|
"])\n",
|
|
"\n",
|
|
"# Compile our model\n",
|
|
"model.compile(optimizer='adam', \n",
|
|
" loss = 'sparse_categorical_crossentropy', \n",
|
|
" metrics=['accuracy'])\n",
|
|
"\n",
|
|
"# Fit model\n",
|
|
"history = model.fit(\n",
|
|
" X.values,\n",
|
|
" y.values,\n",
|
|
" epochs=20\n",
|
|
")\n",
|
|
"\n",
|
|
"# Model summary\n",
|
|
"print(model.summary())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<bound method NDFrame.head of PassengerId Survived\n",
|
|
"0 892 0\n",
|
|
"1 893 1\n",
|
|
"2 894 0\n",
|
|
"3 895 0\n",
|
|
"4 896 0\n",
|
|
".. ... ...\n",
|
|
"413 1305 0\n",
|
|
"414 1306 1\n",
|
|
"415 1307 0\n",
|
|
"416 1308 0\n",
|
|
"417 1309 0\n",
|
|
"\n",
|
|
"[418 rows x 2 columns]>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"test_pred = model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n",
|
|
"predictions = np.c_[PassengerId, np.argmax(test_pred, axis=1)] # Note that we take the argmax over the collumns to use our softmax output\n",
|
|
"submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
|
|
"print(submission.head)\n",
|
|
"submission.to_csv(\"submissions/NNSubmission.csv\", index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Using this model, we get a score of 0.79425"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|