MLProjects/kaggle_titanic/.ipynb_checkpoints/NeuralNetwork-Copy1-checkpo...

310 lines
12 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"import sklearn\n",
"import sklearn.model_selection\n",
"from sklearn import metrics, preprocessing\n",
"import pickle\n",
"import math\n",
"import tensorflow as tf\n",
"from os import path, getcwd, chdir"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"# Create dataframes from our csv files and set indeces\n",
"\n",
"df = pd.read_csv('data/train.csv')\n",
"df.set_index('PassengerId', inplace=True)\n",
"\n",
"testdf = pd.read_csv('data/test.csv')\n",
"PassengerId = testdf['PassengerId']\n",
"testdf.set_index('PassengerId', inplace=True)\n",
"\n",
"data = [df, testdf]"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Survived Pclass \\\n",
"PassengerId \n",
"1 0 3 \n",
"2 1 1 \n",
"3 1 3 \n",
"4 1 1 \n",
"5 0 3 \n",
"... ... ... \n",
"887 0 2 \n",
"888 1 1 \n",
"889 0 3 \n",
"890 1 1 \n",
"891 0 3 \n",
"\n",
" Name Sex \\\n",
"PassengerId \n",
"1 Braund, Mr. Owen Harris 1 \n",
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 \n",
"3 Heikkinen, Miss. Laina 0 \n",
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 \n",
"5 Allen, Mr. William Henry 1 \n",
"... ... ... \n",
"887 Montvila, Rev. Juozas 1 \n",
"888 Graham, Miss. Margaret Edith 0 \n",
"889 Johnston, Miss. Catherine Helen \"Carrie\" 0 \n",
"890 Behr, Mr. Karl Howell 1 \n",
"891 Dooley, Mr. Patrick 1 \n",
"\n",
" Age SibSp Parch Ticket Fare Cabin \\\n",
"PassengerId \n",
"1 22.000000 1 0 A/5 21171 7.2500 147 \n",
"2 38.000000 1 0 PC 17599 71.2833 81 \n",
"3 26.000000 0 0 STON/O2. 3101282 7.9250 147 \n",
"4 35.000000 1 0 113803 53.1000 55 \n",
"5 35.000000 0 0 373450 8.0500 147 \n",
"... ... ... ... ... ... ... \n",
"887 27.000000 0 0 211536 13.0000 147 \n",
"888 19.000000 0 0 112053 30.0000 30 \n",
"889 29.699118 1 2 W./C. 6607 23.4500 147 \n",
"890 26.000000 0 0 111369 30.0000 60 \n",
"891 32.000000 0 0 370376 7.7500 147 \n",
"\n",
" Embarked \n",
"PassengerId \n",
"1 2 \n",
"2 0 \n",
"3 2 \n",
"4 2 \n",
"5 2 \n",
"... ... \n",
"887 2 \n",
"888 2 \n",
"889 2 \n",
"890 0 \n",
"891 1 \n",
"\n",
"[891 rows x 11 columns]>\n"
]
}
],
"source": [
"# Preprocess the data by converting non numerical features into numerical categorical features \n",
"# and applying mean imputation to deal with NaN values\n",
"\n",
"for dataframe in data:\n",
" le = preprocessing.LabelEncoder()\n",
" dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n",
" dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n",
" dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n",
" dataframe.fillna(dataframe.mean(), inplace=True)\n",
" \n",
"print(df.head)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Pclass Sex Age SibSp Parch Fare Cabin Embarked\n",
"PassengerId \n",
"1 3 1 22.000000 1 0 7.2500 147 2\n",
"2 1 0 38.000000 1 0 71.2833 81 0\n",
"3 3 0 26.000000 0 0 7.9250 147 2\n",
"4 1 0 35.000000 1 0 53.1000 55 2\n",
"5 3 1 35.000000 0 0 8.0500 147 2\n",
"... ... ... ... ... ... ... ... ...\n",
"887 2 1 27.000000 0 0 13.0000 147 2\n",
"888 1 0 19.000000 0 0 30.0000 30 2\n",
"889 3 0 29.699118 1 2 23.4500 147 2\n",
"890 1 1 26.000000 0 0 30.0000 60 0\n",
"891 3 1 32.000000 0 0 7.7500 147 1\n",
"\n",
"[891 rows x 8 columns]>\n"
]
}
],
"source": [
"# Create our input matrix, label vector, and test input matrix\n",
"\n",
"X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n",
"y = df['Survived']\n",
"X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n",
"print(X.head)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"# Normalize the data\n",
"\n",
"X=(X-X.mean())/X.std()\n",
"X_test=(X_test-X_test.mean())/X_test.std()"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"# Setup our data to be fed into our model\n",
"\n",
"dataset = tf.data.Dataset.from_tensor_slices((X.values, y.values))"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:Layer sequential_11 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n",
"\n",
"If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n",
"\n",
"To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n",
"\n",
"Epoch 1/10\n",
"891/891 [==============================] - 5s 5ms/step - loss: 0.4820 - accuracy: 0.7778\n",
"Epoch 2/10\n",
"891/891 [==============================] - 2s 2ms/step - loss: 0.4380 - accuracy: 0.8148\n",
"Epoch 3/10\n",
"891/891 [==============================] - 2s 2ms/step - loss: 0.4248 - accuracy: 0.8148\n",
"Epoch 4/10\n",
"891/891 [==============================] - 2s 2ms/step - loss: 0.4261 - accuracy: 0.8148\n",
"Epoch 5/10\n",
"891/891 [==============================] - 2s 2ms/step - loss: 0.4186 - accuracy: 0.8283\n",
"Epoch 6/10\n",
"891/891 [==============================] - 2s 2ms/step - loss: 0.4074 - accuracy: 0.8182\n",
"Epoch 7/10\n",
"891/891 [==============================] - 2s 2ms/step - loss: 0.4068 - accuracy: 0.8238\n",
"Epoch 8/10\n",
"891/891 [==============================] - 2s 2ms/step - loss: 0.4059 - accuracy: 0.8249\n",
"Epoch 9/10\n",
"891/891 [==============================] - 2s 2ms/step - loss: 0.3979 - accuracy: 0.8272\n",
"Epoch 10/10\n",
"891/891 [==============================] - 3s 4ms/step - loss: 0.3924 - accuracy: 0.8272\n"
]
}
],
"source": [
"# Setup our model\n",
"\n",
"model = tf.keras.models.Sequential([\n",
" # Flatten out our input\n",
" tf.keras.layers.Flatten(),\n",
" \n",
" # Setup our first layer\n",
" tf.keras.layers.Dense(1024, activation=tf.nn.relu),\n",
" \n",
" # Setup output layer\n",
" tf.keras.layers.Dense(2, activation=tf.nn.softmax)\n",
"])\n",
"\n",
"# Compile our model\n",
"model.compile(optimizer='adam', \n",
" loss = 'sparse_categorical_crossentropy', \n",
" metrics=['accuracy'])\n",
"\n",
"# Fit model\n",
"history = model.fit(\n",
" train_dataset, \n",
" epochs=10\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of PassengerId Survived\n",
"0 892 0\n",
"1 893 0\n",
"2 894 0\n",
"3 895 0\n",
"4 896 0\n",
".. ... ...\n",
"413 1305 0\n",
"414 1306 1\n",
"415 1307 0\n",
"416 1308 0\n",
"417 1309 0\n",
"\n",
"[418 rows x 2 columns]>\n"
]
}
],
"source": [
"test_pred = model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n",
"predictions = np.c_[PassengerId, np.argmax(test_pred, axis=1)]\n",
"submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
"print(submission.head)\n",
"submission.to_csv(\"submissions/NNSubmission.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}