{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os\n", "import sklearn\n", "from sklearn import metrics, preprocessing\n", "from sklearn.svm import SVC\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.preprocessing import normalize\n", "import pickle\n", "import math" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Create dataframes from our csv files and set indeces\n", "\n", "df = pd.read_csv('data/train.csv')\n", "df.set_index('PassengerId', inplace=True)\n", "\n", "testdf = pd.read_csv('data/test.csv')\n", "PassengerId = testdf['PassengerId']\n", "testdf.set_index('PassengerId', inplace=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Preprocess the data by converting non numerical features into numerical categorical features \n", "# and applying mean imputation to deal with NaN values\n", "\n", "le = preprocessing.LabelEncoder()\n", "df[\"Sex\"] = le.fit_transform(list(df[\"Sex\"]))\n", "df[\"Cabin\"] = le.fit_transform(list(df[\"Cabin\"]))\n", "df[\"Embarked\"] = le.fit_transform(list(df[\"Embarked\"]))\n", "df.fillna(df.mean(), inplace=True)\n", "\n", "testdf[\"Sex\"] = le.fit_transform(list(testdf[\"Sex\"]))\n", "testdf[\"Cabin\"] = le.fit_transform(list(testdf[\"Cabin\"]))\n", "testdf[\"Embarked\"] = le.fit_transform(list(testdf[\"Embarked\"]))\n", "testdf.fillna(testdf.mean(), inplace=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Create our input matrix, label vector, and test input matrix\n", "\n", "X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n", "y = df['Survived']\n", "X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n", "print(X.head)\n", "\n", "X_test = testdf.drop(['Name', 'Ticket'], axis=1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Normalize our data\n", "\n", "X=(X-X.mean())/X.std()\n", "X_test=(X_test-X_test.mean())/X_test.std()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8383838383838383\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", " \"avoid this warning.\", FutureWarning)\n" ] } ], "source": [ "# Setup basic SVM for classification\n", "clfRAW = SVC().fit(X, y)\n", "print(clfRAW.score(X,y))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Create submission matrix and save to csv file\n", "\n", "predictions = np.c_[PassengerId, clfRAW.predict(X_test)]\n", "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n", "submission['PassengerId'] = PassengerId\n", "submission['Survived'] = clfRAW.predict(X_test)\n", "submission.to_csv(\"submissions/SVMSubmissionRAW.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In submitting these predictions, though we got high training accuracy, we achieve a submission score of only 0.378. This reveals a massive issue of variance. That is, we have severely overfit this training set. We now move on to a more nuanced approach of splitting our data into train and dev sets to properly tune our parameters and avoid overfitting." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GridSearchCV(cv=5, error_score='raise-deprecating',\n", " estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", " decision_function_shape='ovr', degree=3,\n", " gamma='scale', kernel='rbf', max_iter=-1,\n", " probability=False, random_state=None, shrinking=True,\n", " tol=0.001, verbose=False),\n", " iid='warn', n_jobs=None,\n", " param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],\n", " 'kernel': ['rbf']},\n", " {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", " scoring=None, verbose=0)\n" ] } ], "source": [ "# Split our labeled data into train and dev sets\n", "\n", "X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(\n", " X,y,test_size=0.2)\n", "\n", "# Set the parameters by cross-validation\n", "parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n", " 'C': [1, 10, 100, 1000]},\n", " {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]\n", "\n", "svc = SVC(gamma=\"scale\")\n", "clf = GridSearchCV(svc, parameters, cv=5)\n", "print(clf)\n", "clf = clf.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8033707865168539\n", "0.8603351955307262\n" ] } ], "source": [ "print(clf.score(X_train, y_train))\n", "print(clf.score(X_dev, y_dev))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Save our tuned model\n", "\n", "with open('models/SVM.model','wb') as f:\n", " pickle.dump(clf,f)\n", "\n", "# Create submission matrix and save to csv file\n", "\n", "predictions = np.c_[PassengerId, clf.predict(X_test)]\n", "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n", "submission['PassengerId'] = PassengerId\n", "submission['Survived'] = clf.predict(X_test)\n", "submission.to_csv(\"submissions/SVMSubmission.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "With this tuning process we achieve a submission score of 0.77511" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "TODO\n", "Create some features such as fair per person and age * class or convert cabin number to deck category (A, B, C, ...)\n", "\"\"\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }