MLProjects/kaggle_titanic/SVM.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import sklearn\n",
    "from sklearn import metrics, preprocessing\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.preprocessing import normalize\n",
    "import pickle\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataframes from our csv files and set indeces\n",
    "\n",
    "df = pd.read_csv('data/train.csv')\n",
    "df.set_index('PassengerId', inplace=True)\n",
    "\n",
    "testdf = pd.read_csv('data/test.csv')\n",
    "PassengerId = testdf['PassengerId']\n",
    "testdf.set_index('PassengerId', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocess the data by converting non numerical features into numerical categorical features \n",
    "# and applying mean imputation to deal with NaN values\n",
    "\n",
    "le = preprocessing.LabelEncoder()\n",
    "df[\"Sex\"] = le.fit_transform(list(df[\"Sex\"]))\n",
    "df[\"Cabin\"] = le.fit_transform(list(df[\"Cabin\"]))\n",
    "df[\"Embarked\"] = le.fit_transform(list(df[\"Embarked\"]))\n",
    "df.fillna(df.mean(), inplace=True)\n",
    "\n",
    "testdf[\"Sex\"] = le.fit_transform(list(testdf[\"Sex\"]))\n",
    "testdf[\"Cabin\"] = le.fit_transform(list(testdf[\"Cabin\"]))\n",
    "testdf[\"Embarked\"] = le.fit_transform(list(testdf[\"Embarked\"]))\n",
    "testdf.fillna(testdf.mean(), inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<bound method NDFrame.head of              Pclass  Sex        Age  SibSp  Parch     Fare  Cabin  Embarked\n",
      "PassengerId                                                                \n",
      "1                 3    1  22.000000      1      0   7.2500    147         2\n",
      "2                 1    0  38.000000      1      0  71.2833     81         0\n",
      "3                 3    0  26.000000      0      0   7.9250    147         2\n",
      "4                 1    0  35.000000      1      0  53.1000     55         2\n",
      "5                 3    1  35.000000      0      0   8.0500    147         2\n",
      "...             ...  ...        ...    ...    ...      ...    ...       ...\n",
      "887               2    1  27.000000      0      0  13.0000    147         2\n",
      "888               1    0  19.000000      0      0  30.0000     30         2\n",
      "889               3    0  29.699118      1      2  23.4500    147         2\n",
      "890               1    1  26.000000      0      0  30.0000     60         0\n",
      "891               3    1  32.000000      0      0   7.7500    147         1\n",
      "\n",
      "[891 rows x 8 columns]>\n"
     ]
    }
   ],
   "source": [
    "# Create our input matrix, label vector, and test input matrix\n",
    "\n",
    "X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n",
    "y = df['Survived']\n",
    "X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n",
    "print(X.head)\n",
    "\n",
    "X_test = testdf.drop(['Name', 'Ticket'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalize our data\n",
    "\n",
    "X=(X-X.mean())/X.std()\n",
    "X_test=(X_test-X_test.mean())/X_test.std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8383838383838383\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
      "  \"avoid this warning.\", FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "# Setup basic SVM for classification\n",
    "clfRAW = SVC().fit(X, y)\n",
    "print(clfRAW.score(X,y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create submission matrix and save to csv file\n",
    "\n",
    "predictions = np.c_[PassengerId, clfRAW.predict(X_test)]\n",
    "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
    "submission['PassengerId'] = PassengerId\n",
    "submission['Survived'] = clfRAW.predict(X_test)\n",
    "submission.to_csv(\"submissions/SVMSubmissionRAW.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In submitting these predictions, though we got high training accuracy, we achieve a submission score of only 0.378. This reveals a massive issue of variance. That is, we have severely overfit this training set. We now move on to a more nuanced approach of splitting our data into train and dev sets to properly tune our parameters and avoid overfitting."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
      "             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
      "                           decision_function_shape='ovr', degree=3,\n",
      "                           gamma='scale', kernel='rbf', max_iter=-1,\n",
      "                           probability=False, random_state=None, shrinking=True,\n",
      "                           tol=0.001, verbose=False),\n",
      "             iid='warn', n_jobs=None,\n",
      "             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],\n",
      "                          'kernel': ['rbf']},\n",
      "                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],\n",
      "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
      "             scoring=None, verbose=0)\n"
     ]
    }
   ],
   "source": [
    "# Split our labeled data into train and dev sets\n",
    "\n",
    "X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(\n",
    "    X,y,test_size=0.2)\n",
    "\n",
    "# Set the parameters by cross-validation\n",
    "parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n",
    "                     'C': [1, 10, 100, 1000]},\n",
    "                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]\n",
    "\n",
    "svc = SVC(gamma=\"scale\")\n",
    "clf = GridSearchCV(svc, parameters, cv=5)\n",
    "print(clf)\n",
    "clf = clf.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8033707865168539\n",
      "0.8603351955307262\n"
     ]
    }
   ],
   "source": [
    "print(clf.score(X_train, y_train))\n",
    "print(clf.score(X_dev, y_dev))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save our tuned model\n",
    "\n",
    "with open('models/SVM.model','wb') as f:\n",
    "            pickle.dump(clf,f)\n",
    "\n",
    "# Create submission matrix and save to csv file\n",
    "\n",
    "predictions = np.c_[PassengerId, clf.predict(X_test)]\n",
    "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
    "submission['PassengerId'] = PassengerId\n",
    "submission['Survived'] = clf.predict(X_test)\n",
    "submission.to_csv(\"submissions/SVMSubmission.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With this tuning process we achieve a submission score of 0.77511"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "TODO\n",
    "Create some features such as fair per person and age * class or convert cabin number to deck category (A, B, C, ...)\n",
    "\"\"\""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}