MLProjects/kaggle_titanic/.ipynb_checkpoints/Logistic_Regression-checkpo...

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import sklearn\n",
    "from sklearn import metrics, preprocessing\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import pickle\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataframes from our csv files and set indeces\n",
    "\n",
    "df = pd.read_csv('data/train.csv')\n",
    "df.set_index('PassengerId', inplace=True)\n",
    "\n",
    "testdf = pd.read_csv('data/test.csv')\n",
    "PassengerId = testdf['PassengerId']\n",
    "testdf.set_index('PassengerId', inplace=True)\n",
    "data = [df, testdf]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocess the data by converting non numerical features into numerical categorical features \n",
    "# and applying mean imputation to deal with NaN values\n",
    "\n",
    "for dataframe in data:\n",
    "    le = preprocessing.LabelEncoder()\n",
    "    dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n",
    "    dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n",
    "    dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n",
    "    dataframe.fillna(dataframe.mean(), inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<bound method NDFrame.head of              Pclass  Sex        Age  SibSp  Parch     Fare  Cabin  Embarked\n",
      "PassengerId                                                                \n",
      "1                 3    1  22.000000      1      0   7.2500    147         2\n",
      "2                 1    0  38.000000      1      0  71.2833     81         0\n",
      "3                 3    0  26.000000      0      0   7.9250    147         2\n",
      "4                 1    0  35.000000      1      0  53.1000     55         2\n",
      "5                 3    1  35.000000      0      0   8.0500    147         2\n",
      "...             ...  ...        ...    ...    ...      ...    ...       ...\n",
      "887               2    1  27.000000      0      0  13.0000    147         2\n",
      "888               1    0  19.000000      0      0  30.0000     30         2\n",
      "889               3    0  29.699118      1      2  23.4500    147         2\n",
      "890               1    1  26.000000      0      0  30.0000     60         0\n",
      "891               3    1  32.000000      0      0   7.7500    147         1\n",
      "\n",
      "[891 rows x 8 columns]>\n"
     ]
    }
   ],
   "source": [
    "# Create our input matrix, label vector, and test input matrix\n",
    "\n",
    "X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n",
    "y = df['Survived']\n",
    "X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n",
    "print(X.head)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalize the data\n",
    "\n",
    "X=(X-X.mean())/X.std()\n",
    "X_test=(X_test-X_test.mean())/X_test.std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7957351290684624"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a classifier using logistic regression, opting for liblinear solver\n",
    "# because of how small our dataset is\n",
    "\n",
    "clfRAW = LogisticRegression(solver='liblinear', max_iter = 1000).fit(X, y)\n",
    "clfRAW.score(X,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create our predictions matrix and save to csv\n",
    "predictions = np.c_[PassengerId, clfRAW.predict(X_test)]\n",
    "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
    "submission['PassengerId'] = PassengerId\n",
    "submission['Survived'] = clfRAW.predict(X_test)\n",
    "submission.to_csv(\"submissions/LogisticSubmissionRAW.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "For C =  0.001 , acc =  0.8156424581005587\n",
      "For C =  0.01 , acc =  0.8268156424581006\n",
      "For C =  0.1 , acc =  0.8491620111731844\n",
      "For C =  1.0 , acc =  0.8491620111731844\n",
      "For C =  10.0 , acc =  0.8491620111731844\n",
      "For C =  100.0 , acc =  0.8491620111731844\n",
      "For C =  1000.0 , acc =  0.8491620111731844\n",
      "For C =  10000.0 , acc =  0.8491620111731844\n",
      "For C =  100000.0 , acc =  0.8491620111731844\n"
     ]
    }
   ],
   "source": [
    "# Split our labeled data into train and dev sets\n",
    "\n",
    "X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(\n",
    "    X,y,test_size=0.2)\n",
    "\n",
    "# Setup range of values for tuning of C\n",
    "n=np.arange(-3,6)\n",
    "r=pow(float(10),n)\n",
    "\n",
    "# Tune C\n",
    "best = 0\n",
    "for C in r:\n",
    "    clf = LogisticRegression(solver='liblinear', max_iter = 1000, C = C).fit(X_train, y_train)\n",
    "    acc = clf.score(X_dev, y_dev)\n",
    "    print(\"For C = \", C, \", acc = \", acc)\n",
    "    if acc > best:\n",
    "        best = acc\n",
    "        with open('models/liblinearLogisticRegression.model','wb') as f:\n",
    "            pickle.dump(clf,f)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7893258426966292\n",
      "0.8491620111731844\n"
     ]
    }
   ],
   "source": [
    "# Load in our best performing model and check train/dev accuracy\n",
    "\n",
    "pickle_in = open('models/liblinearLogisticRegression.model','rb')\n",
    "clf = pickle.load(pickle_in)\n",
    "print(clf.score(X_train, y_train))\n",
    "print(clf.score(X_dev, y_dev))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create submission matrix and save to csv file\n",
    "\n",
    "predictions = np.c_[PassengerId, clf.predict(X_test)]\n",
    "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
    "submission['PassengerId'] = PassengerId\n",
    "submission['Survived'] = clf.predict(X_test)\n",
    "submission.to_csv(\"submissions/LogisticSubmission.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that in submitting our normal submission (with a train/dev split and tuning of C) to kaggle, we perform worse (0.75) than our RAW submission with no tuning of C (0.77990). Likely as a result of how small the dataset is."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}