{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os\n", "import sklearn\n", "from sklearn import metrics, preprocessing\n", "from sklearn.linear_model import LogisticRegression\n", "import pickle\n", "import math" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create dataframes from our csv files and set indeces\n", "\n", "df = pd.read_csv('data/train.csv')\n", "df.set_index('PassengerId', inplace=True)\n", "\n", "testdf = pd.read_csv('data/test.csv')\n", "PassengerId = testdf['PassengerId']\n", "testdf.set_index('PassengerId', inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Preprocess the data by converting non numerical features into numerical categorical features \n", "# and applying mean imputation to deal with NaN values\n", "\n", "le = preprocessing.LabelEncoder()\n", "df[\"Sex\"] = le.fit_transform(list(df[\"Sex\"]))\n", "df[\"Cabin\"] = le.fit_transform(list(df[\"Cabin\"]))\n", "df[\"Embarked\"] = le.fit_transform(list(df[\"Embarked\"]))\n", "df.fillna(df.mean(), inplace=True)\n", "\n", "testdf[\"Sex\"] = le.fit_transform(list(testdf[\"Sex\"]))\n", "testdf[\"Cabin\"] = le.fit_transform(list(testdf[\"Cabin\"]))\n", "testdf[\"Embarked\"] = le.fit_transform(list(testdf[\"Embarked\"]))\n", "testdf.fillna(testdf.mean(), inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_titles = df.Name.str.extract(' ([A-Za-z]+)\\.', expand=False)\n", "type(train_titles)\n", "train_titles.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Extract titles from the names and create a feature vector\n", "\n", "data = [df, testdf]\n", "titles = {\"Mr\": 1, \"Miss\": 2, \"Mrs\": 3, \"Master\": 4, \"Rare\": 5}\n", "\n", "for dataset in data:\n", " # extract titles\n", " dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\\.', expand=False)\n", " # replace titles with a more common title or as Rare\n", " dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\\\n", " 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n", " dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n", " dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n", " dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n", " # filling NaN with 0, to get safe\n", " dataset['Title'] = dataset['Title'].fillna(\"NA\")\n", "df = df.drop(['Name'], axis=1)\n", "testdf = testdf.drop(['Name'], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"Title\"] = le.fit_transform(list(df[\"Title\"]))\n", "testdf[\"Title\"] = le.fit_transform(list(testdf[\"Title\"]))\n", "print(df.head)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create our input matrix, label vector, and test input matrix\n", "\n", "X = df.drop(['Survived', 'Ticket'], axis=1)\n", "y = df['Survived']\n", "X_test = testdf.drop(['Ticket'], axis=1)\n", "print(X.head)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "TODO\n", "REPLACE THE LETTERS AND NUMBERS OF A DECK WITH JUST DECK LETTER\n", "DEAL WITH NANS\n", "\"\"\"\n", "\n", "\n", "# X['Cabin'] = X.Cabin.fillna('')\n", "# print(X['Cabin'])\n", "# print(X['Cabin'].shape)\n", "# temp = X['Cabin']\n", "# for i, row in X.iterrows():\n", "# value = row['Cabin']\n", "# decks = {\"A\", \"B\", \"C\", \"D\", \"E\", \"F\", \"G\", \"U\"}\n", "# for deck in decks:\n", "# if deck in value:\n", "# X['Cabin'][i] = deck\n", " \n", "# print(X['Cabin'])\n", "\n", "\"\"\"\n", "TODO\n", "Clean up and fill in nans with mode\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X=(X-X.mean())/X.std()\n", "X_test=(X_test-X_test.mean())/X_test.std()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a classifier using logistic regression, opting for liblinear solver\n", "# because of how small our dataset is\n", "\n", "clfRAW = LogisticRegression(solver='liblinear', max_iter = 1000).fit(X, y)\n", "clfRAW.score(X,y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create our predictions matrix and save to csv\n", "predictions = np.c_[PassengerId, clfRAW.predict(X_test)]\n", "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n", "submission['PassengerId'] = PassengerId\n", "submission['Survived'] = clfRAW.predict(X_test)\n", "submission.to_csv(\"submissions/LogisticSubmissionRAW.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Split our labeled data into train and dev sets\n", "\n", "X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(\n", " X,y,test_size=0.2)\n", "\n", "# Setup range of values for tuning of C\n", "n=np.arange(-3,6)\n", "r=pow(float(10),n)\n", "\n", "# Tune C\n", "best = 0\n", "for C in r:\n", " clf = LogisticRegression(solver='liblinear', max_iter = 1000, C = C).fit(X_train, y_train)\n", " acc = clf.score(X_dev, y_dev)\n", " print(\"For C = \", C, \", acc = \", acc)\n", " if acc > best:\n", " best = acc\n", " with open('models/liblinearLogisticRegression.model','wb') as f:\n", " pickle.dump(clf,f)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load in our best performing model and check train/dev accuracy\n", "\n", "pickle_in = open('models/liblinearLogisticRegression.model','rb')\n", "clf = pickle.load(pickle_in)\n", "print(clf.score(X_train, y_train))\n", "print(clf.score(X_dev, y_dev))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create submission matrix and save to csv file\n", "\n", "predictions = np.c_[PassengerId, clf.predict(X_test)]\n", "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n", "submission['PassengerId'] = PassengerId\n", "submission['Survived'] = clf.predict(X_test)\n", "submission.to_csv(\"submissions/LogisticSubmission.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that in submitting our normal submission (with a train/dev split and tuning of C) to kaggle, we perform worse (0.75) than our RAW submission with no tuning of C (0.77990). Likely as a result of how small the dataset is." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }