MLProjects/kaggle_titanic/.ipynb_checkpoints/Logistic_Regression-checkpo...

259 lines
7.7 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"import sklearn\n",
"from sklearn import metrics, preprocessing\n",
"from sklearn.linear_model import LogisticRegression\n",
"import pickle\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# Create dataframes from our csv files and set indeces\n",
"\n",
"df = pd.read_csv('data/train.csv')\n",
"df.set_index('PassengerId', inplace=True)\n",
"\n",
"testdf = pd.read_csv('data/test.csv')\n",
"PassengerId = testdf['PassengerId']\n",
"testdf.set_index('PassengerId', inplace=True)\n",
"data = [df, testdf]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# Preprocess the data by converting non numerical features into numerical categorical features \n",
"# and applying mean imputation to deal with NaN values\n",
"\n",
"for dataframe in data:\n",
" le = preprocessing.LabelEncoder()\n",
" dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n",
" dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n",
" dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n",
" dataframe.fillna(dataframe.mean(), inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Pclass Sex Age SibSp Parch Fare Cabin Embarked\n",
"PassengerId \n",
"1 3 1 22.000000 1 0 7.2500 147 2\n",
"2 1 0 38.000000 1 0 71.2833 81 0\n",
"3 3 0 26.000000 0 0 7.9250 147 2\n",
"4 1 0 35.000000 1 0 53.1000 55 2\n",
"5 3 1 35.000000 0 0 8.0500 147 2\n",
"... ... ... ... ... ... ... ... ...\n",
"887 2 1 27.000000 0 0 13.0000 147 2\n",
"888 1 0 19.000000 0 0 30.0000 30 2\n",
"889 3 0 29.699118 1 2 23.4500 147 2\n",
"890 1 1 26.000000 0 0 30.0000 60 0\n",
"891 3 1 32.000000 0 0 7.7500 147 1\n",
"\n",
"[891 rows x 8 columns]>\n"
]
}
],
"source": [
"# Create our input matrix, label vector, and test input matrix\n",
"\n",
"X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n",
"y = df['Survived']\n",
"X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n",
"print(X.head)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# Normalize the data\n",
"\n",
"X=(X-X.mean())/X.std()\n",
"X_test=(X_test-X_test.mean())/X_test.std()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7957351290684624"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a classifier using logistic regression, opting for liblinear solver\n",
"# because of how small our dataset is\n",
"\n",
"clfRAW = LogisticRegression(solver='liblinear', max_iter = 1000).fit(X, y)\n",
"clfRAW.score(X,y)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"# Create our predictions matrix and save to csv\n",
"predictions = np.c_[PassengerId, clfRAW.predict(X_test)]\n",
"submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
"submission['PassengerId'] = PassengerId\n",
"submission['Survived'] = clfRAW.predict(X_test)\n",
"submission.to_csv(\"submissions/LogisticSubmissionRAW.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"For C = 0.001 , acc = 0.8156424581005587\n",
"For C = 0.01 , acc = 0.8268156424581006\n",
"For C = 0.1 , acc = 0.8491620111731844\n",
"For C = 1.0 , acc = 0.8491620111731844\n",
"For C = 10.0 , acc = 0.8491620111731844\n",
"For C = 100.0 , acc = 0.8491620111731844\n",
"For C = 1000.0 , acc = 0.8491620111731844\n",
"For C = 10000.0 , acc = 0.8491620111731844\n",
"For C = 100000.0 , acc = 0.8491620111731844\n"
]
}
],
"source": [
"# Split our labeled data into train and dev sets\n",
"\n",
"X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(\n",
" X,y,test_size=0.2)\n",
"\n",
"# Setup range of values for tuning of C\n",
"n=np.arange(-3,6)\n",
"r=pow(float(10),n)\n",
"\n",
"# Tune C\n",
"best = 0\n",
"for C in r:\n",
" clf = LogisticRegression(solver='liblinear', max_iter = 1000, C = C).fit(X_train, y_train)\n",
" acc = clf.score(X_dev, y_dev)\n",
" print(\"For C = \", C, \", acc = \", acc)\n",
" if acc > best:\n",
" best = acc\n",
" with open('models/liblinearLogisticRegression.model','wb') as f:\n",
" pickle.dump(clf,f)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.7893258426966292\n",
"0.8491620111731844\n"
]
}
],
"source": [
"# Load in our best performing model and check train/dev accuracy\n",
"\n",
"pickle_in = open('models/liblinearLogisticRegression.model','rb')\n",
"clf = pickle.load(pickle_in)\n",
"print(clf.score(X_train, y_train))\n",
"print(clf.score(X_dev, y_dev))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Create submission matrix and save to csv file\n",
"\n",
"predictions = np.c_[PassengerId, clf.predict(X_test)]\n",
"submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
"submission['PassengerId'] = PassengerId\n",
"submission['Survived'] = clf.predict(X_test)\n",
"submission.to_csv(\"submissions/LogisticSubmission.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that in submitting our normal submission (with a train/dev split and tuning of C) to kaggle, we perform worse (0.75) than our RAW submission with no tuning of C (0.77990). Likely as a result of how small the dataset is."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}