280 lines
8.9 KiB
Plaintext
280 lines
8.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import os\n",
|
|
"import sklearn\n",
|
|
"from sklearn import metrics, preprocessing\n",
|
|
"from sklearn.svm import SVC\n",
|
|
"from sklearn.model_selection import GridSearchCV\n",
|
|
"from sklearn.preprocessing import normalize\n",
|
|
"import pickle\n",
|
|
"import math"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create dataframes from our csv files and set indeces\n",
|
|
"\n",
|
|
"df = pd.read_csv('data/train.csv')\n",
|
|
"df.set_index('PassengerId', inplace=True)\n",
|
|
"\n",
|
|
"testdf = pd.read_csv('data/test.csv')\n",
|
|
"PassengerId = testdf['PassengerId']\n",
|
|
"testdf.set_index('PassengerId', inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Preprocess the data by converting non numerical features into numerical categorical features \n",
|
|
"# and applying mean imputation to deal with NaN values\n",
|
|
"\n",
|
|
"le = preprocessing.LabelEncoder()\n",
|
|
"df[\"Sex\"] = le.fit_transform(list(df[\"Sex\"]))\n",
|
|
"df[\"Cabin\"] = le.fit_transform(list(df[\"Cabin\"]))\n",
|
|
"df[\"Embarked\"] = le.fit_transform(list(df[\"Embarked\"]))\n",
|
|
"df.fillna(df.mean(), inplace=True)\n",
|
|
"\n",
|
|
"testdf[\"Sex\"] = le.fit_transform(list(testdf[\"Sex\"]))\n",
|
|
"testdf[\"Cabin\"] = le.fit_transform(list(testdf[\"Cabin\"]))\n",
|
|
"testdf[\"Embarked\"] = le.fit_transform(list(testdf[\"Embarked\"]))\n",
|
|
"testdf.fillna(testdf.mean(), inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<bound method NDFrame.head of Pclass Sex Age SibSp Parch Fare Cabin Embarked\n",
|
|
"PassengerId \n",
|
|
"1 3 1 22.000000 1 0 7.2500 147 2\n",
|
|
"2 1 0 38.000000 1 0 71.2833 81 0\n",
|
|
"3 3 0 26.000000 0 0 7.9250 147 2\n",
|
|
"4 1 0 35.000000 1 0 53.1000 55 2\n",
|
|
"5 3 1 35.000000 0 0 8.0500 147 2\n",
|
|
"... ... ... ... ... ... ... ... ...\n",
|
|
"887 2 1 27.000000 0 0 13.0000 147 2\n",
|
|
"888 1 0 19.000000 0 0 30.0000 30 2\n",
|
|
"889 3 0 29.699118 1 2 23.4500 147 2\n",
|
|
"890 1 1 26.000000 0 0 30.0000 60 0\n",
|
|
"891 3 1 32.000000 0 0 7.7500 147 1\n",
|
|
"\n",
|
|
"[891 rows x 8 columns]>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Create our input matrix, label vector, and test input matrix\n",
|
|
"\n",
|
|
"X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n",
|
|
"y = df['Survived']\n",
|
|
"X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n",
|
|
"print(X.head)\n",
|
|
"\n",
|
|
"X_test = testdf.drop(['Name', 'Ticket'], axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Normalize our data\n",
|
|
"\n",
|
|
"X=(X-X.mean())/X.std()\n",
|
|
"X_test=(X_test-X_test.mean())/X_test.std()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"0.8383838383838383\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
|
|
" \"avoid this warning.\", FutureWarning)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Setup basic SVM for classification\n",
|
|
"clfRAW = SVC().fit(X, y)\n",
|
|
"print(clfRAW.score(X,y))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create submission matrix and save to csv file\n",
|
|
"\n",
|
|
"predictions = np.c_[PassengerId, clfRAW.predict(X_test)]\n",
|
|
"submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
|
|
"submission['PassengerId'] = PassengerId\n",
|
|
"submission['Survived'] = clfRAW.predict(X_test)\n",
|
|
"submission.to_csv(\"submissions/SVMSubmissionRAW.csv\", index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"In submitting these predictions, though we got high training accuracy, we achieve a submission score of only 0.378. This reveals a massive issue of variance. That is, we have severely overfit this training set. We now move on to a more nuanced approach of splitting our data into train and dev sets to properly tune our parameters and avoid overfitting."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"GridSearchCV(cv=5, error_score='raise-deprecating',\n",
|
|
" estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
|
|
" decision_function_shape='ovr', degree=3,\n",
|
|
" gamma='scale', kernel='rbf', max_iter=-1,\n",
|
|
" probability=False, random_state=None, shrinking=True,\n",
|
|
" tol=0.001, verbose=False),\n",
|
|
" iid='warn', n_jobs=None,\n",
|
|
" param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],\n",
|
|
" 'kernel': ['rbf']},\n",
|
|
" {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],\n",
|
|
" pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
|
|
" scoring=None, verbose=0)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Split our labeled data into train and dev sets\n",
|
|
"\n",
|
|
"X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(\n",
|
|
" X,y,test_size=0.2)\n",
|
|
"\n",
|
|
"# Set the parameters by cross-validation\n",
|
|
"parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n",
|
|
" 'C': [1, 10, 100, 1000]},\n",
|
|
" {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]\n",
|
|
"\n",
|
|
"svc = SVC(gamma=\"scale\")\n",
|
|
"clf = GridSearchCV(svc, parameters, cv=5)\n",
|
|
"print(clf)\n",
|
|
"clf = clf.fit(X_train, y_train)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"0.8033707865168539\n",
|
|
"0.8603351955307262\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(clf.score(X_train, y_train))\n",
|
|
"print(clf.score(X_dev, y_dev))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Save our tuned model\n",
|
|
"\n",
|
|
"with open('models/SVM.model','wb') as f:\n",
|
|
" pickle.dump(clf,f)\n",
|
|
"\n",
|
|
"# Create submission matrix and save to csv file\n",
|
|
"\n",
|
|
"predictions = np.c_[PassengerId, clf.predict(X_test)]\n",
|
|
"submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n",
|
|
"submission['PassengerId'] = PassengerId\n",
|
|
"submission['Survived'] = clf.predict(X_test)\n",
|
|
"submission.to_csv(\"submissions/SVMSubmission.csv\", index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"With this tuning process we achieve a submission score of 0.77511"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\"\"\"\n",
|
|
"TODO\n",
|
|
"Create some features such as fair per person and age * class or convert cabin number to deck category (A, B, C, ...)\n",
|
|
"\"\"\""
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|