In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn import metrics, preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import normalize
import pickle
import math

In [2]:
# Create dataframes from our csv files and set indeces

df = pd.read_csv('data/train.csv')
df.set_index('PassengerId', inplace=True)

testdf = pd.read_csv('data/test.csv')
PassengerId = testdf['PassengerId']
testdf.set_index('PassengerId', inplace=True)

In [3]:
# Preprocess the data by converting non numerical features into numerical categorical features 
# and applying mean imputation to deal with NaN values

le = preprocessing.LabelEncoder()
df["Sex"] = le.fit_transform(list(df["Sex"]))
df["Cabin"] = le.fit_transform(list(df["Cabin"]))
df["Embarked"] = le.fit_transform(list(df["Embarked"]))
df.fillna(df.mean(), inplace=True)

testdf["Sex"] = le.fit_transform(list(testdf["Sex"]))
testdf["Cabin"] = le.fit_transform(list(testdf["Cabin"]))
testdf["Embarked"] = le.fit_transform(list(testdf["Embarked"]))
testdf.fillna(testdf.mean(), inplace=True)

In [4]:
# Create our input matrix, label vector, and test input matrix

X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)
y = df['Survived']
X_test = testdf.drop(['Name', 'Ticket'], axis=1)
print(X.head)

X_test = testdf.drop(['Name', 'Ticket'], axis=1)




In [5]:
# Normalize our data

X=(X-X.mean())/X.std()
X_test=(X_test-X_test.mean())/X_test.std()

In [6]:
# Setup basic SVM for classification
clfRAW = SVC().fit(X, y)
print(clfRAW.score(X,y))

0.8383838383838383




In [7]:
# Create submission matrix and save to csv file

predictions = np.c_[PassengerId, clfRAW.predict(X_test)]
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = PassengerId
submission['Survived'] = clfRAW.predict(X_test)
submission.to_csv("submissions/SVMSubmissionRAW.csv", index=False)

In submitting these predictions, though we got high training accuracy, we achieve a submission score of only 0.378. This reveals a massive issue of variance. That is, we have severely overfit this training set. We now move on to a more nuanced approach of splitting our data into train and dev sets to properly tune our parameters and avoid overfitting.

In [8]:
# Split our labeled data into train and dev sets

X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(
 X,y,test_size=0.2)

# Set the parameters by cross-validation
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
 'C': [1, 10, 100, 1000]},
 {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

svc = SVC(gamma="scale")
clf = GridSearchCV(svc, parameters, cv=5)
print(clf)
clf = clf.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
 estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
 decision_function_shape='ovr', degree=3,
 gamma='scale', kernel='rbf', max_iter=-1,
 probability=False, random_state=None, shrinking=True,
 tol=0.001, verbose=False),
 iid='warn', n_jobs=None,
 param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
 'kernel': ['rbf']},
 {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
 pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
 scoring=None, verbose=0)


In [9]:
print(clf.score(X_train, y_train))
print(clf.score(X_dev, y_dev))

0.8033707865168539
0.8603351955307262


In [10]:
# Save our tuned model

with open('models/SVM.model','wb') as f:
 pickle.dump(clf,f)

# Create submission matrix and save to csv file

predictions = np.c_[PassengerId, clf.predict(X_test)]
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = PassengerId
submission['Survived'] = clf.predict(X_test)
submission.to_csv("submissions/SVMSubmission.csv", index=False)

With this tuning process we achieve a submission score of 0.77511

In [None]:
"""
TODO
Create some features such as fair per person and age * class or convert cabin number to deck category (A, B, C, ...)
"""