In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn import metrics, preprocessing
from sklearn.linear_model import LogisticRegression
import pickle
import math

In [30]:
# Create dataframes from our csv files and set indeces

df = pd.read_csv('data/train.csv')
df.set_index('PassengerId', inplace=True)

testdf = pd.read_csv('data/test.csv')
PassengerId = testdf['PassengerId']
testdf.set_index('PassengerId', inplace=True)
data = [df, testdf]

In [31]:
# Preprocess the data by converting non numerical features into numerical categorical features 
# and applying mean imputation to deal with NaN values

for dataframe in data:
 le = preprocessing.LabelEncoder()
 dataframe["Sex"] = le.fit_transform(list(dataframe["Sex"]))
 dataframe["Cabin"] = le.fit_transform(list(dataframe["Cabin"]))
 dataframe["Embarked"] = le.fit_transform(list(dataframe["Embarked"]))
 dataframe.fillna(dataframe.mean(), inplace=True)

In [32]:
# Create our input matrix, label vector, and test input matrix

X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)
y = df['Survived']
X_test = testdf.drop(['Name', 'Ticket'], axis=1)
print(X.head)




In [38]:
# Normalize the data

X=(X-X.mean())/X.std()
X_test=(X_test-X_test.mean())/X_test.std()

In [34]:
# Create a classifier using logistic regression, opting for liblinear solver
# because of how small our dataset is

clfRAW = LogisticRegression(solver='liblinear', max_iter = 1000).fit(X, y)
clfRAW.score(X,y)

0.7957351290684624

In [35]:
# Create our predictions matrix and save to csv
predictions = np.c_[PassengerId, clfRAW.predict(X_test)]
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = PassengerId
submission['Survived'] = clfRAW.predict(X_test)
submission.to_csv("submissions/LogisticSubmissionRAW.csv", index=False)

In [36]:
# Split our labeled data into train and dev sets

X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(
 X,y,test_size=0.2)

# Setup range of values for tuning of C
n=np.arange(-3,6)
r=pow(float(10),n)

# Tune C
best = 0
for C in r:
 clf = LogisticRegression(solver='liblinear', max_iter = 1000, C = C).fit(X_train, y_train)
 acc = clf.score(X_dev, y_dev)
 print("For C = ", C, ", acc = ", acc)
 if acc > best:
 best = acc
 with open('models/liblinearLogisticRegression.model','wb') as f:
 pickle.dump(clf,f)
 

For C = 0.001 , acc = 0.8156424581005587
For C = 0.01 , acc = 0.8268156424581006
For C = 0.1 , acc = 0.8491620111731844
For C = 1.0 , acc = 0.8491620111731844
For C = 10.0 , acc = 0.8491620111731844
For C = 100.0 , acc = 0.8491620111731844
For C = 1000.0 , acc = 0.8491620111731844
For C = 10000.0 , acc = 0.8491620111731844
For C = 100000.0 , acc = 0.8491620111731844


In [37]:
# Load in our best performing model and check train/dev accuracy

pickle_in = open('models/liblinearLogisticRegression.model','rb')
clf = pickle.load(pickle_in)
print(clf.score(X_train, y_train))
print(clf.score(X_dev, y_dev))

0.7893258426966292
0.8491620111731844


In [10]:
# Create submission matrix and save to csv file

predictions = np.c_[PassengerId, clf.predict(X_test)]
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = PassengerId
submission['Survived'] = clf.predict(X_test)
submission.to_csv("submissions/LogisticSubmission.csv", index=False)

Note that in submitting our normal submission (with a train/dev split and tuning of C) to kaggle, we perform worse (0.75) than our RAW submission with no tuning of C (0.77990). Likely as a result of how small the dataset is.