In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn import metrics, preprocessing
from sklearn.linear_model import LogisticRegression
import pickle
import math

In [None]:
# Create dataframes from our csv files and set indeces

df = pd.read_csv('data/train.csv')
df.set_index('PassengerId', inplace=True)

testdf = pd.read_csv('data/test.csv')
PassengerId = testdf['PassengerId']
testdf.set_index('PassengerId', inplace=True)

In [None]:
# Preprocess the data by converting non numerical features into numerical categorical features 
# and applying mean imputation to deal with NaN values

le = preprocessing.LabelEncoder()
df["Sex"] = le.fit_transform(list(df["Sex"]))
df["Cabin"] = le.fit_transform(list(df["Cabin"]))
df["Embarked"] = le.fit_transform(list(df["Embarked"]))
df.fillna(df.mean(), inplace=True)

testdf["Sex"] = le.fit_transform(list(testdf["Sex"]))
testdf["Cabin"] = le.fit_transform(list(testdf["Cabin"]))
testdf["Embarked"] = le.fit_transform(list(testdf["Embarked"]))
testdf.fillna(testdf.mean(), inplace=True)

In [None]:
train_titles = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
type(train_titles)
train_titles.value_counts()

In [None]:
# Extract titles from the names and create a feature vector

data = [df, testdf]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna("NA")
df = df.drop(['Name'], axis=1)
testdf = testdf.drop(['Name'], axis=1)

In [None]:
df["Title"] = le.fit_transform(list(df["Title"]))
testdf["Title"] = le.fit_transform(list(testdf["Title"]))
print(df.head)


In [None]:
# Create our input matrix, label vector, and test input matrix

X = df.drop(['Survived', 'Ticket'], axis=1)
y = df['Survived']
X_test = testdf.drop(['Ticket'], axis=1)
print(X.head)

In [None]:
"""
TODO
REPLACE THE LETTERS AND NUMBERS OF A DECK WITH JUST DECK LETTER
DEAL WITH NANS
"""


# X['Cabin'] = X.Cabin.fillna('')
# print(X['Cabin'])
# print(X['Cabin'].shape)
# temp = X['Cabin']
# for i, row in X.iterrows():
#     value = row['Cabin']
#     decks = {"A", "B", "C", "D", "E", "F", "G", "U"}
#     for deck in decks:
#         if deck in value:
#             X['Cabin'][i] = deck
            
# print(X['Cabin'])

"""
TODO
Clean up and fill in nans with mode
"""

In [None]:
X=(X-X.mean())/X.std()
X_test=(X_test-X_test.mean())/X_test.std()

In [None]:
# Create a classifier using logistic regression, opting for liblinear solver
# because of how small our dataset is

clfRAW = LogisticRegression(solver='liblinear', max_iter = 1000).fit(X, y)
clfRAW.score(X,y)

In [None]:
# Create our predictions matrix and save to csv
predictions = np.c_[PassengerId, clfRAW.predict(X_test)]
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = PassengerId
submission['Survived'] = clfRAW.predict(X_test)
submission.to_csv("submissions/LogisticSubmissionRAW.csv", index=False)

In [None]:
# Split our labeled data into train and dev sets

X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(
    X,y,test_size=0.2)

# Setup range of values for tuning of C
n=np.arange(-3,6)
r=pow(float(10),n)

# Tune C
best = 0
for C in r:
    clf = LogisticRegression(solver='liblinear', max_iter = 1000, C = C).fit(X_train, y_train)
    acc = clf.score(X_dev, y_dev)
    print("For C = ", C, ", acc = ", acc)
    if acc > best:
        best = acc
        with open('models/liblinearLogisticRegression.model','wb') as f:
            pickle.dump(clf,f)
        

In [None]:
# Load in our best performing model and check train/dev accuracy

pickle_in = open('models/liblinearLogisticRegression.model','rb')
clf = pickle.load(pickle_in)
print(clf.score(X_train, y_train))
print(clf.score(X_dev, y_dev))

In [None]:
# Create submission matrix and save to csv file

predictions = np.c_[PassengerId, clf.predict(X_test)]
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = PassengerId
submission['Survived'] = clf.predict(X_test)
submission.to_csv("submissions/LogisticSubmission.csv", index=False)

Note that in submitting our normal submission (with a train/dev split and tuning of C) to kaggle, we perform worse (0.75) than our RAW submission with no tuning of C (0.77990). Likely as a result of how small the dataset is.