In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn import metrics, preprocessing
from sklearn.linear_model import LogisticRegression
import pickle
import math

In [50]:
# Create dataframes from our csv files and set indeces

df = pd.read_csv('data/train.csv')
df.set_index('PassengerId', inplace=True)

testdf = pd.read_csv('data/test.csv')
PassengerId = testdf['PassengerId']
testdf.set_index('PassengerId', inplace=True)
data = [df, testdf]

In [51]:
# Preprocess the data by converting non numerical features into numerical categorical features 
# and applying mean imputation to deal with NaN values

for dataframe in data:
 le = preprocessing.LabelEncoder()
 dataframe["Sex"] = le.fit_transform(list(dataframe["Sex"]))
 dataframe["Cabin"] = le.fit_transform(list(dataframe["Cabin"]))
 dataframe["Embarked"] = le.fit_transform(list(dataframe["Embarked"]))
 dataframe.fillna(dataframe.mean(), inplace=True)

In [52]:
# Create our input matrix, label vector, and test input matrix

X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)
y = df['Survived']
X_test = testdf.drop(['Name', 'Ticket'], axis=1)
print(X.head)




In [53]:
# Normalize the data

X=(X-X.mean())/X.std()
X_test=(X_test-X_test.mean())/X_test.std()

In [54]:
# Create a classifier using logistic regression, opting for liblinear solver
# because of how small our dataset is

clfRAW = LogisticRegression(solver='liblinear', max_iter = 1000).fit(X, y)
clfRAW.score(X,y)

0.7957351290684624

In [55]:
# Create our predictions matrix and save to csv
predictions = np.c_[PassengerId, clfRAW.predict(X_test)]
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = PassengerId
submission['Survived'] = clfRAW.predict(X_test)
submission.to_csv("submissions/LogisticSubmissionRAW.csv", index=False)

In [56]:
# Split our labeled data into train and dev sets

X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(
 X,y,test_size=0.2)

# Setup range of values for tuning of C
n=np.arange(-3,6)
r=pow(float(10),n)

# Tune C
best = 0
for C in r:
 clf = LogisticRegression(solver='liblinear', max_iter = 1000, C = C).fit(X_train, y_train)
 acc = clf.score(X_dev, y_dev)
 print("For C = ", C, ", acc = ", acc)
 if acc > best:
 best = acc
 with open('models/liblinearLogisticRegression.model','wb') as f:
 pickle.dump(clf,f)
 

For C = 0.001 , acc = 0.6983240223463687
For C = 0.01 , acc = 0.7430167597765364
For C = 0.1 , acc = 0.7541899441340782
For C = 1.0 , acc = 0.7486033519553073
For C = 10.0 , acc = 0.7541899441340782
For C = 100.0 , acc = 0.7541899441340782
For C = 1000.0 , acc = 0.7541899441340782
For C = 10000.0 , acc = 0.7541899441340782
For C = 100000.0 , acc = 0.7541899441340782


In [47]:
# Load in our best performing model and check train/dev accuracy

pickle_in = open('models/liblinearLogisticRegression.model','rb')
clf = pickle.load(pickle_in)
print(clf.score(X_train, y_train))
print(clf.score(X_dev, y_dev))

0.7837078651685393
0.8212290502793296


In [48]:
# Create submission matrix and save to csv file

predictions = np.c_[PassengerId, clf.predict(X_test)]
submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = PassengerId
submission['Survived'] = clf.predict(X_test)
submission.to_csv("submissions/LogisticSubmission.csv", index=False)

Note that in submitting our normal submission (with a train/dev split and tuning of C) to kaggle, we perform worse (0.75) than our RAW submission with no tuning of C (0.77990). Likely as a result of how small the dataset is.

In [57]:
print(predictions)

[[ 892 0]
 [ 893 0]
 [ 894 0]
 [ 895 0]
 [ 896 1]
 [ 897 0]
 [ 898 1]
 [ 899 0]
 [ 900 1]
 [ 901 0]
 [ 902 0]
 [ 903 0]
 [ 904 1]
 [ 905 0]
 [ 906 1]
 [ 907 1]
 [ 908 0]
 [ 909 0]
 [ 910 1]
 [ 911 1]
 [ 912 0]
 [ 913 0]
 [ 914 1]
 [ 915 1]
 [ 916 1]
 [ 917 0]
 [ 918 1]
 [ 919 0]
 [ 920 0]
 [ 921 0]
 [ 922 0]
 [ 923 0]
 [ 924 0]
 [ 925 0]
 [ 926 0]
 [ 927 0]
 [ 928 1]
 [ 929 1]
 [ 930 0]
 [ 931 0]
 [ 932 0]
 [ 933 0]
 [ 934 0]
 [ 935 1]
 [ 936 1]
 [ 937 0]
 [ 938 0]
 [ 939 0]
 [ 940 1]
 [ 941 0]
 [ 942 0]
 [ 943 0]
 [ 944 1]
 [ 945 1]
 [ 946 0]
 [ 947 0]
 [ 948 0]
 [ 949 0]
 [ 950 0]
 [ 951 1]
 [ 952 0]
 [ 953 0]
 [ 954 0]
 [ 955 1]
 [ 956 1]
 [ 957 1]
 [ 958 1]
 [ 959 0]
 [ 960 1]
 [ 961 1]
 [ 962 1]
 [ 963 0]
 [ 964 1]
 [ 965 1]
 [ 966 1]
 [ 967 1]
 [ 968 0]
 [ 969 1]
 [ 970 0]
 [ 971 1]
 [ 972 0]
 [ 973 0]
 [ 974 0]
 [ 975 0]
 [ 976 0]
 [ 977 0]
 [ 978 1]
 [ 979 1]
 [ 980 1]
 [ 981 0]
 [ 982 1]
 [ 983 0]
 [ 984 1]
 [ 985 0]
 [ 986 1]
 [ 987 0]
 [ 988 1]
 [ 989 0]
 [ 990 1]
 [ 991 0]
