{ "cells": [ { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os\n", "import sklearn\n", "from sklearn import metrics, preprocessing\n", "from sklearn.linear_model import LogisticRegression\n", "import pickle\n", "import math" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# Create dataframes from our csv files and set indeces\n", "\n", "df = pd.read_csv('data/train.csv')\n", "df.set_index('PassengerId', inplace=True)\n", "\n", "testdf = pd.read_csv('data/test.csv')\n", "PassengerId = testdf['PassengerId']\n", "testdf.set_index('PassengerId', inplace=True)\n", "data = [df, testdf]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "# Preprocess the data by converting non numerical features into numerical categorical features \n", "# and applying mean imputation to deal with NaN values\n", "\n", "for dataframe in data:\n", " le = preprocessing.LabelEncoder()\n", " dataframe[\"Sex\"] = le.fit_transform(list(dataframe[\"Sex\"]))\n", " dataframe[\"Cabin\"] = le.fit_transform(list(dataframe[\"Cabin\"]))\n", " dataframe[\"Embarked\"] = le.fit_transform(list(dataframe[\"Embarked\"]))\n", " dataframe.fillna(dataframe.mean(), inplace=True)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Create our input matrix, label vector, and test input matrix\n", "\n", "X = df.drop(['Name', 'Survived', 'Ticket'], axis=1)\n", "y = df['Survived']\n", "X_test = testdf.drop(['Name', 'Ticket'], axis=1)\n", "print(X.head)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "# Normalize the data\n", "\n", "X=(X-X.mean())/X.std()\n", "X_test=(X_test-X_test.mean())/X_test.std()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7957351290684624" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a classifier using logistic regression, opting for liblinear solver\n", "# because of how small our dataset is\n", "\n", "clfRAW = LogisticRegression(solver='liblinear', max_iter = 1000).fit(X, y)\n", "clfRAW.score(X,y)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "# Create our predictions matrix and save to csv\n", "predictions = np.c_[PassengerId, clfRAW.predict(X_test)]\n", "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n", "submission['PassengerId'] = PassengerId\n", "submission['Survived'] = clfRAW.predict(X_test)\n", "submission.to_csv(\"submissions/LogisticSubmissionRAW.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "For C = 0.001 , acc = 0.6983240223463687\n", "For C = 0.01 , acc = 0.7430167597765364\n", "For C = 0.1 , acc = 0.7541899441340782\n", "For C = 1.0 , acc = 0.7486033519553073\n", "For C = 10.0 , acc = 0.7541899441340782\n", "For C = 100.0 , acc = 0.7541899441340782\n", "For C = 1000.0 , acc = 0.7541899441340782\n", "For C = 10000.0 , acc = 0.7541899441340782\n", "For C = 100000.0 , acc = 0.7541899441340782\n" ] } ], "source": [ "# Split our labeled data into train and dev sets\n", "\n", "X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(\n", " X,y,test_size=0.2)\n", "\n", "# Setup range of values for tuning of C\n", "n=np.arange(-3,6)\n", "r=pow(float(10),n)\n", "\n", "# Tune C\n", "best = 0\n", "for C in r:\n", " clf = LogisticRegression(solver='liblinear', max_iter = 1000, C = C).fit(X_train, y_train)\n", " acc = clf.score(X_dev, y_dev)\n", " print(\"For C = \", C, \", acc = \", acc)\n", " if acc > best:\n", " best = acc\n", " with open('models/liblinearLogisticRegression.model','wb') as f:\n", " pickle.dump(clf,f)\n", " " ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7837078651685393\n", "0.8212290502793296\n" ] } ], "source": [ "# Load in our best performing model and check train/dev accuracy\n", "\n", "pickle_in = open('models/liblinearLogisticRegression.model','rb')\n", "clf = pickle.load(pickle_in)\n", "print(clf.score(X_train, y_train))\n", "print(clf.score(X_dev, y_dev))" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "# Create submission matrix and save to csv file\n", "\n", "predictions = np.c_[PassengerId, clf.predict(X_test)]\n", "submission = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])\n", "submission['PassengerId'] = PassengerId\n", "submission['Survived'] = clf.predict(X_test)\n", "submission.to_csv(\"submissions/LogisticSubmission.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that in submitting our normal submission (with a train/dev split and tuning of C) to kaggle, we perform worse (0.75) than our RAW submission with no tuning of C (0.77990). Likely as a result of how small the dataset is." ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 892 0]\n", " [ 893 0]\n", " [ 894 0]\n", " [ 895 0]\n", " [ 896 1]\n", " [ 897 0]\n", " [ 898 1]\n", " [ 899 0]\n", " [ 900 1]\n", " [ 901 0]\n", " [ 902 0]\n", " [ 903 0]\n", " [ 904 1]\n", " [ 905 0]\n", " [ 906 1]\n", " [ 907 1]\n", " [ 908 0]\n", " [ 909 0]\n", " [ 910 1]\n", " [ 911 1]\n", " [ 912 0]\n", " [ 913 0]\n", " [ 914 1]\n", " [ 915 1]\n", " [ 916 1]\n", " [ 917 0]\n", " [ 918 1]\n", " [ 919 0]\n", " [ 920 0]\n", " [ 921 0]\n", " [ 922 0]\n", " [ 923 0]\n", " [ 924 0]\n", " [ 925 0]\n", " [ 926 0]\n", " [ 927 0]\n", " [ 928 1]\n", " [ 929 1]\n", " [ 930 0]\n", " [ 931 0]\n", " [ 932 0]\n", " [ 933 0]\n", " [ 934 0]\n", " [ 935 1]\n", " [ 936 1]\n", " [ 937 0]\n", " [ 938 0]\n", " [ 939 0]\n", " [ 940 1]\n", " [ 941 0]\n", " [ 942 0]\n", " [ 943 0]\n", " [ 944 1]\n", " [ 945 1]\n", " [ 946 0]\n", " [ 947 0]\n", " [ 948 0]\n", " [ 949 0]\n", " [ 950 0]\n", " [ 951 1]\n", " [ 952 0]\n", " [ 953 0]\n", " [ 954 0]\n", " [ 955 1]\n", " [ 956 1]\n", " [ 957 1]\n", " [ 958 1]\n", " [ 959 0]\n", " [ 960 1]\n", " [ 961 1]\n", " [ 962 1]\n", " [ 963 0]\n", " [ 964 1]\n", " [ 965 1]\n", " [ 966 1]\n", " [ 967 1]\n", " [ 968 0]\n", " [ 969 1]\n", " [ 970 0]\n", " [ 971 1]\n", " [ 972 0]\n", " [ 973 0]\n", " [ 974 0]\n", " [ 975 0]\n", " [ 976 0]\n", " [ 977 0]\n", " [ 978 1]\n", " [ 979 1]\n", " [ 980 1]\n", " [ 981 0]\n", " [ 982 1]\n", " [ 983 0]\n", " [ 984 1]\n", " [ 985 0]\n", " [ 986 1]\n", " [ 987 0]\n", " [ 988 1]\n", " [ 989 0]\n", " [ 990 1]\n", " [ 991 0]\n", " [ 992 1]\n", " [ 993 0]\n", " [ 994 0]\n", " [ 995 0]\n", " [ 996 1]\n", " [ 997 0]\n", " [ 998 0]\n", " [ 999 0]\n", " [1000 0]\n", " [1001 0]\n", " [1002 0]\n", " [1003 1]\n", " [1004 1]\n", " [1005 1]\n", " [1006 1]\n", " [1007 0]\n", " [1008 0]\n", " [1009 1]\n", " [1010 1]\n", " [1011 1]\n", " [1012 1]\n", " [1013 0]\n", " [1014 1]\n", " [1015 0]\n", " [1016 0]\n", " [1017 1]\n", " [1018 0]\n", " [1019 0]\n", " [1020 0]\n", " [1021 0]\n", " [1022 0]\n", " [1023 0]\n", " [1024 1]\n", " [1025 0]\n", " [1026 0]\n", " [1027 0]\n", " [1028 0]\n", " [1029 0]\n", " [1030 1]\n", " [1031 0]\n", " [1032 0]\n", " [1033 1]\n", " [1034 0]\n", " [1035 0]\n", " [1036 0]\n", " [1037 0]\n", " [1038 0]\n", " [1039 0]\n", " [1040 0]\n", " [1041 0]\n", " [1042 1]\n", " [1043 0]\n", " [1044 0]\n", " [1045 0]\n", " [1046 0]\n", " [1047 0]\n", " [1048 1]\n", " [1049 1]\n", " [1050 0]\n", " [1051 1]\n", " [1052 1]\n", " [1053 0]\n", " [1054 1]\n", " [1055 0]\n", " [1056 0]\n", " [1057 1]\n", " [1058 0]\n", " [1059 0]\n", " [1060 1]\n", " [1061 1]\n", " [1062 0]\n", " [1063 0]\n", " [1064 0]\n", " [1065 0]\n", " [1066 0]\n", " [1067 1]\n", " [1068 1]\n", " [1069 0]\n", " [1070 1]\n", " [1071 1]\n", " [1072 0]\n", " [1073 0]\n", " [1074 1]\n", " [1075 0]\n", " [1076 1]\n", " [1077 0]\n", " [1078 1]\n", " [1079 0]\n", " [1080 0]\n", " [1081 0]\n", " [1082 0]\n", " [1083 0]\n", " [1084 0]\n", " [1085 0]\n", " [1086 0]\n", " [1087 0]\n", " [1088 1]\n", " [1089 1]\n", " [1090 0]\n", " [1091 1]\n", " [1092 1]\n", " [1093 0]\n", " [1094 0]\n", " [1095 1]\n", " [1096 0]\n", " [1097 1]\n", " [1098 1]\n", " [1099 0]\n", " [1100 1]\n", " [1101 0]\n", " [1102 0]\n", " [1103 0]\n", " [1104 0]\n", " [1105 0]\n", " [1106 0]\n", " [1107 0]\n", " [1108 1]\n", " [1109 0]\n", " [1110 1]\n", " [1111 0]\n", " [1112 1]\n", " [1113 0]\n", " [1114 1]\n", " [1115 0]\n", " [1116 1]\n", " [1117 1]\n", " [1118 0]\n", " [1119 1]\n", " [1120 0]\n", " [1121 0]\n", " [1122 0]\n", " [1123 1]\n", " [1124 0]\n", " [1125 0]\n", " [1126 0]\n", " [1127 0]\n", " [1128 0]\n", " [1129 0]\n", " [1130 1]\n", " [1131 1]\n", " [1132 1]\n", " [1133 1]\n", " [1134 0]\n", " [1135 0]\n", " [1136 0]\n", " [1137 0]\n", " [1138 1]\n", " [1139 0]\n", " [1140 1]\n", " [1141 1]\n", " [1142 1]\n", " [1143 0]\n", " [1144 1]\n", " [1145 0]\n", " [1146 0]\n", " [1147 0]\n", " [1148 0]\n", " [1149 0]\n", " [1150 1]\n", " [1151 0]\n", " [1152 0]\n", " [1153 0]\n", " [1154 1]\n", " [1155 1]\n", " [1156 0]\n", " [1157 0]\n", " [1158 0]\n", " [1159 0]\n", " [1160 1]\n", " [1161 0]\n", " [1162 0]\n", " [1163 0]\n", " [1164 1]\n", " [1165 1]\n", " [1166 0]\n", " [1167 1]\n", " [1168 0]\n", " [1169 0]\n", " [1170 0]\n", " [1171 0]\n", " [1172 1]\n", " [1173 0]\n", " [1174 1]\n", " [1175 1]\n", " [1176 1]\n", " [1177 0]\n", " [1178 0]\n", " [1179 0]\n", " [1180 0]\n", " [1181 0]\n", " [1182 0]\n", " [1183 1]\n", " [1184 0]\n", " [1185 0]\n", " [1186 0]\n", " [1187 0]\n", " [1188 1]\n", " [1189 0]\n", " [1190 0]\n", " [1191 0]\n", " [1192 0]\n", " [1193 0]\n", " [1194 0]\n", " [1195 0]\n", " [1196 1]\n", " [1197 1]\n", " [1198 0]\n", " [1199 0]\n", " [1200 0]\n", " [1201 0]\n", " [1202 0]\n", " [1203 0]\n", " [1204 0]\n", " [1205 1]\n", " [1206 1]\n", " [1207 1]\n", " [1208 0]\n", " [1209 0]\n", " [1210 0]\n", " [1211 0]\n", " [1212 0]\n", " [1213 0]\n", " [1214 0]\n", " [1215 0]\n", " [1216 1]\n", " [1217 0]\n", " [1218 1]\n", " [1219 0]\n", " [1220 0]\n", " [1221 0]\n", " [1222 1]\n", " [1223 1]\n", " [1224 0]\n", " [1225 1]\n", " [1226 0]\n", " [1227 0]\n", " [1228 0]\n", " [1229 0]\n", " [1230 0]\n", " [1231 0]\n", " [1232 0]\n", " [1233 0]\n", " [1234 0]\n", " [1235 1]\n", " [1236 0]\n", " [1237 1]\n", " [1238 0]\n", " [1239 1]\n", " [1240 0]\n", " [1241 1]\n", " [1242 1]\n", " [1243 0]\n", " [1244 0]\n", " [1245 0]\n", " [1246 1]\n", " [1247 0]\n", " [1248 1]\n", " [1249 0]\n", " [1250 0]\n", " [1251 0]\n", " [1252 0]\n", " [1253 1]\n", " [1254 1]\n", " [1255 0]\n", " [1256 1]\n", " [1257 0]\n", " [1258 0]\n", " [1259 1]\n", " [1260 1]\n", " [1261 0]\n", " [1262 0]\n", " [1263 1]\n", " [1264 0]\n", " [1265 0]\n", " [1266 1]\n", " [1267 1]\n", " [1268 0]\n", " [1269 0]\n", " [1270 0]\n", " [1271 0]\n", " [1272 0]\n", " [1273 0]\n", " [1274 1]\n", " [1275 1]\n", " [1276 0]\n", " [1277 1]\n", " [1278 0]\n", " [1279 0]\n", " [1280 0]\n", " [1281 0]\n", " [1282 1]\n", " [1283 1]\n", " [1284 0]\n", " [1285 0]\n", " [1286 0]\n", " [1287 1]\n", " [1288 0]\n", " [1289 1]\n", " [1290 0]\n", " [1291 0]\n", " [1292 1]\n", " [1293 0]\n", " [1294 1]\n", " [1295 1]\n", " [1296 0]\n", " [1297 0]\n", " [1298 0]\n", " [1299 0]\n", " [1300 1]\n", " [1301 1]\n", " [1302 1]\n", " [1303 1]\n", " [1304 1]\n", " [1305 0]\n", " [1306 1]\n", " [1307 0]\n", " [1308 0]\n", " [1309 0]]\n" ] } ], "source": [ "print(predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }