{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor\n", "from math import sqrt\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.metrics import mean_absolute_error" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(os.path.join('data', 'clean_train.csv'))\n", "test = pd.read_csv(os.path.join('data', 'clean_test.csv'))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PCA0PCA1PCA2PCA3PCA4PCA5PCA6PCA7PCA8...PCA164PCA165PCA166PCA167PCA168PCA169PCA170PCA171IdSalePrice
004.3451091.619386-0.739617-2.080179-0.9850881.999117-1.231870-0.1317821.316470...0.1607330.0713330.1554680.172801-0.169568-0.1443260.391713-0.0133571208500
110.019142-3.1069590.168223-0.5533410.9407120.200719-0.4689540.235082-0.838022...-1.063234-0.3345560.361166-1.218397-0.346191-0.962753-0.1388631.0831032181500
224.8511491.242811-0.351815-1.484957-0.7582002.181179-1.8439490.2961941.299142...0.0883340.2386240.3272800.325285-0.704900-0.036388-0.5405160.0217113223500
33-1.7716410.039500-1.3586231.920760-2.5508170.209519-0.7563870.700109-1.408543...-0.172186-0.5189220.231498-0.074296-0.034287-0.8777350.028065-0.3210094140000
446.4637471.0644730.2094720.448906-1.5553013.215822-0.946356-0.8052042.112526...-0.2701890.375297-0.396732-0.1090840.317305-0.145975-0.674692-0.3784585250000
\n", "

5 rows × 175 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n", "0 0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 \n", "1 1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 \n", "2 2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 \n", "3 3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 \n", "4 4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 \n", "\n", " PCA6 PCA7 PCA8 ... PCA164 PCA165 PCA166 PCA167 \\\n", "0 -1.231870 -0.131782 1.316470 ... 0.160733 0.071333 0.155468 0.172801 \n", "1 -0.468954 0.235082 -0.838022 ... -1.063234 -0.334556 0.361166 -1.218397 \n", "2 -1.843949 0.296194 1.299142 ... 0.088334 0.238624 0.327280 0.325285 \n", "3 -0.756387 0.700109 -1.408543 ... -0.172186 -0.518922 0.231498 -0.074296 \n", "4 -0.946356 -0.805204 2.112526 ... -0.270189 0.375297 -0.396732 -0.109084 \n", "\n", " PCA168 PCA169 PCA170 PCA171 Id SalePrice \n", "0 -0.169568 -0.144326 0.391713 -0.013357 1 208500 \n", "1 -0.346191 -0.962753 -0.138863 1.083103 2 181500 \n", "2 -0.704900 -0.036388 -0.540516 0.021711 3 223500 \n", "3 -0.034287 -0.877735 0.028065 -0.321009 4 140000 \n", "4 0.317305 -0.145975 -0.674692 -0.378458 5 250000 \n", "\n", "[5 rows x 175 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PCA0PCA1PCA2PCA3PCA4PCA5PCA6PCA7PCA8...PCA163PCA164PCA165PCA166PCA167PCA168PCA169PCA170PCA171Id
00-3.208086-2.987338-0.327066-1.6092060.016879-1.514939-0.417889-0.988173-0.653363...-0.0273640.653222-0.201973-0.769946-0.3448340.5142571.1141060.337765-0.6396171461
11-1.403753-4.2618510.1075270.9359810.165777-0.299485-0.524918-2.3321210.031044...3.8561170.7879960.2152210.4582751.1351090.3789720.953559-1.0082404.4454351462
222.2570020.427951-0.610464-1.301125-1.0583272.674177-1.500824-0.2239990.403440...-0.117138-0.378473-0.0316130.090593-0.173914-0.150098-0.0066120.190780-0.1524861463
333.2536180.537318-0.796079-0.851716-1.2096432.388795-1.340676-0.8763220.421183...-0.4415860.020066-0.1517090.4448260.008218-0.161705-0.4534820.4723520.0461411464
442.876409-0.075909-0.154959-2.4698701.4078200.4875320.0721902.4144461.667224...0.2690620.651172-0.050461-0.526448-0.8437010.574770-0.2278281.0714231.3626381465
\n", "

5 rows × 174 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n", "0 0 -3.208086 -2.987338 -0.327066 -1.609206 0.016879 -1.514939 \n", "1 1 -1.403753 -4.261851 0.107527 0.935981 0.165777 -0.299485 \n", "2 2 2.257002 0.427951 -0.610464 -1.301125 -1.058327 2.674177 \n", "3 3 3.253618 0.537318 -0.796079 -0.851716 -1.209643 2.388795 \n", "4 4 2.876409 -0.075909 -0.154959 -2.469870 1.407820 0.487532 \n", "\n", " PCA6 PCA7 PCA8 ... PCA163 PCA164 PCA165 PCA166 \\\n", "0 -0.417889 -0.988173 -0.653363 ... -0.027364 0.653222 -0.201973 -0.769946 \n", "1 -0.524918 -2.332121 0.031044 ... 3.856117 0.787996 0.215221 0.458275 \n", "2 -1.500824 -0.223999 0.403440 ... -0.117138 -0.378473 -0.031613 0.090593 \n", "3 -1.340676 -0.876322 0.421183 ... -0.441586 0.020066 -0.151709 0.444826 \n", "4 0.072190 2.414446 1.667224 ... 0.269062 0.651172 -0.050461 -0.526448 \n", "\n", " PCA167 PCA168 PCA169 PCA170 PCA171 Id \n", "0 -0.344834 0.514257 1.114106 0.337765 -0.639617 1461 \n", "1 1.135109 0.378972 0.953559 -1.008240 4.445435 1462 \n", "2 -0.173914 -0.150098 -0.006612 0.190780 -0.152486 1463 \n", "3 0.008218 -0.161705 -0.453482 0.472352 0.046141 1464 \n", "4 -0.843701 0.574770 -0.227828 1.071423 1.362638 1465 \n", "\n", "[5 rows x 174 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Set aside unnecessary features\n", "\n", "trainId = train['Id'].astype(int)\n", "testId = test['Id'].astype(int)\n", "\n", "train = train.drop('Id', axis=1)\n", "test = test.drop('Id', axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "X = train.drop('SalePrice', axis=1)\n", "y = train['SalePrice']\n", "X_test = test" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 208500\n", "1 181500\n", "2 223500\n", "3 140000\n", "4 250000\n", " ... \n", "1455 175000\n", "1456 210000\n", "1457 266500\n", "1458 142125\n", "1459 147500\n", "Name: SalePrice, Length: 1460, dtype: int64\n", "(1460, 173)\n", "(1459, 173)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PCA0PCA1PCA2PCA3PCA4PCA5PCA6PCA7PCA8...PCA162PCA163PCA164PCA165PCA166PCA167PCA168PCA169PCA170PCA171
004.3451091.619386-0.739617-2.080179-0.9850881.999117-1.231870-0.1317821.316470...-0.276936-0.1282600.1607330.0713330.1554680.172801-0.169568-0.1443260.391713-0.013357
110.019142-3.1069590.168223-0.5533410.9407120.200719-0.4689540.235082-0.838022...0.140974-0.224535-1.063234-0.3345560.361166-1.218397-0.346191-0.962753-0.1388631.083103
224.8511491.242811-0.351815-1.484957-0.7582002.181179-1.8439490.2961941.299142...-0.289024-0.2825630.0883340.2386240.3272800.325285-0.704900-0.036388-0.5405160.021711
33-1.7716410.039500-1.3586231.920760-2.5508170.209519-0.7563870.700109-1.408543...0.2867900.672251-0.172186-0.5189220.231498-0.074296-0.034287-0.8777350.028065-0.321009
446.4637471.0644730.2094720.448906-1.5553013.215822-0.946356-0.8052042.112526...-0.2355850.019570-0.2701890.375297-0.396732-0.1090840.317305-0.145975-0.674692-0.378458
\n", "

5 rows × 173 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n", "0 0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 \n", "1 1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 \n", "2 2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 \n", "3 3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 \n", "4 4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 \n", "\n", " PCA6 PCA7 PCA8 ... PCA162 PCA163 PCA164 PCA165 \\\n", "0 -1.231870 -0.131782 1.316470 ... -0.276936 -0.128260 0.160733 0.071333 \n", "1 -0.468954 0.235082 -0.838022 ... 0.140974 -0.224535 -1.063234 -0.334556 \n", "2 -1.843949 0.296194 1.299142 ... -0.289024 -0.282563 0.088334 0.238624 \n", "3 -0.756387 0.700109 -1.408543 ... 0.286790 0.672251 -0.172186 -0.518922 \n", "4 -0.946356 -0.805204 2.112526 ... -0.235585 0.019570 -0.270189 0.375297 \n", "\n", " PCA166 PCA167 PCA168 PCA169 PCA170 PCA171 \n", "0 0.155468 0.172801 -0.169568 -0.144326 0.391713 -0.013357 \n", "1 0.361166 -1.218397 -0.346191 -0.962753 -0.138863 1.083103 \n", "2 0.327280 0.325285 -0.704900 -0.036388 -0.540516 0.021711 \n", "3 0.231498 -0.074296 -0.034287 -0.877735 0.028065 -0.321009 \n", "4 -0.396732 -0.109084 0.317305 -0.145975 -0.674692 -0.378458 \n", "\n", "[5 rows x 173 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(y)\n", "print(X.shape)\n", "print(X_test.shape)\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Initialize our models\n", "\n", "tree_model = DecisionTreeRegressor()\n", "rf_model = RandomForestRegressor()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" ] }, { "data": { "text/plain": [ "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", " max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=10,\n", " n_jobs=None, oob_score=False, random_state=None,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fit our models to the training data\n", "\n", "tree_model.fit(X, y)\n", "rf_model.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Calculate error metrics for each model\n", "\n", "tree_mse = mean_squared_error(y, tree_model.predict(X))\n", "tree_mae = mean_absolute_error(y, tree_model.predict(X))\n", "rf_mse = mean_squared_error(y, rf_model.predict(X))\n", "rf_mae = mean_absolute_error(y, rf_model.predict(X))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Decision Tree training mse = 0.0 & mae = 0.0 & rmse = 0.0\n", "Random Forest training mse = 202420995.41813016 & mae = 8265.452260273973 & rmse = 14227.473261901783\n" ] } ], "source": [ "print(\"Decision Tree training mse = \",tree_mse,\" & mae = \",tree_mae,\" & rmse = \", sqrt(tree_mse))\n", "print(\"Random Forest training mse = \",rf_mse,\" & mae = \",rf_mae,\" & rmse = \", sqrt(rf_mse))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Setup and save Tree prediction\n", "\n", "test_pred = tree_model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n", "tree_predictions = np.c_[testId, test_pred] # Note that we take the argmax over the collumns to use our softmax output\n", "tree_submission = pd.DataFrame(tree_predictions, columns = ['Id', 'SalePrice'])\n", "tree_submission['Id'] = tree_submission['Id'].astype(int)\n", "print(tree_submission.head)\n", "tree_submission.to_csv(\"submissions/TreeSubmission.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Setup and save Random Forest prediction\n", "\n", "test_pred = rf_model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n", "rf_predictions = np.c_[testId, test_pred] # Note that we take the argmax over the collumns to use our softmax output\n", "rf_submission = pd.DataFrame(rf_predictions, columns = ['Id', 'SalePrice'])\n", "rf_submission['Id'] = rf_submission['Id'].astype(int)\n", "print(rf_submission.head)\n", "rf_submission.to_csv(\"submissions/RFSubmission.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# Set up and save Ensemble prediction\n", "\n", "ensemble_submission = tree_submission\n", "ensemble_submission['TreeSalePrice'] = tree_submission['SalePrice']\n", "ensemble_submission['ForestSalePrice'] = rf_submission['SalePrice']\n", "ensemble_submission['SalePrice'] = ensemble_submission[['TreeSalePrice', 'ForestSalePrice']].mean(axis=1)\n", "ensemble_submission = ensemble_submission.drop(['TreeSalePrice', 'ForestSalePrice'], axis=1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdSalePrice
01461133457.50
11462163187.50
21463184355.00
31464190472.40
41465185546.15
\n", "
" ], "text/plain": [ " Id SalePrice\n", "0 1461 133457.50\n", "1 1462 163187.50\n", "2 1463 184355.00\n", "3 1464 190472.40\n", "4 1465 185546.15" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ensemble_submission.head()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "ensemble_submission.to_csv(\"submissions/EnsembleSubmission.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }