MLProjects/kaggle_house_prices/.ipynb_checkpoints/DecisionTrees-checkpoint.ipynb

995 lines
32 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from math import sqrt\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.metrics import mean_absolute_error"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv(os.path.join('data', 'clean_train.csv'))\n",
"test = pd.read_csv(os.path.join('data', 'clean_test.csv'))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>PCA0</th>\n",
" <th>PCA1</th>\n",
" <th>PCA2</th>\n",
" <th>PCA3</th>\n",
" <th>PCA4</th>\n",
" <th>PCA5</th>\n",
" <th>PCA6</th>\n",
" <th>PCA7</th>\n",
" <th>PCA8</th>\n",
" <th>...</th>\n",
" <th>PCA164</th>\n",
" <th>PCA165</th>\n",
" <th>PCA166</th>\n",
" <th>PCA167</th>\n",
" <th>PCA168</th>\n",
" <th>PCA169</th>\n",
" <th>PCA170</th>\n",
" <th>PCA171</th>\n",
" <th>Id</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4.345109</td>\n",
" <td>1.619386</td>\n",
" <td>-0.739617</td>\n",
" <td>-2.080179</td>\n",
" <td>-0.985088</td>\n",
" <td>1.999117</td>\n",
" <td>-1.231870</td>\n",
" <td>-0.131782</td>\n",
" <td>1.316470</td>\n",
" <td>...</td>\n",
" <td>0.160733</td>\n",
" <td>0.071333</td>\n",
" <td>0.155468</td>\n",
" <td>0.172801</td>\n",
" <td>-0.169568</td>\n",
" <td>-0.144326</td>\n",
" <td>0.391713</td>\n",
" <td>-0.013357</td>\n",
" <td>1</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0.019142</td>\n",
" <td>-3.106959</td>\n",
" <td>0.168223</td>\n",
" <td>-0.553341</td>\n",
" <td>0.940712</td>\n",
" <td>0.200719</td>\n",
" <td>-0.468954</td>\n",
" <td>0.235082</td>\n",
" <td>-0.838022</td>\n",
" <td>...</td>\n",
" <td>-1.063234</td>\n",
" <td>-0.334556</td>\n",
" <td>0.361166</td>\n",
" <td>-1.218397</td>\n",
" <td>-0.346191</td>\n",
" <td>-0.962753</td>\n",
" <td>-0.138863</td>\n",
" <td>1.083103</td>\n",
" <td>2</td>\n",
" <td>181500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>4.851149</td>\n",
" <td>1.242811</td>\n",
" <td>-0.351815</td>\n",
" <td>-1.484957</td>\n",
" <td>-0.758200</td>\n",
" <td>2.181179</td>\n",
" <td>-1.843949</td>\n",
" <td>0.296194</td>\n",
" <td>1.299142</td>\n",
" <td>...</td>\n",
" <td>0.088334</td>\n",
" <td>0.238624</td>\n",
" <td>0.327280</td>\n",
" <td>0.325285</td>\n",
" <td>-0.704900</td>\n",
" <td>-0.036388</td>\n",
" <td>-0.540516</td>\n",
" <td>0.021711</td>\n",
" <td>3</td>\n",
" <td>223500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>-1.771641</td>\n",
" <td>0.039500</td>\n",
" <td>-1.358623</td>\n",
" <td>1.920760</td>\n",
" <td>-2.550817</td>\n",
" <td>0.209519</td>\n",
" <td>-0.756387</td>\n",
" <td>0.700109</td>\n",
" <td>-1.408543</td>\n",
" <td>...</td>\n",
" <td>-0.172186</td>\n",
" <td>-0.518922</td>\n",
" <td>0.231498</td>\n",
" <td>-0.074296</td>\n",
" <td>-0.034287</td>\n",
" <td>-0.877735</td>\n",
" <td>0.028065</td>\n",
" <td>-0.321009</td>\n",
" <td>4</td>\n",
" <td>140000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>6.463747</td>\n",
" <td>1.064473</td>\n",
" <td>0.209472</td>\n",
" <td>0.448906</td>\n",
" <td>-1.555301</td>\n",
" <td>3.215822</td>\n",
" <td>-0.946356</td>\n",
" <td>-0.805204</td>\n",
" <td>2.112526</td>\n",
" <td>...</td>\n",
" <td>-0.270189</td>\n",
" <td>0.375297</td>\n",
" <td>-0.396732</td>\n",
" <td>-0.109084</td>\n",
" <td>0.317305</td>\n",
" <td>-0.145975</td>\n",
" <td>-0.674692</td>\n",
" <td>-0.378458</td>\n",
" <td>5</td>\n",
" <td>250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 175 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n",
"0 0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 \n",
"1 1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 \n",
"2 2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 \n",
"3 3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 \n",
"4 4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 \n",
"\n",
" PCA6 PCA7 PCA8 ... PCA164 PCA165 PCA166 PCA167 \\\n",
"0 -1.231870 -0.131782 1.316470 ... 0.160733 0.071333 0.155468 0.172801 \n",
"1 -0.468954 0.235082 -0.838022 ... -1.063234 -0.334556 0.361166 -1.218397 \n",
"2 -1.843949 0.296194 1.299142 ... 0.088334 0.238624 0.327280 0.325285 \n",
"3 -0.756387 0.700109 -1.408543 ... -0.172186 -0.518922 0.231498 -0.074296 \n",
"4 -0.946356 -0.805204 2.112526 ... -0.270189 0.375297 -0.396732 -0.109084 \n",
"\n",
" PCA168 PCA169 PCA170 PCA171 Id SalePrice \n",
"0 -0.169568 -0.144326 0.391713 -0.013357 1 208500 \n",
"1 -0.346191 -0.962753 -0.138863 1.083103 2 181500 \n",
"2 -0.704900 -0.036388 -0.540516 0.021711 3 223500 \n",
"3 -0.034287 -0.877735 0.028065 -0.321009 4 140000 \n",
"4 0.317305 -0.145975 -0.674692 -0.378458 5 250000 \n",
"\n",
"[5 rows x 175 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>PCA0</th>\n",
" <th>PCA1</th>\n",
" <th>PCA2</th>\n",
" <th>PCA3</th>\n",
" <th>PCA4</th>\n",
" <th>PCA5</th>\n",
" <th>PCA6</th>\n",
" <th>PCA7</th>\n",
" <th>PCA8</th>\n",
" <th>...</th>\n",
" <th>PCA163</th>\n",
" <th>PCA164</th>\n",
" <th>PCA165</th>\n",
" <th>PCA166</th>\n",
" <th>PCA167</th>\n",
" <th>PCA168</th>\n",
" <th>PCA169</th>\n",
" <th>PCA170</th>\n",
" <th>PCA171</th>\n",
" <th>Id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-3.208086</td>\n",
" <td>-2.987338</td>\n",
" <td>-0.327066</td>\n",
" <td>-1.609206</td>\n",
" <td>0.016879</td>\n",
" <td>-1.514939</td>\n",
" <td>-0.417889</td>\n",
" <td>-0.988173</td>\n",
" <td>-0.653363</td>\n",
" <td>...</td>\n",
" <td>-0.027364</td>\n",
" <td>0.653222</td>\n",
" <td>-0.201973</td>\n",
" <td>-0.769946</td>\n",
" <td>-0.344834</td>\n",
" <td>0.514257</td>\n",
" <td>1.114106</td>\n",
" <td>0.337765</td>\n",
" <td>-0.639617</td>\n",
" <td>1461</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>-1.403753</td>\n",
" <td>-4.261851</td>\n",
" <td>0.107527</td>\n",
" <td>0.935981</td>\n",
" <td>0.165777</td>\n",
" <td>-0.299485</td>\n",
" <td>-0.524918</td>\n",
" <td>-2.332121</td>\n",
" <td>0.031044</td>\n",
" <td>...</td>\n",
" <td>3.856117</td>\n",
" <td>0.787996</td>\n",
" <td>0.215221</td>\n",
" <td>0.458275</td>\n",
" <td>1.135109</td>\n",
" <td>0.378972</td>\n",
" <td>0.953559</td>\n",
" <td>-1.008240</td>\n",
" <td>4.445435</td>\n",
" <td>1462</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2.257002</td>\n",
" <td>0.427951</td>\n",
" <td>-0.610464</td>\n",
" <td>-1.301125</td>\n",
" <td>-1.058327</td>\n",
" <td>2.674177</td>\n",
" <td>-1.500824</td>\n",
" <td>-0.223999</td>\n",
" <td>0.403440</td>\n",
" <td>...</td>\n",
" <td>-0.117138</td>\n",
" <td>-0.378473</td>\n",
" <td>-0.031613</td>\n",
" <td>0.090593</td>\n",
" <td>-0.173914</td>\n",
" <td>-0.150098</td>\n",
" <td>-0.006612</td>\n",
" <td>0.190780</td>\n",
" <td>-0.152486</td>\n",
" <td>1463</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3.253618</td>\n",
" <td>0.537318</td>\n",
" <td>-0.796079</td>\n",
" <td>-0.851716</td>\n",
" <td>-1.209643</td>\n",
" <td>2.388795</td>\n",
" <td>-1.340676</td>\n",
" <td>-0.876322</td>\n",
" <td>0.421183</td>\n",
" <td>...</td>\n",
" <td>-0.441586</td>\n",
" <td>0.020066</td>\n",
" <td>-0.151709</td>\n",
" <td>0.444826</td>\n",
" <td>0.008218</td>\n",
" <td>-0.161705</td>\n",
" <td>-0.453482</td>\n",
" <td>0.472352</td>\n",
" <td>0.046141</td>\n",
" <td>1464</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>2.876409</td>\n",
" <td>-0.075909</td>\n",
" <td>-0.154959</td>\n",
" <td>-2.469870</td>\n",
" <td>1.407820</td>\n",
" <td>0.487532</td>\n",
" <td>0.072190</td>\n",
" <td>2.414446</td>\n",
" <td>1.667224</td>\n",
" <td>...</td>\n",
" <td>0.269062</td>\n",
" <td>0.651172</td>\n",
" <td>-0.050461</td>\n",
" <td>-0.526448</td>\n",
" <td>-0.843701</td>\n",
" <td>0.574770</td>\n",
" <td>-0.227828</td>\n",
" <td>1.071423</td>\n",
" <td>1.362638</td>\n",
" <td>1465</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 174 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n",
"0 0 -3.208086 -2.987338 -0.327066 -1.609206 0.016879 -1.514939 \n",
"1 1 -1.403753 -4.261851 0.107527 0.935981 0.165777 -0.299485 \n",
"2 2 2.257002 0.427951 -0.610464 -1.301125 -1.058327 2.674177 \n",
"3 3 3.253618 0.537318 -0.796079 -0.851716 -1.209643 2.388795 \n",
"4 4 2.876409 -0.075909 -0.154959 -2.469870 1.407820 0.487532 \n",
"\n",
" PCA6 PCA7 PCA8 ... PCA163 PCA164 PCA165 PCA166 \\\n",
"0 -0.417889 -0.988173 -0.653363 ... -0.027364 0.653222 -0.201973 -0.769946 \n",
"1 -0.524918 -2.332121 0.031044 ... 3.856117 0.787996 0.215221 0.458275 \n",
"2 -1.500824 -0.223999 0.403440 ... -0.117138 -0.378473 -0.031613 0.090593 \n",
"3 -1.340676 -0.876322 0.421183 ... -0.441586 0.020066 -0.151709 0.444826 \n",
"4 0.072190 2.414446 1.667224 ... 0.269062 0.651172 -0.050461 -0.526448 \n",
"\n",
" PCA167 PCA168 PCA169 PCA170 PCA171 Id \n",
"0 -0.344834 0.514257 1.114106 0.337765 -0.639617 1461 \n",
"1 1.135109 0.378972 0.953559 -1.008240 4.445435 1462 \n",
"2 -0.173914 -0.150098 -0.006612 0.190780 -0.152486 1463 \n",
"3 0.008218 -0.161705 -0.453482 0.472352 0.046141 1464 \n",
"4 -0.843701 0.574770 -0.227828 1.071423 1.362638 1465 \n",
"\n",
"[5 rows x 174 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Set aside unnecessary features\n",
"\n",
"trainId = train['Id'].astype(int)\n",
"testId = test['Id'].astype(int)\n",
"\n",
"train = train.drop('Id', axis=1)\n",
"test = test.drop('Id', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"X = train.drop('SalePrice', axis=1)\n",
"y = train['SalePrice']\n",
"X_test = test"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 208500\n",
"1 181500\n",
"2 223500\n",
"3 140000\n",
"4 250000\n",
" ... \n",
"1455 175000\n",
"1456 210000\n",
"1457 266500\n",
"1458 142125\n",
"1459 147500\n",
"Name: SalePrice, Length: 1460, dtype: int64\n",
"(1460, 173)\n",
"(1459, 173)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>PCA0</th>\n",
" <th>PCA1</th>\n",
" <th>PCA2</th>\n",
" <th>PCA3</th>\n",
" <th>PCA4</th>\n",
" <th>PCA5</th>\n",
" <th>PCA6</th>\n",
" <th>PCA7</th>\n",
" <th>PCA8</th>\n",
" <th>...</th>\n",
" <th>PCA162</th>\n",
" <th>PCA163</th>\n",
" <th>PCA164</th>\n",
" <th>PCA165</th>\n",
" <th>PCA166</th>\n",
" <th>PCA167</th>\n",
" <th>PCA168</th>\n",
" <th>PCA169</th>\n",
" <th>PCA170</th>\n",
" <th>PCA171</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4.345109</td>\n",
" <td>1.619386</td>\n",
" <td>-0.739617</td>\n",
" <td>-2.080179</td>\n",
" <td>-0.985088</td>\n",
" <td>1.999117</td>\n",
" <td>-1.231870</td>\n",
" <td>-0.131782</td>\n",
" <td>1.316470</td>\n",
" <td>...</td>\n",
" <td>-0.276936</td>\n",
" <td>-0.128260</td>\n",
" <td>0.160733</td>\n",
" <td>0.071333</td>\n",
" <td>0.155468</td>\n",
" <td>0.172801</td>\n",
" <td>-0.169568</td>\n",
" <td>-0.144326</td>\n",
" <td>0.391713</td>\n",
" <td>-0.013357</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0.019142</td>\n",
" <td>-3.106959</td>\n",
" <td>0.168223</td>\n",
" <td>-0.553341</td>\n",
" <td>0.940712</td>\n",
" <td>0.200719</td>\n",
" <td>-0.468954</td>\n",
" <td>0.235082</td>\n",
" <td>-0.838022</td>\n",
" <td>...</td>\n",
" <td>0.140974</td>\n",
" <td>-0.224535</td>\n",
" <td>-1.063234</td>\n",
" <td>-0.334556</td>\n",
" <td>0.361166</td>\n",
" <td>-1.218397</td>\n",
" <td>-0.346191</td>\n",
" <td>-0.962753</td>\n",
" <td>-0.138863</td>\n",
" <td>1.083103</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>4.851149</td>\n",
" <td>1.242811</td>\n",
" <td>-0.351815</td>\n",
" <td>-1.484957</td>\n",
" <td>-0.758200</td>\n",
" <td>2.181179</td>\n",
" <td>-1.843949</td>\n",
" <td>0.296194</td>\n",
" <td>1.299142</td>\n",
" <td>...</td>\n",
" <td>-0.289024</td>\n",
" <td>-0.282563</td>\n",
" <td>0.088334</td>\n",
" <td>0.238624</td>\n",
" <td>0.327280</td>\n",
" <td>0.325285</td>\n",
" <td>-0.704900</td>\n",
" <td>-0.036388</td>\n",
" <td>-0.540516</td>\n",
" <td>0.021711</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>-1.771641</td>\n",
" <td>0.039500</td>\n",
" <td>-1.358623</td>\n",
" <td>1.920760</td>\n",
" <td>-2.550817</td>\n",
" <td>0.209519</td>\n",
" <td>-0.756387</td>\n",
" <td>0.700109</td>\n",
" <td>-1.408543</td>\n",
" <td>...</td>\n",
" <td>0.286790</td>\n",
" <td>0.672251</td>\n",
" <td>-0.172186</td>\n",
" <td>-0.518922</td>\n",
" <td>0.231498</td>\n",
" <td>-0.074296</td>\n",
" <td>-0.034287</td>\n",
" <td>-0.877735</td>\n",
" <td>0.028065</td>\n",
" <td>-0.321009</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>6.463747</td>\n",
" <td>1.064473</td>\n",
" <td>0.209472</td>\n",
" <td>0.448906</td>\n",
" <td>-1.555301</td>\n",
" <td>3.215822</td>\n",
" <td>-0.946356</td>\n",
" <td>-0.805204</td>\n",
" <td>2.112526</td>\n",
" <td>...</td>\n",
" <td>-0.235585</td>\n",
" <td>0.019570</td>\n",
" <td>-0.270189</td>\n",
" <td>0.375297</td>\n",
" <td>-0.396732</td>\n",
" <td>-0.109084</td>\n",
" <td>0.317305</td>\n",
" <td>-0.145975</td>\n",
" <td>-0.674692</td>\n",
" <td>-0.378458</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 173 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n",
"0 0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 \n",
"1 1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 \n",
"2 2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 \n",
"3 3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 \n",
"4 4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 \n",
"\n",
" PCA6 PCA7 PCA8 ... PCA162 PCA163 PCA164 PCA165 \\\n",
"0 -1.231870 -0.131782 1.316470 ... -0.276936 -0.128260 0.160733 0.071333 \n",
"1 -0.468954 0.235082 -0.838022 ... 0.140974 -0.224535 -1.063234 -0.334556 \n",
"2 -1.843949 0.296194 1.299142 ... -0.289024 -0.282563 0.088334 0.238624 \n",
"3 -0.756387 0.700109 -1.408543 ... 0.286790 0.672251 -0.172186 -0.518922 \n",
"4 -0.946356 -0.805204 2.112526 ... -0.235585 0.019570 -0.270189 0.375297 \n",
"\n",
" PCA166 PCA167 PCA168 PCA169 PCA170 PCA171 \n",
"0 0.155468 0.172801 -0.169568 -0.144326 0.391713 -0.013357 \n",
"1 0.361166 -1.218397 -0.346191 -0.962753 -0.138863 1.083103 \n",
"2 0.327280 0.325285 -0.704900 -0.036388 -0.540516 0.021711 \n",
"3 0.231498 -0.074296 -0.034287 -0.877735 0.028065 -0.321009 \n",
"4 -0.396732 -0.109084 0.317305 -0.145975 -0.674692 -0.378458 \n",
"\n",
"[5 rows x 173 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(y)\n",
"print(X.shape)\n",
"print(X_test.shape)\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Initialize our models\n",
"\n",
"tree_model = DecisionTreeRegressor()\n",
"rf_model = RandomForestRegressor()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
" \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
]
},
{
"data": {
"text/plain": [
"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
" max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10,\n",
" n_jobs=None, oob_score=False, random_state=None,\n",
" verbose=0, warm_start=False)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit our models to the training data\n",
"\n",
"tree_model.fit(X, y)\n",
"rf_model.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Calculate error metrics for each model\n",
"\n",
"tree_mse = mean_squared_error(y, tree_model.predict(X))\n",
"tree_mae = mean_absolute_error(y, tree_model.predict(X))\n",
"rf_mse = mean_squared_error(y, rf_model.predict(X))\n",
"rf_mae = mean_absolute_error(y, rf_model.predict(X))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Decision Tree training mse = 0.0 & mae = 0.0 & rmse = 0.0\n",
"Random Forest training mse = 202420995.41813016 & mae = 8265.452260273973 & rmse = 14227.473261901783\n"
]
}
],
"source": [
"print(\"Decision Tree training mse = \",tree_mse,\" & mae = \",tree_mae,\" & rmse = \", sqrt(tree_mse))\n",
"print(\"Random Forest training mse = \",rf_mse,\" & mae = \",rf_mae,\" & rmse = \", sqrt(rf_mse))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Id SalePrice\n",
"0 1461 125500.0\n",
"1 1462 149500.0\n",
"2 1463 185000.0\n",
"3 1464 201000.0\n",
"4 1465 176000.0\n",
"... ... ...\n",
"1454 2915 89000.0\n",
"1455 2916 80000.0\n",
"1456 2917 167900.0\n",
"1457 2918 135000.0\n",
"1458 2919 181000.0\n",
"\n",
"[1459 rows x 2 columns]>\n"
]
}
],
"source": [
"# Setup and save Tree prediction\n",
"\n",
"test_pred = tree_model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n",
"tree_predictions = np.c_[testId, test_pred] # Note that we take the argmax over the collumns to use our softmax output\n",
"tree_submission = pd.DataFrame(tree_predictions, columns = ['Id', 'SalePrice'])\n",
"tree_submission['Id'] = tree_submission['Id'].astype(int)\n",
"print(tree_submission.head)\n",
"tree_submission.to_csv(\"submissions/TreeSubmission.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Id SalePrice\n",
"0 1461 136110.0\n",
"1 1462 167750.0\n",
"2 1463 184140.0\n",
"3 1464 186963.2\n",
"4 1465 188728.2\n",
"... ... ...\n",
"1454 2915 95030.0\n",
"1455 2916 99045.9\n",
"1456 2917 168480.0\n",
"1457 2918 114850.0\n",
"1458 2919 196190.0\n",
"\n",
"[1459 rows x 2 columns]>\n"
]
}
],
"source": [
"# Setup and save Random Forest prediction\n",
"\n",
"test_pred = rf_model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n",
"rf_predictions = np.c_[testId, test_pred] # Note that we take the argmax over the collumns to use our softmax output\n",
"rf_submission = pd.DataFrame(rf_predictions, columns = ['Id', 'SalePrice'])\n",
"rf_submission['Id'] = rf_submission['Id'].astype(int)\n",
"print(rf_submission.head)\n",
"rf_submission.to_csv(\"submissions/RFSubmission.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# Set up and save Ensemble prediction\n",
"\n",
"ensemble_submission = tree_submission\n",
"ensemble_submission['TreeSalePrice'] = tree_submission['SalePrice']\n",
"ensemble_submission['ForestSalePrice'] = rf_submission['SalePrice']\n",
"ensemble_submission['SalePrice'] = ensemble_submission[['TreeSalePrice', 'ForestSalePrice']].mean(axis=1)\n",
"ensemble_submission = ensemble_submission.drop(['TreeSalePrice', 'ForestSalePrice'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1461</td>\n",
" <td>133457.50</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1462</td>\n",
" <td>163187.50</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1463</td>\n",
" <td>184355.00</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>1464</td>\n",
" <td>190472.40</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>1465</td>\n",
" <td>185546.15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Id SalePrice\n",
"0 1461 133457.50\n",
"1 1462 163187.50\n",
"2 1463 184355.00\n",
"3 1464 190472.40\n",
"4 1465 185546.15"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble_submission.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"ensemble_submission.to_csv(\"submissions/EnsembleSubmission.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}