995 lines
32 KiB
Plaintext
995 lines
32 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"from sklearn.tree import DecisionTreeRegressor\n",
|
||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||
"from math import sqrt\n",
|
||
"from sklearn.metrics import mean_squared_error\n",
|
||
"from sklearn.metrics import mean_absolute_error"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"train = pd.read_csv(os.path.join('data', 'clean_train.csv'))\n",
|
||
"test = pd.read_csv(os.path.join('data', 'clean_test.csv'))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Unnamed: 0</th>\n",
|
||
" <th>PCA0</th>\n",
|
||
" <th>PCA1</th>\n",
|
||
" <th>PCA2</th>\n",
|
||
" <th>PCA3</th>\n",
|
||
" <th>PCA4</th>\n",
|
||
" <th>PCA5</th>\n",
|
||
" <th>PCA6</th>\n",
|
||
" <th>PCA7</th>\n",
|
||
" <th>PCA8</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>PCA164</th>\n",
|
||
" <th>PCA165</th>\n",
|
||
" <th>PCA166</th>\n",
|
||
" <th>PCA167</th>\n",
|
||
" <th>PCA168</th>\n",
|
||
" <th>PCA169</th>\n",
|
||
" <th>PCA170</th>\n",
|
||
" <th>PCA171</th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>SalePrice</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4.345109</td>\n",
|
||
" <td>1.619386</td>\n",
|
||
" <td>-0.739617</td>\n",
|
||
" <td>-2.080179</td>\n",
|
||
" <td>-0.985088</td>\n",
|
||
" <td>1.999117</td>\n",
|
||
" <td>-1.231870</td>\n",
|
||
" <td>-0.131782</td>\n",
|
||
" <td>1.316470</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.160733</td>\n",
|
||
" <td>0.071333</td>\n",
|
||
" <td>0.155468</td>\n",
|
||
" <td>0.172801</td>\n",
|
||
" <td>-0.169568</td>\n",
|
||
" <td>-0.144326</td>\n",
|
||
" <td>0.391713</td>\n",
|
||
" <td>-0.013357</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>208500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0.019142</td>\n",
|
||
" <td>-3.106959</td>\n",
|
||
" <td>0.168223</td>\n",
|
||
" <td>-0.553341</td>\n",
|
||
" <td>0.940712</td>\n",
|
||
" <td>0.200719</td>\n",
|
||
" <td>-0.468954</td>\n",
|
||
" <td>0.235082</td>\n",
|
||
" <td>-0.838022</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-1.063234</td>\n",
|
||
" <td>-0.334556</td>\n",
|
||
" <td>0.361166</td>\n",
|
||
" <td>-1.218397</td>\n",
|
||
" <td>-0.346191</td>\n",
|
||
" <td>-0.962753</td>\n",
|
||
" <td>-0.138863</td>\n",
|
||
" <td>1.083103</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>181500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>4.851149</td>\n",
|
||
" <td>1.242811</td>\n",
|
||
" <td>-0.351815</td>\n",
|
||
" <td>-1.484957</td>\n",
|
||
" <td>-0.758200</td>\n",
|
||
" <td>2.181179</td>\n",
|
||
" <td>-1.843949</td>\n",
|
||
" <td>0.296194</td>\n",
|
||
" <td>1.299142</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.088334</td>\n",
|
||
" <td>0.238624</td>\n",
|
||
" <td>0.327280</td>\n",
|
||
" <td>0.325285</td>\n",
|
||
" <td>-0.704900</td>\n",
|
||
" <td>-0.036388</td>\n",
|
||
" <td>-0.540516</td>\n",
|
||
" <td>0.021711</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>223500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>-1.771641</td>\n",
|
||
" <td>0.039500</td>\n",
|
||
" <td>-1.358623</td>\n",
|
||
" <td>1.920760</td>\n",
|
||
" <td>-2.550817</td>\n",
|
||
" <td>0.209519</td>\n",
|
||
" <td>-0.756387</td>\n",
|
||
" <td>0.700109</td>\n",
|
||
" <td>-1.408543</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.172186</td>\n",
|
||
" <td>-0.518922</td>\n",
|
||
" <td>0.231498</td>\n",
|
||
" <td>-0.074296</td>\n",
|
||
" <td>-0.034287</td>\n",
|
||
" <td>-0.877735</td>\n",
|
||
" <td>0.028065</td>\n",
|
||
" <td>-0.321009</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>140000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>6.463747</td>\n",
|
||
" <td>1.064473</td>\n",
|
||
" <td>0.209472</td>\n",
|
||
" <td>0.448906</td>\n",
|
||
" <td>-1.555301</td>\n",
|
||
" <td>3.215822</td>\n",
|
||
" <td>-0.946356</td>\n",
|
||
" <td>-0.805204</td>\n",
|
||
" <td>2.112526</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.270189</td>\n",
|
||
" <td>0.375297</td>\n",
|
||
" <td>-0.396732</td>\n",
|
||
" <td>-0.109084</td>\n",
|
||
" <td>0.317305</td>\n",
|
||
" <td>-0.145975</td>\n",
|
||
" <td>-0.674692</td>\n",
|
||
" <td>-0.378458</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>250000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 175 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n",
|
||
"0 0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 \n",
|
||
"1 1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 \n",
|
||
"2 2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 \n",
|
||
"3 3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 \n",
|
||
"4 4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 \n",
|
||
"\n",
|
||
" PCA6 PCA7 PCA8 ... PCA164 PCA165 PCA166 PCA167 \\\n",
|
||
"0 -1.231870 -0.131782 1.316470 ... 0.160733 0.071333 0.155468 0.172801 \n",
|
||
"1 -0.468954 0.235082 -0.838022 ... -1.063234 -0.334556 0.361166 -1.218397 \n",
|
||
"2 -1.843949 0.296194 1.299142 ... 0.088334 0.238624 0.327280 0.325285 \n",
|
||
"3 -0.756387 0.700109 -1.408543 ... -0.172186 -0.518922 0.231498 -0.074296 \n",
|
||
"4 -0.946356 -0.805204 2.112526 ... -0.270189 0.375297 -0.396732 -0.109084 \n",
|
||
"\n",
|
||
" PCA168 PCA169 PCA170 PCA171 Id SalePrice \n",
|
||
"0 -0.169568 -0.144326 0.391713 -0.013357 1 208500 \n",
|
||
"1 -0.346191 -0.962753 -0.138863 1.083103 2 181500 \n",
|
||
"2 -0.704900 -0.036388 -0.540516 0.021711 3 223500 \n",
|
||
"3 -0.034287 -0.877735 0.028065 -0.321009 4 140000 \n",
|
||
"4 0.317305 -0.145975 -0.674692 -0.378458 5 250000 \n",
|
||
"\n",
|
||
"[5 rows x 175 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Unnamed: 0</th>\n",
|
||
" <th>PCA0</th>\n",
|
||
" <th>PCA1</th>\n",
|
||
" <th>PCA2</th>\n",
|
||
" <th>PCA3</th>\n",
|
||
" <th>PCA4</th>\n",
|
||
" <th>PCA5</th>\n",
|
||
" <th>PCA6</th>\n",
|
||
" <th>PCA7</th>\n",
|
||
" <th>PCA8</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>PCA163</th>\n",
|
||
" <th>PCA164</th>\n",
|
||
" <th>PCA165</th>\n",
|
||
" <th>PCA166</th>\n",
|
||
" <th>PCA167</th>\n",
|
||
" <th>PCA168</th>\n",
|
||
" <th>PCA169</th>\n",
|
||
" <th>PCA170</th>\n",
|
||
" <th>PCA171</th>\n",
|
||
" <th>Id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>-3.208086</td>\n",
|
||
" <td>-2.987338</td>\n",
|
||
" <td>-0.327066</td>\n",
|
||
" <td>-1.609206</td>\n",
|
||
" <td>0.016879</td>\n",
|
||
" <td>-1.514939</td>\n",
|
||
" <td>-0.417889</td>\n",
|
||
" <td>-0.988173</td>\n",
|
||
" <td>-0.653363</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.027364</td>\n",
|
||
" <td>0.653222</td>\n",
|
||
" <td>-0.201973</td>\n",
|
||
" <td>-0.769946</td>\n",
|
||
" <td>-0.344834</td>\n",
|
||
" <td>0.514257</td>\n",
|
||
" <td>1.114106</td>\n",
|
||
" <td>0.337765</td>\n",
|
||
" <td>-0.639617</td>\n",
|
||
" <td>1461</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>-1.403753</td>\n",
|
||
" <td>-4.261851</td>\n",
|
||
" <td>0.107527</td>\n",
|
||
" <td>0.935981</td>\n",
|
||
" <td>0.165777</td>\n",
|
||
" <td>-0.299485</td>\n",
|
||
" <td>-0.524918</td>\n",
|
||
" <td>-2.332121</td>\n",
|
||
" <td>0.031044</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3.856117</td>\n",
|
||
" <td>0.787996</td>\n",
|
||
" <td>0.215221</td>\n",
|
||
" <td>0.458275</td>\n",
|
||
" <td>1.135109</td>\n",
|
||
" <td>0.378972</td>\n",
|
||
" <td>0.953559</td>\n",
|
||
" <td>-1.008240</td>\n",
|
||
" <td>4.445435</td>\n",
|
||
" <td>1462</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2.257002</td>\n",
|
||
" <td>0.427951</td>\n",
|
||
" <td>-0.610464</td>\n",
|
||
" <td>-1.301125</td>\n",
|
||
" <td>-1.058327</td>\n",
|
||
" <td>2.674177</td>\n",
|
||
" <td>-1.500824</td>\n",
|
||
" <td>-0.223999</td>\n",
|
||
" <td>0.403440</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.117138</td>\n",
|
||
" <td>-0.378473</td>\n",
|
||
" <td>-0.031613</td>\n",
|
||
" <td>0.090593</td>\n",
|
||
" <td>-0.173914</td>\n",
|
||
" <td>-0.150098</td>\n",
|
||
" <td>-0.006612</td>\n",
|
||
" <td>0.190780</td>\n",
|
||
" <td>-0.152486</td>\n",
|
||
" <td>1463</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3.253618</td>\n",
|
||
" <td>0.537318</td>\n",
|
||
" <td>-0.796079</td>\n",
|
||
" <td>-0.851716</td>\n",
|
||
" <td>-1.209643</td>\n",
|
||
" <td>2.388795</td>\n",
|
||
" <td>-1.340676</td>\n",
|
||
" <td>-0.876322</td>\n",
|
||
" <td>0.421183</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.441586</td>\n",
|
||
" <td>0.020066</td>\n",
|
||
" <td>-0.151709</td>\n",
|
||
" <td>0.444826</td>\n",
|
||
" <td>0.008218</td>\n",
|
||
" <td>-0.161705</td>\n",
|
||
" <td>-0.453482</td>\n",
|
||
" <td>0.472352</td>\n",
|
||
" <td>0.046141</td>\n",
|
||
" <td>1464</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.876409</td>\n",
|
||
" <td>-0.075909</td>\n",
|
||
" <td>-0.154959</td>\n",
|
||
" <td>-2.469870</td>\n",
|
||
" <td>1.407820</td>\n",
|
||
" <td>0.487532</td>\n",
|
||
" <td>0.072190</td>\n",
|
||
" <td>2.414446</td>\n",
|
||
" <td>1.667224</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.269062</td>\n",
|
||
" <td>0.651172</td>\n",
|
||
" <td>-0.050461</td>\n",
|
||
" <td>-0.526448</td>\n",
|
||
" <td>-0.843701</td>\n",
|
||
" <td>0.574770</td>\n",
|
||
" <td>-0.227828</td>\n",
|
||
" <td>1.071423</td>\n",
|
||
" <td>1.362638</td>\n",
|
||
" <td>1465</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 174 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n",
|
||
"0 0 -3.208086 -2.987338 -0.327066 -1.609206 0.016879 -1.514939 \n",
|
||
"1 1 -1.403753 -4.261851 0.107527 0.935981 0.165777 -0.299485 \n",
|
||
"2 2 2.257002 0.427951 -0.610464 -1.301125 -1.058327 2.674177 \n",
|
||
"3 3 3.253618 0.537318 -0.796079 -0.851716 -1.209643 2.388795 \n",
|
||
"4 4 2.876409 -0.075909 -0.154959 -2.469870 1.407820 0.487532 \n",
|
||
"\n",
|
||
" PCA6 PCA7 PCA8 ... PCA163 PCA164 PCA165 PCA166 \\\n",
|
||
"0 -0.417889 -0.988173 -0.653363 ... -0.027364 0.653222 -0.201973 -0.769946 \n",
|
||
"1 -0.524918 -2.332121 0.031044 ... 3.856117 0.787996 0.215221 0.458275 \n",
|
||
"2 -1.500824 -0.223999 0.403440 ... -0.117138 -0.378473 -0.031613 0.090593 \n",
|
||
"3 -1.340676 -0.876322 0.421183 ... -0.441586 0.020066 -0.151709 0.444826 \n",
|
||
"4 0.072190 2.414446 1.667224 ... 0.269062 0.651172 -0.050461 -0.526448 \n",
|
||
"\n",
|
||
" PCA167 PCA168 PCA169 PCA170 PCA171 Id \n",
|
||
"0 -0.344834 0.514257 1.114106 0.337765 -0.639617 1461 \n",
|
||
"1 1.135109 0.378972 0.953559 -1.008240 4.445435 1462 \n",
|
||
"2 -0.173914 -0.150098 -0.006612 0.190780 -0.152486 1463 \n",
|
||
"3 0.008218 -0.161705 -0.453482 0.472352 0.046141 1464 \n",
|
||
"4 -0.843701 0.574770 -0.227828 1.071423 1.362638 1465 \n",
|
||
"\n",
|
||
"[5 rows x 174 columns]"
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"test.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Set aside unnecessary features\n",
|
||
"\n",
|
||
"trainId = train['Id'].astype(int)\n",
|
||
"testId = test['Id'].astype(int)\n",
|
||
"\n",
|
||
"train = train.drop('Id', axis=1)\n",
|
||
"test = test.drop('Id', axis=1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"X = train.drop('SalePrice', axis=1)\n",
|
||
"y = train['SalePrice']\n",
|
||
"X_test = test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"0 208500\n",
|
||
"1 181500\n",
|
||
"2 223500\n",
|
||
"3 140000\n",
|
||
"4 250000\n",
|
||
" ... \n",
|
||
"1455 175000\n",
|
||
"1456 210000\n",
|
||
"1457 266500\n",
|
||
"1458 142125\n",
|
||
"1459 147500\n",
|
||
"Name: SalePrice, Length: 1460, dtype: int64\n",
|
||
"(1460, 173)\n",
|
||
"(1459, 173)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Unnamed: 0</th>\n",
|
||
" <th>PCA0</th>\n",
|
||
" <th>PCA1</th>\n",
|
||
" <th>PCA2</th>\n",
|
||
" <th>PCA3</th>\n",
|
||
" <th>PCA4</th>\n",
|
||
" <th>PCA5</th>\n",
|
||
" <th>PCA6</th>\n",
|
||
" <th>PCA7</th>\n",
|
||
" <th>PCA8</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>PCA162</th>\n",
|
||
" <th>PCA163</th>\n",
|
||
" <th>PCA164</th>\n",
|
||
" <th>PCA165</th>\n",
|
||
" <th>PCA166</th>\n",
|
||
" <th>PCA167</th>\n",
|
||
" <th>PCA168</th>\n",
|
||
" <th>PCA169</th>\n",
|
||
" <th>PCA170</th>\n",
|
||
" <th>PCA171</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4.345109</td>\n",
|
||
" <td>1.619386</td>\n",
|
||
" <td>-0.739617</td>\n",
|
||
" <td>-2.080179</td>\n",
|
||
" <td>-0.985088</td>\n",
|
||
" <td>1.999117</td>\n",
|
||
" <td>-1.231870</td>\n",
|
||
" <td>-0.131782</td>\n",
|
||
" <td>1.316470</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.276936</td>\n",
|
||
" <td>-0.128260</td>\n",
|
||
" <td>0.160733</td>\n",
|
||
" <td>0.071333</td>\n",
|
||
" <td>0.155468</td>\n",
|
||
" <td>0.172801</td>\n",
|
||
" <td>-0.169568</td>\n",
|
||
" <td>-0.144326</td>\n",
|
||
" <td>0.391713</td>\n",
|
||
" <td>-0.013357</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0.019142</td>\n",
|
||
" <td>-3.106959</td>\n",
|
||
" <td>0.168223</td>\n",
|
||
" <td>-0.553341</td>\n",
|
||
" <td>0.940712</td>\n",
|
||
" <td>0.200719</td>\n",
|
||
" <td>-0.468954</td>\n",
|
||
" <td>0.235082</td>\n",
|
||
" <td>-0.838022</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.140974</td>\n",
|
||
" <td>-0.224535</td>\n",
|
||
" <td>-1.063234</td>\n",
|
||
" <td>-0.334556</td>\n",
|
||
" <td>0.361166</td>\n",
|
||
" <td>-1.218397</td>\n",
|
||
" <td>-0.346191</td>\n",
|
||
" <td>-0.962753</td>\n",
|
||
" <td>-0.138863</td>\n",
|
||
" <td>1.083103</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>4.851149</td>\n",
|
||
" <td>1.242811</td>\n",
|
||
" <td>-0.351815</td>\n",
|
||
" <td>-1.484957</td>\n",
|
||
" <td>-0.758200</td>\n",
|
||
" <td>2.181179</td>\n",
|
||
" <td>-1.843949</td>\n",
|
||
" <td>0.296194</td>\n",
|
||
" <td>1.299142</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.289024</td>\n",
|
||
" <td>-0.282563</td>\n",
|
||
" <td>0.088334</td>\n",
|
||
" <td>0.238624</td>\n",
|
||
" <td>0.327280</td>\n",
|
||
" <td>0.325285</td>\n",
|
||
" <td>-0.704900</td>\n",
|
||
" <td>-0.036388</td>\n",
|
||
" <td>-0.540516</td>\n",
|
||
" <td>0.021711</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>-1.771641</td>\n",
|
||
" <td>0.039500</td>\n",
|
||
" <td>-1.358623</td>\n",
|
||
" <td>1.920760</td>\n",
|
||
" <td>-2.550817</td>\n",
|
||
" <td>0.209519</td>\n",
|
||
" <td>-0.756387</td>\n",
|
||
" <td>0.700109</td>\n",
|
||
" <td>-1.408543</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.286790</td>\n",
|
||
" <td>0.672251</td>\n",
|
||
" <td>-0.172186</td>\n",
|
||
" <td>-0.518922</td>\n",
|
||
" <td>0.231498</td>\n",
|
||
" <td>-0.074296</td>\n",
|
||
" <td>-0.034287</td>\n",
|
||
" <td>-0.877735</td>\n",
|
||
" <td>0.028065</td>\n",
|
||
" <td>-0.321009</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>6.463747</td>\n",
|
||
" <td>1.064473</td>\n",
|
||
" <td>0.209472</td>\n",
|
||
" <td>0.448906</td>\n",
|
||
" <td>-1.555301</td>\n",
|
||
" <td>3.215822</td>\n",
|
||
" <td>-0.946356</td>\n",
|
||
" <td>-0.805204</td>\n",
|
||
" <td>2.112526</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.235585</td>\n",
|
||
" <td>0.019570</td>\n",
|
||
" <td>-0.270189</td>\n",
|
||
" <td>0.375297</td>\n",
|
||
" <td>-0.396732</td>\n",
|
||
" <td>-0.109084</td>\n",
|
||
" <td>0.317305</td>\n",
|
||
" <td>-0.145975</td>\n",
|
||
" <td>-0.674692</td>\n",
|
||
" <td>-0.378458</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 173 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Unnamed: 0 PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 \\\n",
|
||
"0 0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 \n",
|
||
"1 1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 \n",
|
||
"2 2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 \n",
|
||
"3 3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 \n",
|
||
"4 4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 \n",
|
||
"\n",
|
||
" PCA6 PCA7 PCA8 ... PCA162 PCA163 PCA164 PCA165 \\\n",
|
||
"0 -1.231870 -0.131782 1.316470 ... -0.276936 -0.128260 0.160733 0.071333 \n",
|
||
"1 -0.468954 0.235082 -0.838022 ... 0.140974 -0.224535 -1.063234 -0.334556 \n",
|
||
"2 -1.843949 0.296194 1.299142 ... -0.289024 -0.282563 0.088334 0.238624 \n",
|
||
"3 -0.756387 0.700109 -1.408543 ... 0.286790 0.672251 -0.172186 -0.518922 \n",
|
||
"4 -0.946356 -0.805204 2.112526 ... -0.235585 0.019570 -0.270189 0.375297 \n",
|
||
"\n",
|
||
" PCA166 PCA167 PCA168 PCA169 PCA170 PCA171 \n",
|
||
"0 0.155468 0.172801 -0.169568 -0.144326 0.391713 -0.013357 \n",
|
||
"1 0.361166 -1.218397 -0.346191 -0.962753 -0.138863 1.083103 \n",
|
||
"2 0.327280 0.325285 -0.704900 -0.036388 -0.540516 0.021711 \n",
|
||
"3 0.231498 -0.074296 -0.034287 -0.877735 0.028065 -0.321009 \n",
|
||
"4 -0.396732 -0.109084 0.317305 -0.145975 -0.674692 -0.378458 \n",
|
||
"\n",
|
||
"[5 rows x 173 columns]"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"print(y)\n",
|
||
"print(X.shape)\n",
|
||
"print(X_test.shape)\n",
|
||
"X.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Initialize our models\n",
|
||
"\n",
|
||
"tree_model = DecisionTreeRegressor()\n",
|
||
"rf_model = RandomForestRegressor()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
|
||
" \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
|
||
" max_features='auto', max_leaf_nodes=None,\n",
|
||
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
|
||
" min_samples_leaf=1, min_samples_split=2,\n",
|
||
" min_weight_fraction_leaf=0.0, n_estimators=10,\n",
|
||
" n_jobs=None, oob_score=False, random_state=None,\n",
|
||
" verbose=0, warm_start=False)"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Fit our models to the training data\n",
|
||
"\n",
|
||
"tree_model.fit(X, y)\n",
|
||
"rf_model.fit(X, y)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Calculate error metrics for each model\n",
|
||
"\n",
|
||
"tree_mse = mean_squared_error(y, tree_model.predict(X))\n",
|
||
"tree_mae = mean_absolute_error(y, tree_model.predict(X))\n",
|
||
"rf_mse = mean_squared_error(y, rf_model.predict(X))\n",
|
||
"rf_mae = mean_absolute_error(y, rf_model.predict(X))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Decision Tree training mse = 0.0 & mae = 0.0 & rmse = 0.0\n",
|
||
"Random Forest training mse = 202420995.41813016 & mae = 8265.452260273973 & rmse = 14227.473261901783\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"Decision Tree training mse = \",tree_mse,\" & mae = \",tree_mae,\" & rmse = \", sqrt(tree_mse))\n",
|
||
"print(\"Random Forest training mse = \",rf_mse,\" & mae = \",rf_mae,\" & rmse = \", sqrt(rf_mse))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<bound method NDFrame.head of Id SalePrice\n",
|
||
"0 1461 125500.0\n",
|
||
"1 1462 149500.0\n",
|
||
"2 1463 185000.0\n",
|
||
"3 1464 201000.0\n",
|
||
"4 1465 176000.0\n",
|
||
"... ... ...\n",
|
||
"1454 2915 89000.0\n",
|
||
"1455 2916 80000.0\n",
|
||
"1456 2917 167900.0\n",
|
||
"1457 2918 135000.0\n",
|
||
"1458 2919 181000.0\n",
|
||
"\n",
|
||
"[1459 rows x 2 columns]>\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Setup and save Tree prediction\n",
|
||
"\n",
|
||
"test_pred = tree_model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n",
|
||
"tree_predictions = np.c_[testId, test_pred] # Note that we take the argmax over the collumns to use our softmax output\n",
|
||
"tree_submission = pd.DataFrame(tree_predictions, columns = ['Id', 'SalePrice'])\n",
|
||
"tree_submission['Id'] = tree_submission['Id'].astype(int)\n",
|
||
"print(tree_submission.head)\n",
|
||
"tree_submission.to_csv(\"submissions/TreeSubmission.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<bound method NDFrame.head of Id SalePrice\n",
|
||
"0 1461 136110.0\n",
|
||
"1 1462 167750.0\n",
|
||
"2 1463 184140.0\n",
|
||
"3 1464 186963.2\n",
|
||
"4 1465 188728.2\n",
|
||
"... ... ...\n",
|
||
"1454 2915 95030.0\n",
|
||
"1455 2916 99045.9\n",
|
||
"1456 2917 168480.0\n",
|
||
"1457 2918 114850.0\n",
|
||
"1458 2919 196190.0\n",
|
||
"\n",
|
||
"[1459 rows x 2 columns]>\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Setup and save Random Forest prediction\n",
|
||
"\n",
|
||
"test_pred = rf_model.predict(X_test.values) # Note that we need to feed our model the values or our dataframe X_test\n",
|
||
"rf_predictions = np.c_[testId, test_pred] # Note that we take the argmax over the collumns to use our softmax output\n",
|
||
"rf_submission = pd.DataFrame(rf_predictions, columns = ['Id', 'SalePrice'])\n",
|
||
"rf_submission['Id'] = rf_submission['Id'].astype(int)\n",
|
||
"print(rf_submission.head)\n",
|
||
"rf_submission.to_csv(\"submissions/RFSubmission.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Set up and save Ensemble prediction\n",
|
||
"\n",
|
||
"ensemble_submission = tree_submission\n",
|
||
"ensemble_submission['TreeSalePrice'] = tree_submission['SalePrice']\n",
|
||
"ensemble_submission['ForestSalePrice'] = rf_submission['SalePrice']\n",
|
||
"ensemble_submission['SalePrice'] = ensemble_submission[['TreeSalePrice', 'ForestSalePrice']].mean(axis=1)\n",
|
||
"ensemble_submission = ensemble_submission.drop(['TreeSalePrice', 'ForestSalePrice'], axis=1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>SalePrice</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1461</td>\n",
|
||
" <td>133457.50</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1462</td>\n",
|
||
" <td>163187.50</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1463</td>\n",
|
||
" <td>184355.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1464</td>\n",
|
||
" <td>190472.40</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1465</td>\n",
|
||
" <td>185546.15</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Id SalePrice\n",
|
||
"0 1461 133457.50\n",
|
||
"1 1462 163187.50\n",
|
||
"2 1463 184355.00\n",
|
||
"3 1464 190472.40\n",
|
||
"4 1465 185546.15"
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ensemble_submission.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"ensemble_submission.to_csv(\"submissions/EnsembleSubmission.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|