2762 lines
92 KiB
Plaintext
2762 lines
92 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import os\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.decomposition import PCA"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>MSSubClass</th>\n",
|
||
" <th>MSZoning</th>\n",
|
||
" <th>LotFrontage</th>\n",
|
||
" <th>LotArea</th>\n",
|
||
" <th>Street</th>\n",
|
||
" <th>Alley</th>\n",
|
||
" <th>LotShape</th>\n",
|
||
" <th>LandContour</th>\n",
|
||
" <th>Utilities</th>\n",
|
||
" <th>LotConfig</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>ScreenPorch</th>\n",
|
||
" <th>PoolArea</th>\n",
|
||
" <th>PoolQC</th>\n",
|
||
" <th>Fence</th>\n",
|
||
" <th>MiscFeature</th>\n",
|
||
" <th>MiscVal</th>\n",
|
||
" <th>MoSold</th>\n",
|
||
" <th>YrSold</th>\n",
|
||
" <th>SaleType</th>\n",
|
||
" <th>SaleCondition</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>1461</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>RH</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>11622</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>120</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>MnPrv</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1462</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>81.0</td>\n",
|
||
" <td>14267</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Corner</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Gar2</td>\n",
|
||
" <td>12500</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1463</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>74.0</td>\n",
|
||
" <td>13830</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>MnPrv</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1464</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>78.0</td>\n",
|
||
" <td>9978</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1465</td>\n",
|
||
" <td>120</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>43.0</td>\n",
|
||
" <td>5005</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>HLS</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>144</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2915</td>\n",
|
||
" <td>160</td>\n",
|
||
" <td>RM</td>\n",
|
||
" <td>21.0</td>\n",
|
||
" <td>1936</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2916</td>\n",
|
||
" <td>160</td>\n",
|
||
" <td>RM</td>\n",
|
||
" <td>21.0</td>\n",
|
||
" <td>1894</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Abnorml</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2917</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>160.0</td>\n",
|
||
" <td>20000</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Abnorml</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2918</td>\n",
|
||
" <td>85</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>62.0</td>\n",
|
||
" <td>10441</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>MnPrv</td>\n",
|
||
" <td>Shed</td>\n",
|
||
" <td>700</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2919</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>74.0</td>\n",
|
||
" <td>9627</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1459 rows × 79 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
|
||
"Id \n",
|
||
"1461 20 RH 80.0 11622 Pave NaN Reg \n",
|
||
"1462 20 RL 81.0 14267 Pave NaN IR1 \n",
|
||
"1463 60 RL 74.0 13830 Pave NaN IR1 \n",
|
||
"1464 60 RL 78.0 9978 Pave NaN IR1 \n",
|
||
"1465 120 RL 43.0 5005 Pave NaN IR1 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"2915 160 RM 21.0 1936 Pave NaN Reg \n",
|
||
"2916 160 RM 21.0 1894 Pave NaN Reg \n",
|
||
"2917 20 RL 160.0 20000 Pave NaN Reg \n",
|
||
"2918 85 RL 62.0 10441 Pave NaN Reg \n",
|
||
"2919 60 RL 74.0 9627 Pave NaN Reg \n",
|
||
"\n",
|
||
" LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence \\\n",
|
||
"Id ... \n",
|
||
"1461 Lvl AllPub Inside ... 120 0 NaN MnPrv \n",
|
||
"1462 Lvl AllPub Corner ... 0 0 NaN NaN \n",
|
||
"1463 Lvl AllPub Inside ... 0 0 NaN MnPrv \n",
|
||
"1464 Lvl AllPub Inside ... 0 0 NaN NaN \n",
|
||
"1465 HLS AllPub Inside ... 144 0 NaN NaN \n",
|
||
"... ... ... ... ... ... ... ... ... \n",
|
||
"2915 Lvl AllPub Inside ... 0 0 NaN NaN \n",
|
||
"2916 Lvl AllPub Inside ... 0 0 NaN NaN \n",
|
||
"2917 Lvl AllPub Inside ... 0 0 NaN NaN \n",
|
||
"2918 Lvl AllPub Inside ... 0 0 NaN MnPrv \n",
|
||
"2919 Lvl AllPub Inside ... 0 0 NaN NaN \n",
|
||
"\n",
|
||
" MiscFeature MiscVal MoSold YrSold SaleType SaleCondition \n",
|
||
"Id \n",
|
||
"1461 NaN 0 6 2010 WD Normal \n",
|
||
"1462 Gar2 12500 6 2010 WD Normal \n",
|
||
"1463 NaN 0 3 2010 WD Normal \n",
|
||
"1464 NaN 0 6 2010 WD Normal \n",
|
||
"1465 NaN 0 1 2010 WD Normal \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"2915 NaN 0 6 2006 WD Normal \n",
|
||
"2916 NaN 0 4 2006 WD Abnorml \n",
|
||
"2917 NaN 0 9 2006 WD Abnorml \n",
|
||
"2918 Shed 700 7 2006 WD Normal \n",
|
||
"2919 NaN 0 11 2006 WD Normal \n",
|
||
"\n",
|
||
"[1459 rows x 79 columns]"
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train = pd.read_csv(os.path.join('data', 'train.csv'))\n",
|
||
"train.set_index('Id')\n",
|
||
"test = pd.read_csv(os.path.join('data', 'test.csv'))\n",
|
||
"test.set_index('Id')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
|
||
"0 1 60 RL 65.0 8450 Pave NaN Reg \n",
|
||
"1 2 20 RL 80.0 9600 Pave NaN Reg \n",
|
||
"2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
|
||
"3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
|
||
"4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
|
||
"\n",
|
||
" LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold \\\n",
|
||
"0 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n",
|
||
"1 Lvl AllPub ... 0 NaN NaN NaN 0 5 \n",
|
||
"2 Lvl AllPub ... 0 NaN NaN NaN 0 9 \n",
|
||
"3 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n",
|
||
"4 Lvl AllPub ... 0 NaN NaN NaN 0 12 \n",
|
||
"\n",
|
||
" YrSold SaleType SaleCondition SalePrice \n",
|
||
"0 2008 WD Normal 208500 \n",
|
||
"1 2007 WD Normal 181500 \n",
|
||
"2 2008 WD Normal 223500 \n",
|
||
"3 2006 WD Abnorml 140000 \n",
|
||
"4 2008 WD Normal 250000 \n",
|
||
"\n",
|
||
"[5 rows x 81 columns]\n",
|
||
"(1460, 81)\n",
|
||
"(1459, 80)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(train.head())\n",
|
||
"print(train.shape)\n",
|
||
"print(test.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(1460, 19)\n",
|
||
"LotFrontage 259\n",
|
||
"Alley 1369\n",
|
||
"MasVnrType 8\n",
|
||
"MasVnrArea 8\n",
|
||
"BsmtQual 37\n",
|
||
"BsmtCond 37\n",
|
||
"BsmtExposure 38\n",
|
||
"BsmtFinType1 37\n",
|
||
"BsmtFinType2 38\n",
|
||
"Electrical 1\n",
|
||
"FireplaceQu 690\n",
|
||
"GarageType 81\n",
|
||
"GarageYrBlt 81\n",
|
||
"GarageFinish 81\n",
|
||
"GarageQual 81\n",
|
||
"GarageCond 81\n",
|
||
"PoolQC 1453\n",
|
||
"Fence 1179\n",
|
||
"MiscFeature 1406\n",
|
||
"dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# All features with null values and their amounts\n",
|
||
"\n",
|
||
"train_null = train.loc[:, train.isnull().any()]\n",
|
||
"train_null.head()\n",
|
||
"print(train_null.shape)\n",
|
||
"print(train_null.isnull().sum())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Drop features with too many null values\n",
|
||
"\n",
|
||
"train = train.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1)\n",
|
||
"test = test.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>MSSubClass</th>\n",
|
||
" <th>MSZoning</th>\n",
|
||
" <th>LotFrontage</th>\n",
|
||
" <th>LotArea</th>\n",
|
||
" <th>Street</th>\n",
|
||
" <th>LotShape</th>\n",
|
||
" <th>LandContour</th>\n",
|
||
" <th>Utilities</th>\n",
|
||
" <th>LotConfig</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>EnclosedPorch</th>\n",
|
||
" <th>3SsnPorch</th>\n",
|
||
" <th>ScreenPorch</th>\n",
|
||
" <th>PoolArea</th>\n",
|
||
" <th>MiscVal</th>\n",
|
||
" <th>MoSold</th>\n",
|
||
" <th>YrSold</th>\n",
|
||
" <th>SaleType</th>\n",
|
||
" <th>SaleCondition</th>\n",
|
||
" <th>SalePrice</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" <td>8450</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2008</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" <td>208500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>9600</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>FR2</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2007</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" <td>181500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>68.0</td>\n",
|
||
" <td>11250</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>2008</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" <td>223500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>9550</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Corner</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>272</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Abnorml</td>\n",
|
||
" <td>140000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>84.0</td>\n",
|
||
" <td>14260</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>FR2</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2008</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" <td>250000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 76 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Id MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour \\\n",
|
||
"0 1 60 RL 65.0 8450 Pave Reg Lvl \n",
|
||
"1 2 20 RL 80.0 9600 Pave Reg Lvl \n",
|
||
"2 3 60 RL 68.0 11250 Pave IR1 Lvl \n",
|
||
"3 4 70 RL 60.0 9550 Pave IR1 Lvl \n",
|
||
"4 5 60 RL 84.0 14260 Pave IR1 Lvl \n",
|
||
"\n",
|
||
" Utilities LotConfig ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea \\\n",
|
||
"0 AllPub Inside ... 0 0 0 0 \n",
|
||
"1 AllPub FR2 ... 0 0 0 0 \n",
|
||
"2 AllPub Inside ... 0 0 0 0 \n",
|
||
"3 AllPub Corner ... 272 0 0 0 \n",
|
||
"4 AllPub FR2 ... 0 0 0 0 \n",
|
||
"\n",
|
||
" MiscVal MoSold YrSold SaleType SaleCondition SalePrice \n",
|
||
"0 0 2 2008 WD Normal 208500 \n",
|
||
"1 0 5 2007 WD Normal 181500 \n",
|
||
"2 0 9 2008 WD Normal 223500 \n",
|
||
"3 0 2 2006 WD Abnorml 140000 \n",
|
||
"4 0 12 2008 WD Normal 250000 \n",
|
||
"\n",
|
||
"[5 rows x 76 columns]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(1460, 76)\n",
|
||
"(1459, 75)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(train.shape)\n",
|
||
"print(test.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(1460, 14)\n",
|
||
"LotFrontage 259\n",
|
||
"MasVnrType 8\n",
|
||
"MasVnrArea 8\n",
|
||
"BsmtQual 37\n",
|
||
"BsmtCond 37\n",
|
||
"BsmtExposure 38\n",
|
||
"BsmtFinType1 37\n",
|
||
"BsmtFinType2 38\n",
|
||
"Electrical 1\n",
|
||
"GarageType 81\n",
|
||
"GarageYrBlt 81\n",
|
||
"GarageFinish 81\n",
|
||
"GarageQual 81\n",
|
||
"GarageCond 81\n",
|
||
"dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# All features with null values and their amounts\n",
|
||
"\n",
|
||
"train_null = train.loc[:, train.isnull().any()]\n",
|
||
"train_null.head()\n",
|
||
"print(train_null.shape)\n",
|
||
"print(train_null.isnull().sum())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Fill in object nans with 'NAN' and numerical nans with mean (mean imputation)\n",
|
||
"# Note that we use the train mean for mean imputation on the test df as well\n",
|
||
"\n",
|
||
"data = [train, test]\n",
|
||
"\n",
|
||
"for df in data:\n",
|
||
" for column in df:\n",
|
||
" if df[column].isna().any():\n",
|
||
" if df[column].dtype == object:\n",
|
||
" df[column] = df[column].replace(np.nan, 'NAN')\n",
|
||
" else:\n",
|
||
" mean = train[column].mean()\n",
|
||
" df[column] = df[column].replace(np.nan, mean)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>MSZoning</th>\n",
|
||
" <th>Street</th>\n",
|
||
" <th>LotShape</th>\n",
|
||
" <th>LandContour</th>\n",
|
||
" <th>Utilities</th>\n",
|
||
" <th>LotConfig</th>\n",
|
||
" <th>LandSlope</th>\n",
|
||
" <th>Neighborhood</th>\n",
|
||
" <th>Condition1</th>\n",
|
||
" <th>Condition2</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Electrical</th>\n",
|
||
" <th>KitchenQual</th>\n",
|
||
" <th>Functional</th>\n",
|
||
" <th>GarageType</th>\n",
|
||
" <th>GarageFinish</th>\n",
|
||
" <th>GarageQual</th>\n",
|
||
" <th>GarageCond</th>\n",
|
||
" <th>PavedDrive</th>\n",
|
||
" <th>SaleType</th>\n",
|
||
" <th>SaleCondition</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>Gtl</td>\n",
|
||
" <td>CollgCr</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>SBrkr</td>\n",
|
||
" <td>Gd</td>\n",
|
||
" <td>Typ</td>\n",
|
||
" <td>Attchd</td>\n",
|
||
" <td>RFn</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>Y</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>Reg</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>FR2</td>\n",
|
||
" <td>Gtl</td>\n",
|
||
" <td>Veenker</td>\n",
|
||
" <td>Feedr</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>SBrkr</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>Typ</td>\n",
|
||
" <td>Attchd</td>\n",
|
||
" <td>RFn</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>Y</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Inside</td>\n",
|
||
" <td>Gtl</td>\n",
|
||
" <td>CollgCr</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>SBrkr</td>\n",
|
||
" <td>Gd</td>\n",
|
||
" <td>Typ</td>\n",
|
||
" <td>Attchd</td>\n",
|
||
" <td>RFn</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>Y</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>Corner</td>\n",
|
||
" <td>Gtl</td>\n",
|
||
" <td>Crawfor</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>SBrkr</td>\n",
|
||
" <td>Gd</td>\n",
|
||
" <td>Typ</td>\n",
|
||
" <td>Detchd</td>\n",
|
||
" <td>Unf</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>Y</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Abnorml</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>RL</td>\n",
|
||
" <td>Pave</td>\n",
|
||
" <td>IR1</td>\n",
|
||
" <td>Lvl</td>\n",
|
||
" <td>AllPub</td>\n",
|
||
" <td>FR2</td>\n",
|
||
" <td>Gtl</td>\n",
|
||
" <td>NoRidge</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>Norm</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>SBrkr</td>\n",
|
||
" <td>Gd</td>\n",
|
||
" <td>Typ</td>\n",
|
||
" <td>Attchd</td>\n",
|
||
" <td>RFn</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>TA</td>\n",
|
||
" <td>Y</td>\n",
|
||
" <td>WD</td>\n",
|
||
" <td>Normal</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 38 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" MSZoning Street LotShape LandContour Utilities LotConfig LandSlope \\\n",
|
||
"0 RL Pave Reg Lvl AllPub Inside Gtl \n",
|
||
"1 RL Pave Reg Lvl AllPub FR2 Gtl \n",
|
||
"2 RL Pave IR1 Lvl AllPub Inside Gtl \n",
|
||
"3 RL Pave IR1 Lvl AllPub Corner Gtl \n",
|
||
"4 RL Pave IR1 Lvl AllPub FR2 Gtl \n",
|
||
"\n",
|
||
" Neighborhood Condition1 Condition2 ... Electrical KitchenQual Functional \\\n",
|
||
"0 CollgCr Norm Norm ... SBrkr Gd Typ \n",
|
||
"1 Veenker Feedr Norm ... SBrkr TA Typ \n",
|
||
"2 CollgCr Norm Norm ... SBrkr Gd Typ \n",
|
||
"3 Crawfor Norm Norm ... SBrkr Gd Typ \n",
|
||
"4 NoRidge Norm Norm ... SBrkr Gd Typ \n",
|
||
"\n",
|
||
" GarageType GarageFinish GarageQual GarageCond PavedDrive SaleType \\\n",
|
||
"0 Attchd RFn TA TA Y WD \n",
|
||
"1 Attchd RFn TA TA Y WD \n",
|
||
"2 Attchd RFn TA TA Y WD \n",
|
||
"3 Detchd Unf TA TA Y WD \n",
|
||
"4 Attchd RFn TA TA Y WD \n",
|
||
"\n",
|
||
" SaleCondition \n",
|
||
"0 Normal \n",
|
||
"1 Normal \n",
|
||
"2 Normal \n",
|
||
"3 Abnorml \n",
|
||
"4 Normal \n",
|
||
"\n",
|
||
"[5 rows x 38 columns]"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# All object type features to convert to numerical\n",
|
||
"\n",
|
||
"train_obj = train.select_dtypes(include=['object']).copy()\n",
|
||
"test_obj = test.select_dtypes(include=['object']).copy()\n",
|
||
"\n",
|
||
"train_obj.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>MSSubClass</th>\n",
|
||
" <th>LotFrontage</th>\n",
|
||
" <th>LotArea</th>\n",
|
||
" <th>OverallQual</th>\n",
|
||
" <th>OverallCond</th>\n",
|
||
" <th>YearBuilt</th>\n",
|
||
" <th>YearRemodAdd</th>\n",
|
||
" <th>MasVnrArea</th>\n",
|
||
" <th>BsmtFinSF1</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>SaleType_ConLw</th>\n",
|
||
" <th>SaleType_New</th>\n",
|
||
" <th>SaleType_Oth</th>\n",
|
||
" <th>SaleType_WD</th>\n",
|
||
" <th>SaleCondition_Abnorml</th>\n",
|
||
" <th>SaleCondition_AdjLand</th>\n",
|
||
" <th>SaleCondition_Alloca</th>\n",
|
||
" <th>SaleCondition_Family</th>\n",
|
||
" <th>SaleCondition_Normal</th>\n",
|
||
" <th>SaleCondition_Partial</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" <td>8450</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2003</td>\n",
|
||
" <td>2003</td>\n",
|
||
" <td>196.0</td>\n",
|
||
" <td>706</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>9600</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1976</td>\n",
|
||
" <td>1976</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>978</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>68.0</td>\n",
|
||
" <td>11250</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>2002</td>\n",
|
||
" <td>162.0</td>\n",
|
||
" <td>486</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>9550</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1915</td>\n",
|
||
" <td>1970</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>216</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>84.0</td>\n",
|
||
" <td>14260</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2000</td>\n",
|
||
" <td>2000</td>\n",
|
||
" <td>350.0</td>\n",
|
||
" <td>655</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 283 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n",
|
||
"0 1 60 65.0 8450 7 5 2003 \n",
|
||
"1 2 20 80.0 9600 6 8 1976 \n",
|
||
"2 3 60 68.0 11250 7 5 2001 \n",
|
||
"3 4 70 60.0 9550 7 5 1915 \n",
|
||
"4 5 60 84.0 14260 8 5 2000 \n",
|
||
"\n",
|
||
" YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n",
|
||
"0 2003 196.0 706 ... 0 0 \n",
|
||
"1 1976 0.0 978 ... 0 0 \n",
|
||
"2 2002 162.0 486 ... 0 0 \n",
|
||
"3 1970 0.0 216 ... 0 0 \n",
|
||
"4 2000 350.0 655 ... 0 0 \n",
|
||
"\n",
|
||
" SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n",
|
||
"0 0 1 0 0 \n",
|
||
"1 0 1 0 0 \n",
|
||
"2 0 1 0 0 \n",
|
||
"3 0 1 1 0 \n",
|
||
"4 0 1 0 0 \n",
|
||
"\n",
|
||
" SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n",
|
||
"0 0 0 1 \n",
|
||
"1 0 0 1 \n",
|
||
"2 0 0 1 \n",
|
||
"3 0 0 0 \n",
|
||
"4 0 0 1 \n",
|
||
"\n",
|
||
" SaleCondition_Partial \n",
|
||
"0 0 \n",
|
||
"1 0 \n",
|
||
"2 0 \n",
|
||
"3 0 \n",
|
||
"4 0 \n",
|
||
"\n",
|
||
"[5 rows x 283 columns]"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"print(list(train_obj))\n",
|
||
"\n",
|
||
"one_hot_train = pd.get_dummies(train, columns=list(train_obj))\n",
|
||
"one_hot_train.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# One hot encoding\n",
|
||
"\n",
|
||
"train = pd.get_dummies(train, columns=list(train_obj))\n",
|
||
"test = pd.get_dummies(test, columns=list(test_obj))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(1460, 283)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>MSSubClass</th>\n",
|
||
" <th>LotFrontage</th>\n",
|
||
" <th>LotArea</th>\n",
|
||
" <th>OverallQual</th>\n",
|
||
" <th>OverallCond</th>\n",
|
||
" <th>YearBuilt</th>\n",
|
||
" <th>YearRemodAdd</th>\n",
|
||
" <th>MasVnrArea</th>\n",
|
||
" <th>BsmtFinSF1</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>SaleType_ConLw</th>\n",
|
||
" <th>SaleType_New</th>\n",
|
||
" <th>SaleType_Oth</th>\n",
|
||
" <th>SaleType_WD</th>\n",
|
||
" <th>SaleCondition_Abnorml</th>\n",
|
||
" <th>SaleCondition_AdjLand</th>\n",
|
||
" <th>SaleCondition_Alloca</th>\n",
|
||
" <th>SaleCondition_Family</th>\n",
|
||
" <th>SaleCondition_Normal</th>\n",
|
||
" <th>SaleCondition_Partial</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" <td>8450</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2003</td>\n",
|
||
" <td>2003</td>\n",
|
||
" <td>196.0</td>\n",
|
||
" <td>706</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>9600</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1976</td>\n",
|
||
" <td>1976</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>978</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>68.0</td>\n",
|
||
" <td>11250</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>2002</td>\n",
|
||
" <td>162.0</td>\n",
|
||
" <td>486</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>9550</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1915</td>\n",
|
||
" <td>1970</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>216</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>84.0</td>\n",
|
||
" <td>14260</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2000</td>\n",
|
||
" <td>2000</td>\n",
|
||
" <td>350.0</td>\n",
|
||
" <td>655</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 283 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n",
|
||
"0 1 60 65.0 8450 7 5 2003 \n",
|
||
"1 2 20 80.0 9600 6 8 1976 \n",
|
||
"2 3 60 68.0 11250 7 5 2001 \n",
|
||
"3 4 70 60.0 9550 7 5 1915 \n",
|
||
"4 5 60 84.0 14260 8 5 2000 \n",
|
||
"\n",
|
||
" YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n",
|
||
"0 2003 196.0 706 ... 0 0 \n",
|
||
"1 1976 0.0 978 ... 0 0 \n",
|
||
"2 2002 162.0 486 ... 0 0 \n",
|
||
"3 1970 0.0 216 ... 0 0 \n",
|
||
"4 2000 350.0 655 ... 0 0 \n",
|
||
"\n",
|
||
" SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n",
|
||
"0 0 1 0 0 \n",
|
||
"1 0 1 0 0 \n",
|
||
"2 0 1 0 0 \n",
|
||
"3 0 1 1 0 \n",
|
||
"4 0 1 0 0 \n",
|
||
"\n",
|
||
" SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n",
|
||
"0 0 0 1 \n",
|
||
"1 0 0 1 \n",
|
||
"2 0 0 1 \n",
|
||
"3 0 0 0 \n",
|
||
"4 0 0 1 \n",
|
||
"\n",
|
||
" SaleCondition_Partial \n",
|
||
"0 0 \n",
|
||
"1 0 \n",
|
||
"2 0 \n",
|
||
"3 0 \n",
|
||
"4 0 \n",
|
||
"\n",
|
||
"[5 rows x 283 columns]"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"print(train.shape)\n",
|
||
"train.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(1459, 272)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>MSSubClass</th>\n",
|
||
" <th>LotFrontage</th>\n",
|
||
" <th>LotArea</th>\n",
|
||
" <th>OverallQual</th>\n",
|
||
" <th>OverallCond</th>\n",
|
||
" <th>YearBuilt</th>\n",
|
||
" <th>YearRemodAdd</th>\n",
|
||
" <th>MasVnrArea</th>\n",
|
||
" <th>BsmtFinSF1</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>SaleType_NAN</th>\n",
|
||
" <th>SaleType_New</th>\n",
|
||
" <th>SaleType_Oth</th>\n",
|
||
" <th>SaleType_WD</th>\n",
|
||
" <th>SaleCondition_Abnorml</th>\n",
|
||
" <th>SaleCondition_AdjLand</th>\n",
|
||
" <th>SaleCondition_Alloca</th>\n",
|
||
" <th>SaleCondition_Family</th>\n",
|
||
" <th>SaleCondition_Normal</th>\n",
|
||
" <th>SaleCondition_Partial</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1461</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>11622</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1961</td>\n",
|
||
" <td>1961</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>468.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1462</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>81.0</td>\n",
|
||
" <td>14267</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1958</td>\n",
|
||
" <td>1958</td>\n",
|
||
" <td>108.0</td>\n",
|
||
" <td>923.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1463</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>74.0</td>\n",
|
||
" <td>13830</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1997</td>\n",
|
||
" <td>1998</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>791.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1464</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>78.0</td>\n",
|
||
" <td>9978</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1998</td>\n",
|
||
" <td>1998</td>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>602.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1465</td>\n",
|
||
" <td>120</td>\n",
|
||
" <td>43.0</td>\n",
|
||
" <td>5005</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1992</td>\n",
|
||
" <td>1992</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>263.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 272 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond \\\n",
|
||
"0 1461 20 80.0 11622 5 6 \n",
|
||
"1 1462 20 81.0 14267 6 6 \n",
|
||
"2 1463 60 74.0 13830 5 5 \n",
|
||
"3 1464 60 78.0 9978 6 6 \n",
|
||
"4 1465 120 43.0 5005 8 5 \n",
|
||
"\n",
|
||
" YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_NAN \\\n",
|
||
"0 1961 1961 0.0 468.0 ... 0 \n",
|
||
"1 1958 1958 108.0 923.0 ... 0 \n",
|
||
"2 1997 1998 0.0 791.0 ... 0 \n",
|
||
"3 1998 1998 20.0 602.0 ... 0 \n",
|
||
"4 1992 1992 0.0 263.0 ... 0 \n",
|
||
"\n",
|
||
" SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml \\\n",
|
||
"0 0 0 1 0 \n",
|
||
"1 0 0 1 0 \n",
|
||
"2 0 0 1 0 \n",
|
||
"3 0 0 1 0 \n",
|
||
"4 0 0 1 0 \n",
|
||
"\n",
|
||
" SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family \\\n",
|
||
"0 0 0 0 \n",
|
||
"1 0 0 0 \n",
|
||
"2 0 0 0 \n",
|
||
"3 0 0 0 \n",
|
||
"4 0 0 0 \n",
|
||
"\n",
|
||
" SaleCondition_Normal SaleCondition_Partial \n",
|
||
"0 1 0 \n",
|
||
"1 1 0 \n",
|
||
"2 1 0 \n",
|
||
"3 1 0 \n",
|
||
"4 1 0 \n",
|
||
"\n",
|
||
"[5 rows x 272 columns]"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"print(test.shape)\n",
|
||
"test.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"{'Exterior1st_Stone', 'Condition2_RRNn', 'Condition2_RRAn', 'Electrical_NAN', 'HouseStyle_2.5Fin', 'Exterior1st_ImStucc', 'Electrical_Mix', 'RoofMatl_Roll', 'SalePrice', 'RoofMatl_Membran', 'Heating_Floor', 'RoofMatl_Metal', 'Condition2_RRAe', 'RoofMatl_ClyTile', 'GarageQual_Ex', 'Heating_OthW', 'Exterior2nd_Other', 'Utilities_NoSeWa'}\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Check for missing columns\n",
|
||
"\n",
|
||
"missing_cols = set( train.columns ) - set( test.columns )\n",
|
||
"print(missing_cols)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Note that we are missing some columns. This is because in the one-hot encoding process, the test set did not have any\n",
|
||
"# samples with a certain categorical output. For example, in RoofMatl, there were no samples in test where the value was\n",
|
||
"# ClyTile, which means no column was create to one hot encode it. As such, we will create zero-valued columns to fill\n",
|
||
"\n",
|
||
"for col in missing_cols:\n",
|
||
" test[col] = 0\n",
|
||
"\n",
|
||
"# Maintain same ordering for the df\n",
|
||
"test = test[train.columns]\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Drop zeroed out SalePrice column from test set\n",
|
||
"\n",
|
||
"test = test.drop('SalePrice', axis=1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>MSSubClass</th>\n",
|
||
" <th>LotFrontage</th>\n",
|
||
" <th>LotArea</th>\n",
|
||
" <th>OverallQual</th>\n",
|
||
" <th>OverallCond</th>\n",
|
||
" <th>YearBuilt</th>\n",
|
||
" <th>YearRemodAdd</th>\n",
|
||
" <th>MasVnrArea</th>\n",
|
||
" <th>BsmtFinSF1</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>SaleType_ConLw</th>\n",
|
||
" <th>SaleType_New</th>\n",
|
||
" <th>SaleType_Oth</th>\n",
|
||
" <th>SaleType_WD</th>\n",
|
||
" <th>SaleCondition_Abnorml</th>\n",
|
||
" <th>SaleCondition_AdjLand</th>\n",
|
||
" <th>SaleCondition_Alloca</th>\n",
|
||
" <th>SaleCondition_Family</th>\n",
|
||
" <th>SaleCondition_Normal</th>\n",
|
||
" <th>SaleCondition_Partial</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" <td>8450</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2003</td>\n",
|
||
" <td>2003</td>\n",
|
||
" <td>196.0</td>\n",
|
||
" <td>706</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>9600</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1976</td>\n",
|
||
" <td>1976</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>978</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>68.0</td>\n",
|
||
" <td>11250</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>2002</td>\n",
|
||
" <td>162.0</td>\n",
|
||
" <td>486</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>9550</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1915</td>\n",
|
||
" <td>1970</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>216</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>84.0</td>\n",
|
||
" <td>14260</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2000</td>\n",
|
||
" <td>2000</td>\n",
|
||
" <td>350.0</td>\n",
|
||
" <td>655</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 283 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n",
|
||
"0 1 60 65.0 8450 7 5 2003 \n",
|
||
"1 2 20 80.0 9600 6 8 1976 \n",
|
||
"2 3 60 68.0 11250 7 5 2001 \n",
|
||
"3 4 70 60.0 9550 7 5 1915 \n",
|
||
"4 5 60 84.0 14260 8 5 2000 \n",
|
||
"\n",
|
||
" YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n",
|
||
"0 2003 196.0 706 ... 0 0 \n",
|
||
"1 1976 0.0 978 ... 0 0 \n",
|
||
"2 2002 162.0 486 ... 0 0 \n",
|
||
"3 1970 0.0 216 ... 0 0 \n",
|
||
"4 2000 350.0 655 ... 0 0 \n",
|
||
"\n",
|
||
" SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n",
|
||
"0 0 1 0 0 \n",
|
||
"1 0 1 0 0 \n",
|
||
"2 0 1 0 0 \n",
|
||
"3 0 1 1 0 \n",
|
||
"4 0 1 0 0 \n",
|
||
"\n",
|
||
" SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n",
|
||
"0 0 0 1 \n",
|
||
"1 0 0 1 \n",
|
||
"2 0 0 1 \n",
|
||
"3 0 0 0 \n",
|
||
"4 0 0 1 \n",
|
||
"\n",
|
||
" SaleCondition_Partial \n",
|
||
"0 0 \n",
|
||
"1 0 \n",
|
||
"2 0 \n",
|
||
"3 0 \n",
|
||
"4 0 \n",
|
||
"\n",
|
||
"[5 rows x 283 columns]"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>MSSubClass</th>\n",
|
||
" <th>LotFrontage</th>\n",
|
||
" <th>LotArea</th>\n",
|
||
" <th>OverallQual</th>\n",
|
||
" <th>OverallCond</th>\n",
|
||
" <th>YearBuilt</th>\n",
|
||
" <th>YearRemodAdd</th>\n",
|
||
" <th>MasVnrArea</th>\n",
|
||
" <th>BsmtFinSF1</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>SaleType_ConLw</th>\n",
|
||
" <th>SaleType_New</th>\n",
|
||
" <th>SaleType_Oth</th>\n",
|
||
" <th>SaleType_WD</th>\n",
|
||
" <th>SaleCondition_Abnorml</th>\n",
|
||
" <th>SaleCondition_AdjLand</th>\n",
|
||
" <th>SaleCondition_Alloca</th>\n",
|
||
" <th>SaleCondition_Family</th>\n",
|
||
" <th>SaleCondition_Normal</th>\n",
|
||
" <th>SaleCondition_Partial</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1461</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>11622</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1961</td>\n",
|
||
" <td>1961</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>468.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1462</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>81.0</td>\n",
|
||
" <td>14267</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1958</td>\n",
|
||
" <td>1958</td>\n",
|
||
" <td>108.0</td>\n",
|
||
" <td>923.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1463</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>74.0</td>\n",
|
||
" <td>13830</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1997</td>\n",
|
||
" <td>1998</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>791.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1464</td>\n",
|
||
" <td>60</td>\n",
|
||
" <td>78.0</td>\n",
|
||
" <td>9978</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1998</td>\n",
|
||
" <td>1998</td>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>602.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1465</td>\n",
|
||
" <td>120</td>\n",
|
||
" <td>43.0</td>\n",
|
||
" <td>5005</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1992</td>\n",
|
||
" <td>1992</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>263.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 282 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond \\\n",
|
||
"0 1461 20 80.0 11622 5 6 \n",
|
||
"1 1462 20 81.0 14267 6 6 \n",
|
||
"2 1463 60 74.0 13830 5 5 \n",
|
||
"3 1464 60 78.0 9978 6 6 \n",
|
||
"4 1465 120 43.0 5005 8 5 \n",
|
||
"\n",
|
||
" YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw \\\n",
|
||
"0 1961 1961 0.0 468.0 ... 0 \n",
|
||
"1 1958 1958 108.0 923.0 ... 0 \n",
|
||
"2 1997 1998 0.0 791.0 ... 0 \n",
|
||
"3 1998 1998 20.0 602.0 ... 0 \n",
|
||
"4 1992 1992 0.0 263.0 ... 0 \n",
|
||
"\n",
|
||
" SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml \\\n",
|
||
"0 0 0 1 0 \n",
|
||
"1 0 0 1 0 \n",
|
||
"2 0 0 1 0 \n",
|
||
"3 0 0 1 0 \n",
|
||
"4 0 0 1 0 \n",
|
||
"\n",
|
||
" SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family \\\n",
|
||
"0 0 0 0 \n",
|
||
"1 0 0 0 \n",
|
||
"2 0 0 0 \n",
|
||
"3 0 0 0 \n",
|
||
"4 0 0 0 \n",
|
||
"\n",
|
||
" SaleCondition_Normal SaleCondition_Partial \n",
|
||
"0 1 0 \n",
|
||
"1 1 0 \n",
|
||
"2 1 0 \n",
|
||
"3 1 0 \n",
|
||
"4 1 0 \n",
|
||
"\n",
|
||
"[5 rows x 282 columns]"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"test.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Store columns we don't want to scale\n",
|
||
"\n",
|
||
"testId = test['Id']\n",
|
||
"trainSalePrice = train['SalePrice']\n",
|
||
"trainId = train['Id']\n",
|
||
"test = test.drop('Id', axis=1)\n",
|
||
"train = train.drop(['Id', 'SalePrice'], axis=1)\n",
|
||
"\n",
|
||
"# Store labels to remake our dataframes\n",
|
||
"\n",
|
||
"labels = list(train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Scale the data and reformat as pandas dataframe\n",
|
||
"scaler = StandardScaler()\n",
|
||
"\n",
|
||
"scaler.fit(train)\n",
|
||
"\n",
|
||
"train = pd.DataFrame(scaler.transform(train), columns = labels)\n",
|
||
"test = pd.DataFrame(scaler.transform(test), columns = labels)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,\n",
|
||
" svd_solver='auto', tol=0.0, whiten=False)"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pca = PCA(.95)\n",
|
||
"pca.fit(train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# create pd dataframe from pca transformation\n",
|
||
"\n",
|
||
"n_components = 172\n",
|
||
"train = pd.DataFrame(pca.transform(train), columns=['PCA%i' % i for i in range(n_components)])\n",
|
||
"test = pd.DataFrame(pca.transform(test), columns=['PCA%i' % i for i in range(n_components)])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Bring back columns we set aside\n",
|
||
"\n",
|
||
"test['Id'] = testId\n",
|
||
"train['Id'] = trainId\n",
|
||
"train['SalePrice'] = trainSalePrice"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>PCA0</th>\n",
|
||
" <th>PCA1</th>\n",
|
||
" <th>PCA2</th>\n",
|
||
" <th>PCA3</th>\n",
|
||
" <th>PCA4</th>\n",
|
||
" <th>PCA5</th>\n",
|
||
" <th>PCA6</th>\n",
|
||
" <th>PCA7</th>\n",
|
||
" <th>PCA8</th>\n",
|
||
" <th>PCA9</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>PCA164</th>\n",
|
||
" <th>PCA165</th>\n",
|
||
" <th>PCA166</th>\n",
|
||
" <th>PCA167</th>\n",
|
||
" <th>PCA168</th>\n",
|
||
" <th>PCA169</th>\n",
|
||
" <th>PCA170</th>\n",
|
||
" <th>PCA171</th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>SalePrice</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4.345109</td>\n",
|
||
" <td>1.619386</td>\n",
|
||
" <td>-0.739617</td>\n",
|
||
" <td>-2.080179</td>\n",
|
||
" <td>-0.985088</td>\n",
|
||
" <td>1.999117</td>\n",
|
||
" <td>-1.231870</td>\n",
|
||
" <td>-0.131782</td>\n",
|
||
" <td>1.316470</td>\n",
|
||
" <td>-1.336446</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.160733</td>\n",
|
||
" <td>0.071333</td>\n",
|
||
" <td>0.155468</td>\n",
|
||
" <td>0.172801</td>\n",
|
||
" <td>-0.169568</td>\n",
|
||
" <td>-0.144326</td>\n",
|
||
" <td>0.391713</td>\n",
|
||
" <td>-0.013357</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>208500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0.019142</td>\n",
|
||
" <td>-3.106959</td>\n",
|
||
" <td>0.168223</td>\n",
|
||
" <td>-0.553341</td>\n",
|
||
" <td>0.940712</td>\n",
|
||
" <td>0.200719</td>\n",
|
||
" <td>-0.468954</td>\n",
|
||
" <td>0.235082</td>\n",
|
||
" <td>-0.838022</td>\n",
|
||
" <td>-1.273833</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-1.063234</td>\n",
|
||
" <td>-0.334556</td>\n",
|
||
" <td>0.361166</td>\n",
|
||
" <td>-1.218397</td>\n",
|
||
" <td>-0.346191</td>\n",
|
||
" <td>-0.962753</td>\n",
|
||
" <td>-0.138863</td>\n",
|
||
" <td>1.083103</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>181500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>4.851149</td>\n",
|
||
" <td>1.242811</td>\n",
|
||
" <td>-0.351815</td>\n",
|
||
" <td>-1.484957</td>\n",
|
||
" <td>-0.758200</td>\n",
|
||
" <td>2.181179</td>\n",
|
||
" <td>-1.843949</td>\n",
|
||
" <td>0.296194</td>\n",
|
||
" <td>1.299142</td>\n",
|
||
" <td>-1.391358</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.088334</td>\n",
|
||
" <td>0.238624</td>\n",
|
||
" <td>0.327280</td>\n",
|
||
" <td>0.325285</td>\n",
|
||
" <td>-0.704900</td>\n",
|
||
" <td>-0.036388</td>\n",
|
||
" <td>-0.540516</td>\n",
|
||
" <td>0.021711</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>223500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>-1.771641</td>\n",
|
||
" <td>0.039500</td>\n",
|
||
" <td>-1.358623</td>\n",
|
||
" <td>1.920760</td>\n",
|
||
" <td>-2.550817</td>\n",
|
||
" <td>0.209519</td>\n",
|
||
" <td>-0.756387</td>\n",
|
||
" <td>0.700109</td>\n",
|
||
" <td>-1.408543</td>\n",
|
||
" <td>0.025023</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.172186</td>\n",
|
||
" <td>-0.518922</td>\n",
|
||
" <td>0.231498</td>\n",
|
||
" <td>-0.074296</td>\n",
|
||
" <td>-0.034287</td>\n",
|
||
" <td>-0.877735</td>\n",
|
||
" <td>0.028065</td>\n",
|
||
" <td>-0.321009</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>140000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>6.463747</td>\n",
|
||
" <td>1.064473</td>\n",
|
||
" <td>0.209472</td>\n",
|
||
" <td>0.448906</td>\n",
|
||
" <td>-1.555301</td>\n",
|
||
" <td>3.215822</td>\n",
|
||
" <td>-0.946356</td>\n",
|
||
" <td>-0.805204</td>\n",
|
||
" <td>2.112526</td>\n",
|
||
" <td>-1.821083</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.270189</td>\n",
|
||
" <td>0.375297</td>\n",
|
||
" <td>-0.396732</td>\n",
|
||
" <td>-0.109084</td>\n",
|
||
" <td>0.317305</td>\n",
|
||
" <td>-0.145975</td>\n",
|
||
" <td>-0.674692</td>\n",
|
||
" <td>-0.378458</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>250000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 174 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 PCA6 \\\n",
|
||
"0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 -1.231870 \n",
|
||
"1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 -0.468954 \n",
|
||
"2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 -1.843949 \n",
|
||
"3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 -0.756387 \n",
|
||
"4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 -0.946356 \n",
|
||
"\n",
|
||
" PCA7 PCA8 PCA9 ... PCA164 PCA165 PCA166 PCA167 \\\n",
|
||
"0 -0.131782 1.316470 -1.336446 ... 0.160733 0.071333 0.155468 0.172801 \n",
|
||
"1 0.235082 -0.838022 -1.273833 ... -1.063234 -0.334556 0.361166 -1.218397 \n",
|
||
"2 0.296194 1.299142 -1.391358 ... 0.088334 0.238624 0.327280 0.325285 \n",
|
||
"3 0.700109 -1.408543 0.025023 ... -0.172186 -0.518922 0.231498 -0.074296 \n",
|
||
"4 -0.805204 2.112526 -1.821083 ... -0.270189 0.375297 -0.396732 -0.109084 \n",
|
||
"\n",
|
||
" PCA168 PCA169 PCA170 PCA171 Id SalePrice \n",
|
||
"0 -0.169568 -0.144326 0.391713 -0.013357 1 208500 \n",
|
||
"1 -0.346191 -0.962753 -0.138863 1.083103 2 181500 \n",
|
||
"2 -0.704900 -0.036388 -0.540516 0.021711 3 223500 \n",
|
||
"3 -0.034287 -0.877735 0.028065 -0.321009 4 140000 \n",
|
||
"4 0.317305 -0.145975 -0.674692 -0.378458 5 250000 \n",
|
||
"\n",
|
||
"[5 rows x 174 columns]"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>PCA0</th>\n",
|
||
" <th>PCA1</th>\n",
|
||
" <th>PCA2</th>\n",
|
||
" <th>PCA3</th>\n",
|
||
" <th>PCA4</th>\n",
|
||
" <th>PCA5</th>\n",
|
||
" <th>PCA6</th>\n",
|
||
" <th>PCA7</th>\n",
|
||
" <th>PCA8</th>\n",
|
||
" <th>PCA9</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>PCA163</th>\n",
|
||
" <th>PCA164</th>\n",
|
||
" <th>PCA165</th>\n",
|
||
" <th>PCA166</th>\n",
|
||
" <th>PCA167</th>\n",
|
||
" <th>PCA168</th>\n",
|
||
" <th>PCA169</th>\n",
|
||
" <th>PCA170</th>\n",
|
||
" <th>PCA171</th>\n",
|
||
" <th>Id</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>-3.208086</td>\n",
|
||
" <td>-2.987338</td>\n",
|
||
" <td>-0.327066</td>\n",
|
||
" <td>-1.609206</td>\n",
|
||
" <td>0.016879</td>\n",
|
||
" <td>-1.514939</td>\n",
|
||
" <td>-0.417889</td>\n",
|
||
" <td>-0.988173</td>\n",
|
||
" <td>-0.653363</td>\n",
|
||
" <td>0.647642</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.027364</td>\n",
|
||
" <td>0.653222</td>\n",
|
||
" <td>-0.201973</td>\n",
|
||
" <td>-0.769946</td>\n",
|
||
" <td>-0.344834</td>\n",
|
||
" <td>0.514257</td>\n",
|
||
" <td>1.114106</td>\n",
|
||
" <td>0.337765</td>\n",
|
||
" <td>-0.639617</td>\n",
|
||
" <td>1461</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>-1.403753</td>\n",
|
||
" <td>-4.261851</td>\n",
|
||
" <td>0.107527</td>\n",
|
||
" <td>0.935981</td>\n",
|
||
" <td>0.165777</td>\n",
|
||
" <td>-0.299485</td>\n",
|
||
" <td>-0.524918</td>\n",
|
||
" <td>-2.332121</td>\n",
|
||
" <td>0.031044</td>\n",
|
||
" <td>-0.820514</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3.856117</td>\n",
|
||
" <td>0.787996</td>\n",
|
||
" <td>0.215221</td>\n",
|
||
" <td>0.458275</td>\n",
|
||
" <td>1.135109</td>\n",
|
||
" <td>0.378972</td>\n",
|
||
" <td>0.953559</td>\n",
|
||
" <td>-1.008240</td>\n",
|
||
" <td>4.445435</td>\n",
|
||
" <td>1462</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2.257002</td>\n",
|
||
" <td>0.427951</td>\n",
|
||
" <td>-0.610464</td>\n",
|
||
" <td>-1.301125</td>\n",
|
||
" <td>-1.058327</td>\n",
|
||
" <td>2.674177</td>\n",
|
||
" <td>-1.500824</td>\n",
|
||
" <td>-0.223999</td>\n",
|
||
" <td>0.403440</td>\n",
|
||
" <td>-0.198229</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.117138</td>\n",
|
||
" <td>-0.378473</td>\n",
|
||
" <td>-0.031613</td>\n",
|
||
" <td>0.090593</td>\n",
|
||
" <td>-0.173914</td>\n",
|
||
" <td>-0.150098</td>\n",
|
||
" <td>-0.006612</td>\n",
|
||
" <td>0.190780</td>\n",
|
||
" <td>-0.152486</td>\n",
|
||
" <td>1463</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3.253618</td>\n",
|
||
" <td>0.537318</td>\n",
|
||
" <td>-0.796079</td>\n",
|
||
" <td>-0.851716</td>\n",
|
||
" <td>-1.209643</td>\n",
|
||
" <td>2.388795</td>\n",
|
||
" <td>-1.340676</td>\n",
|
||
" <td>-0.876322</td>\n",
|
||
" <td>0.421183</td>\n",
|
||
" <td>-0.692292</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.441586</td>\n",
|
||
" <td>0.020066</td>\n",
|
||
" <td>-0.151709</td>\n",
|
||
" <td>0.444826</td>\n",
|
||
" <td>0.008218</td>\n",
|
||
" <td>-0.161705</td>\n",
|
||
" <td>-0.453482</td>\n",
|
||
" <td>0.472352</td>\n",
|
||
" <td>0.046141</td>\n",
|
||
" <td>1464</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.876409</td>\n",
|
||
" <td>-0.075909</td>\n",
|
||
" <td>-0.154959</td>\n",
|
||
" <td>-2.469870</td>\n",
|
||
" <td>1.407820</td>\n",
|
||
" <td>0.487532</td>\n",
|
||
" <td>0.072190</td>\n",
|
||
" <td>2.414446</td>\n",
|
||
" <td>1.667224</td>\n",
|
||
" <td>-0.621508</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.269062</td>\n",
|
||
" <td>0.651172</td>\n",
|
||
" <td>-0.050461</td>\n",
|
||
" <td>-0.526448</td>\n",
|
||
" <td>-0.843701</td>\n",
|
||
" <td>0.574770</td>\n",
|
||
" <td>-0.227828</td>\n",
|
||
" <td>1.071423</td>\n",
|
||
" <td>1.362638</td>\n",
|
||
" <td>1465</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 173 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 PCA6 \\\n",
|
||
"0 -3.208086 -2.987338 -0.327066 -1.609206 0.016879 -1.514939 -0.417889 \n",
|
||
"1 -1.403753 -4.261851 0.107527 0.935981 0.165777 -0.299485 -0.524918 \n",
|
||
"2 2.257002 0.427951 -0.610464 -1.301125 -1.058327 2.674177 -1.500824 \n",
|
||
"3 3.253618 0.537318 -0.796079 -0.851716 -1.209643 2.388795 -1.340676 \n",
|
||
"4 2.876409 -0.075909 -0.154959 -2.469870 1.407820 0.487532 0.072190 \n",
|
||
"\n",
|
||
" PCA7 PCA8 PCA9 ... PCA163 PCA164 PCA165 PCA166 \\\n",
|
||
"0 -0.988173 -0.653363 0.647642 ... -0.027364 0.653222 -0.201973 -0.769946 \n",
|
||
"1 -2.332121 0.031044 -0.820514 ... 3.856117 0.787996 0.215221 0.458275 \n",
|
||
"2 -0.223999 0.403440 -0.198229 ... -0.117138 -0.378473 -0.031613 0.090593 \n",
|
||
"3 -0.876322 0.421183 -0.692292 ... -0.441586 0.020066 -0.151709 0.444826 \n",
|
||
"4 2.414446 1.667224 -0.621508 ... 0.269062 0.651172 -0.050461 -0.526448 \n",
|
||
"\n",
|
||
" PCA167 PCA168 PCA169 PCA170 PCA171 Id \n",
|
||
"0 -0.344834 0.514257 1.114106 0.337765 -0.639617 1461 \n",
|
||
"1 1.135109 0.378972 0.953559 -1.008240 4.445435 1462 \n",
|
||
"2 -0.173914 -0.150098 -0.006612 0.190780 -0.152486 1463 \n",
|
||
"3 0.008218 -0.161705 -0.453482 0.472352 0.046141 1464 \n",
|
||
"4 -0.843701 0.574770 -0.227828 1.071423 1.362638 1465 \n",
|
||
"\n",
|
||
"[5 rows x 173 columns]"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"test.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Write to csv\n",
|
||
"test.to_csv(r'data\\clean_test.csv')\n",
|
||
"train.to_csv(r'data\\clean_train.csv')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|