MLProjects/kaggle_house_prices/cleanup.ipynb

2762 lines
92 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.decomposition import PCA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>LotConfig</th>\n",
" <th>...</th>\n",
" <th>ScreenPorch</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1461</td>\n",
" <td>20</td>\n",
" <td>RH</td>\n",
" <td>80.0</td>\n",
" <td>11622</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>120</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1462</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>81.0</td>\n",
" <td>14267</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Corner</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Gar2</td>\n",
" <td>12500</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1463</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>74.0</td>\n",
" <td>13830</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1464</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>78.0</td>\n",
" <td>9978</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1465</td>\n",
" <td>120</td>\n",
" <td>RL</td>\n",
" <td>43.0</td>\n",
" <td>5005</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>HLS</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>144</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2915</td>\n",
" <td>160</td>\n",
" <td>RM</td>\n",
" <td>21.0</td>\n",
" <td>1936</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2916</td>\n",
" <td>160</td>\n",
" <td>RM</td>\n",
" <td>21.0</td>\n",
" <td>1894</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2917</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>160.0</td>\n",
" <td>20000</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2918</td>\n",
" <td>85</td>\n",
" <td>RL</td>\n",
" <td>62.0</td>\n",
" <td>10441</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>Shed</td>\n",
" <td>700</td>\n",
" <td>7</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2919</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>74.0</td>\n",
" <td>9627</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>11</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1459 rows × 79 columns</p>\n",
"</div>"
],
"text/plain": [
" MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"Id \n",
"1461 20 RH 80.0 11622 Pave NaN Reg \n",
"1462 20 RL 81.0 14267 Pave NaN IR1 \n",
"1463 60 RL 74.0 13830 Pave NaN IR1 \n",
"1464 60 RL 78.0 9978 Pave NaN IR1 \n",
"1465 120 RL 43.0 5005 Pave NaN IR1 \n",
"... ... ... ... ... ... ... ... \n",
"2915 160 RM 21.0 1936 Pave NaN Reg \n",
"2916 160 RM 21.0 1894 Pave NaN Reg \n",
"2917 20 RL 160.0 20000 Pave NaN Reg \n",
"2918 85 RL 62.0 10441 Pave NaN Reg \n",
"2919 60 RL 74.0 9627 Pave NaN Reg \n",
"\n",
" LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence \\\n",
"Id ... \n",
"1461 Lvl AllPub Inside ... 120 0 NaN MnPrv \n",
"1462 Lvl AllPub Corner ... 0 0 NaN NaN \n",
"1463 Lvl AllPub Inside ... 0 0 NaN MnPrv \n",
"1464 Lvl AllPub Inside ... 0 0 NaN NaN \n",
"1465 HLS AllPub Inside ... 144 0 NaN NaN \n",
"... ... ... ... ... ... ... ... ... \n",
"2915 Lvl AllPub Inside ... 0 0 NaN NaN \n",
"2916 Lvl AllPub Inside ... 0 0 NaN NaN \n",
"2917 Lvl AllPub Inside ... 0 0 NaN NaN \n",
"2918 Lvl AllPub Inside ... 0 0 NaN MnPrv \n",
"2919 Lvl AllPub Inside ... 0 0 NaN NaN \n",
"\n",
" MiscFeature MiscVal MoSold YrSold SaleType SaleCondition \n",
"Id \n",
"1461 NaN 0 6 2010 WD Normal \n",
"1462 Gar2 12500 6 2010 WD Normal \n",
"1463 NaN 0 3 2010 WD Normal \n",
"1464 NaN 0 6 2010 WD Normal \n",
"1465 NaN 0 1 2010 WD Normal \n",
"... ... ... ... ... ... ... \n",
"2915 NaN 0 6 2006 WD Normal \n",
"2916 NaN 0 4 2006 WD Abnorml \n",
"2917 NaN 0 9 2006 WD Abnorml \n",
"2918 Shed 700 7 2006 WD Normal \n",
"2919 NaN 0 11 2006 WD Normal \n",
"\n",
"[1459 rows x 79 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = pd.read_csv(os.path.join('data', 'train.csv'))\n",
"train.set_index('Id')\n",
"test = pd.read_csv(os.path.join('data', 'test.csv'))\n",
"test.set_index('Id')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"0 1 60 RL 65.0 8450 Pave NaN Reg \n",
"1 2 20 RL 80.0 9600 Pave NaN Reg \n",
"2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
"3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
"4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
"\n",
" LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold \\\n",
"0 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n",
"1 Lvl AllPub ... 0 NaN NaN NaN 0 5 \n",
"2 Lvl AllPub ... 0 NaN NaN NaN 0 9 \n",
"3 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n",
"4 Lvl AllPub ... 0 NaN NaN NaN 0 12 \n",
"\n",
" YrSold SaleType SaleCondition SalePrice \n",
"0 2008 WD Normal 208500 \n",
"1 2007 WD Normal 181500 \n",
"2 2008 WD Normal 223500 \n",
"3 2006 WD Abnorml 140000 \n",
"4 2008 WD Normal 250000 \n",
"\n",
"[5 rows x 81 columns]\n",
"(1460, 81)\n",
"(1459, 80)\n"
]
}
],
"source": [
"print(train.head())\n",
"print(train.shape)\n",
"print(test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1460, 19)\n",
"LotFrontage 259\n",
"Alley 1369\n",
"MasVnrType 8\n",
"MasVnrArea 8\n",
"BsmtQual 37\n",
"BsmtCond 37\n",
"BsmtExposure 38\n",
"BsmtFinType1 37\n",
"BsmtFinType2 38\n",
"Electrical 1\n",
"FireplaceQu 690\n",
"GarageType 81\n",
"GarageYrBlt 81\n",
"GarageFinish 81\n",
"GarageQual 81\n",
"GarageCond 81\n",
"PoolQC 1453\n",
"Fence 1179\n",
"MiscFeature 1406\n",
"dtype: int64\n"
]
}
],
"source": [
"# All features with null values and their amounts\n",
"\n",
"train_null = train.loc[:, train.isnull().any()]\n",
"train_null.head()\n",
"print(train_null.shape)\n",
"print(train_null.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Drop features with too many null values\n",
"\n",
"train = train.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1)\n",
"test = test.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>LotConfig</th>\n",
" <th>...</th>\n",
" <th>EnclosedPorch</th>\n",
" <th>3SsnPorch</th>\n",
" <th>ScreenPorch</th>\n",
" <th>PoolArea</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>Pave</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>Pave</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>FR2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>181500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>Pave</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>223500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>RL</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>Pave</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Corner</td>\n",
" <td>...</td>\n",
" <td>272</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" <td>140000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>Pave</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>FR2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 76 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour \\\n",
"0 1 60 RL 65.0 8450 Pave Reg Lvl \n",
"1 2 20 RL 80.0 9600 Pave Reg Lvl \n",
"2 3 60 RL 68.0 11250 Pave IR1 Lvl \n",
"3 4 70 RL 60.0 9550 Pave IR1 Lvl \n",
"4 5 60 RL 84.0 14260 Pave IR1 Lvl \n",
"\n",
" Utilities LotConfig ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea \\\n",
"0 AllPub Inside ... 0 0 0 0 \n",
"1 AllPub FR2 ... 0 0 0 0 \n",
"2 AllPub Inside ... 0 0 0 0 \n",
"3 AllPub Corner ... 272 0 0 0 \n",
"4 AllPub FR2 ... 0 0 0 0 \n",
"\n",
" MiscVal MoSold YrSold SaleType SaleCondition SalePrice \n",
"0 0 2 2008 WD Normal 208500 \n",
"1 0 5 2007 WD Normal 181500 \n",
"2 0 9 2008 WD Normal 223500 \n",
"3 0 2 2006 WD Abnorml 140000 \n",
"4 0 12 2008 WD Normal 250000 \n",
"\n",
"[5 rows x 76 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1460, 76)\n",
"(1459, 75)\n"
]
}
],
"source": [
"print(train.shape)\n",
"print(test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1460, 14)\n",
"LotFrontage 259\n",
"MasVnrType 8\n",
"MasVnrArea 8\n",
"BsmtQual 37\n",
"BsmtCond 37\n",
"BsmtExposure 38\n",
"BsmtFinType1 37\n",
"BsmtFinType2 38\n",
"Electrical 1\n",
"GarageType 81\n",
"GarageYrBlt 81\n",
"GarageFinish 81\n",
"GarageQual 81\n",
"GarageCond 81\n",
"dtype: int64\n"
]
}
],
"source": [
"# All features with null values and their amounts\n",
"\n",
"train_null = train.loc[:, train.isnull().any()]\n",
"train_null.head()\n",
"print(train_null.shape)\n",
"print(train_null.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Fill in object nans with 'NAN' and numerical nans with mean (mean imputation)\n",
"# Note that we use the train mean for mean imputation on the test df as well\n",
"\n",
"data = [train, test]\n",
"\n",
"for df in data:\n",
" for column in df:\n",
" if df[column].isna().any():\n",
" if df[column].dtype == object:\n",
" df[column] = df[column].replace(np.nan, 'NAN')\n",
" else:\n",
" mean = train[column].mean()\n",
" df[column] = df[column].replace(np.nan, mean)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MSZoning</th>\n",
" <th>Street</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>LotConfig</th>\n",
" <th>LandSlope</th>\n",
" <th>Neighborhood</th>\n",
" <th>Condition1</th>\n",
" <th>Condition2</th>\n",
" <th>...</th>\n",
" <th>Electrical</th>\n",
" <th>KitchenQual</th>\n",
" <th>Functional</th>\n",
" <th>GarageType</th>\n",
" <th>GarageFinish</th>\n",
" <th>GarageQual</th>\n",
" <th>GarageCond</th>\n",
" <th>PavedDrive</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>RL</td>\n",
" <td>Pave</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>CollgCr</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>...</td>\n",
" <td>SBrkr</td>\n",
" <td>Gd</td>\n",
" <td>Typ</td>\n",
" <td>Attchd</td>\n",
" <td>RFn</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>RL</td>\n",
" <td>Pave</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>FR2</td>\n",
" <td>Gtl</td>\n",
" <td>Veenker</td>\n",
" <td>Feedr</td>\n",
" <td>Norm</td>\n",
" <td>...</td>\n",
" <td>SBrkr</td>\n",
" <td>TA</td>\n",
" <td>Typ</td>\n",
" <td>Attchd</td>\n",
" <td>RFn</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>RL</td>\n",
" <td>Pave</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>CollgCr</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>...</td>\n",
" <td>SBrkr</td>\n",
" <td>Gd</td>\n",
" <td>Typ</td>\n",
" <td>Attchd</td>\n",
" <td>RFn</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>RL</td>\n",
" <td>Pave</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Corner</td>\n",
" <td>Gtl</td>\n",
" <td>Crawfor</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>...</td>\n",
" <td>SBrkr</td>\n",
" <td>Gd</td>\n",
" <td>Typ</td>\n",
" <td>Detchd</td>\n",
" <td>Unf</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>RL</td>\n",
" <td>Pave</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>FR2</td>\n",
" <td>Gtl</td>\n",
" <td>NoRidge</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>...</td>\n",
" <td>SBrkr</td>\n",
" <td>Gd</td>\n",
" <td>Typ</td>\n",
" <td>Attchd</td>\n",
" <td>RFn</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 38 columns</p>\n",
"</div>"
],
"text/plain": [
" MSZoning Street LotShape LandContour Utilities LotConfig LandSlope \\\n",
"0 RL Pave Reg Lvl AllPub Inside Gtl \n",
"1 RL Pave Reg Lvl AllPub FR2 Gtl \n",
"2 RL Pave IR1 Lvl AllPub Inside Gtl \n",
"3 RL Pave IR1 Lvl AllPub Corner Gtl \n",
"4 RL Pave IR1 Lvl AllPub FR2 Gtl \n",
"\n",
" Neighborhood Condition1 Condition2 ... Electrical KitchenQual Functional \\\n",
"0 CollgCr Norm Norm ... SBrkr Gd Typ \n",
"1 Veenker Feedr Norm ... SBrkr TA Typ \n",
"2 CollgCr Norm Norm ... SBrkr Gd Typ \n",
"3 Crawfor Norm Norm ... SBrkr Gd Typ \n",
"4 NoRidge Norm Norm ... SBrkr Gd Typ \n",
"\n",
" GarageType GarageFinish GarageQual GarageCond PavedDrive SaleType \\\n",
"0 Attchd RFn TA TA Y WD \n",
"1 Attchd RFn TA TA Y WD \n",
"2 Attchd RFn TA TA Y WD \n",
"3 Detchd Unf TA TA Y WD \n",
"4 Attchd RFn TA TA Y WD \n",
"\n",
" SaleCondition \n",
"0 Normal \n",
"1 Normal \n",
"2 Normal \n",
"3 Abnorml \n",
"4 Normal \n",
"\n",
"[5 rows x 38 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# All object type features to convert to numerical\n",
"\n",
"train_obj = train.select_dtypes(include=['object']).copy()\n",
"test_obj = test.select_dtypes(include=['object']).copy()\n",
"\n",
"train_obj.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>OverallQual</th>\n",
" <th>OverallCond</th>\n",
" <th>YearBuilt</th>\n",
" <th>YearRemodAdd</th>\n",
" <th>MasVnrArea</th>\n",
" <th>BsmtFinSF1</th>\n",
" <th>...</th>\n",
" <th>SaleType_ConLw</th>\n",
" <th>SaleType_New</th>\n",
" <th>SaleType_Oth</th>\n",
" <th>SaleType_WD</th>\n",
" <th>SaleCondition_Abnorml</th>\n",
" <th>SaleCondition_AdjLand</th>\n",
" <th>SaleCondition_Alloca</th>\n",
" <th>SaleCondition_Family</th>\n",
" <th>SaleCondition_Normal</th>\n",
" <th>SaleCondition_Partial</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>2003</td>\n",
" <td>2003</td>\n",
" <td>196.0</td>\n",
" <td>706</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>6</td>\n",
" <td>8</td>\n",
" <td>1976</td>\n",
" <td>1976</td>\n",
" <td>0.0</td>\n",
" <td>978</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>2001</td>\n",
" <td>2002</td>\n",
" <td>162.0</td>\n",
" <td>486</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>1915</td>\n",
" <td>1970</td>\n",
" <td>0.0</td>\n",
" <td>216</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" <td>2000</td>\n",
" <td>2000</td>\n",
" <td>350.0</td>\n",
" <td>655</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 283 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n",
"0 1 60 65.0 8450 7 5 2003 \n",
"1 2 20 80.0 9600 6 8 1976 \n",
"2 3 60 68.0 11250 7 5 2001 \n",
"3 4 70 60.0 9550 7 5 1915 \n",
"4 5 60 84.0 14260 8 5 2000 \n",
"\n",
" YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n",
"0 2003 196.0 706 ... 0 0 \n",
"1 1976 0.0 978 ... 0 0 \n",
"2 2002 162.0 486 ... 0 0 \n",
"3 1970 0.0 216 ... 0 0 \n",
"4 2000 350.0 655 ... 0 0 \n",
"\n",
" SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n",
"0 0 1 0 0 \n",
"1 0 1 0 0 \n",
"2 0 1 0 0 \n",
"3 0 1 1 0 \n",
"4 0 1 0 0 \n",
"\n",
" SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n",
"0 0 0 1 \n",
"1 0 0 1 \n",
"2 0 0 1 \n",
"3 0 0 0 \n",
"4 0 0 1 \n",
"\n",
" SaleCondition_Partial \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 283 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(list(train_obj))\n",
"\n",
"one_hot_train = pd.get_dummies(train, columns=list(train_obj))\n",
"one_hot_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# One hot encoding\n",
"\n",
"train = pd.get_dummies(train, columns=list(train_obj))\n",
"test = pd.get_dummies(test, columns=list(test_obj))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1460, 283)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>OverallQual</th>\n",
" <th>OverallCond</th>\n",
" <th>YearBuilt</th>\n",
" <th>YearRemodAdd</th>\n",
" <th>MasVnrArea</th>\n",
" <th>BsmtFinSF1</th>\n",
" <th>...</th>\n",
" <th>SaleType_ConLw</th>\n",
" <th>SaleType_New</th>\n",
" <th>SaleType_Oth</th>\n",
" <th>SaleType_WD</th>\n",
" <th>SaleCondition_Abnorml</th>\n",
" <th>SaleCondition_AdjLand</th>\n",
" <th>SaleCondition_Alloca</th>\n",
" <th>SaleCondition_Family</th>\n",
" <th>SaleCondition_Normal</th>\n",
" <th>SaleCondition_Partial</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>2003</td>\n",
" <td>2003</td>\n",
" <td>196.0</td>\n",
" <td>706</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>6</td>\n",
" <td>8</td>\n",
" <td>1976</td>\n",
" <td>1976</td>\n",
" <td>0.0</td>\n",
" <td>978</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>2001</td>\n",
" <td>2002</td>\n",
" <td>162.0</td>\n",
" <td>486</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>1915</td>\n",
" <td>1970</td>\n",
" <td>0.0</td>\n",
" <td>216</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" <td>2000</td>\n",
" <td>2000</td>\n",
" <td>350.0</td>\n",
" <td>655</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 283 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n",
"0 1 60 65.0 8450 7 5 2003 \n",
"1 2 20 80.0 9600 6 8 1976 \n",
"2 3 60 68.0 11250 7 5 2001 \n",
"3 4 70 60.0 9550 7 5 1915 \n",
"4 5 60 84.0 14260 8 5 2000 \n",
"\n",
" YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n",
"0 2003 196.0 706 ... 0 0 \n",
"1 1976 0.0 978 ... 0 0 \n",
"2 2002 162.0 486 ... 0 0 \n",
"3 1970 0.0 216 ... 0 0 \n",
"4 2000 350.0 655 ... 0 0 \n",
"\n",
" SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n",
"0 0 1 0 0 \n",
"1 0 1 0 0 \n",
"2 0 1 0 0 \n",
"3 0 1 1 0 \n",
"4 0 1 0 0 \n",
"\n",
" SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n",
"0 0 0 1 \n",
"1 0 0 1 \n",
"2 0 0 1 \n",
"3 0 0 0 \n",
"4 0 0 1 \n",
"\n",
" SaleCondition_Partial \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 283 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(train.shape)\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1459, 272)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>OverallQual</th>\n",
" <th>OverallCond</th>\n",
" <th>YearBuilt</th>\n",
" <th>YearRemodAdd</th>\n",
" <th>MasVnrArea</th>\n",
" <th>BsmtFinSF1</th>\n",
" <th>...</th>\n",
" <th>SaleType_NAN</th>\n",
" <th>SaleType_New</th>\n",
" <th>SaleType_Oth</th>\n",
" <th>SaleType_WD</th>\n",
" <th>SaleCondition_Abnorml</th>\n",
" <th>SaleCondition_AdjLand</th>\n",
" <th>SaleCondition_Alloca</th>\n",
" <th>SaleCondition_Family</th>\n",
" <th>SaleCondition_Normal</th>\n",
" <th>SaleCondition_Partial</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1461</td>\n",
" <td>20</td>\n",
" <td>80.0</td>\n",
" <td>11622</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>1961</td>\n",
" <td>1961</td>\n",
" <td>0.0</td>\n",
" <td>468.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1462</td>\n",
" <td>20</td>\n",
" <td>81.0</td>\n",
" <td>14267</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1958</td>\n",
" <td>1958</td>\n",
" <td>108.0</td>\n",
" <td>923.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1463</td>\n",
" <td>60</td>\n",
" <td>74.0</td>\n",
" <td>13830</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>1997</td>\n",
" <td>1998</td>\n",
" <td>0.0</td>\n",
" <td>791.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>1464</td>\n",
" <td>60</td>\n",
" <td>78.0</td>\n",
" <td>9978</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1998</td>\n",
" <td>1998</td>\n",
" <td>20.0</td>\n",
" <td>602.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>1465</td>\n",
" <td>120</td>\n",
" <td>43.0</td>\n",
" <td>5005</td>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" <td>1992</td>\n",
" <td>1992</td>\n",
" <td>0.0</td>\n",
" <td>263.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 272 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond \\\n",
"0 1461 20 80.0 11622 5 6 \n",
"1 1462 20 81.0 14267 6 6 \n",
"2 1463 60 74.0 13830 5 5 \n",
"3 1464 60 78.0 9978 6 6 \n",
"4 1465 120 43.0 5005 8 5 \n",
"\n",
" YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_NAN \\\n",
"0 1961 1961 0.0 468.0 ... 0 \n",
"1 1958 1958 108.0 923.0 ... 0 \n",
"2 1997 1998 0.0 791.0 ... 0 \n",
"3 1998 1998 20.0 602.0 ... 0 \n",
"4 1992 1992 0.0 263.0 ... 0 \n",
"\n",
" SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml \\\n",
"0 0 0 1 0 \n",
"1 0 0 1 0 \n",
"2 0 0 1 0 \n",
"3 0 0 1 0 \n",
"4 0 0 1 0 \n",
"\n",
" SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 \n",
"\n",
" SaleCondition_Normal SaleCondition_Partial \n",
"0 1 0 \n",
"1 1 0 \n",
"2 1 0 \n",
"3 1 0 \n",
"4 1 0 \n",
"\n",
"[5 rows x 272 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(test.shape)\n",
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'Exterior1st_Stone', 'Condition2_RRNn', 'Condition2_RRAn', 'Electrical_NAN', 'HouseStyle_2.5Fin', 'Exterior1st_ImStucc', 'Electrical_Mix', 'RoofMatl_Roll', 'SalePrice', 'RoofMatl_Membran', 'Heating_Floor', 'RoofMatl_Metal', 'Condition2_RRAe', 'RoofMatl_ClyTile', 'GarageQual_Ex', 'Heating_OthW', 'Exterior2nd_Other', 'Utilities_NoSeWa'}\n"
]
}
],
"source": [
"# Check for missing columns\n",
"\n",
"missing_cols = set( train.columns ) - set( test.columns )\n",
"print(missing_cols)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Note that we are missing some columns. This is because in the one-hot encoding process, the test set did not have any\n",
"# samples with a certain categorical output. For example, in RoofMatl, there were no samples in test where the value was\n",
"# ClyTile, which means no column was create to one hot encode it. As such, we will create zero-valued columns to fill\n",
"\n",
"for col in missing_cols:\n",
" test[col] = 0\n",
"\n",
"# Maintain same ordering for the df\n",
"test = test[train.columns]\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Drop zeroed out SalePrice column from test set\n",
"\n",
"test = test.drop('SalePrice', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>OverallQual</th>\n",
" <th>OverallCond</th>\n",
" <th>YearBuilt</th>\n",
" <th>YearRemodAdd</th>\n",
" <th>MasVnrArea</th>\n",
" <th>BsmtFinSF1</th>\n",
" <th>...</th>\n",
" <th>SaleType_ConLw</th>\n",
" <th>SaleType_New</th>\n",
" <th>SaleType_Oth</th>\n",
" <th>SaleType_WD</th>\n",
" <th>SaleCondition_Abnorml</th>\n",
" <th>SaleCondition_AdjLand</th>\n",
" <th>SaleCondition_Alloca</th>\n",
" <th>SaleCondition_Family</th>\n",
" <th>SaleCondition_Normal</th>\n",
" <th>SaleCondition_Partial</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>2003</td>\n",
" <td>2003</td>\n",
" <td>196.0</td>\n",
" <td>706</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>6</td>\n",
" <td>8</td>\n",
" <td>1976</td>\n",
" <td>1976</td>\n",
" <td>0.0</td>\n",
" <td>978</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>2001</td>\n",
" <td>2002</td>\n",
" <td>162.0</td>\n",
" <td>486</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>1915</td>\n",
" <td>1970</td>\n",
" <td>0.0</td>\n",
" <td>216</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" <td>2000</td>\n",
" <td>2000</td>\n",
" <td>350.0</td>\n",
" <td>655</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 283 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n",
"0 1 60 65.0 8450 7 5 2003 \n",
"1 2 20 80.0 9600 6 8 1976 \n",
"2 3 60 68.0 11250 7 5 2001 \n",
"3 4 70 60.0 9550 7 5 1915 \n",
"4 5 60 84.0 14260 8 5 2000 \n",
"\n",
" YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n",
"0 2003 196.0 706 ... 0 0 \n",
"1 1976 0.0 978 ... 0 0 \n",
"2 2002 162.0 486 ... 0 0 \n",
"3 1970 0.0 216 ... 0 0 \n",
"4 2000 350.0 655 ... 0 0 \n",
"\n",
" SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n",
"0 0 1 0 0 \n",
"1 0 1 0 0 \n",
"2 0 1 0 0 \n",
"3 0 1 1 0 \n",
"4 0 1 0 0 \n",
"\n",
" SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n",
"0 0 0 1 \n",
"1 0 0 1 \n",
"2 0 0 1 \n",
"3 0 0 0 \n",
"4 0 0 1 \n",
"\n",
" SaleCondition_Partial \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 283 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>OverallQual</th>\n",
" <th>OverallCond</th>\n",
" <th>YearBuilt</th>\n",
" <th>YearRemodAdd</th>\n",
" <th>MasVnrArea</th>\n",
" <th>BsmtFinSF1</th>\n",
" <th>...</th>\n",
" <th>SaleType_ConLw</th>\n",
" <th>SaleType_New</th>\n",
" <th>SaleType_Oth</th>\n",
" <th>SaleType_WD</th>\n",
" <th>SaleCondition_Abnorml</th>\n",
" <th>SaleCondition_AdjLand</th>\n",
" <th>SaleCondition_Alloca</th>\n",
" <th>SaleCondition_Family</th>\n",
" <th>SaleCondition_Normal</th>\n",
" <th>SaleCondition_Partial</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1461</td>\n",
" <td>20</td>\n",
" <td>80.0</td>\n",
" <td>11622</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>1961</td>\n",
" <td>1961</td>\n",
" <td>0.0</td>\n",
" <td>468.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1462</td>\n",
" <td>20</td>\n",
" <td>81.0</td>\n",
" <td>14267</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1958</td>\n",
" <td>1958</td>\n",
" <td>108.0</td>\n",
" <td>923.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1463</td>\n",
" <td>60</td>\n",
" <td>74.0</td>\n",
" <td>13830</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>1997</td>\n",
" <td>1998</td>\n",
" <td>0.0</td>\n",
" <td>791.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>1464</td>\n",
" <td>60</td>\n",
" <td>78.0</td>\n",
" <td>9978</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1998</td>\n",
" <td>1998</td>\n",
" <td>20.0</td>\n",
" <td>602.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>1465</td>\n",
" <td>120</td>\n",
" <td>43.0</td>\n",
" <td>5005</td>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" <td>1992</td>\n",
" <td>1992</td>\n",
" <td>0.0</td>\n",
" <td>263.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 282 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass LotFrontage LotArea OverallQual OverallCond \\\n",
"0 1461 20 80.0 11622 5 6 \n",
"1 1462 20 81.0 14267 6 6 \n",
"2 1463 60 74.0 13830 5 5 \n",
"3 1464 60 78.0 9978 6 6 \n",
"4 1465 120 43.0 5005 8 5 \n",
"\n",
" YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw \\\n",
"0 1961 1961 0.0 468.0 ... 0 \n",
"1 1958 1958 108.0 923.0 ... 0 \n",
"2 1997 1998 0.0 791.0 ... 0 \n",
"3 1998 1998 20.0 602.0 ... 0 \n",
"4 1992 1992 0.0 263.0 ... 0 \n",
"\n",
" SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml \\\n",
"0 0 0 1 0 \n",
"1 0 0 1 0 \n",
"2 0 0 1 0 \n",
"3 0 0 1 0 \n",
"4 0 0 1 0 \n",
"\n",
" SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 \n",
"\n",
" SaleCondition_Normal SaleCondition_Partial \n",
"0 1 0 \n",
"1 1 0 \n",
"2 1 0 \n",
"3 1 0 \n",
"4 1 0 \n",
"\n",
"[5 rows x 282 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# Store columns we don't want to scale\n",
"\n",
"testId = test['Id']\n",
"trainSalePrice = train['SalePrice']\n",
"trainId = train['Id']\n",
"test = test.drop('Id', axis=1)\n",
"train = train.drop(['Id', 'SalePrice'], axis=1)\n",
"\n",
"# Store labels to remake our dataframes\n",
"\n",
"labels = list(train)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Scale the data and reformat as pandas dataframe\n",
"scaler = StandardScaler()\n",
"\n",
"scaler.fit(train)\n",
"\n",
"train = pd.DataFrame(scaler.transform(train), columns = labels)\n",
"test = pd.DataFrame(scaler.transform(test), columns = labels)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,\n",
" svd_solver='auto', tol=0.0, whiten=False)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pca = PCA(.95)\n",
"pca.fit(train)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# create pd dataframe from pca transformation\n",
"\n",
"n_components = 172\n",
"train = pd.DataFrame(pca.transform(train), columns=['PCA%i' % i for i in range(n_components)])\n",
"test = pd.DataFrame(pca.transform(test), columns=['PCA%i' % i for i in range(n_components)])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Bring back columns we set aside\n",
"\n",
"test['Id'] = testId\n",
"train['Id'] = trainId\n",
"train['SalePrice'] = trainSalePrice"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PCA0</th>\n",
" <th>PCA1</th>\n",
" <th>PCA2</th>\n",
" <th>PCA3</th>\n",
" <th>PCA4</th>\n",
" <th>PCA5</th>\n",
" <th>PCA6</th>\n",
" <th>PCA7</th>\n",
" <th>PCA8</th>\n",
" <th>PCA9</th>\n",
" <th>...</th>\n",
" <th>PCA164</th>\n",
" <th>PCA165</th>\n",
" <th>PCA166</th>\n",
" <th>PCA167</th>\n",
" <th>PCA168</th>\n",
" <th>PCA169</th>\n",
" <th>PCA170</th>\n",
" <th>PCA171</th>\n",
" <th>Id</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>4.345109</td>\n",
" <td>1.619386</td>\n",
" <td>-0.739617</td>\n",
" <td>-2.080179</td>\n",
" <td>-0.985088</td>\n",
" <td>1.999117</td>\n",
" <td>-1.231870</td>\n",
" <td>-0.131782</td>\n",
" <td>1.316470</td>\n",
" <td>-1.336446</td>\n",
" <td>...</td>\n",
" <td>0.160733</td>\n",
" <td>0.071333</td>\n",
" <td>0.155468</td>\n",
" <td>0.172801</td>\n",
" <td>-0.169568</td>\n",
" <td>-0.144326</td>\n",
" <td>0.391713</td>\n",
" <td>-0.013357</td>\n",
" <td>1</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.019142</td>\n",
" <td>-3.106959</td>\n",
" <td>0.168223</td>\n",
" <td>-0.553341</td>\n",
" <td>0.940712</td>\n",
" <td>0.200719</td>\n",
" <td>-0.468954</td>\n",
" <td>0.235082</td>\n",
" <td>-0.838022</td>\n",
" <td>-1.273833</td>\n",
" <td>...</td>\n",
" <td>-1.063234</td>\n",
" <td>-0.334556</td>\n",
" <td>0.361166</td>\n",
" <td>-1.218397</td>\n",
" <td>-0.346191</td>\n",
" <td>-0.962753</td>\n",
" <td>-0.138863</td>\n",
" <td>1.083103</td>\n",
" <td>2</td>\n",
" <td>181500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4.851149</td>\n",
" <td>1.242811</td>\n",
" <td>-0.351815</td>\n",
" <td>-1.484957</td>\n",
" <td>-0.758200</td>\n",
" <td>2.181179</td>\n",
" <td>-1.843949</td>\n",
" <td>0.296194</td>\n",
" <td>1.299142</td>\n",
" <td>-1.391358</td>\n",
" <td>...</td>\n",
" <td>0.088334</td>\n",
" <td>0.238624</td>\n",
" <td>0.327280</td>\n",
" <td>0.325285</td>\n",
" <td>-0.704900</td>\n",
" <td>-0.036388</td>\n",
" <td>-0.540516</td>\n",
" <td>0.021711</td>\n",
" <td>3</td>\n",
" <td>223500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>-1.771641</td>\n",
" <td>0.039500</td>\n",
" <td>-1.358623</td>\n",
" <td>1.920760</td>\n",
" <td>-2.550817</td>\n",
" <td>0.209519</td>\n",
" <td>-0.756387</td>\n",
" <td>0.700109</td>\n",
" <td>-1.408543</td>\n",
" <td>0.025023</td>\n",
" <td>...</td>\n",
" <td>-0.172186</td>\n",
" <td>-0.518922</td>\n",
" <td>0.231498</td>\n",
" <td>-0.074296</td>\n",
" <td>-0.034287</td>\n",
" <td>-0.877735</td>\n",
" <td>0.028065</td>\n",
" <td>-0.321009</td>\n",
" <td>4</td>\n",
" <td>140000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>6.463747</td>\n",
" <td>1.064473</td>\n",
" <td>0.209472</td>\n",
" <td>0.448906</td>\n",
" <td>-1.555301</td>\n",
" <td>3.215822</td>\n",
" <td>-0.946356</td>\n",
" <td>-0.805204</td>\n",
" <td>2.112526</td>\n",
" <td>-1.821083</td>\n",
" <td>...</td>\n",
" <td>-0.270189</td>\n",
" <td>0.375297</td>\n",
" <td>-0.396732</td>\n",
" <td>-0.109084</td>\n",
" <td>0.317305</td>\n",
" <td>-0.145975</td>\n",
" <td>-0.674692</td>\n",
" <td>-0.378458</td>\n",
" <td>5</td>\n",
" <td>250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 174 columns</p>\n",
"</div>"
],
"text/plain": [
" PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 PCA6 \\\n",
"0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 -1.231870 \n",
"1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 -0.468954 \n",
"2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 -1.843949 \n",
"3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 -0.756387 \n",
"4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 -0.946356 \n",
"\n",
" PCA7 PCA8 PCA9 ... PCA164 PCA165 PCA166 PCA167 \\\n",
"0 -0.131782 1.316470 -1.336446 ... 0.160733 0.071333 0.155468 0.172801 \n",
"1 0.235082 -0.838022 -1.273833 ... -1.063234 -0.334556 0.361166 -1.218397 \n",
"2 0.296194 1.299142 -1.391358 ... 0.088334 0.238624 0.327280 0.325285 \n",
"3 0.700109 -1.408543 0.025023 ... -0.172186 -0.518922 0.231498 -0.074296 \n",
"4 -0.805204 2.112526 -1.821083 ... -0.270189 0.375297 -0.396732 -0.109084 \n",
"\n",
" PCA168 PCA169 PCA170 PCA171 Id SalePrice \n",
"0 -0.169568 -0.144326 0.391713 -0.013357 1 208500 \n",
"1 -0.346191 -0.962753 -0.138863 1.083103 2 181500 \n",
"2 -0.704900 -0.036388 -0.540516 0.021711 3 223500 \n",
"3 -0.034287 -0.877735 0.028065 -0.321009 4 140000 \n",
"4 0.317305 -0.145975 -0.674692 -0.378458 5 250000 \n",
"\n",
"[5 rows x 174 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PCA0</th>\n",
" <th>PCA1</th>\n",
" <th>PCA2</th>\n",
" <th>PCA3</th>\n",
" <th>PCA4</th>\n",
" <th>PCA5</th>\n",
" <th>PCA6</th>\n",
" <th>PCA7</th>\n",
" <th>PCA8</th>\n",
" <th>PCA9</th>\n",
" <th>...</th>\n",
" <th>PCA163</th>\n",
" <th>PCA164</th>\n",
" <th>PCA165</th>\n",
" <th>PCA166</th>\n",
" <th>PCA167</th>\n",
" <th>PCA168</th>\n",
" <th>PCA169</th>\n",
" <th>PCA170</th>\n",
" <th>PCA171</th>\n",
" <th>Id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>-3.208086</td>\n",
" <td>-2.987338</td>\n",
" <td>-0.327066</td>\n",
" <td>-1.609206</td>\n",
" <td>0.016879</td>\n",
" <td>-1.514939</td>\n",
" <td>-0.417889</td>\n",
" <td>-0.988173</td>\n",
" <td>-0.653363</td>\n",
" <td>0.647642</td>\n",
" <td>...</td>\n",
" <td>-0.027364</td>\n",
" <td>0.653222</td>\n",
" <td>-0.201973</td>\n",
" <td>-0.769946</td>\n",
" <td>-0.344834</td>\n",
" <td>0.514257</td>\n",
" <td>1.114106</td>\n",
" <td>0.337765</td>\n",
" <td>-0.639617</td>\n",
" <td>1461</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>-1.403753</td>\n",
" <td>-4.261851</td>\n",
" <td>0.107527</td>\n",
" <td>0.935981</td>\n",
" <td>0.165777</td>\n",
" <td>-0.299485</td>\n",
" <td>-0.524918</td>\n",
" <td>-2.332121</td>\n",
" <td>0.031044</td>\n",
" <td>-0.820514</td>\n",
" <td>...</td>\n",
" <td>3.856117</td>\n",
" <td>0.787996</td>\n",
" <td>0.215221</td>\n",
" <td>0.458275</td>\n",
" <td>1.135109</td>\n",
" <td>0.378972</td>\n",
" <td>0.953559</td>\n",
" <td>-1.008240</td>\n",
" <td>4.445435</td>\n",
" <td>1462</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2.257002</td>\n",
" <td>0.427951</td>\n",
" <td>-0.610464</td>\n",
" <td>-1.301125</td>\n",
" <td>-1.058327</td>\n",
" <td>2.674177</td>\n",
" <td>-1.500824</td>\n",
" <td>-0.223999</td>\n",
" <td>0.403440</td>\n",
" <td>-0.198229</td>\n",
" <td>...</td>\n",
" <td>-0.117138</td>\n",
" <td>-0.378473</td>\n",
" <td>-0.031613</td>\n",
" <td>0.090593</td>\n",
" <td>-0.173914</td>\n",
" <td>-0.150098</td>\n",
" <td>-0.006612</td>\n",
" <td>0.190780</td>\n",
" <td>-0.152486</td>\n",
" <td>1463</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>3.253618</td>\n",
" <td>0.537318</td>\n",
" <td>-0.796079</td>\n",
" <td>-0.851716</td>\n",
" <td>-1.209643</td>\n",
" <td>2.388795</td>\n",
" <td>-1.340676</td>\n",
" <td>-0.876322</td>\n",
" <td>0.421183</td>\n",
" <td>-0.692292</td>\n",
" <td>...</td>\n",
" <td>-0.441586</td>\n",
" <td>0.020066</td>\n",
" <td>-0.151709</td>\n",
" <td>0.444826</td>\n",
" <td>0.008218</td>\n",
" <td>-0.161705</td>\n",
" <td>-0.453482</td>\n",
" <td>0.472352</td>\n",
" <td>0.046141</td>\n",
" <td>1464</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>2.876409</td>\n",
" <td>-0.075909</td>\n",
" <td>-0.154959</td>\n",
" <td>-2.469870</td>\n",
" <td>1.407820</td>\n",
" <td>0.487532</td>\n",
" <td>0.072190</td>\n",
" <td>2.414446</td>\n",
" <td>1.667224</td>\n",
" <td>-0.621508</td>\n",
" <td>...</td>\n",
" <td>0.269062</td>\n",
" <td>0.651172</td>\n",
" <td>-0.050461</td>\n",
" <td>-0.526448</td>\n",
" <td>-0.843701</td>\n",
" <td>0.574770</td>\n",
" <td>-0.227828</td>\n",
" <td>1.071423</td>\n",
" <td>1.362638</td>\n",
" <td>1465</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 173 columns</p>\n",
"</div>"
],
"text/plain": [
" PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 PCA6 \\\n",
"0 -3.208086 -2.987338 -0.327066 -1.609206 0.016879 -1.514939 -0.417889 \n",
"1 -1.403753 -4.261851 0.107527 0.935981 0.165777 -0.299485 -0.524918 \n",
"2 2.257002 0.427951 -0.610464 -1.301125 -1.058327 2.674177 -1.500824 \n",
"3 3.253618 0.537318 -0.796079 -0.851716 -1.209643 2.388795 -1.340676 \n",
"4 2.876409 -0.075909 -0.154959 -2.469870 1.407820 0.487532 0.072190 \n",
"\n",
" PCA7 PCA8 PCA9 ... PCA163 PCA164 PCA165 PCA166 \\\n",
"0 -0.988173 -0.653363 0.647642 ... -0.027364 0.653222 -0.201973 -0.769946 \n",
"1 -2.332121 0.031044 -0.820514 ... 3.856117 0.787996 0.215221 0.458275 \n",
"2 -0.223999 0.403440 -0.198229 ... -0.117138 -0.378473 -0.031613 0.090593 \n",
"3 -0.876322 0.421183 -0.692292 ... -0.441586 0.020066 -0.151709 0.444826 \n",
"4 2.414446 1.667224 -0.621508 ... 0.269062 0.651172 -0.050461 -0.526448 \n",
"\n",
" PCA167 PCA168 PCA169 PCA170 PCA171 Id \n",
"0 -0.344834 0.514257 1.114106 0.337765 -0.639617 1461 \n",
"1 1.135109 0.378972 0.953559 -1.008240 4.445435 1462 \n",
"2 -0.173914 -0.150098 -0.006612 0.190780 -0.152486 1463 \n",
"3 0.008218 -0.161705 -0.453482 0.472352 0.046141 1464 \n",
"4 -0.843701 0.574770 -0.227828 1.071423 1.362638 1465 \n",
"\n",
"[5 rows x 173 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# Write to csv\n",
"test.to_csv(r'data\\clean_test.csv')\n",
"train.to_csv(r'data\\clean_train.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}