In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
train.set_index('Id')
test = pd.read_csv(os.path.join('data', 'test.csv'))
test.set_index('Id')

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [3]:
print(train.head())
print(train.shape)
print(test.shape)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [4]:
# All features with null values and their amounts

train_null = train.loc[:, train.isnull().any()]
train_null.head()
print(train_null.shape)
print(train_null.isnull().sum())

(1460, 19)
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [5]:
# Drop features with too many null values

train = train.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1)
test = test.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1)

In [6]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [7]:
print(train.shape)
print(test.shape)

(1460, 76)
(1459, 75)


In [8]:
# All features with null values and their amounts

train_null = train.loc[:, train.isnull().any()]
train_null.head()
print(train_null.shape)
print(train_null.isnull().sum())

(1460, 14)
LotFrontage     259
MasVnrType        8
MasVnrArea        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64


In [9]:
# Fill in object nans with 'NAN' and numerical nans with mean (mean imputation)
# Note that we use the train mean for mean imputation on the test df as well

data = [train, test]

for df in data:
    for column in df:
        if df[column].isna().any():
            if df[column].dtype == object:
                df[column] = df[column].replace(np.nan, 'NAN')
            else:
                mean = train[column].mean()
                df[column] = df[column].replace(np.nan, mean)

In [10]:
# All object type features to convert to numerical

train_obj = train.select_dtypes(include=['object']).copy()
test_obj = test.select_dtypes(include=['object']).copy()

train_obj.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


In [11]:
print(list(train_obj))

one_hot_train = pd.get_dummies(train, columns=list(train_obj))
one_hot_train.head()

['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [12]:
# One hot encoding

train = pd.get_dummies(train, columns=list(train_obj))
test = pd.get_dummies(test, columns=list(test_obj))

In [13]:
print(train.shape)
train.head()

(1460, 283)


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [14]:
print(test.shape)
test.head()

(1459, 272)


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_NAN,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [15]:
# Check for missing columns

missing_cols = set( train.columns ) - set( test.columns )
print(missing_cols)

{'Exterior1st_Stone', 'Condition2_RRNn', 'Condition2_RRAn', 'Electrical_NAN', 'HouseStyle_2.5Fin', 'Exterior1st_ImStucc', 'Electrical_Mix', 'RoofMatl_Roll', 'SalePrice', 'RoofMatl_Membran', 'Heating_Floor', 'RoofMatl_Metal', 'Condition2_RRAe', 'RoofMatl_ClyTile', 'GarageQual_Ex', 'Heating_OthW', 'Exterior2nd_Other', 'Utilities_NoSeWa'}


In [16]:
# Note  that we are missing some columns. This is because in the one-hot encoding process, the test set did not have any
# samples with a certain categorical output. For example, in RoofMatl, there were no samples in test where the value was
# ClyTile, which means no column was create to one hot encode it. As such, we will create zero-valued columns to fill

for col in missing_cols:
        test[col] = 0

# Maintain same ordering for the df
test = test[train.columns]


In [17]:
# Drop zeroed out SalePrice column from test set

test = test.drop('SalePrice', axis=1)

In [18]:
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [19]:
test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [20]:
# Store columns we don't want to scale

testId = test['Id']
trainSalePrice = train['SalePrice']
trainId = train['Id']
test = test.drop('Id', axis=1)
train = train.drop(['Id', 'SalePrice'], axis=1)

# Store labels to remake our dataframes

labels = list(train)

In [21]:
# Scale the data and reformat as pandas dataframe
scaler = StandardScaler()

scaler.fit(train)

train = pd.DataFrame(scaler.transform(train), columns = labels)
test = pd.DataFrame(scaler.transform(test), columns = labels)

In [22]:
pca = PCA(.95)
pca.fit(train)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [23]:
# create pd dataframe from pca transformation

n_components = 172
train = pd.DataFrame(pca.transform(train), columns=['PCA%i' % i for i in range(n_components)])
test = pd.DataFrame(pca.transform(test), columns=['PCA%i' % i for i in range(n_components)])

In [24]:
# Bring back columns we set aside

test['Id'] = testId
train['Id'] = trainId
train['SalePrice'] = trainSalePrice

In [25]:
train.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,PCA164,PCA165,PCA166,PCA167,PCA168,PCA169,PCA170,PCA171,Id,SalePrice
0,4.345109,1.619386,-0.739617,-2.080179,-0.985088,1.999117,-1.23187,-0.131782,1.31647,-1.336446,...,0.160733,0.071333,0.155468,0.172801,-0.169568,-0.144326,0.391713,-0.013357,1,208500
1,0.019142,-3.106959,0.168223,-0.553341,0.940712,0.200719,-0.468954,0.235082,-0.838022,-1.273833,...,-1.063234,-0.334556,0.361166,-1.218397,-0.346191,-0.962753,-0.138863,1.083103,2,181500
2,4.851149,1.242811,-0.351815,-1.484957,-0.7582,2.181179,-1.843949,0.296194,1.299142,-1.391358,...,0.088334,0.238624,0.32728,0.325285,-0.7049,-0.036388,-0.540516,0.021711,3,223500
3,-1.771641,0.0395,-1.358623,1.92076,-2.550817,0.209519,-0.756387,0.700109,-1.408543,0.025023,...,-0.172186,-0.518922,0.231498,-0.074296,-0.034287,-0.877735,0.028065,-0.321009,4,140000
4,6.463747,1.064473,0.209472,0.448906,-1.555301,3.215822,-0.946356,-0.805204,2.112526,-1.821083,...,-0.270189,0.375297,-0.396732,-0.109084,0.317305,-0.145975,-0.674692,-0.378458,5,250000


In [26]:
test.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,PCA163,PCA164,PCA165,PCA166,PCA167,PCA168,PCA169,PCA170,PCA171,Id
0,-3.208086,-2.987338,-0.327066,-1.609206,0.016879,-1.514939,-0.417889,-0.988173,-0.653363,0.647642,...,-0.027364,0.653222,-0.201973,-0.769946,-0.344834,0.514257,1.114106,0.337765,-0.639617,1461
1,-1.403753,-4.261851,0.107527,0.935981,0.165777,-0.299485,-0.524918,-2.332121,0.031044,-0.820514,...,3.856117,0.787996,0.215221,0.458275,1.135109,0.378972,0.953559,-1.00824,4.445435,1462
2,2.257002,0.427951,-0.610464,-1.301125,-1.058327,2.674177,-1.500824,-0.223999,0.40344,-0.198229,...,-0.117138,-0.378473,-0.031613,0.090593,-0.173914,-0.150098,-0.006612,0.19078,-0.152486,1463
3,3.253618,0.537318,-0.796079,-0.851716,-1.209643,2.388795,-1.340676,-0.876322,0.421183,-0.692292,...,-0.441586,0.020066,-0.151709,0.444826,0.008218,-0.161705,-0.453482,0.472352,0.046141,1464
4,2.876409,-0.075909,-0.154959,-2.46987,1.40782,0.487532,0.07219,2.414446,1.667224,-0.621508,...,0.269062,0.651172,-0.050461,-0.526448,-0.843701,0.57477,-0.227828,1.071423,1.362638,1465


In [27]:
# Write to csv
test.to_csv(r'data\clean_test.csv')
train.to_csv(r'data\clean_train.csv')