{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.decomposition import PCA" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfig...ScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleCondition
Id
146120RH80.011622PaveNaNRegLvlAllPubInside...1200NaNMnPrvNaN062010WDNormal
146220RL81.014267PaveNaNIR1LvlAllPubCorner...00NaNNaNGar21250062010WDNormal
146360RL74.013830PaveNaNIR1LvlAllPubInside...00NaNMnPrvNaN032010WDNormal
146460RL78.09978PaveNaNIR1LvlAllPubInside...00NaNNaNNaN062010WDNormal
1465120RL43.05005PaveNaNIR1HLSAllPubInside...1440NaNNaNNaN012010WDNormal
..................................................................
2915160RM21.01936PaveNaNRegLvlAllPubInside...00NaNNaNNaN062006WDNormal
2916160RM21.01894PaveNaNRegLvlAllPubInside...00NaNNaNNaN042006WDAbnorml
291720RL160.020000PaveNaNRegLvlAllPubInside...00NaNNaNNaN092006WDAbnorml
291885RL62.010441PaveNaNRegLvlAllPubInside...00NaNMnPrvShed70072006WDNormal
291960RL74.09627PaveNaNRegLvlAllPubInside...00NaNNaNNaN0112006WDNormal
\n", "

1459 rows × 79 columns

\n", "
" ], "text/plain": [ " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "Id \n", "1461 20 RH 80.0 11622 Pave NaN Reg \n", "1462 20 RL 81.0 14267 Pave NaN IR1 \n", "1463 60 RL 74.0 13830 Pave NaN IR1 \n", "1464 60 RL 78.0 9978 Pave NaN IR1 \n", "1465 120 RL 43.0 5005 Pave NaN IR1 \n", "... ... ... ... ... ... ... ... \n", "2915 160 RM 21.0 1936 Pave NaN Reg \n", "2916 160 RM 21.0 1894 Pave NaN Reg \n", "2917 20 RL 160.0 20000 Pave NaN Reg \n", "2918 85 RL 62.0 10441 Pave NaN Reg \n", "2919 60 RL 74.0 9627 Pave NaN Reg \n", "\n", " LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence \\\n", "Id ... \n", "1461 Lvl AllPub Inside ... 120 0 NaN MnPrv \n", "1462 Lvl AllPub Corner ... 0 0 NaN NaN \n", "1463 Lvl AllPub Inside ... 0 0 NaN MnPrv \n", "1464 Lvl AllPub Inside ... 0 0 NaN NaN \n", "1465 HLS AllPub Inside ... 144 0 NaN NaN \n", "... ... ... ... ... ... ... ... ... \n", "2915 Lvl AllPub Inside ... 0 0 NaN NaN \n", "2916 Lvl AllPub Inside ... 0 0 NaN NaN \n", "2917 Lvl AllPub Inside ... 0 0 NaN NaN \n", "2918 Lvl AllPub Inside ... 0 0 NaN MnPrv \n", "2919 Lvl AllPub Inside ... 0 0 NaN NaN \n", "\n", " MiscFeature MiscVal MoSold YrSold SaleType SaleCondition \n", "Id \n", "1461 NaN 0 6 2010 WD Normal \n", "1462 Gar2 12500 6 2010 WD Normal \n", "1463 NaN 0 3 2010 WD Normal \n", "1464 NaN 0 6 2010 WD Normal \n", "1465 NaN 0 1 2010 WD Normal \n", "... ... ... ... ... ... ... \n", "2915 NaN 0 6 2006 WD Normal \n", "2916 NaN 0 4 2006 WD Abnorml \n", "2917 NaN 0 9 2006 WD Abnorml \n", "2918 Shed 700 7 2006 WD Normal \n", "2919 NaN 0 11 2006 WD Normal \n", "\n", "[1459 rows x 79 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = pd.read_csv(os.path.join('data', 'train.csv'))\n", "train.set_index('Id')\n", "test = pd.read_csv(os.path.join('data', 'test.csv'))\n", "test.set_index('Id')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "0 1 60 RL 65.0 8450 Pave NaN Reg \n", "1 2 20 RL 80.0 9600 Pave NaN Reg \n", "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", "\n", " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold \\\n", "0 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "1 Lvl AllPub ... 0 NaN NaN NaN 0 5 \n", "2 Lvl AllPub ... 0 NaN NaN NaN 0 9 \n", "3 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "4 Lvl AllPub ... 0 NaN NaN NaN 0 12 \n", "\n", " YrSold SaleType SaleCondition SalePrice \n", "0 2008 WD Normal 208500 \n", "1 2007 WD Normal 181500 \n", "2 2008 WD Normal 223500 \n", "3 2006 WD Abnorml 140000 \n", "4 2008 WD Normal 250000 \n", "\n", "[5 rows x 81 columns]\n", "(1460, 81)\n", "(1459, 80)\n" ] } ], "source": [ "print(train.head())\n", "print(train.shape)\n", "print(test.shape)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1460, 19)\n", "LotFrontage 259\n", "Alley 1369\n", "MasVnrType 8\n", "MasVnrArea 8\n", "BsmtQual 37\n", "BsmtCond 37\n", "BsmtExposure 38\n", "BsmtFinType1 37\n", "BsmtFinType2 38\n", "Electrical 1\n", "FireplaceQu 690\n", "GarageType 81\n", "GarageYrBlt 81\n", "GarageFinish 81\n", "GarageQual 81\n", "GarageCond 81\n", "PoolQC 1453\n", "Fence 1179\n", "MiscFeature 1406\n", "dtype: int64\n" ] } ], "source": [ "# All features with null values and their amounts\n", "\n", "train_null = train.loc[:, train.isnull().any()]\n", "train_null.head()\n", "print(train_null.shape)\n", "print(train_null.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Drop features with too many null values\n", "\n", "train = train.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1)\n", "test = test.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageLotAreaStreetLotShapeLandContourUtilitiesLotConfig...EnclosedPorch3SsnPorchScreenPorchPoolAreaMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveRegLvlAllPubInside...0000022008WDNormal208500
1220RL80.09600PaveRegLvlAllPubFR2...0000052007WDNormal181500
2360RL68.011250PaveIR1LvlAllPubInside...0000092008WDNormal223500
3470RL60.09550PaveIR1LvlAllPubCorner...272000022006WDAbnorml140000
4560RL84.014260PaveIR1LvlAllPubFR2...00000122008WDNormal250000
\n", "

5 rows × 76 columns

\n", "
" ], "text/plain": [ " Id MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour \\\n", "0 1 60 RL 65.0 8450 Pave Reg Lvl \n", "1 2 20 RL 80.0 9600 Pave Reg Lvl \n", "2 3 60 RL 68.0 11250 Pave IR1 Lvl \n", "3 4 70 RL 60.0 9550 Pave IR1 Lvl \n", "4 5 60 RL 84.0 14260 Pave IR1 Lvl \n", "\n", " Utilities LotConfig ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea \\\n", "0 AllPub Inside ... 0 0 0 0 \n", "1 AllPub FR2 ... 0 0 0 0 \n", "2 AllPub Inside ... 0 0 0 0 \n", "3 AllPub Corner ... 272 0 0 0 \n", "4 AllPub FR2 ... 0 0 0 0 \n", "\n", " MiscVal MoSold YrSold SaleType SaleCondition SalePrice \n", "0 0 2 2008 WD Normal 208500 \n", "1 0 5 2007 WD Normal 181500 \n", "2 0 9 2008 WD Normal 223500 \n", "3 0 2 2006 WD Abnorml 140000 \n", "4 0 12 2008 WD Normal 250000 \n", "\n", "[5 rows x 76 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1460, 76)\n", "(1459, 75)\n" ] } ], "source": [ "print(train.shape)\n", "print(test.shape)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1460, 14)\n", "LotFrontage 259\n", "MasVnrType 8\n", "MasVnrArea 8\n", "BsmtQual 37\n", "BsmtCond 37\n", "BsmtExposure 38\n", "BsmtFinType1 37\n", "BsmtFinType2 38\n", "Electrical 1\n", "GarageType 81\n", "GarageYrBlt 81\n", "GarageFinish 81\n", "GarageQual 81\n", "GarageCond 81\n", "dtype: int64\n" ] } ], "source": [ "# All features with null values and their amounts\n", "\n", "train_null = train.loc[:, train.isnull().any()]\n", "train_null.head()\n", "print(train_null.shape)\n", "print(train_null.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Fill in object nans with 'NAN' and numerical nans with mean (mean imputation)\n", "# Note that we use the train mean for mean imputation on the test df as well\n", "\n", "data = [train, test]\n", "\n", "for df in data:\n", " for column in df:\n", " if df[column].isna().any():\n", " if df[column].dtype == object:\n", " df[column] = df[column].replace(np.nan, 'NAN')\n", " else:\n", " mean = train[column].mean()\n", " df[column] = df[column].replace(np.nan, mean)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1Condition2...ElectricalKitchenQualFunctionalGarageTypeGarageFinishGarageQualGarageCondPavedDriveSaleTypeSaleCondition
0RLPaveRegLvlAllPubInsideGtlCollgCrNormNorm...SBrkrGdTypAttchdRFnTATAYWDNormal
1RLPaveRegLvlAllPubFR2GtlVeenkerFeedrNorm...SBrkrTATypAttchdRFnTATAYWDNormal
2RLPaveIR1LvlAllPubInsideGtlCollgCrNormNorm...SBrkrGdTypAttchdRFnTATAYWDNormal
3RLPaveIR1LvlAllPubCornerGtlCrawforNormNorm...SBrkrGdTypDetchdUnfTATAYWDAbnorml
4RLPaveIR1LvlAllPubFR2GtlNoRidgeNormNorm...SBrkrGdTypAttchdRFnTATAYWDNormal
\n", "

5 rows × 38 columns

\n", "
" ], "text/plain": [ " MSZoning Street LotShape LandContour Utilities LotConfig LandSlope \\\n", "0 RL Pave Reg Lvl AllPub Inside Gtl \n", "1 RL Pave Reg Lvl AllPub FR2 Gtl \n", "2 RL Pave IR1 Lvl AllPub Inside Gtl \n", "3 RL Pave IR1 Lvl AllPub Corner Gtl \n", "4 RL Pave IR1 Lvl AllPub FR2 Gtl \n", "\n", " Neighborhood Condition1 Condition2 ... Electrical KitchenQual Functional \\\n", "0 CollgCr Norm Norm ... SBrkr Gd Typ \n", "1 Veenker Feedr Norm ... SBrkr TA Typ \n", "2 CollgCr Norm Norm ... SBrkr Gd Typ \n", "3 Crawfor Norm Norm ... SBrkr Gd Typ \n", "4 NoRidge Norm Norm ... SBrkr Gd Typ \n", "\n", " GarageType GarageFinish GarageQual GarageCond PavedDrive SaleType \\\n", "0 Attchd RFn TA TA Y WD \n", "1 Attchd RFn TA TA Y WD \n", "2 Attchd RFn TA TA Y WD \n", "3 Detchd Unf TA TA Y WD \n", "4 Attchd RFn TA TA Y WD \n", "\n", " SaleCondition \n", "0 Normal \n", "1 Normal \n", "2 Normal \n", "3 Abnorml \n", "4 Normal \n", "\n", "[5 rows x 38 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# All object type features to convert to numerical\n", "\n", "train_obj = train.select_dtypes(include=['object']).copy()\n", "test_obj = test.select_dtypes(include=['object']).copy()\n", "\n", "train_obj.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...SaleType_ConLwSaleType_NewSaleType_OthSaleType_WDSaleCondition_AbnormlSaleCondition_AdjLandSaleCondition_AllocaSaleCondition_FamilySaleCondition_NormalSaleCondition_Partial
016065.084507520032003196.0706...0001000010
122080.0960068197619760.0978...0001000010
236068.0112507520012002162.0486...0001000010
347060.0955075191519700.0216...0001100000
456084.0142608520002000350.0655...0001000010
\n", "

5 rows × 283 columns

\n", "
" ], "text/plain": [ " Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n", "0 1 60 65.0 8450 7 5 2003 \n", "1 2 20 80.0 9600 6 8 1976 \n", "2 3 60 68.0 11250 7 5 2001 \n", "3 4 70 60.0 9550 7 5 1915 \n", "4 5 60 84.0 14260 8 5 2000 \n", "\n", " YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n", "0 2003 196.0 706 ... 0 0 \n", "1 1976 0.0 978 ... 0 0 \n", "2 2002 162.0 486 ... 0 0 \n", "3 1970 0.0 216 ... 0 0 \n", "4 2000 350.0 655 ... 0 0 \n", "\n", " SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n", "0 0 1 0 0 \n", "1 0 1 0 0 \n", "2 0 1 0 0 \n", "3 0 1 1 0 \n", "4 0 1 0 0 \n", "\n", " SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n", "0 0 0 1 \n", "1 0 0 1 \n", "2 0 0 1 \n", "3 0 0 0 \n", "4 0 0 1 \n", "\n", " SaleCondition_Partial \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", "[5 rows x 283 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(list(train_obj))\n", "\n", "one_hot_train = pd.get_dummies(train, columns=list(train_obj))\n", "one_hot_train.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# One hot encoding\n", "\n", "train = pd.get_dummies(train, columns=list(train_obj))\n", "test = pd.get_dummies(test, columns=list(test_obj))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1460, 283)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...SaleType_ConLwSaleType_NewSaleType_OthSaleType_WDSaleCondition_AbnormlSaleCondition_AdjLandSaleCondition_AllocaSaleCondition_FamilySaleCondition_NormalSaleCondition_Partial
016065.084507520032003196.0706...0001000010
122080.0960068197619760.0978...0001000010
236068.0112507520012002162.0486...0001000010
347060.0955075191519700.0216...0001100000
456084.0142608520002000350.0655...0001000010
\n", "

5 rows × 283 columns

\n", "
" ], "text/plain": [ " Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n", "0 1 60 65.0 8450 7 5 2003 \n", "1 2 20 80.0 9600 6 8 1976 \n", "2 3 60 68.0 11250 7 5 2001 \n", "3 4 70 60.0 9550 7 5 1915 \n", "4 5 60 84.0 14260 8 5 2000 \n", "\n", " YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n", "0 2003 196.0 706 ... 0 0 \n", "1 1976 0.0 978 ... 0 0 \n", "2 2002 162.0 486 ... 0 0 \n", "3 1970 0.0 216 ... 0 0 \n", "4 2000 350.0 655 ... 0 0 \n", "\n", " SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n", "0 0 1 0 0 \n", "1 0 1 0 0 \n", "2 0 1 0 0 \n", "3 0 1 1 0 \n", "4 0 1 0 0 \n", "\n", " SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n", "0 0 0 1 \n", "1 0 0 1 \n", "2 0 0 1 \n", "3 0 0 0 \n", "4 0 0 1 \n", "\n", " SaleCondition_Partial \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", "[5 rows x 283 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(train.shape)\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1459, 272)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...SaleType_NANSaleType_NewSaleType_OthSaleType_WDSaleCondition_AbnormlSaleCondition_AdjLandSaleCondition_AllocaSaleCondition_FamilySaleCondition_NormalSaleCondition_Partial
014612080.01162256196119610.0468.0...0001000010
114622081.0142676619581958108.0923.0...0001000010
214636074.01383055199719980.0791.0...0001000010
314646078.09978661998199820.0602.0...0001000010
4146512043.0500585199219920.0263.0...0001000010
\n", "

5 rows × 272 columns

\n", "
" ], "text/plain": [ " Id MSSubClass LotFrontage LotArea OverallQual OverallCond \\\n", "0 1461 20 80.0 11622 5 6 \n", "1 1462 20 81.0 14267 6 6 \n", "2 1463 60 74.0 13830 5 5 \n", "3 1464 60 78.0 9978 6 6 \n", "4 1465 120 43.0 5005 8 5 \n", "\n", " YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_NAN \\\n", "0 1961 1961 0.0 468.0 ... 0 \n", "1 1958 1958 108.0 923.0 ... 0 \n", "2 1997 1998 0.0 791.0 ... 0 \n", "3 1998 1998 20.0 602.0 ... 0 \n", "4 1992 1992 0.0 263.0 ... 0 \n", "\n", " SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml \\\n", "0 0 0 1 0 \n", "1 0 0 1 0 \n", "2 0 0 1 0 \n", "3 0 0 1 0 \n", "4 0 0 1 0 \n", "\n", " SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " SaleCondition_Normal SaleCondition_Partial \n", "0 1 0 \n", "1 1 0 \n", "2 1 0 \n", "3 1 0 \n", "4 1 0 \n", "\n", "[5 rows x 272 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(test.shape)\n", "test.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'Exterior1st_Stone', 'Condition2_RRNn', 'Condition2_RRAn', 'Electrical_NAN', 'HouseStyle_2.5Fin', 'Exterior1st_ImStucc', 'Electrical_Mix', 'RoofMatl_Roll', 'SalePrice', 'RoofMatl_Membran', 'Heating_Floor', 'RoofMatl_Metal', 'Condition2_RRAe', 'RoofMatl_ClyTile', 'GarageQual_Ex', 'Heating_OthW', 'Exterior2nd_Other', 'Utilities_NoSeWa'}\n" ] } ], "source": [ "# Check for missing columns\n", "\n", "missing_cols = set( train.columns ) - set( test.columns )\n", "print(missing_cols)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Note that we are missing some columns. This is because in the one-hot encoding process, the test set did not have any\n", "# samples with a certain categorical output. For example, in RoofMatl, there were no samples in test where the value was\n", "# ClyTile, which means no column was create to one hot encode it. As such, we will create zero-valued columns to fill\n", "\n", "for col in missing_cols:\n", " test[col] = 0\n", "\n", "# Maintain same ordering for the df\n", "test = test[train.columns]\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Drop zeroed out SalePrice column from test set\n", "\n", "test = test.drop('SalePrice', axis=1)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...SaleType_ConLwSaleType_NewSaleType_OthSaleType_WDSaleCondition_AbnormlSaleCondition_AdjLandSaleCondition_AllocaSaleCondition_FamilySaleCondition_NormalSaleCondition_Partial
016065.084507520032003196.0706...0001000010
122080.0960068197619760.0978...0001000010
236068.0112507520012002162.0486...0001000010
347060.0955075191519700.0216...0001100000
456084.0142608520002000350.0655...0001000010
\n", "

5 rows × 283 columns

\n", "
" ], "text/plain": [ " Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \\\n", "0 1 60 65.0 8450 7 5 2003 \n", "1 2 20 80.0 9600 6 8 1976 \n", "2 3 60 68.0 11250 7 5 2001 \n", "3 4 70 60.0 9550 7 5 1915 \n", "4 5 60 84.0 14260 8 5 2000 \n", "\n", " YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New \\\n", "0 2003 196.0 706 ... 0 0 \n", "1 1976 0.0 978 ... 0 0 \n", "2 2002 162.0 486 ... 0 0 \n", "3 1970 0.0 216 ... 0 0 \n", "4 2000 350.0 655 ... 0 0 \n", "\n", " SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand \\\n", "0 0 1 0 0 \n", "1 0 1 0 0 \n", "2 0 1 0 0 \n", "3 0 1 1 0 \n", "4 0 1 0 0 \n", "\n", " SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \\\n", "0 0 0 1 \n", "1 0 0 1 \n", "2 0 0 1 \n", "3 0 0 0 \n", "4 0 0 1 \n", "\n", " SaleCondition_Partial \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", "[5 rows x 283 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...SaleType_ConLwSaleType_NewSaleType_OthSaleType_WDSaleCondition_AbnormlSaleCondition_AdjLandSaleCondition_AllocaSaleCondition_FamilySaleCondition_NormalSaleCondition_Partial
014612080.01162256196119610.0468.0...0001000010
114622081.0142676619581958108.0923.0...0001000010
214636074.01383055199719980.0791.0...0001000010
314646078.09978661998199820.0602.0...0001000010
4146512043.0500585199219920.0263.0...0001000010
\n", "

5 rows × 282 columns

\n", "
" ], "text/plain": [ " Id MSSubClass LotFrontage LotArea OverallQual OverallCond \\\n", "0 1461 20 80.0 11622 5 6 \n", "1 1462 20 81.0 14267 6 6 \n", "2 1463 60 74.0 13830 5 5 \n", "3 1464 60 78.0 9978 6 6 \n", "4 1465 120 43.0 5005 8 5 \n", "\n", " YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw \\\n", "0 1961 1961 0.0 468.0 ... 0 \n", "1 1958 1958 108.0 923.0 ... 0 \n", "2 1997 1998 0.0 791.0 ... 0 \n", "3 1998 1998 20.0 602.0 ... 0 \n", "4 1992 1992 0.0 263.0 ... 0 \n", "\n", " SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml \\\n", "0 0 0 1 0 \n", "1 0 0 1 0 \n", "2 0 0 1 0 \n", "3 0 0 1 0 \n", "4 0 0 1 0 \n", "\n", " SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " SaleCondition_Normal SaleCondition_Partial \n", "0 1 0 \n", "1 1 0 \n", "2 1 0 \n", "3 1 0 \n", "4 1 0 \n", "\n", "[5 rows x 282 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Store columns we don't want to scale\n", "\n", "testId = test['Id']\n", "trainSalePrice = train['SalePrice']\n", "trainId = train['Id']\n", "test = test.drop('Id', axis=1)\n", "train = train.drop(['Id', 'SalePrice'], axis=1)\n", "\n", "# Store labels to remake our dataframes\n", "\n", "labels = list(train)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# Scale the data and reformat as pandas dataframe\n", "scaler = StandardScaler()\n", "\n", "scaler.fit(train)\n", "\n", "train = pd.DataFrame(scaler.transform(train), columns = labels)\n", "test = pd.DataFrame(scaler.transform(test), columns = labels)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,\n", " svd_solver='auto', tol=0.0, whiten=False)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca = PCA(.95)\n", "pca.fit(train)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# create pd dataframe from pca transformation\n", "\n", "n_components = 172\n", "train = pd.DataFrame(pca.transform(train), columns=['PCA%i' % i for i in range(n_components)])\n", "test = pd.DataFrame(pca.transform(test), columns=['PCA%i' % i for i in range(n_components)])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# Bring back columns we set aside\n", "\n", "test['Id'] = testId\n", "train['Id'] = trainId\n", "train['SalePrice'] = trainSalePrice" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PCA0PCA1PCA2PCA3PCA4PCA5PCA6PCA7PCA8PCA9...PCA164PCA165PCA166PCA167PCA168PCA169PCA170PCA171IdSalePrice
04.3451091.619386-0.739617-2.080179-0.9850881.999117-1.231870-0.1317821.316470-1.336446...0.1607330.0713330.1554680.172801-0.169568-0.1443260.391713-0.0133571208500
10.019142-3.1069590.168223-0.5533410.9407120.200719-0.4689540.235082-0.838022-1.273833...-1.063234-0.3345560.361166-1.218397-0.346191-0.962753-0.1388631.0831032181500
24.8511491.242811-0.351815-1.484957-0.7582002.181179-1.8439490.2961941.299142-1.391358...0.0883340.2386240.3272800.325285-0.704900-0.036388-0.5405160.0217113223500
3-1.7716410.039500-1.3586231.920760-2.5508170.209519-0.7563870.700109-1.4085430.025023...-0.172186-0.5189220.231498-0.074296-0.034287-0.8777350.028065-0.3210094140000
46.4637471.0644730.2094720.448906-1.5553013.215822-0.946356-0.8052042.112526-1.821083...-0.2701890.375297-0.396732-0.1090840.317305-0.145975-0.674692-0.3784585250000
\n", "

5 rows × 174 columns

\n", "
" ], "text/plain": [ " PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 PCA6 \\\n", "0 4.345109 1.619386 -0.739617 -2.080179 -0.985088 1.999117 -1.231870 \n", "1 0.019142 -3.106959 0.168223 -0.553341 0.940712 0.200719 -0.468954 \n", "2 4.851149 1.242811 -0.351815 -1.484957 -0.758200 2.181179 -1.843949 \n", "3 -1.771641 0.039500 -1.358623 1.920760 -2.550817 0.209519 -0.756387 \n", "4 6.463747 1.064473 0.209472 0.448906 -1.555301 3.215822 -0.946356 \n", "\n", " PCA7 PCA8 PCA9 ... PCA164 PCA165 PCA166 PCA167 \\\n", "0 -0.131782 1.316470 -1.336446 ... 0.160733 0.071333 0.155468 0.172801 \n", "1 0.235082 -0.838022 -1.273833 ... -1.063234 -0.334556 0.361166 -1.218397 \n", "2 0.296194 1.299142 -1.391358 ... 0.088334 0.238624 0.327280 0.325285 \n", "3 0.700109 -1.408543 0.025023 ... -0.172186 -0.518922 0.231498 -0.074296 \n", "4 -0.805204 2.112526 -1.821083 ... -0.270189 0.375297 -0.396732 -0.109084 \n", "\n", " PCA168 PCA169 PCA170 PCA171 Id SalePrice \n", "0 -0.169568 -0.144326 0.391713 -0.013357 1 208500 \n", "1 -0.346191 -0.962753 -0.138863 1.083103 2 181500 \n", "2 -0.704900 -0.036388 -0.540516 0.021711 3 223500 \n", "3 -0.034287 -0.877735 0.028065 -0.321009 4 140000 \n", "4 0.317305 -0.145975 -0.674692 -0.378458 5 250000 \n", "\n", "[5 rows x 174 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PCA0PCA1PCA2PCA3PCA4PCA5PCA6PCA7PCA8PCA9...PCA163PCA164PCA165PCA166PCA167PCA168PCA169PCA170PCA171Id
0-3.208086-2.987338-0.327066-1.6092060.016879-1.514939-0.417889-0.988173-0.6533630.647642...-0.0273640.653222-0.201973-0.769946-0.3448340.5142571.1141060.337765-0.6396171461
1-1.403753-4.2618510.1075270.9359810.165777-0.299485-0.524918-2.3321210.031044-0.820514...3.8561170.7879960.2152210.4582751.1351090.3789720.953559-1.0082404.4454351462
22.2570020.427951-0.610464-1.301125-1.0583272.674177-1.500824-0.2239990.403440-0.198229...-0.117138-0.378473-0.0316130.090593-0.173914-0.150098-0.0066120.190780-0.1524861463
33.2536180.537318-0.796079-0.851716-1.2096432.388795-1.340676-0.8763220.421183-0.692292...-0.4415860.020066-0.1517090.4448260.008218-0.161705-0.4534820.4723520.0461411464
42.876409-0.075909-0.154959-2.4698701.4078200.4875320.0721902.4144461.667224-0.621508...0.2690620.651172-0.050461-0.526448-0.8437010.574770-0.2278281.0714231.3626381465
\n", "

5 rows × 173 columns

\n", "
" ], "text/plain": [ " PCA0 PCA1 PCA2 PCA3 PCA4 PCA5 PCA6 \\\n", "0 -3.208086 -2.987338 -0.327066 -1.609206 0.016879 -1.514939 -0.417889 \n", "1 -1.403753 -4.261851 0.107527 0.935981 0.165777 -0.299485 -0.524918 \n", "2 2.257002 0.427951 -0.610464 -1.301125 -1.058327 2.674177 -1.500824 \n", "3 3.253618 0.537318 -0.796079 -0.851716 -1.209643 2.388795 -1.340676 \n", "4 2.876409 -0.075909 -0.154959 -2.469870 1.407820 0.487532 0.072190 \n", "\n", " PCA7 PCA8 PCA9 ... PCA163 PCA164 PCA165 PCA166 \\\n", "0 -0.988173 -0.653363 0.647642 ... -0.027364 0.653222 -0.201973 -0.769946 \n", "1 -2.332121 0.031044 -0.820514 ... 3.856117 0.787996 0.215221 0.458275 \n", "2 -0.223999 0.403440 -0.198229 ... -0.117138 -0.378473 -0.031613 0.090593 \n", "3 -0.876322 0.421183 -0.692292 ... -0.441586 0.020066 -0.151709 0.444826 \n", "4 2.414446 1.667224 -0.621508 ... 0.269062 0.651172 -0.050461 -0.526448 \n", "\n", " PCA167 PCA168 PCA169 PCA170 PCA171 Id \n", "0 -0.344834 0.514257 1.114106 0.337765 -0.639617 1461 \n", "1 1.135109 0.378972 0.953559 -1.008240 4.445435 1462 \n", "2 -0.173914 -0.150098 -0.006612 0.190780 -0.152486 1463 \n", "3 0.008218 -0.161705 -0.453482 0.472352 0.046141 1464 \n", "4 -0.843701 0.574770 -0.227828 1.071423 1.362638 1465 \n", "\n", "[5 rows x 173 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# Write to csv\n", "test.to_csv(r'data\\clean_test.csv')\n", "train.to_csv(r'data\\clean_train.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }