{ "cells": [ { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "import urllib\n", "import tarfile\n", "from sklearn.model_selection import StratifiedShuffleSplit as SSS\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.svm import SVR\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", "import joblib\n", "from scipy.stats import expon, reciprocal" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n", "HOUSING_PATH = os.path.join(\"datasets\", \"housing\")\n", "HOUSING_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n", " os.makedirs(housing_path, exist_ok = True) # Create directory if not already there\n", " tgz_path = os.path.join(housing_path, 'housing.tgz') # Make path for our tgz file\n", " urllib.request.urlretrieve(housing_url, tgz_path) # Download the file\n", " housing_tgz = tarfile.open(tgz_path) # Open the file\n", " housing_tgz.extractall(path=housing_path) # Extract from tarfile\n", " housing_tgz.close()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "fetch_housing_data()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def load_housing_data(housing_path=HOUSING_PATH):\n", " csv_path = os.path.join(housing_path, 'housing.csv')\n", " return pd.read_csv(csv_path)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "-122.23 | \n", "37.88 | \n", "41.0 | \n", "880.0 | \n", "129.0 | \n", "322.0 | \n", "126.0 | \n", "8.3252 | \n", "452600.0 | \n", "NEAR BAY | \n", "
| 1 | \n", "-122.22 | \n", "37.86 | \n", "21.0 | \n", "7099.0 | \n", "1106.0 | \n", "2401.0 | \n", "1138.0 | \n", "8.3014 | \n", "358500.0 | \n", "NEAR BAY | \n", "
| 2 | \n", "-122.24 | \n", "37.85 | \n", "52.0 | \n", "1467.0 | \n", "190.0 | \n", "496.0 | \n", "177.0 | \n", "7.2574 | \n", "352100.0 | \n", "NEAR BAY | \n", "
| 3 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1274.0 | \n", "235.0 | \n", "558.0 | \n", "219.0 | \n", "5.6431 | \n", "341300.0 | \n", "NEAR BAY | \n", "
| 4 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1627.0 | \n", "280.0 | \n", "565.0 | \n", "259.0 | \n", "3.8462 | \n", "342200.0 | \n", "NEAR BAY | \n", "