End-to-End Machine Learning Project

fetch the data:

import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH): 
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
fetch_housing_data()

load data using Pandas:

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

1 2	housing = load_housing_data() housing.head()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

The info() method is useful to get a quick description of the data, in particular the total number of rows, and each attribute’s type and number of non-null values

1	housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

1	housing["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

The describe() method shows a summary of the numerical attributes

1	housing.describe()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000

%matplotlib inline 
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))
plt.show()

png

Create a Test Set

just pick some instances randomly, typically 20% of the dataset, and set them aside:

import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")

16512 train + 4128 test

A common solution is to use each instance’s identifier to decide whether or not it should go in the test set. For example, you could compute a hash of each instance’s identifier, keep only the last byte of the hash, and put the instance in the test set if this value is lower or equal to 51.

import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_radio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_radio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

# housing_with_id = housing.reset_index() # adds an 'index' column
# train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

# housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
# train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
# from sklearn.model_selection import train_test_split
# 
# train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

stratified sampling based on the income category

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

housing["income_cat"].value_counts() / len(housing)

3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

The test set generated using stratified sampling has income category proportions almost identical to those in the full dataset, whereas the test set generated using purely random sampling is quite skewed.

1 2	print(strat_test_set["income_cat"].value_counts() / len(strat_test_set)) test_set["income_cat"].value_counts() / len(test_set["income_cat"])

3.0    0.350533
2.0    0.318798
4.0    0.176357
5.0    0.114583
1.0    0.039729
Name: income_cat, dtype: float64





3.0    0.358527
2.0    0.324370
4.0    0.167393
5.0    0.109496
1.0    0.040213
Name: income_cat, dtype: float64

1 2	for set in (strat_train_set, strat_test_set): set.drop(["income_cat"], axis=1, inplace=True)

Discover and Visualize the Data to Gain Insights

Let’s create a copy so you can play with it without harming the training set:

1	housing = strat_train_set.copy()

Visualizing Geographical Data

1	housing.plot(kind="scatter", x="longitude", y="latitude")

<matplotlib.axes._subplots.AxesSubplot at 0x112340b38>

png

1
2
3

# setting the alpha option to 0.1 makes it much easier to visualize the places where there is 
# a high density of data points
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x112a02f28>

png

The radius of each circle represents the district’s population (option s), and the color represents the price (option c). We will use a predefinded color map (option cmap) called jet, which rangs from blue (low values) to red (high prices):

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
            s=housing["population"]/100, label="population",
            c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True
            )
plt.legend()

<matplotlib.legend.Legend at 0x112c5dba8>

png

1 2	corr_matrix = housing.corr() corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

Another way to check for correlation between attributes is to use Pandas’ scatter_matrix function, which plots every numerical attribute against every other numerical attribute.

from pandas.tools.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
             "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

/Users/tweinyan/.pyenv/versions/3.6.2/envs/venv36/lib/python3.6/site-packages/ipykernel_launcher.py:5: FutureWarning: 'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.
  """





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x112b8ea58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f875390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x110007080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f82b2e8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f77fbe0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f77fba8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f736400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f774f60>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f6cf400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f6a8a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f642da0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f688668>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f5c5d30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f623588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ffca908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ff60ac8>]], dtype=object)

png

The most promising attribute to predict the median house value is the median income

1 2	housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x10f532be0>

png

在500000，450000，350000附近有直线，为了不对算法造成不良影响，需要去除

Experimenting with Attribute Combinations

1
2
3

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

1 2	corr_matrix = housing.corr() corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

Prepare the Data for Machine Learning Algorithms

separate the predictors and the labels

1 2	housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy()

Data Cleaning

The total_bedrooms attribute has some missing values. You have three options:

Get rid of the corresponding districts.
Get rid of the whole attribute.
Set the values to some value (zero, the mean, the median, etc.)

# housing.dropna(subset=["total_bedrooms"]) # option 1
# housing.drop("total_bedrooms", axis=1) # option 2
# median = housing["total_bedrooms"].median()
# housing["total_bedrooms"].fillna(median) # option 3

# Scikit-learn provides a handy class to take care of missing values: Imputer
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_

housing_num.median().values

array([ -118.51  ,    34.26  ,    29.    ,  2119.5   ,   433.    ,
        1164.    ,   408.    ,     3.5409])

1 2	X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns)

Handling Text and Categorical Attributes

ocean_proximity is a text attribute, let’s convert these text labels to numbers

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

array([0, 0, 4, ..., 1, 0, 3])

1	print(encoder.classes_)

['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']

0对应’<1H OCEAN’， 4对应’NEAR OCEAN’，两个相近属性对应的值距离太远，显然这不是我们想要的。

one-hot encoding

To fix this issue, a common solution is to create one binary attribute per category: one attribute equal to 1 when the category is “<1H OCEAN” (and 0 otherwise), another attribute equal to 1 when the category is “INLAND” (and 0 otherwise), and so on. This is called one-hot encoding, because only one attribute will be equal to 1 (hot), while the others will be 0 (cold).

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
    with 16512 stored elements in Compressed Sparse Row format>

1 2	# convert it to a dense NumPy array housing_cat_1hot.toarray()

array([[ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       ..., 
       [ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.]])

# 使用LabelBinarizer代替以上两个
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ..., 
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

Custom Transformers

from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
    
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room 
    def fit(self, X, y=None):
        return self # nothing else to do 
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix] 
        population_per_household = X[:, population_ix] / X[:, household_ix] 
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

Feature Scaling

Min-max scaling
Standardization

Transformation Pipelines

A small pipeline for the numerical attributes:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
housing_num_tr = num_pipeline.fit_transform(housing_num)

bug fix:

https://stackoverflow.com/questions/46162855/fit-transform-takes-2-positional-arguments-but-3-were-given-with-labelbinarize

from sklearn.pipeline import FeatureUnion

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names 
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values
    
class LableBinarizer_new(BaseEstimator, TransformerMixin):
    def fit(self, X, y=0):
        return self
    def transform(self, X, y=0):
        encoder = LabelBinarizer()
        result = encoder.fit_transform(X)
        return result
    
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', LabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ..., 
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

1	housing_prepared.shape

(16512, 16)

Select and Train a Model

Training and Evaluating on the Training Set

Train a Linear Regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

/Users/tweinyan/.pyenv/versions/3.6.2/envs/venv36/lib/python3.6/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

Predictions:     [ 210644.60459286  317768.80697211  210956.43331178   59218.98886849
  189747.55849878]
Labels:         [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]

from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68628.198198489219

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

看上去过拟合了，但是我们并不想在测试集上验证，所以需要一个交叉验证集

Better Evaluation Using Cross-Validation

Use Scikit-Learn’s cross-validation. Performs K-fold coross-validation: it randomly splits the training set into 10 distinct subsets called folds, then it trains and evaluates the Decision Tree model 10 times, picking a different fold for evaluation every time and training on the other 9 folds.

Decision Tree

# the result is an array containing the 10 evaluation scores
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(tree_rmse_scores)

Scores: [ 68598.67166286  65608.52457119  70810.0869789   69211.66220381
  72014.87961395  75011.86954035  71534.25262773  70335.79903809
  76849.90225183  70211.15023877]
Mean: 71018.6798727
Standard deviation: 3012.64703648

Linear Regression

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                            scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [ 66782.73843989  66960.118071    70361.18285107  74742.02956966
  68022.09224176  71193.07033936  64969.63056405  68276.69992785
  71543.69797334  67665.10082067]
Mean: 69051.6360799
Standard deviation: 2732.39242562

RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                               scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [ 50896.82385485  49692.14156589  52932.15059225  54728.28076321
  52398.83293126  55051.19425456  51192.98572404  50816.1459606
  55622.24945824  54250.92937433]
Mean: 52758.1734479
Standard deviation: 1970.1870403

Fine-Tune Your Model

Grid Search

手动调参是乏味且低效的，使用Scikit-Learn的GridSearchCV可以做这件事，只要指定哪些超参，哪些值需要尝试，函数会使用交叉验证评价所有超参值的组合。

# best combination of hyperparameter values for the RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

1	grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

1	grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

1
2
3

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

64365.7732082 {'max_features': 2, 'n_estimators': 3}
55624.8696251 {'max_features': 2, 'n_estimators': 10}
52747.3739976 {'max_features': 2, 'n_estimators': 30}
61352.3593931 {'max_features': 4, 'n_estimators': 3}
52884.9953945 {'max_features': 4, 'n_estimators': 10}
50459.6268283 {'max_features': 4, 'n_estimators': 30}
59149.2863452 {'max_features': 6, 'n_estimators': 3}
51761.6995007 {'max_features': 6, 'n_estimators': 10}
50131.7157433 {'max_features': 6, 'n_estimators': 30}
58754.4065217 {'max_features': 8, 'n_estimators': 3}
52138.1112373 {'max_features': 8, 'n_estimators': 10}
49947.2015882 {'max_features': 8, 'n_estimators': 30}
62303.8222151 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54700.9998416 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60095.3114005 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52884.0961431 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
57626.9358603 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
51655.3895683 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

Analyze the Best Models and Their Errors

1 2	feature_importances = grid_search.best_estimator_.feature_importances_ feature_importances

array([  7.33548881e-02,   6.83026577e-02,   4.30563234e-02,
         1.59958729e-02,   1.44271213e-02,   1.61089781e-02,
         1.37680834e-02,   3.30419243e-01,   4.99376357e-02,
         1.13037983e-01,   8.09265844e-02,   7.21802653e-03,
         1.68103579e-01,   5.94523572e-05,   1.75085991e-03,
         3.53271167e-03])

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.33041924268078282, 'median_income'),
 (0.16810357872016857, 'INLAND'),
 (0.11303798307184716, 'pop_per_hhold'),
 (0.080926584412980271, 'bedrooms_per_room'),
 (0.073354888107514407, 'longitude'),
 (0.068302657661592367, 'latitude'),
 (0.049937635687884216, 'rooms_per_hhold'),
 (0.043056323384962278, 'housing_median_age'),
 (0.016108978092674749, 'population'),
 (0.015995872935832368, 'total_rooms'),
 (0.014427121335475118, 'total_bedrooms'),
 (0.013768083439830915, 'households'),
 (0.0072180265259504878, '<1H OCEAN'),
 (0.0035327116730775876, 'NEAR OCEAN'),
 (0.0017508599122762194, 'NEAR BAY'),
 (5.9452357150607577e-05, 'ISLAND')]

Evalute Your System on the Test Set

final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

47964.376820929021