End-to-End Machine Learning Project

fetch the data:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()

fetch_housing_data()

load data using Pandas:

1
2
3
4
5
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
1
2
housing = load_housing_data()
housing.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY

The info() method is useful to get a quick description of the data, in particular the total number of rows, and each attribute’s type and number of non-null values

1
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
1
housing["ocean_proximity"].value_counts()
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

The describe() method shows a summary of the numerical attributes

1
housing.describe()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
1
2
3
4
%matplotlib inline 
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))
plt.show()

png

Create a Test Set

just pick some instances randomly, typically 20% of the dataset, and set them aside:

1
2
3
4
5
6
7
8
9
10
11
import numpy as np

def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")
16512 train + 4128 test

A common solution is to use each instance’s identifier to decide whether or not it should go in the test set. For example, you could compute a hash of each instance’s identifier, keep only the last byte of the hash, and put the instance in the test set if this value is lower or equal to 51.

1
2
3
4
5
6
7
8
9
import hashlib

def test_set_check(identifier, test_ratio, hash):
return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_radio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_radio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
1
2
3
4
5
6
7
8
# housing_with_id = housing.reset_index() # adds an 'index' column
# train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

# housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
# train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
# from sklearn.model_selection import train_test_split
#
# train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
1
2
3
4
5
6
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

stratified sampling based on the income category

1
2
3
4
5
6
7
8
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]

housing["income_cat"].value_counts() / len(housing)
3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

The test set generated using stratified sampling has income category proportions almost identical to those in the full dataset, whereas the test set generated using purely random sampling is quite skewed.

1
2
print(strat_test_set["income_cat"].value_counts() / len(strat_test_set))
test_set["income_cat"].value_counts() / len(test_set["income_cat"])
3.0    0.350533
2.0    0.318798
4.0    0.176357
5.0    0.114583
1.0    0.039729
Name: income_cat, dtype: float64





3.0    0.358527
2.0    0.324370
4.0    0.167393
5.0    0.109496
1.0    0.040213
Name: income_cat, dtype: float64
1
2
for set in (strat_train_set, strat_test_set):
set.drop(["income_cat"], axis=1, inplace=True)

Discover and Visualize the Data to Gain Insights

Let’s create a copy so you can play with it without harming the training set:

1
housing = strat_train_set.copy()

Visualizing Geographical Data

1
housing.plot(kind="scatter", x="longitude", y="latitude")
<matplotlib.axes._subplots.AxesSubplot at 0x112340b38>

png

1
2
3
# setting the alpha option to 0.1 makes it much easier to visualize the places where there is 
# a high density of data points
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x112a02f28>

png

The radius of each circle represents the district’s population (option s), and the color represents the price (option c). We will use a predefinded color map (option cmap) called jet, which rangs from blue (low values) to red (high prices):

1
2
3
4
5
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population",
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True
)
plt.legend()
<matplotlib.legend.Legend at 0x112c5dba8>

png

1
2
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

Another way to check for correlation between attributes is to use Pandas’ scatter_matrix function, which plots every numerical attribute against every other numerical attribute.

1
2
3
4
5
from pandas.tools.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
/Users/tweinyan/.pyenv/versions/3.6.2/envs/venv36/lib/python3.6/site-packages/ipykernel_launcher.py:5: FutureWarning: 'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.
  """





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x112b8ea58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f875390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x110007080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f82b2e8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f77fbe0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f77fba8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f736400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f774f60>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f6cf400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f6a8a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f642da0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f688668>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f5c5d30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f623588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ffca908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ff60ac8>]], dtype=object)

png

The most promising attribute to predict the median house value is the median income

1
2
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x10f532be0>

png

在500000,450000,350000附近有直线,为了不对算法造成不良影响,需要去除

Experimenting with Attribute Combinations

1
2
3
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
1
2
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

Prepare the Data for Machine Learning Algorithms

separate the predictors and the labels

1
2
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

Data Cleaning

The total_bedrooms attribute has some missing values. You have three options:

  • Get rid of the corresponding districts.
  • Get rid of the whole attribute.
  • Set the values to some value (zero, the mean, the median, etc.)
1
2
3
4
5
6
7
8
9
10
11
12
13
# housing.dropna(subset=["total_bedrooms"]) # option 1
# housing.drop("total_bedrooms", axis=1) # option 2
# median = housing["total_bedrooms"].median()
# housing["total_bedrooms"].fillna(median) # option 3

# Scikit-learn provides a handy class to take care of missing values: Imputer
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_

housing_num.median().values
array([ -118.51  ,    34.26  ,    29.    ,  2119.5   ,   433.    ,
        1164.    ,   408.    ,     3.5409])
1
2
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

Handling Text and Categorical Attributes

ocean_proximity is a text attribute, let’s convert these text labels to numbers

1
2
3
4
5
6
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
array([0, 0, 4, ..., 1, 0, 3])
1
print(encoder.classes_)
['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']

0对应’<1H OCEAN’, 4对应’NEAR OCEAN’,两个相近属性对应的值距离太远,显然这不是我们想要的。

one-hot encoding

To fix this issue, a common solution is to create one binary attribute per category: one attribute equal to 1 when the category is “<1H OCEAN” (and 0 otherwise), another attribute equal to 1 when the category is “INLAND” (and 0 otherwise), and so on. This is called one-hot encoding, because only one attribute will be equal to 1 (hot), while the others will be 0 (cold).

1
2
3
4
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
    with 16512 stored elements in Compressed Sparse Row format>
1
2
# convert it to a dense NumPy array
housing_cat_1hot.toarray()
array([[ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       ..., 
       [ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.]])
1
2
3
4
5
# 使用LabelBinarizer代替以上两个
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ..., 
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

Custom Transformers

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

Feature Scaling

  • Min-max scaling
  • Standardization

Transformation Pipelines

A small pipeline for the numerical attributes:

1
2
3
4
5
6
7
8
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)

bug fix:

https://stackoverflow.com/questions/46162855/fit-transform-takes-2-positional-arguments-but-3-were-given-with-labelbinarize

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from sklearn.pipeline import FeatureUnion

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.attribute_names].values

class LableBinarizer_new(BaseEstimator, TransformerMixin):
def fit(self, X, y=0):
return self
def transform(self, X, y=0):
encoder = LabelBinarizer()
result = encoder.fit_transform(X)
return result

num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline)
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ..., 
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])
1
housing_prepared.shape
(16512, 16)

Select and Train a Model

Training and Evaluating on the Training Set

Train a Linear Regression model

1
2
3
4
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
/Users/tweinyan/.pyenv/versions/3.6.2/envs/venv36/lib/python3.6/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
1
2
3
4
5
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))
Predictions:     [ 210644.60459286  317768.80697211  210956.43331178   59218.98886849
  189747.55849878]
Labels:         [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
1
2
3
4
5
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
68628.198198489219
1
2
3
4
5
6
7
8
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
0.0

看上去过拟合了,但是我们并不想在测试集上验证,所以需要一个交叉验证集

Better Evaluation Using Cross-Validation

Use Scikit-Learn’s cross-validation. Performs K-fold coross-validation: it randomly splits the training set into 10 distinct subsets called folds, then it trains and evaluates the Decision Tree model 10 times, picking a different fold for evaluation every time and training on the other 9 folds.

Decision Tree
1
2
3
4
5
# the result is an array containing the 10 evaluation scores
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
1
2
3
4
5
6
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)
Scores: [ 68598.67166286  65608.52457119  70810.0869789   69211.66220381
  72014.87961395  75011.86954035  71534.25262773  70335.79903809
  76849.90225183  70211.15023877]
Mean: 71018.6798727
Standard deviation: 3012.64703648
Linear Regression
1
2
3
4
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
Scores: [ 66782.73843989  66960.118071    70361.18285107  74742.02956966
  68022.09224176  71193.07033936  64969.63056405  68276.69992785
  71543.69797334  67665.10082067]
Mean: 69051.6360799
Standard deviation: 2732.39242562
RandomForestRegressor
1
2
3
4
5
6
7
8
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
Scores: [ 50896.82385485  49692.14156589  52932.15059225  54728.28076321
  52398.83293126  55051.19425456  51192.98572404  50816.1459606
  55622.24945824  54250.92937433]
Mean: 52758.1734479
Standard deviation: 1970.1870403

Fine-Tune Your Model

手动调参是乏味且低效的,使用Scikit-Learn的GridSearchCV可以做这件事,只要指定哪些超参,哪些值需要尝试,函数会使用交叉验证 评价所有超参值的组合。

1
2
3
4
5
6
7
8
9
10
11
# best combination of hyperparameter values for the RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)
1
grid_search.best_params_
{'max_features': 8, 'n_estimators': 30}
1
grid_search.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
1
2
3
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
64365.7732082 {'max_features': 2, 'n_estimators': 3}
55624.8696251 {'max_features': 2, 'n_estimators': 10}
52747.3739976 {'max_features': 2, 'n_estimators': 30}
61352.3593931 {'max_features': 4, 'n_estimators': 3}
52884.9953945 {'max_features': 4, 'n_estimators': 10}
50459.6268283 {'max_features': 4, 'n_estimators': 30}
59149.2863452 {'max_features': 6, 'n_estimators': 3}
51761.6995007 {'max_features': 6, 'n_estimators': 10}
50131.7157433 {'max_features': 6, 'n_estimators': 30}
58754.4065217 {'max_features': 8, 'n_estimators': 3}
52138.1112373 {'max_features': 8, 'n_estimators': 10}
49947.2015882 {'max_features': 8, 'n_estimators': 30}
62303.8222151 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54700.9998416 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60095.3114005 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52884.0961431 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
57626.9358603 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
51655.3895683 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

Analyze the Best Models and Their Errors

1
2
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
array([  7.33548881e-02,   6.83026577e-02,   4.30563234e-02,
         1.59958729e-02,   1.44271213e-02,   1.61089781e-02,
         1.37680834e-02,   3.30419243e-01,   4.99376357e-02,
         1.13037983e-01,   8.09265844e-02,   7.21802653e-03,
         1.68103579e-01,   5.94523572e-05,   1.75085991e-03,
         3.53271167e-03])
1
2
3
4
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
[(0.33041924268078282, 'median_income'),
 (0.16810357872016857, 'INLAND'),
 (0.11303798307184716, 'pop_per_hhold'),
 (0.080926584412980271, 'bedrooms_per_room'),
 (0.073354888107514407, 'longitude'),
 (0.068302657661592367, 'latitude'),
 (0.049937635687884216, 'rooms_per_hhold'),
 (0.043056323384962278, 'housing_median_age'),
 (0.016108978092674749, 'population'),
 (0.015995872935832368, 'total_rooms'),
 (0.014427121335475118, 'total_bedrooms'),
 (0.013768083439830915, 'households'),
 (0.0072180265259504878, '<1H OCEAN'),
 (0.0035327116730775876, 'NEAR OCEAN'),
 (0.0017508599122762194, 'NEAR BAY'),
 (5.9452357150607577e-05, 'ISLAND')]

Evalute Your System on the Test Set

1
2
3
4
5
6
7
8
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
47964.376820929021