MERCS 101 - Lecture 02: Regression

This is the first part of the tutorial, focusing on MERCS as a simple classifier.

Preliminaries

External Imports

In [1]:
import numpy as np
import os
import sys
from sklearn.metrics import (mean_absolute_error,
                             mean_squared_error,
                             mean_squared_log_error)
import pandas as pd

MERCS imports

In [2]:
sys.path.insert(0, '..') # We add the parent dir to the path
from src.mercs.core import MERCS
from src.mercs.utils import *

import src.datasets as datasets
/home/elia/Software/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

Induction

Importing Data

First, we import the nursery dataset.

In [3]:
train, test = datasets.load_slump()

This is a fully numerical dataset

In [4]:
train.head()
Out[4]:
Var0 Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
0 0.573840 0.424870 0.403846 0.6250 0.315068 0.573267 0.150727 0.793103 0.724138 0.430576
1 0.109705 0.772021 0.734615 0.2500 0.520548 0.394852 0.403213 0.000000 0.000000 0.579342
2 0.105485 0.766839 0.734615 0.2375 0.794521 0.386078 0.391737 0.034483 0.000000 0.595549
3 0.105485 0.766839 0.730769 0.2375 1.000000 0.380228 0.384086 0.103448 0.025862 0.602080
4 0.071730 0.580311 0.553846 0.7500 0.383562 0.628839 0.066565 0.689655 0.758621 0.232946
In [5]:
test.head()
Out[5]:
Var0 Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
0 0.586498 0.466321 0.446154 0.2500 0.315068 0.473823 0.487376 0.000000 0.000000 0.650460
1 0.742616 0.580311 0.000000 0.7500 0.383562 0.251536 0.567712 0.793103 0.655172 0.265360
2 0.780591 0.601036 0.000000 0.4500 0.383562 0.321732 0.659526 0.879310 0.810345 0.304306
3 0.662447 0.549223 0.523077 0.5875 0.109589 0.114068 0.525631 0.827586 0.465517 0.582487
4 0.037975 0.549223 0.526923 0.6125 0.109589 0.488447 0.475899 0.827586 0.810345 0.258829

Training

In [6]:
model = MERCS()
In [7]:
ind_parameters = {'ind_type':           'RF',
                  'ind_n_estimators':   10,
                  'ind_max_depth':      4}

sel_parameters = {'sel_type':           'Base',
                  'sel_its':            4,
                  'sel_param':          1}
In [8]:
model.fit(train, **ind_parameters, **sel_parameters)
is_nominal in this model is: [0 0 0 0 0 0 0 0 0 0]

Inference

Prediction

In [9]:
code = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]

target_boolean = np.array(code) == 1
y_true = test[test.columns.values[target_boolean]].values
In [10]:
y_true
Out[10]:
array([[0.        , 0.6504596 ],
       [0.65517241, 0.26536043],
       [0.81034483, 0.30430576],
       [0.46551724, 0.5824867 ],
       [0.81034483, 0.25882922],
       [0.6637931 , 0.27842283],
       [0.32758621, 0.37203677],
       [0.75      , 0.4109821 ],
       [0.18965517, 0.32994678],
       [0.56896552, 0.32027092],
       [0.63793103, 0.22327044],
       [0.75862069, 0.21988389],
       [0.94827586, 0.45960329],
       [0.44827586, 0.46274794],
       [0.56034483, 0.40130624],
       [0.68965517, 0.33333333],
       [0.60344828, 0.2394775 ],
       [1.        , 0.51451379],
       [0.12931034, 0.50145138],
       [0.28448276, 0.50798258],
       [0.37068966, 0.44339623],
       [0.12068966, 0.37856797],
       [1.        , 0.66013546],
       [0.        , 0.5890179 ],
       [0.        , 0.59554911],
       [0.        , 0.57619739],
       [0.75862069, 0.53072085],
       [0.98275862, 0.6504596 ],
       [0.        , 0.79293662],
       [0.        , 0.79922593],
       [0.        , 0.80575714],
       [0.81034483, 0.77358491],
       [1.        , 0.76390905]])
In [11]:
pred_parameters = {'pred_type':     'IT',
                   'pred_param':    0.1,
                   'pred_its':      4}
In [12]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)
SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
In [13]:
y_pred
Out[13]:
array([[0.0375    , 0.41600145],
       [0.67392241, 0.34265239],
       [0.69965517, 0.36854741],
       [0.77314655, 0.53332124],
       [0.76418103, 0.27531447],
       [0.61724138, 0.2907777 ],
       [0.25689655, 0.42696541],
       [0.64478448, 0.39421263],
       [0.43318966, 0.39377117],
       [0.60905172, 0.38010402],
       [0.68844828, 0.37906386],
       [0.66077586, 0.34044509],
       [0.75969828, 0.42963836],
       [0.50784483, 0.41060716],
       [0.56711207, 0.3916969 ],
       [0.6774569 , 0.35462022],
       [0.73025862, 0.3216074 ],
       [0.80353448, 0.50876875],
       [0.17435345, 0.35800677],
       [0.27607759, 0.36156265],
       [0.23836207, 0.34961297],
       [0.38793103, 0.5147315 ],
       [0.80663793, 0.52347   ],
       [0.04590517, 0.57252056],
       [0.04590517, 0.57252056],
       [0.05237069, 0.49264635],
       [0.77672414, 0.50921021],
       [0.80728448, 0.51967223],
       [0.06853448, 0.61653967],
       [0.06206897, 0.6209059 ],
       [0.07176724, 0.55940977],
       [0.7337069 , 0.53525036],
       [0.81323276, 0.74855467]])

Evaluation

In [14]:
y_true = test[test.columns.values[np.array(code)==1]].values
In [15]:
obs_1 = mean_absolute_error(y_true, y_pred)
obs_2 = mean_squared_error(y_true, y_pred)
obs_3 = mean_squared_log_error(y_true, y_pred)

obs = [obs_1, obs_2, obs_3]

for o in obs:
    assert isinstance(o, (int, float))
    assert 0 <= o 
In [16]:
obs_3
Out[16]:
0.006169584678365304