MERCS 101 - Lecture 02: Regression¶

This is the first part of the tutorial, focusing on MERCS as a simple classifier.

Preliminaries¶

External Imports¶

import numpy as np
import os
import sys
from sklearn.metrics import (mean_absolute_error,
                             mean_squared_error,
                             mean_squared_log_error)
import pandas as pd

MERCS imports¶

sys.path.insert(0, '..') # We add the parent dir to the path
from src.mercs.core import MERCS
from src.mercs.utils import *

import src.datasets as datasets

/home/elia/Software/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

Induction¶

Importing Data¶

First, we import the nursery dataset.

train, test = datasets.load_slump()

This is a fully numerical dataset

train.head()

test.head()

Training¶

model = MERCS()

ind_parameters = {'ind_type':           'RF',
                  'ind_n_estimators':   10,
                  'ind_max_depth':      4}

sel_parameters = {'sel_type':           'Base',
                  'sel_its':            4,
                  'sel_param':          1}

model.fit(train, **ind_parameters, **sel_parameters)

is_nominal in this model is: [0 0 0 0 0 0 0 0 0 0]

Inference¶

Prediction¶

code = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]

target_boolean = np.array(code) == 1
y_true = test[test.columns.values[target_boolean]].values

y_true

array([[0.        , 0.6504596 ],
       [0.65517241, 0.26536043],
       [0.81034483, 0.30430576],
       [0.46551724, 0.5824867 ],
       [0.81034483, 0.25882922],
       [0.6637931 , 0.27842283],
       [0.32758621, 0.37203677],
       [0.75      , 0.4109821 ],
       [0.18965517, 0.32994678],
       [0.56896552, 0.32027092],
       [0.63793103, 0.22327044],
       [0.75862069, 0.21988389],
       [0.94827586, 0.45960329],
       [0.44827586, 0.46274794],
       [0.56034483, 0.40130624],
       [0.68965517, 0.33333333],
       [0.60344828, 0.2394775 ],
       [1.        , 0.51451379],
       [0.12931034, 0.50145138],
       [0.28448276, 0.50798258],
       [0.37068966, 0.44339623],
       [0.12068966, 0.37856797],
       [1.        , 0.66013546],
       [0.        , 0.5890179 ],
       [0.        , 0.59554911],
       [0.        , 0.57619739],
       [0.75862069, 0.53072085],
       [0.98275862, 0.6504596 ],
       [0.        , 0.79293662],
       [0.        , 0.79922593],
       [0.        , 0.80575714],
       [0.81034483, 0.77358491],
       [1.        , 0.76390905]])

pred_parameters = {'pred_type':     'IT',
                   'pred_param':    0.1,
                   'pred_its':      4}

y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)

SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]

y_pred

array([[0.0375    , 0.41600145],
       [0.67392241, 0.34265239],
       [0.69965517, 0.36854741],
       [0.77314655, 0.53332124],
       [0.76418103, 0.27531447],
       [0.61724138, 0.2907777 ],
       [0.25689655, 0.42696541],
       [0.64478448, 0.39421263],
       [0.43318966, 0.39377117],
       [0.60905172, 0.38010402],
       [0.68844828, 0.37906386],
       [0.66077586, 0.34044509],
       [0.75969828, 0.42963836],
       [0.50784483, 0.41060716],
       [0.56711207, 0.3916969 ],
       [0.6774569 , 0.35462022],
       [0.73025862, 0.3216074 ],
       [0.80353448, 0.50876875],
       [0.17435345, 0.35800677],
       [0.27607759, 0.36156265],
       [0.23836207, 0.34961297],
       [0.38793103, 0.5147315 ],
       [0.80663793, 0.52347   ],
       [0.04590517, 0.57252056],
       [0.04590517, 0.57252056],
       [0.05237069, 0.49264635],
       [0.77672414, 0.50921021],
       [0.80728448, 0.51967223],
       [0.06853448, 0.61653967],
       [0.06206897, 0.6209059 ],
       [0.07176724, 0.55940977],
       [0.7337069 , 0.53525036],
       [0.81323276, 0.74855467]])

Evaluation¶

y_true = test[test.columns.values[np.array(code)==1]].values

obs_1 = mean_absolute_error(y_true, y_pred)
obs_2 = mean_squared_error(y_true, y_pred)
obs_3 = mean_squared_log_error(y_true, y_pred)

obs = [obs_1, obs_2, obs_3]

for o in obs:
    assert isinstance(o, (int, float))
    assert 0 <= o

obs_3

0.006169584678365304

	Var0	Var1	Var2	Var3	Var4	Var5	Var6	Var7	Var8	Var9
0	0.573840	0.424870	0.403846	0.6250	0.315068	0.573267	0.150727	0.793103	0.724138	0.430576
1	0.109705	0.772021	0.734615	0.2500	0.520548	0.394852	0.403213	0.000000	0.000000	0.579342
2	0.105485	0.766839	0.734615	0.2375	0.794521	0.386078	0.391737	0.034483	0.000000	0.595549
3	0.105485	0.766839	0.730769	0.2375	1.000000	0.380228	0.384086	0.103448	0.025862	0.602080
4	0.071730	0.580311	0.553846	0.7500	0.383562	0.628839	0.066565	0.689655	0.758621	0.232946

	Var0	Var1	Var2	Var3	Var4	Var5	Var6	Var7	Var8	Var9
0	0.586498	0.466321	0.446154	0.2500	0.315068	0.473823	0.487376	0.000000	0.000000	0.650460
1	0.742616	0.580311	0.000000	0.7500	0.383562	0.251536	0.567712	0.793103	0.655172	0.265360
2	0.780591	0.601036	0.000000	0.4500	0.383562	0.321732	0.659526	0.879310	0.810345	0.304306
3	0.662447	0.549223	0.523077	0.5875	0.109589	0.114068	0.525631	0.827586	0.465517	0.582487
4	0.037975	0.549223	0.526923	0.6125	0.109589	0.488447	0.475899	0.827586	0.810345	0.258829