MERCS 101 - Lecture 01: Classification

This is the first part of the tutorial, focusing on MERCS as a simple classifier.

Preliminaries

External Imports

In [2]:
import numpy as np
import os
import sys
from sklearn.metrics import f1_score, accuracy_score, classification_report
import pandas as pd

MERCS imports

In [4]:
sys.path.insert(0, '../..') # We add the parent dir to the path
from src.mercs.core import MERCS
from src.mercs.utils import *

import src.datasets as datasets
/home/elia/Software/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

Induction

Importing Data

First, we import the nursery dataset.

In [3]:
train, test = datasets.load_nursery()

This is a fully nominal dataset

In [4]:
train.head()
Out[4]:
Var0 Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8
0 2 3 0 0 0 0 0 2 2
1 2 3 0 0 0 0 0 1 1
2 2 3 0 0 0 0 0 0 0
3 2 3 0 0 0 0 2 2 2
4 2 3 0 0 0 0 2 1 1

Training

In [5]:
model = MERCS()
In [6]:
ind_parameters = {'ind_type':           'RF',
                  'ind_n_estimators':   30}

sel_parameters = {'sel_type':           'Base',
                  'sel_its':            4,
                  'sel_param':          1}
In [7]:
train.head()
Out[7]:
Var0 Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8
0 2 3 0 0 0 0 0 2 2
1 2 3 0 0 0 0 0 1 1
2 2 3 0 0 0 0 0 0 0
3 2 3 0 0 0 0 2 2 2
4 2 3 0 0 0 0 2 1 1
In [8]:
model.fit(train, **ind_parameters, **sel_parameters)
is_nominal in this model is: [1 1 1 1 1 1 1 1 1]

Inference

Prediction

In [9]:
code = [0,0,0,0,0,0,0,0,1]
len(code)
Out[9]:
9
In [10]:
pred_parameters = {'pred_type':    'MI',
                   'pred_param':   1.0,
                   'pred_its':     8}
In [11]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)
SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
In [12]:
y_pred
Out[12]:
array([[4.],
       [0.],
       [4.],
       ...,
       [1.],
       [3.],
       [0.]])

Evaluation

In [13]:
y_true = test[test.columns.values[np.array(code)==1]].values
In [14]:
obs = f1_score(y_true, y_pred, average='macro')
obs
Out[14]:
0.9808938294010889
In [15]:
assert isinstance(obs, (int, float))
assert 0 <= obs <= 1
In [ ]:
 

Missing attributes

In [16]:
train.iloc[1:8000,2] = np.nan
model.fit(train, **ind_parameters, **sel_parameters)
is_nominal in this model is: [1 1 1 1 1 1 1 1 1]

In [17]:
train
Out[17]:
Var0 Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8
0 2 3 0.0 0 0 0 0 2 2
1 2 3 NaN 0 0 0 0 1 1
2 2 3 NaN 0 0 0 0 0 0
3 2 3 NaN 0 0 0 2 2 2
4 2 3 NaN 0 0 0 2 1 1
5 2 3 NaN 0 0 0 2 0 0
6 2 3 NaN 0 0 0 1 2 1
7 2 3 NaN 0 0 0 1 1 1
8 2 3 NaN 0 0 0 1 0 0
9 2 3 NaN 0 0 1 0 2 4
10 2 3 NaN 0 0 1 0 1 1
11 2 3 NaN 0 0 1 0 0 0
12 2 3 NaN 0 0 1 2 1 1
13 2 3 NaN 0 0 1 2 0 0
14 2 3 NaN 0 0 1 1 2 1
15 2 3 NaN 0 0 1 1 1 1
16 2 3 NaN 0 0 1 1 0 0
17 2 3 NaN 0 2 0 0 2 4
18 2 3 NaN 0 2 0 0 1 1
19 2 3 NaN 0 2 0 0 0 0
20 2 3 NaN 0 2 0 2 2 4
21 2 3 NaN 0 2 0 2 1 1
22 2 3 NaN 0 2 0 2 0 0
23 2 3 NaN 0 2 0 1 2 1
24 2 3 NaN 0 2 0 1 1 1
25 2 3 NaN 0 2 0 1 0 0
26 2 3 NaN 0 2 1 0 2 4
27 2 3 NaN 0 2 1 0 1 1
28 2 3 NaN 0 2 1 0 0 0
29 2 3 NaN 0 2 1 2 2 4
... ... ... ... ... ... ... ... ... ...
11650 0 4 2.0 3 2 0 1 2 3
11651 0 4 2.0 3 2 0 1 1 3
11652 0 4 2.0 3 2 0 1 0 0
11653 0 4 2.0 3 2 1 0 2 3
11654 0 4 2.0 3 2 1 0 1 3
11655 0 4 2.0 3 2 1 0 0 0
11656 0 4 2.0 3 2 1 2 2 3
11657 0 4 2.0 3 2 1 2 1 3
11658 0 4 2.0 3 2 1 2 0 0
11659 0 4 2.0 3 2 1 1 2 3
11660 0 4 2.0 3 2 1 1 1 3
11661 0 4 2.0 3 2 1 1 0 0
11662 0 4 2.0 3 1 0 0 2 3
11663 0 4 2.0 3 1 0 0 1 3
11664 0 4 2.0 3 1 0 0 0 0
11665 0 4 2.0 3 1 0 2 2 3
11666 0 4 2.0 3 1 0 2 1 3
11667 0 4 2.0 3 1 0 2 0 0
11668 0 4 2.0 3 1 0 1 2 3
11669 0 4 2.0 3 1 0 1 1 3
11670 0 4 2.0 3 1 0 1 0 0
11671 0 4 2.0 3 1 1 0 2 3
11672 0 4 2.0 3 1 1 0 1 3
11673 0 4 2.0 3 1 1 0 0 0
11674 0 4 2.0 3 1 1 2 2 3
11675 0 4 2.0 3 1 1 2 1 3
11676 0 4 2.0 3 1 1 2 0 0
11677 0 4 2.0 3 1 1 1 2 3
11678 0 4 2.0 3 1 1 1 1 3
11679 0 4 2.0 3 1 1 1 0 0

11680 rows × 9 columns

In [18]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)

obs = f1_score(y_true, y_pred, average='macro')
SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
/home/elia/Software/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [19]:
obs
Out[19]:
0.6011381377741974
In [20]:
md = model.s['metadata']
In [21]:
md
Out[21]:
{'FI': array([[5.79202658e-04, 1.01637576e-01, 3.90927923e-02, 4.41132298e-02,
         5.51740771e-02, 2.37647782e-02, 6.18721639e-02, 6.73766180e-01,
         0.00000000e+00],
        [2.56172342e-04, 2.67051193e-01, 2.20448651e-01, 0.00000000e+00,
         1.62871653e-01, 7.40922132e-02, 1.47425482e-01, 7.27570584e-02,
         5.50975781e-02],
        [1.01122574e-03, 2.33393762e-01, 0.00000000e+00, 2.54147861e-01,
         1.72857588e-01, 7.42995190e-02, 1.51176373e-01, 6.58778367e-02,
         4.72358337e-02],
        [7.66635637e-04, 0.00000000e+00, 2.28883222e-01, 2.07564829e-01,
         1.42178871e-01, 7.95316780e-02, 1.42922433e-01, 9.20642731e-02,
         1.06088058e-01],
        [5.84219248e-04, 2.47887939e-01, 2.10378861e-01, 2.06157638e-01,
         0.00000000e+00, 6.46402526e-02, 1.30896966e-01, 7.21702679e-02,
         6.72838571e-02],
        [3.20659190e-04, 2.30420792e-01, 1.99729867e-01, 1.86818742e-01,
         1.41888435e-01, 0.00000000e+00, 1.31738087e-01, 5.30053513e-02,
         5.60780675e-02],
        [3.76580657e-04, 2.53883337e-01, 1.88624496e-01, 2.19784238e-01,
         1.32828774e-01, 5.61731798e-02, 0.00000000e+00, 7.14897545e-02,
         7.68396397e-02],
        [1.33901118e-04, 8.27136640e-02, 4.93662385e-02, 6.16277903e-02,
         4.34860869e-02, 1.67029849e-02, 4.49344778e-02, 0.00000000e+00,
         7.01034857e-01],
        [0.00000000e+00, 2.23452332e-01, 1.25830373e-01, 6.47518277e-03,
         3.61720417e-02, 3.88217017e-02, 7.63240334e-02, 1.56737332e-02,
         1.10583936e-01],
        [2.38894206e-04, 2.28010787e-01, 2.24140822e-01, 2.21739644e-01,
         0.00000000e+00, 6.42047499e-02, 1.23956822e-01, 6.55675483e-02,
         7.21407330e-02],
        [8.19577404e-04, 2.87375464e-01, 0.00000000e+00, 2.52661437e-01,
         1.39925625e-01, 7.85814033e-02, 1.35868924e-01, 6.16143333e-02,
         4.31532348e-02],
        [1.01755716e-04, 7.82056480e-02, 5.30942863e-02, 5.97983977e-02,
         3.97031653e-02, 2.19245780e-02, 4.54602878e-02, 0.00000000e+00,
         7.01711881e-01],
        [7.00647335e-04, 1.02991223e-01, 4.01705563e-02, 4.29010874e-02,
         5.09486037e-02, 2.73899522e-02, 5.69508044e-02, 6.77947126e-01,
         0.00000000e+00],
        [2.68965783e-04, 2.61020292e-01, 1.84504462e-01, 2.33341955e-01,
         1.11988515e-01, 6.12434880e-02, 0.00000000e+00, 7.49734279e-02,
         7.26588935e-02],
        [0.00000000e+00, 2.06965777e-01, 4.35734276e-02, 5.52764215e-02,
         5.15142980e-03, 2.98741041e-02, 2.88839168e-02, 1.72260880e-02,
         1.46382168e-01],
        [4.44913041e-04, 2.72118825e-01, 2.23934372e-01, 0.00000000e+00,
         1.43771154e-01, 8.21382705e-02, 1.62240613e-01, 6.27143671e-02,
         5.26374862e-02],
        [2.76622497e-04, 2.29602840e-01, 1.98935448e-01, 1.83452583e-01,
         1.33073479e-01, 0.00000000e+00, 1.47775435e-01, 6.04460991e-02,
         4.64374928e-02],
        [9.11191934e-04, 0.00000000e+00, 2.28881662e-01, 2.06558060e-01,
         1.39726600e-01, 8.36577482e-02, 1.51914862e-01, 8.56863895e-02,
         1.02663486e-01],
        [0.00000000e+00, 2.20152073e-01, 7.95464806e-02, 9.86277568e-02,
         1.10271672e-02, 5.06006126e-02, 6.06682428e-02, 3.11114289e-02,
         1.48266238e-01],
        [4.94740650e-04, 2.36355771e-01, 2.03861354e-01, 2.11569584e-01,
         1.41490071e-01, 5.72459682e-02, 0.00000000e+00, 8.23801821e-02,
         6.66023287e-02],
        [8.47876328e-04, 1.11806475e-01, 3.98008928e-02, 3.77300406e-02,
         5.13222905e-02, 2.37414150e-02, 6.34289946e-02, 6.71322015e-01,
         0.00000000e+00],
        [4.61365006e-04, 2.91533359e-01, 2.19663310e-01, 0.00000000e+00,
         1.33175264e-01, 7.67140912e-02, 1.48001614e-01, 7.38725470e-02,
         5.65784494e-02],
        [3.27346882e-04, 2.61944288e-01, 0.00000000e+00, 2.54586406e-01,
         1.59656608e-01, 7.42249327e-02, 1.43644025e-01, 6.29538944e-02,
         4.26624990e-02],
        [2.60156399e-04, 2.71071060e-01, 1.99257213e-01, 1.96538640e-01,
         0.00000000e+00, 6.39979145e-02, 1.34915050e-01, 6.58189254e-02,
         6.81410407e-02],
        [7.26512255e-04, 0.00000000e+00, 2.36971463e-01, 2.18887273e-01,
         1.47213182e-01, 6.88883453e-02, 1.41644629e-01, 8.76926723e-02,
         9.79759229e-02],
        [4.40805137e-04, 2.43919133e-01, 2.00405370e-01, 1.88756883e-01,
         1.21751150e-01, 0.00000000e+00, 1.35634078e-01, 5.37320800e-02,
         5.53605009e-02],
        [1.37642256e-04, 7.94595472e-02, 5.08919923e-02, 6.04720906e-02,
         4.23744248e-02, 1.87648411e-02, 4.84336824e-02, 0.00000000e+00,
         6.99465779e-01],
        [1.27812902e-03, 2.56373705e-01, 0.00000000e+00, 2.20597123e-01,
         1.52440194e-01, 8.07495218e-02, 1.68081422e-01, 6.97532498e-02,
         5.07266554e-02],
        [2.87890671e-04, 2.15379535e-01, 1.97682639e-01, 1.93933565e-01,
         1.47167988e-01, 0.00000000e+00, 1.27631026e-01, 6.42350168e-02,
         5.36823409e-02],
        [6.99146552e-04, 2.72170997e-01, 2.02561732e-01, 1.97693270e-01,
         1.30064039e-01, 6.53789660e-02, 0.00000000e+00, 6.56223670e-02,
         6.58094827e-02],
        [4.85135234e-04, 2.89380652e-01, 2.34440478e-01, 0.00000000e+00,
         1.30717534e-01, 6.80737619e-02, 1.41010195e-01, 7.99597708e-02,
         5.59324728e-02],
        [1.96548000e-04, 7.61573979e-02, 5.19876526e-02, 5.60797297e-02,
         4.44749966e-02, 1.96112680e-02, 5.72960425e-02, 0.00000000e+00,
         6.94196365e-01],
        [7.05859368e-04, 0.00000000e+00, 2.05505224e-01, 2.19755622e-01,
         1.59749928e-01, 6.17164349e-02, 1.42701951e-01, 9.36572388e-02,
         1.16207742e-01],
        [0.00000000e+00, 1.68996477e-01, 6.70442099e-02, 7.14455231e-03,
         3.60985155e-02, 4.09022893e-02, 3.35657767e-02, 6.74413488e-03,
         2.72837378e-01],
        [4.13304226e-04, 2.41448806e-01, 2.00153968e-01, 2.04576127e-01,
         0.00000000e+00, 6.55886601e-02, 1.44945040e-01, 6.81336194e-02,
         7.47404760e-02],
        [7.69105563e-04, 1.07613991e-01, 3.97407113e-02, 4.03038227e-02,
         4.93473384e-02, 2.41715808e-02, 6.10170391e-02, 6.77036411e-01,
         0.00000000e+00]]),
 'clf_labels': [array([0., 2.]),
  array([0., 1., 2., 3., 4.]),
  array([0., 1., 2., 3.]),
  array([0., 1., 2., 3.]),
  array([0., 1., 2.]),
  array([0., 1.]),
  array([0., 1., 2.]),
  array([0., 1., 2.]),
  array([0., 1., 2., 3.])],
 'has_nan': array([False, False,  True, False, False, False, False, False, False]),
 'is_nominal': array([1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'nb_atts': 9,
 'nb_tuples': 11680,
 'nb_values': array([3, 5, 4, 4, 3, 2, 3, 3, 5]),
 'types': array([dtype('int64'), dtype('int64'), dtype('float64'), dtype('int64'),
        dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
        dtype('int64')], dtype=object)}
In [22]:
md = {k:v for k,v in md.items() if k is not 'FI'}
In [23]:
nominal_attributes = [x for x in range(md['nb_atts']) if md['is_nominal'][x] == 1]
numeric_attributes = [x for x in range(md['nb_atts']) if md['is_nominal'][x] == 0]
nominal_attributes
Out[23]:
[0, 1, 2, 3, 4, 5, 6, 7, 8]
In [24]:
np.unique(md['is_nominal'])
Out[24]:
array([1])
In [25]:
np.unique(md['types']).shape[0]
Out[25]:
2
In [26]:
a = model.m_codes.copy()
b = model.m_codes.copy()
In [27]:
a.copy()
Out[27]:
array([[0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1]])
In [28]:
a = np.array([[1,2,3]])
b = np.array([[3,4,5]])
c = None
In [29]:
np.concatenate((a,b,c))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-29-8a143f74917f> in <module>()
----> 1 np.concatenate((a,b,c))

ValueError: all the input arrays must have same number of dimensions
In [ ]:
a.shape
In [ ]:
a = np.array([1,0,0,0])
In [ ]:
u = np.unique(a)
#u.sort()
u
In [ ]:
np.array_equal(np.unique(a),[1,0])