Source code for mlens.ensemble.base

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017-2018
:licence: MIT

Base classes for ensemble layer management.
"""
# pylint: disable=protected-access
# pylint: disable=too-many-arguments
# pylint: disable=too-many-instance-attributes


from __future__ import division, print_function, with_statement

from abc import ABCMeta, abstractmethod
import warnings

from .. import config
from ..parallel import Layer, ParallelProcessing, make_group
from ..parallel.base import BaseStacker
from ..externals.sklearn.validation import check_random_state
from ..utils import (check_ensemble_build, print_time,
                     safe_print, IdTrain, format_name)
from ..utils.exceptions import (
    LayerSpecificationWarning, NotFittedError, NotInitializedError)
from ..metrics import Data
from ..externals.sklearn.base import BaseEstimator, clone
try:
    # Try get performance counter
    from time import perf_counter as time
except ImportError:
    # Fall back on wall clock
    from time import time


GLOBAL_SEQUENTIAL_NAME = list()


def check_kwargs(kwargs, forbidden):
    """Pop unwanted arguments and issue warning"""
    for f in forbidden:
        s = kwargs.pop(f, None)
        if s is not None:
            warnings.warn(
                "Layer-specific parameter '%s' contradicts"
                "ensemble-wide settings. Ignoring." % f,
                LayerSpecificationWarning)


def print_job(lc, start_message):
    """Print job details.

    Parameters
    ----------
    lc : :class:`Sequential`
        The LayerContainer instance running the job.

    start_message : str
        Initial message.
    """
    f = "stdout" if lc.verbose < 10 else "stderr"
    if lc.verbose:
        safe_print("\n%s %d layers" % (start_message, len(lc.stack)),
                   file=f, flush=True)
        if lc.verbose >= 5:
            safe_print("""[INFO] n_jobs = %i
[INFO] backend = %r
[INFO] start_method = %r
[INFO] cache = %r
""" % (lc.n_jobs, lc.backend, config.get_start_method(), config.get_tmpdir()),
                       file=f, flush=True)

    t0 = time()
    return f, t0


###############################################################################
[docs]class Sequential(BaseStacker): r"""Container class for a stack of sequentially processed estimators. The Sequential class stories all layers as an ordered dictionary and modifies possesses a ``get_params`` method to appear as an estimator in the Scikit-learn API. This allows correct cloning and parameter updating. Parameters ---------- stack: list, optional (default = None) list of estimators (i.e. layers) to build instance with. n_jobs : int (default = -1) Degree of concurrency. Set ``n_jobs = -1`` for maximal parallelism and ``n_jobs=1`` for sequential processing. backend : str, (default="threading") the joblib backend to use (i.e. "multiprocessing" or "threading"). raise_on_exception : bool (default = False) raise error on soft exceptions. Otherwise issue warning. verbose : int or bool (default = False) level of verbosity. - ``verbose = 0`` silent (same as ``verbose = False``) - ``verbose = 1`` messages at start and finish (same as ``verbose = True``) - ``verbose = 2`` messages for each layer - etc If ``verbose >= 10`` prints to ``sys.stderr``, else ``sys.stdout``. """ def __init__(self, name=None, verbose=False, stack=None, **kwargs): if stack and not isinstance(stack, list): if stack.__class__.__name__.lower() == 'layer': stack = [stack] else: raise ValueError( "Expect stack to be a Layer or a list of Layers. " "Got %r" % stack) name = format_name(name, 'sequential', GLOBAL_SEQUENTIAL_NAME) super(Sequential, self).__init__( stack=stack, name=name, verbose=verbose, **kwargs) def __iter__(self): """Generator for stacked layers""" for layer in self.stack: yield layer
[docs] def fit(self, X, y=None, **kwargs): r"""Fit instance. Iterative fits each layer in the stack on the output of the subsequent layer. First layer is fitted on input data. Parameters ----------- X : array-like of shape = [n_samples, n_features] input matrix to be used for fitting and predicting. y : array-like of shape = [n_samples, ] training labels. **kwargs : optional optional arguments to processor """ if not self.__stack__: raise NotInitializedError("No elements in stack to fit.") f, t0 = print_job(self, "Fitting") with ParallelProcessing(self.backend, self.n_jobs, max(self.verbose - 4, 0)) as manager: out = manager.stack(self, 'fit', X, y, **kwargs) if self.verbose: print_time(t0, "{:<35}".format("Fit complete"), file=f) if out is None: return self return out
[docs] def fit_transform(self, X, y=None, **kwargs): r"""Fit instance and return cross-validated predictions. Equivalent to ``Sequential().fit(X, y, return_preds=True)`` Parameters ----------- X : array-like of shape = [n_samples, n_features] input matrix to be used for fitting and predicting. y : array-like of shape = [n_samples, ] training labels. **kwargs : optional optional arguments to processor """ return self.fit(X, y, return_preds=True, **kwargs)
[docs] def predict(self, X, **kwargs): r"""Predict. Parameters ----------- X : array-like of shape = [n_samples, n_features] input matrix to be used for prediction. **kwargs : optional optional keyword arguments. Returns ------- X_pred : array-like of shape = [n_samples, n_fitted_estimators] predictions from final layer. """ if not self.__fitted__: NotFittedError("Instance not fitted.") f, t0 = print_job(self, "Predicting") out = self._predict(X, 'predict', **kwargs) if self.verbose: print_time(t0, "{:<35}".format("Predict complete"), file=f, flush=True) return out
[docs] def transform(self, X, **kwargs): """Predict using sub-learners as is done during the ``fit`` call. Parameters ----------- X : array-like of shape = [n_samples, n_features] input matrix to be used for prediction. *args : optional optional arguments. **kwargs : optional optional keyword arguments. Returns ------- X_pred : array-like of shape = [n_test_samples, n_fitted_estimators] predictions from ``fit`` call to final layer. """ if not self.__fitted__: NotFittedError("Instance not fitted.") f, t0 = print_job(self, "Transforming") out = self._predict(X, 'transform', **kwargs) if self.verbose: print_time(t0, "{:<35}".format("Transform complete"), file=f, flush=True) return out
def _predict(self, X, job, **kwargs): r"""Generic for processing a predict job through all layers. Parameters ----------- X : array-like of shape = [n_samples, n_features] input matrix to be used for prediction. job : str type of prediction. Should be 'predict' or 'transform'. Returns ------- X_pred : array-like predictions from final layer. Either predictions from ``fit`` call or new predictions on X using base learners fitted on all training data. """ r = kwargs.pop('return_preds', True) with ParallelProcessing(self.backend, self.n_jobs, max(self.verbose - 4, 0)) as manager: out = manager.stack(self, job, X, return_preds=r, **kwargs) if not isinstance(out, list): out = [out] out = [p.squeeze() for p in out] if len(out) == 1: out = out[0] return out @property def data(self): """Ensemble data""" out = list() for layer in self.stack: d = layer.raw_data if not d: continue out.extend([('%s/%s' % (layer.name, k), v) for k, v in d]) return Data(out)
###############################################################################
[docs]class BaseEnsemble(BaseEstimator): """BaseEnsemble class. Core ensemble class methods used to add ensemble layers and manipulate parameters. Parameters ---------- model_selection: bool (default=False) Whether to use the ensemble in model selection mode. If ``True``, this will alter the ``transform`` method. When calling ``transform`` on new data, the ensemble will call ``predict``, while calling ``transform`` with the training data reproduces predictions from the ``fit`` call. Hence the ensemble can be used as a pure transformer in a preprocessing pipeline passed to the :class:`Evaluator`, as training folds are faithfully reproduced as during a ``fit``call and test folds are transformed with the ``predict`` method. samples_size: int (default=20) size of training set sample (``[min(sample_size, X.size[0]), min(X.size[1], sample_size)]`` shuffle: bool (default=False) whether to shuffle input data during fit calls random_state: bool (default=False) random seed. scorer: obj, optional scorer function verbose: bool, optional verbosity samples_size: int (default=20) size of training set sample (``[min(sample_size, X.size[0]), min(X.size[1], sample_size)]`` """ __metaclass__ = ABCMeta @abstractmethod def __init__( self, shuffle=False, random_state=None, scorer=None, verbose=False, layers=None, array_check=None, model_selection=False, sample_size=20, **kwargs): self.shuffle = shuffle self.random_state = random_state self.scorer = scorer self._model_selection = model_selection self._verbose = verbose self.layers = layers if layers else list() self.sample_size = sample_size self.model_selection = model_selection self._backend = Sequential(verbose=verbose, **kwargs) self.raise_on_exception = self._backend.raise_on_exception if layers: layers_ = clone(layers) self._backend.push(*layers_) if array_check is not None: warnings.warn( "array checking is deprecated. The array_check argument will be removed in 0.2.4.", DeprecationWarning)
[docs] def add(self, estimators, indexer, preprocessing=None, **kwargs): """Method for adding a layer. Parameters ----------- estimators: dict of lists or list of estimators, or `:class:`Layer`. Pre-made layer or estimators to construct layer with. If ``preprocessing`` is ``None`` or ``list``, ``estimators`` should be a ``list``. The list can either contain estimator instances, named tuples of estimator instances, or a combination of both. :: option_1 = [estimator_1, estimator_2] option_2 = [("est-1", estimator_1), ("est-2", estimator_2)] option_3 = [estimator_1, ("est-2", estimator_2)] If different preprocessing pipelines are desired, a dictionary that maps estimators to preprocessing pipelines must be passed. The names of the estimator dictionary must correspond to the names of the estimator dictionary. :: preprocessing_cases = {"case-1": [trans_1, trans_2]. "case-2": [alt_trans_1, alt_trans_2]} estimators = {"case-1": [est_a, est_b]. "case-2": [est_c, est_d]} The lists for each dictionary entry can be any of ``option_1``, ``option_2`` and ``option_3``. indexer : instance or None (default = None) Indexer instance to use. Defaults to the layer class indexer with default settings. See :mod:`mlens.base` for details. preprocessing: dict of lists or list, optional (default = None) preprocessing pipelines for given layer. If the same preprocessing applies to all estimators, ``preprocessing`` should be a list of transformer instances. The list can contain the instances directly, named tuples of transformers, or a combination of both. :: option_1 = [transformer_1, transformer_2] option_2 = [("trans-1", transformer_1), ("trans-2", transformer_2)] option_3 = [transformer_1, ("trans-2", transformer_2)] If different preprocessing pipelines are desired, a dictionary that maps preprocessing pipelines must be passed. The names of the preprocessing dictionary must correspond to the names of the estimator dictionary. :: preprocessing_cases = {"case-1": [trans_1, trans_2]. "case-2": [alt_trans_1, alt_trans_2]} estimators = {"case-1": [est_a, est_b]. "case-2": [est_c, est_d]} The lists for each dictionary entry can be any of ``option_1``, ``option_2`` and ``option_3``. **kwargs : optional keyword arguments to be passed onto the layer at instantiation. Returns ---------- self : instance Modified instance. """ lyr = self._build_layer(estimators, indexer, preprocessing, **kwargs) self.layers.append(clone(lyr)) setattr(self, lyr.name.replace('-', '_'), lyr) self._backend.push(lyr) return self
[docs] def replace(self, idx, estimators, indexer, preprocessing=None, **kwargs): """Replace a layer. Replace a layer in the stack with a new layer. See :func:`add` for full parameter documentation. Parameters ----------- idx: int Position in stack of layer to replace. Indexing is 0-based. estimators: dict of lists or list of estimators, or `:class:`Layer`. Pre-made layer or estimators to construct layer with. indexer : instance or None (default = None) Indexer instance to use. Defaults to the layer class indexer with default settings. See :mod:`mlens.base` for details. preprocessing: dict of lists or list, optional (default = None) preprocessing pipelines for given layer. **kwargs : optional keyword arguments to be passed onto the layer at instantiation. Returns ---------- self : instance Modified instance """ lyr = self._build_layer(estimators, indexer, preprocessing, **kwargs) self.layers[idx] = clone(lyr) setattr(self, lyr.name.replace('-', '_'), lyr) self._backend.replace(idx, lyr) return self
[docs] def remove(self, idx): """Remove a layer from stack Remove a layer at a given position from stack. Parameters ---------- idx: int Position in stack. Indexing is 0-based. Returns ------- self: instance Modified instance """ name = self.layers[idx].name self.layers.pop(idx) delattr(self, name.replace('-', '_')) self._backend.pop(idx) return self
[docs] def fit(self, X, y=None, **kwargs): """Fit ensemble. Parameters ---------- X : array-like of shape = [n_samples, n_features] input matrix to be used for prediction. y : array-like of shape = [n_samples, ] or None (default = None) output vector to trained estimators on. Returns ------- self : instance class instance with fitted estimators. """ if not check_ensemble_build(self._backend): # No layers instantiated, but raise_on_exception is False return self if self.model_selection: self._id_train.fit(X) out = self._backend.fit(X, y, **kwargs) if out is not self._backend: # fit_transform return out else: return self
[docs] def transform(self, X, y=None, **kwargs): """Transform with fitted ensemble. Replicates cross-validated prediction process from training. Parameters ---------- X : array-like, shape=[n_samples, n_features] input matrix to be used for prediction. y : array-like, shape[n_samples, ] targets. Needs to be passed as input in model selection mode as some indexers will reduce the size of the input array (X) and y must be adjusted accordingly. Returns ------- pred : array-like or tuple, shape=[n_samples, n_features] predictions for provided input array. If in model selection mode, return a tuple ``(X_trans, y_trans)`` where ``y_trans`` is either ``y``, or a trunctated version to match the samples in ``X_trans``. """ if not check_ensemble_build(self._backend): # No layers instantiated, but raise_on_exception is False return if self.model_selection: if y is None: raise TypeError( "In model selection mode, y is a required argument.") # Need to modify the transform method to account for blending # cutting X in size, so y needs to be cut too if not self._id_train.is_train(X): return self.predict(X, **kwargs), y # Asked to reproduce predictions during fit, here we need to # account for that in model selection mode, # blend ensemble will cut X in observation size so need to adjust y X = self._backend.transform(X, **kwargs) if X.shape[0] != y.shape[0]: r = y.shape[0] - X.shape[0] y = y[r:] return X, y return self._backend.transform(X, **kwargs)
[docs] def fit_transform(self, X, y, **kwargs): r"""Fit ensemble and return cross-validated predictions. Equivalent to ``ensemble.fit(X, y).transform(X)``, but more efficient. Parameters ----------- X : array-like of shape = [n_samples, n_features] input matrix to be used for fitting and predicting. y : array-like of shape = [n_samples, ] training labels. **kwargs : optional optional arguments to processor Returns ------- pred : array-like or tuple, shape=[n_samples, n_features] predictions for provided input array. If in model selection mode, return a tuple ``(X_trans, y_trans)`` where ``y_trans`` is either ``y``, or a trunctated version to match the samples in ``X_trans``. """ kwargs.pop('return_preds', None) return self.fit(X, y, return_preds=True)
[docs] def predict(self, X, **kwargs): """Predict with fitted ensemble. Parameters ---------- X : array-like, shape=[n_samples, n_features] input matrix to be used for prediction. Returns ------- pred : array-like or tuple, shape=[n_samples, n_features] predictions for provided input array. """ if not check_ensemble_build(self._backend): # No layers instantiated, but raise_on_exception is False return return self._backend.predict(X, **kwargs)
[docs] def predict_proba(self, X, **kwargs): """Predict class probabilities with fitted ensemble. Compatibility method for Scikit-learn. This method checks that the final layer has ``proba=True``, then calls the regular ``predict`` method. Parameters ---------- X : array-like, shape=[n_samples, n_features] input matrix to be used for prediction. Returns ------- pred : array-like or tuple, shape=[n_samples, n_features] predictions for provided input array. """ kwargs.pop('proba', None) return self.predict(X, proba=True, **kwargs)
def _build_layer(self, estimators, indexer, preprocessing, **kwargs): """Build a layer from estimators and preprocessing pipelines""" # --- check args --- # Arguments that cannot be very between layers check_kwargs(kwargs, ['backend', 'n_jobs']) # Pop layer kwargs and override Sequential args verbose = kwargs.pop('verbose', max(self._backend.verbose - 1, 0)) dtype = kwargs.pop('dtype', self._backend.dtype) propagate = kwargs.pop('propagate_features', None) shuffle = kwargs.pop('shuffle', self.shuffle) random_state = kwargs.pop('random_state', self.random_state) rs = kwargs.pop('raise_on_exception', self.raise_on_exception) if random_state: random_state = check_random_state(random_state).randint(0, 10000) # Set learner kwargs kwargs['verbose'] = max(verbose - 1, 0) kwargs['scorer'] = kwargs.pop('scorer', self.scorer) # Check estimator and preprocessing formatting group = make_group(indexer, estimators, preprocessing, kwargs) # --- layer --- name = "layer-%i" % (len(self._backend.stack) + 1) # Start count at 1 lyr = Layer( name=name, dtype=dtype, shuffle=shuffle, random_state=random_state, verbose=verbose, raise_on_exception=rs, propagate_features=propagate) lyr.push(group) return lyr @property def model_selection(self): """Turn model selection mode""" return self._model_selection @model_selection.setter def model_selection(self, model_selection): """Turn model selection on or off""" self._model_selection = model_selection if self._model_selection: self._id_train = IdTrain(self.sample_size) else: self._id_train = None @property def data(self): """Fit data""" return self._backend.data @property def verbose(self): """Level of printed messages""" return self._verbose @verbose.setter def verbose(self, value): """Set level of printed messages""" self._verbose = value self._backend.verbose = value