Source code for mlens.index.base

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017-2018
:licence: MIT


Base classes for partitioning training data.
"""
from __future__ import division

from abc import abstractmethod
import numpy as np

from ..externals.sklearn.base import BaseEstimator


[docs]def prune_train(start_below, stop_below, start_above, stop_above):
    """Checks if indices above or below are empty and remove them.

    A utility function for checking if the train indices below the a given
    test set range are (0, 0), or if indices above the test set range is
    (n, n). In this case, these will lead to an empty array and therefore
    can safely be removed to create a single training set index range.

    Parameters
    ----------
    start_below : int
        index number starting below the test set. Should always be the same
        for all test sets.

     stop_below : int
        the index number at which the test set is starting on.

    start_above : int
    the index number at which the test set ends.

    stop_above : int
        The end of the data set (n). Should always be the same for all test
        sets.
    """
    if start_below == stop_below:
        tri = ((start_above, stop_above),)

    elif start_above == stop_above:
        tri = ((start_below, stop_below),)

    else:
        tri = ((start_below, stop_below), (start_above, stop_above))
    return tri


[docs]def partition(n, p):
    """Get partition sizes for a given number of samples and partitions.

    This method will give an array containing the sizes of ``p`` partitions
    given a total sample size of ``n``. If there is a remainder from the
    split, the r first folds will be incremented by 1.

    Parameters
    ----------
    n : int
        number of samples.

    p : int
        number of partitions.

    Examples
    --------

    Return sample sizes of 2 partitions given a total of 4 samples

    >>> from mlens.index.base import partition
    >>> _partition(4, 2)
    array([2, 2])

    Return sample sizes of 3 partitions given a total of 8 samples

    >>> from mlens.index.base import partition
    >>> _partition(8, 3)
    array([3, 3, 2])
    """
    sizes = (n // p) * np.ones(p, dtype=np.int)
    sizes[:n % p] += 1
    return sizes


[docs]def make_tuple(arr):
    """Make a list of index tuples from array

    Parameters
    ----------
    arr : array

    Returns
    -------
    out : list

    Examples
    --------
    >>> import numpy as np
    >>> from mlens.index.base import make_tuple
    >>> _make_tuple(np.array([0, 1, 2, 5, 6, 8, 9, 10]))
    [(0, 3), (5, 7), (8, 11)]
    """
    out = list()
    t1 = t0 = arr[0]
    for i in arr[1:]:
        if i - t1 <= 1:
            t1 = i
            continue

        out.append((t0, t1 + 1))
        t1 = t0 = i

    out.append((t0, t1 + 1))
    return out


[docs]class BaseIndex(BaseEstimator):

    """Base Index class.

    Specification of indexer-wide methods and attributes that we can always
    expect to find in any indexer. Helps to provide a uniform interface
    during parallel estimation.
    """

    def __init__(self):
        self.folds = None
        self.partitions = 1
        self.n_samples = None
        self.n_test_samples = None

        self.__fitted__ = False

[docs]    @abstractmethod
    def fit(self, X, y=None, job=None):
        """Method for storing array data.

        Parameters
        ----------
        X : array-like of shape [n_samples, optional]
            array to _collect dimension data from.

        y : array-like, optional
            label data

        job : str, optional
            optional job type data

        Returns
        -------
        instance :
            indexer with stores sample size data.

        Notes
        -----
        Fitting an indexer stores nothing that points to the array
        or memmap ``X``. Only the ``shape`` attribute of ``X`` is called.
        """

    @abstractmethod
    def _gen_indices(self):
        """Method for constructing the index generator.

        This should be modified by each indexer class to build the desired
        index. Currently, the Default is the standard K-Fold as this method
        is returned by Subset-based indexer when number of subsets is ``1``.

        Returns
        -------
        iterable :
            a generator of ``train_index, test_index``.
        """
        n_samples = self.n_samples
        folds = self.folds

        if folds == 1:
            # Return the full index as both training and test set
            yield ((0, n_samples),), (0, n_samples)
        else:
            # Get the length of the test sets
            tei_len = partition(n_samples, folds)

            last = 0
            for size in tei_len:

                # Test set
                tei_start, tei_stop = last, last + size
                tei = (tei_start, tei_stop)

                # Train set
                tri_start_below, tri_stop_below = 0, tei_start
                tri_start_above, tri_stop_above = tei_stop, n_samples

                tri = prune_train(tri_start_below, tri_stop_below,
                                  tri_start_above, tri_stop_above)

                yield tri, tei
                last = tei_stop

    # pylint: disable=unused-argument, no-self-use
[docs]    def partition(self, X=None, as_array=False):
        """Partition generator method.

        Default behavior is to yield ``None``
        for fitting on full data. Overridden in
        :class:`SubsetIndex` and :class:`ClusteredSubsetIndex`
        to produce partition indexes.
        """
        yield None

[docs]    def generate(self, X=None, as_array=False):
        r"""Front-end generator method.

        Generator for training and test set indices based on the
        generator specification in ``_gen_indicies``.

        Parameters
        ----------
        X : array-like, optional
            If instance has not been fitted, the training set ``X`` must be
            passed to the ``generate`` method, which will call ``fit`` before
            proceeding. If already fitted, ``X`` can be omitted.

        as_array : bool (default = False)
            whether to return train and test indices as a pair of tuple(s)
            or numpy arrays. If the returned tuples are singular they can be
            used on an array X with standard slicing syntax
            (``X[start:stop]``), but if a list of tuples is returned
            slicing ``X`` properly requires first building a list or array
            of index numbers from the list of tuples. This can be achieved
            either by setting ``as_array`` to ``True``, or running ::

                for train_tup, test_tup in indexer.generate():
                    train_idx = \
                        np.hstack([np.arange(t0, t1) for t0, t1 in train_tup])

            when slicing is required.
        """
        # Check that the instance have some array information to work with
        if not self.__fitted__:
            if X is None:
                raise AttributeError("No array provided to indexer. Either "
                                     "pass an array to the 'generate' method, "
                                     "or call the 'fit' method first or "
                                     "initiate the instance with an array X "
                                     "as argument.")
            # Need to call fit to continue
            self.fit(X)

        for tri, tei in self._gen_indices():

            if as_array:
                tri = self._build_range(tri)
                tei = self._build_range(tei)

            yield tri, tei

    @staticmethod
    def _build_range(idx):
        """Build an array of indexes from a list or tuple of index tuples.

        Given an index object containing tuples of ``(start, stop)`` indexes
        ``_build_range`` will return an array that concatenate all elements
        between each ``start`` and ``stop`` number.

        Examples
        --------
        Single slice (convex slicing)

        >>> from mlens.index.base import BaseIndex
        >>> BaseIndex._build_range((0, 6))
        array([0, 1, 2, 3, 4, 5])

        Several slices (non-convex slicing)

        >>> from mlens.index.base import BaseIndex
        >>> BaseIndex._build_range([(0, 2), (4, 6)])
        array([0, 1, 4, 5])
        """
        if isinstance(idx[0], tuple):
            return np.hstack([np.arange(t0, t1) for t0, t1 in idx])
        return np.arange(idx[0], idx[1])

[docs]    def set_params(self, **params):
        self.__fitted__ = False
        return super(BaseIndex, self).set_params(**params)


[docs]class FullIndex(BaseIndex):

    """Vacuous indexer to be used with final layers.

    FullIndex is a compatibility class to be used with meta layers. It stores
    the sample size to be predicted for use with the
    :class:`ParallelProcessing` job manager, and yields a ``None, None``
    index when `generate` is called.
    """

    def __init__(self, X=None):
        super(FullIndex, self).__init__()
        if X is not None:
            self.fit(X)

[docs]    def fit(self, X, y=None, job=None):
        """Store dimensionality data about X."""
        self.n_samples = X.shape[0]
        self.n_test_samples = X.shape[0]
        self.__fitted__ = True

    def _gen_indices(self):
        """Vacuous generator to ensure training data is not sliced."""
        yield None, None