Source code for mlens.index.blend

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017-2018
:licence: MIT

Blend indexing.
"""
from __future__ import division

from numbers import Integral
import numpy as np

from ._checks import check_partial_index
from .base import BaseIndex


[docs]class BlendIndex(BaseIndex):

    """Indexer that generates two non-overlapping subsets of ``X``.

    Iterator that generates one training fold and one test fold that are
    non-overlapping and that may or may not partition all of X depending on the
    user's specification.

    BlendIndex creates a singleton generator (has on iteration) that
    yields two tuples of ``(start, stop)`` integers that can be used for
    numpy array slicing (i.e. ``X[stop:start]``). If a full array index
    is desired this can easily be achieved with::

        for train_tup, test_tup in self.generate():
            train_slice = numpy.hstack([numpy.arange(t0, t1) for t0, t1 in
                                      train_tup])

            test_slice = numpy.hstack([numpy.arange(t0, t1) for t0, t1 in
                                      test_tup])

    Parameters
    ----------
    test_size : int or float (default = 0.5)
        Size of the test set. If ``float``, assumed to be proportion of full
        data set.

    train_size : int or float, optional
        Size of test set. If not specified (i.e. ``train_size = None``,
        train_size is equal to ``n_samples - test_size``. If ``float``, assumed
        to be a proportion of full data set. If ``train_size`` + ``test_size``
        amount to less than the observations in the full data set, a subset
        of specified size will be used.

    X : array-like of shape [n_samples,] , optional
        the training set to partition. The training label array is also,
        accepted, as only the first dimension is used. If ``X`` is not
        passed
        at instantiation, the ``fit`` method must be called before
        ``generate``, or ``X`` must be passed as an argument of
        ``generate``.

    raise_on_exception : bool (default = True)
        whether to warn on suspicious slices or raise an error.

    See Also
    --------
    :class:`FoldIndex`, :class:`SubsetIndex`

    Examples
    --------

    Selecting an absolute test size, with train size as the remainder

    >>> import numpy as np
    >>> from mlens.index import BlendIndex
    >>> X = np.arange(8)
    >>> idx = BlendIndex(3, rebase=True)
    >>> print('Test size: 3')
    >>> for tri, tei in idx.generate(X):
    ...     print('TEST (idx | array): (%i, %i) | %r ' % (tei[0], tei[1],
    ...                                                   X[tei[0]:tei[1]]))
    ...     print('TRAIN (idx | array): (%i, %i) | %r ' % (tri[0], tri[1],
    ...                                                    X[tri[0]:tri[1]]))
    Test size: 3
    TEST (idx | array): (5, 8) | array([5, 6, 7])
    TRAIN (idx | array): (0, 5) | array([0, 1, 2, 3, 4])

    Selecting a test and train size less than the total

    >>> import numpy as np
    >>> from mlens.index import BlendIndex
    >>> X = np.arange(8)
    >>> idx = BlendIndex(3, 4, X)
    >>> print('Test size: 3')
    >>> print('Train size: 4')
    >>> for tri, tei in idx.generate(X):
    ...     print('TEST (idx | array): (%i, %i) | %r ' % (tei[0], tei[1],
    ...                                                   X[tei[0]:tei[1]]))
    ...     print('TRAIN (idx | array): (%i, %i) | %r ' % (tri[0], tri[1],
    ...                                                    X[tri[0]:tri[1]]))
    Test size: 3
    Train size: 4
    TEST (idx | array): (4, 7) | array([4, 5, 6])
    TRAIN (idx | array): (0, 4) | array([0, 1, 2, 3])

    Selecting a percentage of observations as test and train set

    >>> import numpy as np
    >>> from mlens.index import BlendIndex
    >>> X = np.arange(8)
    >>> idx = BlendIndex(0.25, 0.45, X)
    >>> print('Test size: 25% * 8 = 2')
    >>> print('Train size: 45% * 8 < 4 -> 3')
    >>> for tri, tei in idx.generate(X):
    ...     print('TEST (idx | array): (%i, %i) | %r ' % (tei[0], tei[1],
    ...                                                   X[tei[0]:tei[1]]))
    ...     print('TRAIN (idx | array): (%i, %i) | %r ' % (tri[0], tri[1],
    ...                                                    X[tri[0]:tri[1]]))
    Test size: 25% * 8 = 2
    Train size: 50% * 8 < 4 ->
    TEST (idx | array): (3, 5) | array([[3, 4]])
    TRAIN (idx | array): (0, 3) | array([[0, 1, 2]])

    Rebasing the test set to be 0-indexed

    >>> import numpy as np
    >>> from mlens.index import BlendIndex
    >>> X = np.arange(8)
    >>> idx = BlendIndex(3, rebase=True)
    >>> print('Test size: 3')
    >>> for tri, tei in idx.generate(X):
    ...     print('TEST tuple: (%i, %i) | array: %r' % (tei[0], tei[1],
    ...                                                 np.arange(tei[0],
    ...                                                           tei[1])))
    Test size: 3
    TEST tuple: (0, 3) | array: array([0, 1, 2])
    """

    def __init__(self,
                 test_size=0.5,
                 train_size=None,
                 X=None,
                 raise_on_exception=True):
        super(BlendIndex, self).__init__()
        self.n_train = None
        self.n_test = None
        self.test_size = test_size
        self.train_size = train_size
        self.raise_on_exception = raise_on_exception

        if X is not None:
            self.fit(X)

[docs]    def fit(self, X, y=None, job=None):
        """Method for storing array data.

        Parameters
        ----------
        X : array-like of shape [n_samples, optional]
            array to _collect dimension data from.
        y : None
            for compatibility
        job : None
            for compatibility

        Returns
        -------
        instance :
            indexer with stores sample size data.
        """
        self.n_samples = X.shape[0]

        # Get number of test samples
        if isinstance(self.test_size, Integral):
            self.n_test = self.test_size
        else:
            self.n_test = int(np.floor(self.test_size * self.n_samples))

        # Get number of train samples
        if self.train_size is None:
            # Partition X - we coerce a positive value here:
            # if n_test is oversampled will get at final check
            self.n_train = int(np.floor(np.abs(self.n_samples - self.n_test)))

        elif isinstance(self.train_size, Integral):
            self.n_train = self.train_size

        else:
            self.n_train = int(np.floor(self.train_size * self.n_samples))

        check_partial_index(self.n_samples, self.test_size, self.train_size,
                            self.n_test, self.n_train)

        self.n_test_samples = self.n_test

        self.__fitted__ = True
        return self

    def _gen_indices(self):
        """Return train and test set index generator."""
        # Blended train set is from 0 to n, with test set from n to N
        # There is no iteration.
        yield (0, self.n_train), (self.n_train, self.n_train + self.n_test)