Source code for mlens.index.fold

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017-2018
:licence: MIT

Stack indexing.
"""
from __future__ import division

from ._checks import check_full_index
from .base import BaseIndex


[docs]class FoldIndex(BaseIndex):

    """Indexer that generates the full size of ``X``.

    K-Fold iterator that generates fold index tuples.

    FoldIndex creates a generator that returns a tuple of stop and start
    positions to be used for numpy array slicing [stop:start]. Note that
    slicing works well for the test set, but for the training set it is
    recommended to concatenate the index for training data that comes before
    the current test set with the index for the training data that comes after.
    This can easily be achieved with::

        for train_tup, test_tup in self.generate():
            train_slice = numpy.hstack([numpy.arange(t0, t1) for t0, t1 in
                                      train_tup])

            xtrain, xtest = X[train_slice], X[test_tup[0]:test_tup[1]]

    Warnings
    --------
    Simple clicing (i.e. ``X[start:stop]`` generally does not work for the
    train set, which often requires concatenating the train index range
    below the current test set, and the train index range above the current
    test set. To build get a training index, use ::

        ``hstack([np.arange(t0, t1) for t0, t1 in train_index_tuples])``.

    See Also
    --------
    :class:`BlendIndex`, :class:`SubsetIndex`

    Parameters
    ----------
    folds : int (default = 2)
        Number of splits to create in each partition. ``folds`` can
        not be 1 if ``n_partition > 1``. Note that if ``folds = 1``,
        both the train and test set will index the full data.

    X : array-like of shape [n_samples,] , optional
        the training set to partition. The training label array is also,
        accepted, as only the first dimension is used. If ``X`` is not
        passed at instantiation, the ``fit`` method must be called before
        ``generate``, or ``X`` must be passed as an argument of
        ``generate``.

    raise_on_exception : bool (default = True)
        whether to warn on suspicious slices or raise an error.

    Examples
    --------

    Creating arrays of folds and checking overlap

    >>> import numpy as np
    >>> from mlens.index import FoldIndex
    >>> X = np.arange(10)
    >>> print("Data set: %r" % X)
    >>> print()
    >>>
    >>> idx = FoldIndex(4, X)
    >>>
    >>> for train, test in idx.generate(as_array=True):
    ...     print('TRAIN IDX: %32r | TEST IDX: %16r' % (train, test))
    >>>
    >>> print()
    >>>
    >>> for train, test in idx.generate(as_array=True):
    ...     print('TRAIN SET: %32r | TEST SET: %16r' % (X[train], X[test]))
    >>>
    >>> for train_idx, test_idx in idx.generate(as_array=True):
    ...     assert not any([i in X[test_idx] for i in X[train_idx]])
    >>>
    >>> print()
    >>>
    >>> print("No overlap between train set and test set.")
    Data set: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    TRAIN IDX:     array([3, 4, 5, 6, 7, 8, 9]) | TEST IDX: array([0, 1, 2])
    TRAIN IDX:     array([0, 1, 2, 6, 7, 8, 9]) | TEST IDX: array([3, 4, 5])
    TRAIN IDX:  array([0, 1, 2, 3, 4, 5, 8, 9]) | TEST IDX:    array([6, 7])
    TRAIN IDX:  array([0, 1, 2, 3, 4, 5, 6, 7]) | TEST IDX:    array([8, 9])
    TRAIN SET:     array([3, 4, 5, 6, 7, 8, 9]) | TEST SET: array([0, 1, 2])
    TRAIN SET:     array([0, 1, 2, 6, 7, 8, 9]) | TEST SET: array([3, 4, 5])
    TRAIN SET:  array([0, 1, 2, 3, 4, 5, 8, 9]) | TEST SET:    array([6, 7])
    TRAIN SET:  array([0, 1, 2, 3, 4, 5, 6, 7]) | TEST SET:    array([8, 9])
    No overlap between train set and test set.

    Passing ``folds = 1`` without raising exception:

    >>> import numpy as np
    >>> from mlens.index import FoldIndex
    >>> X = np.arange(3)
    >>> print("Data set: %r" % X)
    >>> print()
    >>>
    >>> idx = FoldIndex(1, X, raise_on_exception=False)
    >>>
    >>> for train, test in idx.generate(as_array=True):
    ...     print('TRAIN IDX: %10r | TEST IDX: %10r' % (train, test))
    /../mlens/base/indexer.py:167: UserWarning: 'folds' is 1, will return
    full index as both training set and test set.
    warnings.warn("'folds' is 1, will return full index as "
    Data set: array([0, 1, 2])
    TRAIN IDX: array([0, 1, 2]) | TEST IDX: array([0, 1, 2])
    """

    def __init__(self, folds=2, X=None, raise_on_exception=True):
        super(FoldIndex, self).__init__()
        self.folds = folds
        self.raise_on_exception = raise_on_exception

        if X is not None:
            self.fit(X)

[docs]    def fit(self, X, y=None, job=None):
        """Method for storing array data.

        Parameters
        ----------
        X : array-like of shape [n_samples, optional]
            array to _collect dimension data from.
        y : None
            for compatibility
        job : None
            for compatibility

        Returns
        -------
        instance :
            indexer with stores sample size data.
        """
        n = X.shape[0]
        check_full_index(n, self.folds, self.raise_on_exception)

        self.n_test_samples = self.n_samples = n
        self.__fitted__ = True
        return self

    def _gen_indices(self):
        """Generate K-Fold iterator."""
        return super(FoldIndex, self)._gen_indices()