Source code for mlens.index.base

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017-2018
:licence: MIT


Base classes for partitioning training data.
"""
from __future__ import division

from abc import abstractmethod
import numpy as np

from ..externals.sklearn.base import BaseEstimator


[docs]def prune_train(start_below, stop_below, start_above, stop_above): """Checks if indices above or below are empty and remove them. A utility function for checking if the train indices below the a given test set range are (0, 0), or if indices above the test set range is (n, n). In this case, these will lead to an empty array and therefore can safely be removed to create a single training set index range. Parameters ---------- start_below : int index number starting below the test set. Should always be the same for all test sets. stop_below : int the index number at which the test set is starting on. start_above : int the index number at which the test set ends. stop_above : int The end of the data set (n). Should always be the same for all test sets. """ if start_below == stop_below: tri = ((start_above, stop_above),) elif start_above == stop_above: tri = ((start_below, stop_below),) else: tri = ((start_below, stop_below), (start_above, stop_above)) return tri
[docs]def partition(n, p): """Get partition sizes for a given number of samples and partitions. This method will give an array containing the sizes of ``p`` partitions given a total sample size of ``n``. If there is a remainder from the split, the r first folds will be incremented by 1. Parameters ---------- n : int number of samples. p : int number of partitions. Examples -------- Return sample sizes of 2 partitions given a total of 4 samples >>> from mlens.index.base import partition >>> _partition(4, 2) array([2, 2]) Return sample sizes of 3 partitions given a total of 8 samples >>> from mlens.index.base import partition >>> _partition(8, 3) array([3, 3, 2]) """ sizes = (n // p) * np.ones(p, dtype=np.int) sizes[:n % p] += 1 return sizes
[docs]def make_tuple(arr): """Make a list of index tuples from array Parameters ---------- arr : array Returns ------- out : list Examples -------- >>> import numpy as np >>> from mlens.index.base import make_tuple >>> _make_tuple(np.array([0, 1, 2, 5, 6, 8, 9, 10])) [(0, 3), (5, 7), (8, 11)] """ out = list() t1 = t0 = arr[0] for i in arr[1:]: if i - t1 <= 1: t1 = i continue out.append((t0, t1 + 1)) t1 = t0 = i out.append((t0, t1 + 1)) return out
[docs]class BaseIndex(BaseEstimator): """Base Index class. Specification of indexer-wide methods and attributes that we can always expect to find in any indexer. Helps to provide a uniform interface during parallel estimation. """ def __init__(self): self.folds = None self.partitions = 1 self.n_samples = None self.n_test_samples = None self.__fitted__ = False
[docs] @abstractmethod def fit(self, X, y=None, job=None): """Method for storing array data. Parameters ---------- X : array-like of shape [n_samples, optional] array to _collect dimension data from. y : array-like, optional label data job : str, optional optional job type data Returns ------- instance : indexer with stores sample size data. Notes ----- Fitting an indexer stores nothing that points to the array or memmap ``X``. Only the ``shape`` attribute of ``X`` is called. """
@abstractmethod def _gen_indices(self): """Method for constructing the index generator. This should be modified by each indexer class to build the desired index. Currently, the Default is the standard K-Fold as this method is returned by Subset-based indexer when number of subsets is ``1``. Returns ------- iterable : a generator of ``train_index, test_index``. """ n_samples = self.n_samples folds = self.folds if folds == 1: # Return the full index as both training and test set yield ((0, n_samples),), (0, n_samples) else: # Get the length of the test sets tei_len = partition(n_samples, folds) last = 0 for size in tei_len: # Test set tei_start, tei_stop = last, last + size tei = (tei_start, tei_stop) # Train set tri_start_below, tri_stop_below = 0, tei_start tri_start_above, tri_stop_above = tei_stop, n_samples tri = prune_train(tri_start_below, tri_stop_below, tri_start_above, tri_stop_above) yield tri, tei last = tei_stop # pylint: disable=unused-argument, no-self-use
[docs] def partition(self, X=None, as_array=False): """Partition generator method. Default behavior is to yield ``None`` for fitting on full data. Overridden in :class:`SubsetIndex` and :class:`ClusteredSubsetIndex` to produce partition indexes. """ yield None
[docs] def generate(self, X=None, as_array=False): r"""Front-end generator method. Generator for training and test set indices based on the generator specification in ``_gen_indicies``. Parameters ---------- X : array-like, optional If instance has not been fitted, the training set ``X`` must be passed to the ``generate`` method, which will call ``fit`` before proceeding. If already fitted, ``X`` can be omitted. as_array : bool (default = False) whether to return train and test indices as a pair of tuple(s) or numpy arrays. If the returned tuples are singular they can be used on an array X with standard slicing syntax (``X[start:stop]``), but if a list of tuples is returned slicing ``X`` properly requires first building a list or array of index numbers from the list of tuples. This can be achieved either by setting ``as_array`` to ``True``, or running :: for train_tup, test_tup in indexer.generate(): train_idx = \ np.hstack([np.arange(t0, t1) for t0, t1 in train_tup]) when slicing is required. """ # Check that the instance have some array information to work with if not self.__fitted__: if X is None: raise AttributeError("No array provided to indexer. Either " "pass an array to the 'generate' method, " "or call the 'fit' method first or " "initiate the instance with an array X " "as argument.") # Need to call fit to continue self.fit(X) for tri, tei in self._gen_indices(): if as_array: tri = self._build_range(tri) tei = self._build_range(tei) yield tri, tei
@staticmethod def _build_range(idx): """Build an array of indexes from a list or tuple of index tuples. Given an index object containing tuples of ``(start, stop)`` indexes ``_build_range`` will return an array that concatenate all elements between each ``start`` and ``stop`` number. Examples -------- Single slice (convex slicing) >>> from mlens.index.base import BaseIndex >>> BaseIndex._build_range((0, 6)) array([0, 1, 2, 3, 4, 5]) Several slices (non-convex slicing) >>> from mlens.index.base import BaseIndex >>> BaseIndex._build_range([(0, 2), (4, 6)]) array([0, 1, 4, 5]) """ if isinstance(idx[0], tuple): return np.hstack([np.arange(t0, t1) for t0, t1 in idx]) return np.arange(idx[0], idx[1])
[docs] def set_params(self, **params): self.__fitted__ = False return super(BaseIndex, self).set_params(**params)
[docs]class FullIndex(BaseIndex): """Vacuous indexer to be used with final layers. FullIndex is a compatibility class to be used with meta layers. It stores the sample size to be predicted for use with the :class:`ParallelProcessing` job manager, and yields a ``None, None`` index when `generate` is called. """ def __init__(self, X=None): super(FullIndex, self).__init__() if X is not None: self.fit(X)
[docs] def fit(self, X, y=None, job=None): """Store dimensionality data about X.""" self.n_samples = X.shape[0] self.n_test_samples = X.shape[0] self.__fitted__ = True
def _gen_indices(self): """Vacuous generator to ensure training data is not sliced.""" yield None, None