Source code for mlens.preprocessing.preprocess

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017-2018
:licence: MIT
"""

from __future__ import division, print_function

from ..externals.sklearn.base import BaseEstimator, TransformerMixin


[docs]class Subset(BaseEstimator, TransformerMixin):

    """Select a subset of features.

    The ``Subset`` class acts as a transformer that reduces the feature set
    to a subset specified by the user.

    Parameters
    ----------
    subset : list
        list of columns indexes to select subset with. Indexes can
        either be of type ``str`` if data accepts slicing on a list of
        strings, otherwise the list should be of type ``int``.
    """

    def __init__(self, subset=None):
        self.subset = subset

[docs]    def fit(self, X, y=None):
        """Learn what format the data is stored in.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The whose type will be inferred.

        y : array-like of shape = [n_samples, n_features]
            pass-through for Scikit-learn pipeline compatibility.
        """
        self.is_df_ = X.__class__.__name__ in ['DataFrame', 'Series']

        if self.subset is not None:
            self.use_loc_ = any([isinstance(x, str) for x in self.subset])

        return self

[docs]    def transform(self, X, y=None, copy=False):
        """Return specified subset of X.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The whose type will be inferred.

        y : array-like of shape = [n_samples, n_features]
            pass-through for Scikit-learn pipeline compatibility.

        copy : bool (default = None)
            whether to copy X before transforming.
        """
        if self.subset is None:
            return X

        else:
            Xt = X.copy() if copy else X

            if self.is_df_ and self.use_loc_:
                Xt = Xt.loc[:, self.subset]

            elif self.is_df_:
                Xt = Xt.iloc[:, self.subset]

            else:
                Xt = Xt[:, self.subset]

            return Xt


[docs]class Shift(BaseEstimator, TransformerMixin):

    r"""Lag operator.

    Shift an input array :math:`X` with :math:`s` steps, i.e. for some time
    series :math:`\mathbf{X} = (X_t, X_{t-1}, ..., X_{0})`,

    .. math::

        L^{s} \mathbf{X} = (X_{t-s}, X_{t-1-s}, ..., X_{s - s})

    Parameters
    ----------

    s : int
        number of lags to generate


    Examples
    --------
    >>> import numpy as np
    >>> from mlens.preprocessing import Shift
    >>> X = np.arange(10)
    >>> L = Shift(2)
    >>> Z = L.fit_transform(X)
    >>> print("X : {}".format(X[2:]))
    >>> print("Z : {}".format(Z))
    X : [2 3 4 5 6 7 8 9]
    Z : [0 1 2 3 4 5 6 7]
    """

    def __init__(self, s):

        self.s = s

[docs]    def fit(self, X, y=None):
        """Pass through for compatability."""
        return self

[docs]    def transform(self, X):
        """Return lagged dataset."""
        return X[:-self.s]