Source code for mlens.parallel.backend

"""ML-Ensemble

:author: Sebastian Flennerhag
:copyright: 2017-2018
:license: MIT

Parallel processing backend classes. Manages memory-mapping of data, estimation
caching and job scheduling.
"""
# pylint: disable=too-few-public-methods
# pylint: disable=too-many-arguments
# pylint: disable=too-many-instance-attributes
# pylint: disable=useless-super-delegation

from __future__ import with_statement, division

import gc
import os
import shutil
import subprocess
import tempfile
import warnings

from abc import ABCMeta, abstractmethod

import numpy as np
from scipy.sparse import issparse, hstack

from .. import config
from ..externals.joblib import Parallel, dump, load
from ..utils import check_initialized
from ..utils.exceptions import (ParallelProcessingError,
                                ParallelProcessingWarning)
from ..externals.sklearn.validation import check_random_state


###############################################################################
def _dtype(a, b=None):
    """Utility for getting a dtype"""
    return getattr(a, 'dtype', getattr(b, 'dtype', None))


[docs]def dump_array(array, name, path): """Dump array for memmapping. Parameters ---------- array : array-like Array to be persisted name : str Name of file path : str Path to cache. Returns ------- f: array-like memory-mapped array. """ # First check if the array is on file if isinstance(array, str): # Load file from disk. Need to dump if not memmaped already if not array.split('.')[-1] in ['mmap', 'npy', 'npz']: # Try loading the file assuming a csv-like format array = _load(array) if isinstance(array, str): # If arr remains a string, it's pointing to an mmap file f = array else: # Dump ndarray on disk f = os.path.join(path, '%s.mmap' % name) if os.path.exists(f): os.unlink(f) dump(array, f) return f
def _load(arr): """Load array from file using default settings.""" if arr.split('.')[-1] in ['npy', 'npz']: return np.load(arr) else: try: return np.genfromtxt(arr) except Exception as e: raise IOError("Could not load X from %s, does not " "appear to be a valid ndarray. " "Details:\n%r" % (arr, e)) def _load_mmap(f): """Load a mmap presumably dumped by joblib, otherwise try numpy.""" try: return load(f, mmap_mode='r') except (IndexError, KeyError): # Joblib's 'load' func fails on npy and npz: use numpy.load return np.load(f, mmap_mode='r') def _set_path(job, path, threading): """Build path as a cache or list depending on whether using threading""" if path: if not isinstance(path, str) and not threading: raise ValueError("Path must be a str with backend=multiprocessing." " Got %r" % path.__class__) elif not isinstance(path, (str, dict)): raise ValueError("Invalid path format. Should be one of " "str, dict. Got %r" % path.__class__) job.dir = path return job if threading: # No need to pickle job.dir = dict() return job # Else, need a directory path = config.get_tmpdir() try: job.tmp = tempfile.TemporaryDirectory( prefix=config.get_prefix(), dir=path) job.dir = job.tmp.name except AttributeError: # Fails on python 2 job.dir = tempfile.mkdtemp(prefix=config.get_prefix(), dir=path) return job ###############################################################################
[docs]class Job(object): """Container class for holding and managing job data. :class:`Job` is intended as a on-the-fly job handler that keeps track of input data, predictions, and manages estimation caches. .. versionchanged:: 0.2.0 See Also -------- :class:`ParallelProcessing`, :class:`ParallelEvaluation` Parameters ---------- job : str Type of job to run. One of ``'fit'``, ``'transform'``, ``'predict'``. stack : bool Whether to stack outputs when calls to :func:`~mlens.parallel.backend.Job.update` are made. This will make the ``predict_out`` array become ``predict_in``. split : bool Whether to create a new sub-cache when the :attr:`~mlens.parallel.backend.Job.args` property is called. dir : str, dict, optional estimation cache. Pass dictionary for use with multiprocessing or a string pointing to the disk directory to create the cache in tmp : obj, optional a Tempfile object for temporary directories targets : array-like of shape [n_in_samples,], optional input targets predict_in : array-like of shape [n_in_samples, n_in_features], optional input data predict_out : array_like of shape [n_out_samples, n_out_features], optional prediction output array """ __slots__ = ['targets', 'predict_in', 'predict_out', 'dir', 'job', 'tmp', '_n_dir', 'kwargs', 'stack', 'split'] def __init__(self, job, stack, split, dir=None, tmp=None, predict_in=None, targets=None, predict_out=None): self.job = job self.stack = stack self.split = split self.targets = targets self.predict_in = predict_in self.predict_out = predict_out self.tmp = tmp self.dir = dir self._n_dir = 0
[docs] def clear(self): """Clear output data for new task""" self.predict_out = None
[docs] def update(self): """Updated output array and shift to input if stacked. If stacking is en force, the output array will replace the input array, and used as input for subsequent jobs. Sparse matrices are force-converted to ``csr`` format. """ if self.predict_out is None: return if (issparse(self.predict_out) and not self.predict_out.__class__.__name__.startswith('csr')): # Enforce csr on spare matrices self.predict_out = self.predict_out.tocsr() if self.stack: self.predict_in = self.predict_out self.rebase()
[docs] def rebase(self): """Rebase output labels to input indexing. Some indexers that only generate predictions for subsets of the training data require the targets to be rebased. Since indexers operate in a strictly sequential manner, rebase simply drop the first ``n`` observations in the target vector until number of observations remaining coincide. .. seealso:: :class:`~mlens.index.blend.BlendIndex` """ if self.targets is not None and ( self.targets.shape[0] > self.predict_in.shape[0]): # This is legal if X is a prediction matrix generated by predicting # only a subset of the original training set. # Since indexing is strictly monotonic, we can simply discard # the first observations in y to get the corresponding labels. rebase = self.targets.shape[0] - self.predict_in.shape[0] self.targets = self.targets[rebase:]
[docs] def shuffle(self, random_state): """Shuffle inputs. Permutes the indexing of ``predict_in`` and ``y`` arrays. Parameters ---------- random_state : int, obj Random seed number or generator to use. """ r = check_random_state(random_state) idx = r.permutation(self.targets.shape[0]) self.predict_in = self.predict_in[idx] self.targets = self.targets[idx]
[docs] def subdir(self): """Return a cache subdirectory If ``split`` is en force, a new sub-cache will be created in the main cache. Otherwise the same sub-cache as used in previous call will be returned. .. versionadded:: 0.2.0 Returns ------- cache : str, list Either a string pointing to a cache persisted to disk, or an in-memory cache in the form of a list. """ path_name = "task_%s" % str(self._n_dir) if self.split: # Increment sub-cache counter self._n_dir += 1 if isinstance(self.dir, str): path = os.path.join(self.dir, path_name) cache_exists = os.path.exists(path) # Persist cache to disk if cache_exists and self.split: raise ParallelProcessingError( "Subdirectory %s exist. Clear cache." % path_name) elif not cache_exists: os.mkdir(path) return path # Keep in memory if path_name in self.dir and self.split: raise ParallelProcessingError( "Subdirectory %s exist. Clear cache." % path_name) elif path_name not in self.dir: self.dir[path_name] = list() return self.dir[path_name]
[docs] def args(self, **kwargs): """Produce args dict .. versionadded:: 0.2.0 Returns the arguments dictionary passed to a task of a parallel processing manager. Output dictionary has the following form:: out = {'auxiliary': {'X': self.predict_in, 'P': self.predict_out}, 'main': {'X': self.predict_in, 'P': self.predict_out}, 'dir': self.subdir(), 'job': self.job } Parameters ---------- **kwargs : optional Optional keyword arguments to pass to the task. Returns ------- args : dict Arguments dictionary """ aux_feed = {'X': self.predict_in, 'P': None} main_feed = {'X': self.predict_in, 'P': self.predict_out} if self.job in ['fit', 'evaluate']: main_feed['y'] = self.targets aux_feed['y'] = self.targets out = dict() if kwargs: out.update(kwargs) out = {'auxiliary': aux_feed, 'main': main_feed, 'dir': self.subdir(), 'job': self.job} return out
###############################################################################
[docs]class BaseProcessor(object): """Parallel processing base class. Base class for parallel processing engines. Parameters ---------- backend: str, optional Type of backend. One of ``'threading'``, ``'multiprocessing'``, ``'sequential'``. n_jobs : int, optional Degree of concurrency. verbose: bool, int, optional Level of verbosity of the :class:`~mlens.externals.joblib.parallel.Parallel` instance. """ __meta_class__ = ABCMeta __slots__ = ['caller', '__initialized__', '__threading__', 'job', 'n_jobs', 'backend', 'verbose'] @abstractmethod def __init__(self, backend=None, n_jobs=None, verbose=None): self.job = None self.__initialized__ = 0 self.backend = config.get_backend() if not backend else backend self.n_jobs = -1 if not n_jobs else n_jobs self.verbose = False if not verbose else verbose self.__threading__ = self.backend == 'threading' def __enter__(self): return self
[docs] def initialize(self, job, X, y, path, warm_start=False, return_preds=False, **kwargs): """Initialize processing engine. Set up the job parameters before an estimation call. Calling :func:`~mlens.parallel.backend.BaseProcessor.clear` undoes initialization. Parameters ---------- job : str type of job to complete with each task. One of ``'fit'``, ``'predict'`` and ``'transform'``. X : array-like of shape [n_samples, n_features] Input data y : array-like of shape [n_samples,], optional. targets. Required for fit, should not be passed to predict or transform jobs. path : str or dict, optional Custom estimation cache. Pass a string to force use of persistent cache on disk. Pass a ``dict`` for in-memory cache (requires ``backend != 'multiprocessing'``. return_preds : bool or list, optional whether to return prediction ouput. If ``True``, final prediction is returned. Alternatively, pass a list of task names for which output should be returned. warm_start : bool, optional whether to re-use previous input data initialization. Useful if repeated jobs are made on the same input arrays. **kwargs : optional optional keyword arguments to pass onto the task's call method. Returns ------- out : dict An output parameter dictionary to pass to pass to an estimation method. Either ``None`` (no output), or ``{'final':True}`` for only final prediction, or ``{'final': False, 'return_names': return_preds}`` if a list of task-specific output was passed. """ if not warm_start: self._initialize(job=job, X=X, y=y, path=path, **kwargs) if return_preds is True: return {'return_final': True} if return_preds is False: return {} return {'return_final': False, 'return_names': return_preds}
def _initialize(self, job, X, y=None, path=None, **kwargs): """Create a job instance for estimation. See :func:`~mlens.parallel.backend.BaseProcess.initialize` for further details. """ job = Job(job, **kwargs) job = _set_path(job, path, self.__threading__) # --- Prepare inputs for name, arr in zip(('X', 'y'), (X, y)): if arr is None: continue # Dump data in cache if self.__threading__: # No need to memmap f = None if isinstance(arr, str): arr = _load(arr) else: f = dump_array(arr, name, job.dir) # Store data for processing if name == 'y' and arr is not None: job.targets = arr if self.__threading__ else _load_mmap(f) elif name == 'X': job.predict_in = arr \ if self.__threading__ else _load_mmap(f) self.job = job self.__initialized__ = 1 gc.collect() return self def __exit__(self, *args): self.clear()
[docs] def clear(self): """Destroy cache and reset instance job parameters.""" # Detach Job instance job = self.job self.job = None self.__initialized__ = 0 if job: path = job.dir path_handle = job.tmp # Release shared memory references del job gc.collect() # Destroy cache try: # If the cache has been persisted to disk, remove it if isinstance(path, str): path_handle.cleanup() except (AttributeError, OSError): # Python 2 has no handler, can also fail on windows # Use explicit shutil process, or fall back on subprocess try: shutil.rmtree(path) except OSError: # Can fail on windows, need to use the shell try: subprocess.Popen( 'rmdir /S /Q %s' % path, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).kill() except OSError: warnings.warn( "Failed to delete cache at %s." "If created with default settings, will be " "removed on reboot. For immediate " "removal, manual removal is required." % path, ParallelProcessingWarning) finally: del path, path_handle gc.collect() if gc.garbage: warnings.warn( "Clearing cache failed, uncollected:\n%r" % gc.garbage, ParallelProcessingWarning)
[docs]class ParallelProcessing(BaseProcessor): """Parallel processing engine. Engine for running computational graph. :class:`ParallelProcessing` is a manager for executing a sequence of tasks in a given caller, where each task is run sequentially, but assumed to be parallelized internally. The main responsibility of :class:`ParallelProcessing` is to handle memory-mapping, estimation cache updates, input and output array updates and output collection. Parameters ---------- caller : obj the caller of the job. Either a Layer or a meta layer class such as Sequential. *args : optional Optional arguments to :class:`~mlens.parallel.backend.BaseProcessor` **kwargs : optional Optional keyword arguments to :class:`~mlens.parallel.backend.BaseProcessor`. """ def __init__(self, *args, **kwargs): super(ParallelProcessing, self).__init__(*args, **kwargs)
[docs] def map(self, caller, job, X, y=None, path=None, return_preds=False, wart_start=False, split=False, **kwargs): """Parallel task mapping. Run independent tasks in caller in parallel. Warning ------- By default, the :~mlens.parallel.backend.ParallelProcessing.map` runs on a shallow cache, where all tasks share the same cache. As such, the user must ensure that each task has a unique name, or cache retrieval will be corrupted. To commit a seperate sub-cache to each task, set ``split=True``. Parameters ---------- caller : iterable Iterable that generates accepted task instances. Caller should be a child of the :class:`~mlens.parallel.base.BaseBackend` class, and tasks need to implement an appropriate call method. job : str type of job to complete with each task. One of ``'fit'``, ``'predict'`` and ``'transform'``. X : array-like of shape [n_samples, n_features] Input data y : array-like of shape [n_samples,], optional. targets. Required for fit, should not be passed to predict or transform jobs. path : str or dict, optional Custom estimation cache. Pass a string to force use of persistent cache on disk. Pass a ``dict`` for in-memory cache (requires ``backend != 'multiprocessing'``. return_preds : bool or list, optional whether to return prediction ouput. If ``True``, final prediction is returned. Alternatively, pass a list of task names for which output should be returned. warm_start : bool, optional whether to re-use previous input data initialization. Useful if repeated jobs are made on the same input arrays. split : bool, default = False whether to commit a separate sub-cache to each task. **kwargs : optional optional keyword arguments to pass onto each task. Returns ------- out: array-like, list, optional Prediction array(s). """ out = self.initialize( job=job, X=X, y=y, path=path, warm_start=wart_start, return_preds=return_preds, split=split, stack=False) return self.process(caller=caller, out=out, **kwargs)
[docs] def stack(self, caller, job, X, y=None, path=None, return_preds=False, warm_start=False, split=True, **kwargs): """Stacked parallel task mapping. Run stacked tasks in caller in parallel. This method runs a stack of tasks as a stack, where the output of each task is the input to the next. Warning ------- By default, the :func:`~mlens.parallel.backend.ParallelProcessing.stack` method runs on a deep cache, where each tasks has a separate cache. As such, the user must ensure that tasks don't depend on data cached by previous tasks. To run all tasks in a single sub-cache, set ``split=False``. Parameters ---------- caller : iterable Iterable that generates accepted task instances. Caller should be a child of the :class:`~mlens.parallel.base.BaseBackend` class, and tasks need to implement an appropriate call method. job : str type of job to complete with each task. One of ``'fit'``, ``'predict'`` and ``'transform'``. X : array-like of shape [n_samples, n_features] Input data y : array-like of shape [n_samples,], optional. targets. Required for fit, should not be passed to predict or transform jobs. path : str or dict, optional Custom estimation cache. Pass a string to force use of persistent cache on disk. Pass a ``dict`` for in-memory cache (requires ``backend != 'multiprocessing'``. return_preds : bool or list, optional whether to return prediction output. If ``True``, final prediction is returned. Alternatively, pass a list of task names for which output should be returned. warm_start : bool, optional whether to re-use previous input data initialization. Useful if repeated jobs are made on the same input arrays. split : bool, default = True whether to commit a separate sub-cache to each task. **kwargs : optional optional keyword arguments to pass onto each task. Returns ------- out: array-like, list, optional Prediction array(s). """ out = self.initialize( job=job, X=X, y=y, path=path, warm_start=warm_start, return_preds=return_preds, split=split, stack=True) return self.process(caller=caller, out=out, **kwargs)
[docs] def process(self, caller, out, **kwargs): """Process job. Main method for processing a caller. Requires the instance to be setup by a prior call to :func:`~mlens.parallel.backend.BaseProcessor.initialize`. .. seealso:: :func:`~mlens.parallel.backend.ParallelProcessing.map`, :func:`~mlens.parallel.backend.ParallelProcessing.stack` Parameters ---------- caller : iterable Iterable that generates accepted task instances. Caller should be a child of the :class:`~mlens.parallel.base.BaseBackend` class, and tasks need to implement an appropriate call method. out : dict A dictionary with output parameters. Pass an empty dict for no output. See :func:`~mlens.parallel.backend.BaseProcessor.initialize` for more details. Returns ------- out: array-like, list, optional Prediction array(s). """ check_initialized(self) return_names = out.pop('return_names', []) return_final = out.pop('return_final', False) out = list() if return_names else None tf = self.job.dir if not isinstance(self.job.dir, list) else None with Parallel(n_jobs=self.n_jobs, temp_folder=tf, max_nbytes=None, mmap_mode='w+', verbose=self.verbose, backend=self.backend) as parallel: for task in caller: self.job.clear() self._partial_process(task, parallel, **kwargs) if task.name in return_names: out.append(self.get_preds(dtype=_dtype(task))) self.job.update() if return_final: out = self.get_preds(dtype=_dtype(task)) return out
def _partial_process(self, task, parallel, **kwargs): """Process given task""" if self.job.job == 'fit' and getattr(task, 'shuffle', False): self.job.shuffle(getattr(task, 'random_state', None)) task.setup(self.job.predict_in, self.job.targets, self.job.job) if not task.__no_output__: self._gen_prediction_array(task, self.job.job, self.__threading__) task(self.job.args(**kwargs), parallel=parallel) if not task.__no_output__ and getattr(task, 'n_feature_prop', 0): self._propagate_features(task) def _propagate_features(self, task): """Propagate features from input array to output array.""" p_out, p_in = self.job.predict_out, self.job.predict_in # Check for loss of obs between layers (i.e. with blendindex) n_in, n_out = p_in.shape[0], p_out.shape[0] r = int(n_in - n_out) if not issparse(p_in): # Simple item setting p_out[:, :task.n_feature_prop] = p_in[r:, task.propagate_features] else: # Need to populate propagated features using scipy sparse hstack self.job.predict_out = hstack( [p_in[r:, task.propagate_features], p_out[:, task.n_feature_prop:]] ).tolil() def _gen_prediction_array(self, task, job, threading): """Generate prediction array either in-memory or persist to disk.""" shape = task.shape(job) if threading: self.job.predict_out = np.zeros(shape, dtype=_dtype(task)) else: f = os.path.join(self.job.dir, '%s_out_array.mmap' % task.name) try: self.job.predict_out = np.memmap( filename=f, dtype=_dtype(task), mode='w+', shape=shape) except Exception as exc: raise OSError( "Cannot create prediction matrix of shape (" "%i, %i), size %i MBs, for %s.\n Details:\n%r" % (shape[0], shape[1], 8 * shape[0] * shape[1] / (1024 ** 2), task.name, exc))
[docs] def get_preds(self, dtype=None, order='C'): """Return prediction matrix. Parameters ---------- dtype : numpy dtype object, optional data type to return order : str (default = 'C') data order. See :class:`numpy.asarray` for details. Returns ------- P: array-like Prediction array """ if not hasattr(self, 'job'): raise ParallelProcessingError( "Processor has been terminated:\ncannot retrieve final " "prediction array from cache.") if dtype is None: dtype = config.get_dtype() if issparse(self.job.predict_out): return self.job.predict_out return np.asarray(self.job.predict_out, dtype=dtype, order=order)
###############################################################################
[docs]class ParallelEvaluation(BaseProcessor): """Parallel cross-validation engine. Minimal parallel processing engine. Similar to :class:`ParallelProcessing`, but offers less features, only fits the *callers* indexer, and excepts no task output. """ def __init__(self, *args, **kwargs): super(ParallelEvaluation, self).__init__(*args, **kwargs)
[docs] def process(self, caller, case, X, y, path=None, **kwargs): """Process caller. Parameters ---------- caller: iterable Iterable for evaluation job.s Expected caller is a :class:`Evaluator` instance. case: str evaluation case to run on the evaluator. One of ``'preprocess'`` and ``'evaluate'``. X: array-like of shape [n_samples, n_features] Input data y: array-like of shape [n_samples,], optional. targets. Required for fit, should not be passed to predict or transform jobs. path: str or dict, optional Custom estimation cache. Pass a string to force use of persistent cache on disk. Pass a ``dict`` for in-memory cache (requires ``backend != 'multiprocessing'``. """ self._initialize( job='fit', X=X, y=y, path=path, split=False, stack=False) check_initialized(self) # Use context manager to ensure same parallel job during entire process tf = self.job.dir if not isinstance(self.job.dir, list) else None with Parallel(n_jobs=self.n_jobs, temp_folder=tf, max_nbytes=None, mmap_mode='w+', verbose=self.verbose, backend=self.backend) as parallel: caller.indexer.fit(self.job.predict_in, self.job.targets, self.job.job) caller(parallel, self.job.args(**kwargs), case)