Source code for mlens.metrics.utils

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017-2018
:licence: MIT

Utility functions for constructing metrics
"""
from __future__ import division

import warnings
import numpy as np

from ..utils.exceptions import MetricWarning
try:
    from collections import OrderedDict as _dict
except ImportError:
    _dict = dict


def _get_string(obj, dec):
    """Stringify object"""
    try:
        return '{0:.{dec}f}'.format(obj, dec=dec)
    except (TypeError, ValueError):
        return obj.__str__()


def _get_partitions(obj):
    """Check if any entry has partitions"""
    for name, _ in obj:
        if int(name.split('.')[-2]) > 0:
            return True
    return False


def _split(f, s, a_p='', a_s='', b_p='', b_s='', reverse=False):
    """Split string on a symbol and return two string, first possible empty"""
    splitted = f.split(s)
    if len(splitted) == 1:
        a, b = '', splitted[0]
        if reverse:
            b, a = a, b
    else:
        a, b = splitted

    if a:
        a = '%s%s%s' % (a_p, a, a_s)
    if b:
        b = '%s%s%s' % (b_p, b, b_s)

    return a, b


[docs]class Data(_dict):

    """Wrapper class around dict to get pretty prints

    :class:`Data` is an ordered dictionary that implements a dedicated
    pretty print method for a nested dictionary. Printing a :class:`Data`
    dictionary provides a human-readable table. The input dictionary is
    expected to have two levels: the first level gives the columns and the
    second level the rows. Rows names are parsed as
    ``[OUTER]/[MIDDLE].[INNER]--[IDX]``, where IDX has to be an integer. All
    entries are optional.

    .. seealso::
        :func:`assemble_data`, :func:`assemble_table`

    Warning
    -------
    :class:`Data` is an internal class that expects a particular functions.
    This class cannot be used as a general drop-in replacement for the standard
    ``dict`` class.

    Examples
    --------
    >>> from mlens.metrics import Data
    >>> d = [('row-idx-1.row-idx-2.0.0', {'column-1': 0.1, 'column-2': 0.1})]
    >>> data = Data(d)
    >>> print(data)
                            column-a  column-b
    row-idx-1  row-idx-2        0.10      0.20
    """

    def __init__(self, data=None, padding=2, decimals=2):
        if isinstance(data, list):
            data = assemble_data(data)
        super(Data, self).__init__(data)
        self.__padding__ = padding
        self.__decimals__ = decimals

    def __repr__(self):
        return assemble_table(self, self.__padding__, self.__decimals__)


[docs]def assemble_table(data, padding=2, decimals=2):
    """Construct data table from input dict

    Given a nested dictionary formed by :func:`assemble_data`,
    :func:`assemble_table` returns a string that prints the contents of
    the input in tabular format. The input dictionary is
    expected to have two levels: the first level gives the columns and the
    second level the rows. Rows names are parsed as
    ``[OUTER]/[MIDDLE].[INNER]--[IDX]``, where IDX must be an integer. All
    entries are optional.

    .. seealso::
        :class:`Data`, :func:`assemble_data`

    Examples
    --------
    >>> from mlens.metrics import assemble_data, assemble_table
    >>> d = [('row-idx-1.row-idx-2.a.b', {'column-1': 0.1, 'column-2': 0.1})]
    >>> print(assemble_table(assemble_data(d)))
                            column-2-m  column-2-s  column-1-m  column-1-s
    row-idx-1  row-idx-2          0.10        0.00        0.10        0.00
    """
    buffer = 0
    row_glossary = ['layer', 'case', 'est', 'part']

    cols = list()
    rows = list()
    row_keys = list()
    max_col_len = dict()
    max_row_len = {r: 0 for r in row_glossary}

    # First, measure the maximum length of each column in table
    for key, val in data.items():
        cols.append(key)
        max_col_len[key] = len(key)

        # dat_key is the estimators. Number of columns is not fixed so need
        # to assume all exist and purge empty columns
        for dat_key, v in sorted(val.items()):
            if not v:
                # Safety: no data
                continue

            v_ = len(_get_string(v, decimals))
            if v_ > max_col_len[key]:
                max_col_len[key] = v_

            if dat_key in row_keys:
                # Already mapped row entry name
                continue

            layer, k = _split(dat_key, '/')
            case, k = _split(k, '.')
            est, part = _split(k, '--', reverse=True)

            # Header space before column headings
            items = [i for i in [layer, case, est, part] if i != '']
            buffer = max(buffer, len('  '.join(items)))

            for k, v in zip(row_glossary, [layer, case, est, part]):
                v_ = len(v)
                if v_ > max_row_len[k]:
                    max_row_len[k] = v_

            dat = _dict()
            dat['layer'] = layer
            dat['case'] = case
            dat['est'] = est
            dat['part'] = part
            row_keys.append(dat_key)
            rows.append(dat)

    # Check which row name columns we can drop (ex partition number)
    drop = list()
    for k, v in max_row_len.items():
        if v == 0:
            drop.append(k)

    # Header
    out = " " * (buffer + padding)
    for col in cols:
        adj = max_col_len[col] - len(col) + padding
        out += " " * adj + col
    out += "\n"

    # Entries
    for dat_key, dat in zip(row_keys, rows):
        # Estimator name
        for key, val in dat.items():
            if key in drop:
                continue
            adj = max_row_len[key] - len(val) + padding
            out += val + " " * adj

        # Data
        for col in cols:
            item = data[col][dat_key]
            if not item and item != 0:
                out += " " * (max_col_len[col] + padding)
                continue
            item_ = _get_string(item, decimals)
            adj = max_col_len[col] - len(item_) + padding
            out += " " * adj + item_
        out += "\n"
    return out


[docs]def assemble_data(data_list):
    """Build a data dictionary out of a list of entries and data dicts

    Given a list named tuples of dictionaries, :func:`assemble_data`
    returns a nested ordered dictionary with data keys as outer keys and
    tuple names as inner keys. The returned dictionary can be printed in
    tabular format by :func:`assemble_table`.

    .. seealso::
        :class:`Data`, :func:`assemble_table`

    Examples
    --------
    >>> from mlens.metrics import assemble_data, assemble_table
    >>> d = [('row-idx-1.row-idx-2.a.b', {'column-1': 0.1, 'column-2': 0.1})]
    >>> print(assemble_table(assemble_data(d)))
                            column-2-m  column-2-s  column-1-m  column-1-s
    row-idx-1  row-idx-2          0.10        0.00        0.10        0.00
    """
    data = _dict()
    tmp = _dict()

    partitions = _get_partitions(data_list)

    # Collect scores per preprocessing case and estimator(s)
    for name, data_dict in data_list:
        if not data_dict:
            continue

        prefix, name = _split(name, '/', a_s='/')

        # Names are either est.i.j or case.est.i.j
        splitted = name.split('.')
        if partitions:
            name = tuple(splitted[:-1])

            if len(name) == 3:
                name = '%s.%s--%s' % name
            else:
                name = '%s--%s' % name
        else:
            name = '.'.join(splitted[:-2])

        name = '%s%s' % (prefix, name)

        if name not in tmp:
            # Set up data struct for name
            tmp[name] = _dict()
            for k in data_dict.keys():
                tmp[name][k] = list()
                if '%s-m' % k not in data:
                    data['%s-m' % k] = _dict()
                    data['%s-s' % k] = _dict()
                data['%s-m' % k][name] = list()
                data['%s-s' % k][name] = list()

        # collect all data dicts belonging to name
        for k, v in data_dict.items():
            tmp[name][k].append(v)

    # Aggregate to get mean and std
    for name, data_dict in tmp.items():
        for k, v in data_dict.items():
            if not v:
                continue
            try:
                # Purge None values from the main est due to no predict times
                v = [i for i in v if i is not None]
                if v:
                    data['%s-m' % k][name] = np.mean(v)
                    data['%s-s' % k][name] = np.std(v)
            except Exception as exc:
                warnings.warn(
                    "Aggregating data for %s failed. Raw data:\n%r\n"
                    "Details: %r" % (k, v, exc), MetricWarning)

    # Check if there are empty columns
    discard = list()
    for key, data_dict in data.items():
        empty = True
        for val in data_dict.values():
            if val or val == 0:
                empty = False
        if empty:
            discard.append(key)
    for key in discard:
        data.pop(key)
    return data