Source code for mlens.metrics.utils

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017-2018
:licence: MIT

Utility functions for constructing metrics
"""
from __future__ import division

import warnings
import numpy as np

from ..utils.exceptions import MetricWarning
try:
    from collections import OrderedDict as _dict
except ImportError:
    _dict = dict


def _get_string(obj, dec):
    """Stringify object"""
    try:
        return '{0:.{dec}f}'.format(obj, dec=dec)
    except (TypeError, ValueError):
        return obj.__str__()


def _get_partitions(obj):
    """Check if any entry has partitions"""
    for name, _ in obj:
        if int(name.split('.')[-2]) > 0:
            return True
    return False


def _split(f, s, a_p='', a_s='', b_p='', b_s='', reverse=False):
    """Split string on a symbol and return two string, first possible empty"""
    splitted = f.split(s)
    if len(splitted) == 1:
        a, b = '', splitted[0]
        if reverse:
            b, a = a, b
    else:
        a, b = splitted

    if a:
        a = '%s%s%s' % (a_p, a, a_s)
    if b:
        b = '%s%s%s' % (b_p, b, b_s)

    return a, b


[docs]class Data(_dict): """Wrapper class around dict to get pretty prints :class:`Data` is an ordered dictionary that implements a dedicated pretty print method for a nested dictionary. Printing a :class:`Data` dictionary provides a human-readable table. The input dictionary is expected to have two levels: the first level gives the columns and the second level the rows. Rows names are parsed as ``[OUTER]/[MIDDLE].[INNER]--[IDX]``, where IDX has to be an integer. All entries are optional. .. seealso:: :func:`assemble_data`, :func:`assemble_table` Warning ------- :class:`Data` is an internal class that expects a particular functions. This class cannot be used as a general drop-in replacement for the standard ``dict`` class. Examples -------- >>> from mlens.metrics import Data >>> d = [('row-idx-1.row-idx-2.0.0', {'column-1': 0.1, 'column-2': 0.1})] >>> data = Data(d) >>> print(data) column-a column-b row-idx-1 row-idx-2 0.10 0.20 """ def __init__(self, data=None, padding=2, decimals=2): if isinstance(data, list): data = assemble_data(data) super(Data, self).__init__(data) self.__padding__ = padding self.__decimals__ = decimals def __repr__(self): return assemble_table(self, self.__padding__, self.__decimals__)
[docs]def assemble_table(data, padding=2, decimals=2): """Construct data table from input dict Given a nested dictionary formed by :func:`assemble_data`, :func:`assemble_table` returns a string that prints the contents of the input in tabular format. The input dictionary is expected to have two levels: the first level gives the columns and the second level the rows. Rows names are parsed as ``[OUTER]/[MIDDLE].[INNER]--[IDX]``, where IDX must be an integer. All entries are optional. .. seealso:: :class:`Data`, :func:`assemble_data` Examples -------- >>> from mlens.metrics import assemble_data, assemble_table >>> d = [('row-idx-1.row-idx-2.a.b', {'column-1': 0.1, 'column-2': 0.1})] >>> print(assemble_table(assemble_data(d))) column-2-m column-2-s column-1-m column-1-s row-idx-1 row-idx-2 0.10 0.00 0.10 0.00 """ buffer = 0 row_glossary = ['layer', 'case', 'est', 'part'] cols = list() rows = list() row_keys = list() max_col_len = dict() max_row_len = {r: 0 for r in row_glossary} # First, measure the maximum length of each column in table for key, val in data.items(): cols.append(key) max_col_len[key] = len(key) # dat_key is the estimators. Number of columns is not fixed so need # to assume all exist and purge empty columns for dat_key, v in sorted(val.items()): if not v: # Safety: no data continue v_ = len(_get_string(v, decimals)) if v_ > max_col_len[key]: max_col_len[key] = v_ if dat_key in row_keys: # Already mapped row entry name continue layer, k = _split(dat_key, '/') case, k = _split(k, '.') est, part = _split(k, '--', reverse=True) # Header space before column headings items = [i for i in [layer, case, est, part] if i != ''] buffer = max(buffer, len(' '.join(items))) for k, v in zip(row_glossary, [layer, case, est, part]): v_ = len(v) if v_ > max_row_len[k]: max_row_len[k] = v_ dat = _dict() dat['layer'] = layer dat['case'] = case dat['est'] = est dat['part'] = part row_keys.append(dat_key) rows.append(dat) # Check which row name columns we can drop (ex partition number) drop = list() for k, v in max_row_len.items(): if v == 0: drop.append(k) # Header out = " " * (buffer + padding) for col in cols: adj = max_col_len[col] - len(col) + padding out += " " * adj + col out += "\n" # Entries for dat_key, dat in zip(row_keys, rows): # Estimator name for key, val in dat.items(): if key in drop: continue adj = max_row_len[key] - len(val) + padding out += val + " " * adj # Data for col in cols: item = data[col][dat_key] if not item and item != 0: out += " " * (max_col_len[col] + padding) continue item_ = _get_string(item, decimals) adj = max_col_len[col] - len(item_) + padding out += " " * adj + item_ out += "\n" return out
[docs]def assemble_data(data_list): """Build a data dictionary out of a list of entries and data dicts Given a list named tuples of dictionaries, :func:`assemble_data` returns a nested ordered dictionary with data keys as outer keys and tuple names as inner keys. The returned dictionary can be printed in tabular format by :func:`assemble_table`. .. seealso:: :class:`Data`, :func:`assemble_table` Examples -------- >>> from mlens.metrics import assemble_data, assemble_table >>> d = [('row-idx-1.row-idx-2.a.b', {'column-1': 0.1, 'column-2': 0.1})] >>> print(assemble_table(assemble_data(d))) column-2-m column-2-s column-1-m column-1-s row-idx-1 row-idx-2 0.10 0.00 0.10 0.00 """ data = _dict() tmp = _dict() partitions = _get_partitions(data_list) # Collect scores per preprocessing case and estimator(s) for name, data_dict in data_list: if not data_dict: continue prefix, name = _split(name, '/', a_s='/') # Names are either est.i.j or case.est.i.j splitted = name.split('.') if partitions: name = tuple(splitted[:-1]) if len(name) == 3: name = '%s.%s--%s' % name else: name = '%s--%s' % name else: name = '.'.join(splitted[:-2]) name = '%s%s' % (prefix, name) if name not in tmp: # Set up data struct for name tmp[name] = _dict() for k in data_dict.keys(): tmp[name][k] = list() if '%s-m' % k not in data: data['%s-m' % k] = _dict() data['%s-s' % k] = _dict() data['%s-m' % k][name] = list() data['%s-s' % k][name] = list() # collect all data dicts belonging to name for k, v in data_dict.items(): tmp[name][k].append(v) # Aggregate to get mean and std for name, data_dict in tmp.items(): for k, v in data_dict.items(): if not v: continue try: # Purge None values from the main est due to no predict times v = [i for i in v if i is not None] if v: data['%s-m' % k][name] = np.mean(v) data['%s-s' % k][name] = np.std(v) except Exception as exc: warnings.warn( "Aggregating data for %s failed. Raw data:\n%r\n" "Details: %r" % (k, v, exc), MetricWarning) # Check if there are empty columns discard = list() for key, data_dict in data.items(): empty = True for val in data_dict.values(): if val or val == 0: empty = False if empty: discard.append(key) for key in discard: data.pop(key) return data