Source code for alp.appcom.ensembles

"""
Ensembles module
================
"""

import warnings
from time import time

import numpy as np
import pandas as pd
from progressbar import ETA
from progressbar import Bar
from progressbar import DynamicMessage
from progressbar import FormatLabel
from progressbar import Percentage
from progressbar import ProgressBar
from progressbar import SimpleProgress


[docs]def get_best(experiments, metric, op, partial=False):
    """Helper function for manipulation of a list of experiments

    In case of equality in the metric, the behaviour of op_arg determines the
    result.

    Args:
        experiments(list): a list of experiments
        metric(str): the name of a metric used in the experiments
        op (function): operation to perform with the metric (optional)
        partial(bool): if True will pass an experiment without result. Raise
            an error otherwise.
    """
    best_perf_expes = []
    list_experiments = []
    list_keys = []
    not_ready = False
    for k, expe in experiments.items():
        if not hasattr(expe, 'full_res'):  # pragma: no cover
            if not partial:
                raise Exception('Results are not ready')
            else:
                not_ready = True
        else:
            best_perf_expes.append(op(expe.full_res['metrics'][metric]))
            list_experiments.append(expe)
            list_keys.append(k)

    if not_ready is True:  # pragma: no cover
        warnings.warn('Some results are not ready: Using the best available'
                      ' model.')

    if len(list_experiments) == 0:
        raise Exception('No result is ready yet')

    ar_expes = np.array(list_experiments)
    ar_keys = np.array(list_keys)
    perf_array = np.array(best_perf_expes)
    perf_nans = np.isnan(perf_array)
    if (1 - perf_nans).sum() == 0:
        raise Exception('The selected metric evaluations are all nans')

    best_perf_expes = perf_array[perf_nans == False]  # NOQA
    bool_choice = op(best_perf_expes) == np.array(best_perf_expes)
    best = ar_expes[bool_choice]  # NOQA
    best_key = ar_keys[bool_choice]
    return best[0], best_key[0]


widgets = [Percentage(), ' ',
           SimpleProgress(), ' ',
           Bar(marker='=', left='[', right=']'),
           ' ', FormatLabel('in: %(elapsed)s'), ' ',
           ETA(), ' | ', 'job/', DynamicMessage('s')]


[docs]class Ensemble(object):

    """Base class to build experiments containers able to execute batch
    sequences of action. Must implement the `fit`, `fit_gen`, `fit_async`
    `fit_gen_async` methods

    Args:
        experiments(dict or list): experiments to be wrapped. If a dictionnary
            is passed, it should map experiment names to experiments.
    """
    def __init__(self, experiments):
        if isinstance(experiments, list):
            experiments = {i: v for i, v in enumerate(experiments)}
        if not isinstance(experiments, dict):  # pragma: no cover
            raise TypeError('You must pass either an experiments dict or list')
        self.experiments = experiments

[docs]    def fit(self, data, data_val, *args, **kwargs):
        raise NotImplementedError

[docs]    def fit_gen(self, data, data_val, *args, **kwargs):
        raise NotImplementedError

[docs]    def fit_async(self, data, data_val, *args, **kwargs):
        raise NotImplementedError

[docs]    def fit_gen_async(self, data, data_val, *args, **kwargs):
        raise NotImplementedError

[docs]    def predict(self, data, data_val, *args, **kwargs):
        raise NotImplementedError

[docs]    def summary(self, metrics, verbose=False):
        raise NotImplementedError

[docs]    def plt_summary(self):
        raise NotImplementedError


[docs]class HParamsSearch(Ensemble):
    """Hyper parameters search class

    Train several experiments with different hyperparameters and save results.
    Wraps the training process so that it's possible to access results easily.

    Args:
        experiments(dict or list): experiments to be wrapped. If a dictionnary
            is passed, it should map experiment names to experiments
        hyperparams(dict): a dict of hyperparameters
        metric(str): the name of a metric used in the experiments
        op(str): an operator to select a model

    """
    def __init__(self, experiments, hyperparams=None, metric=None, op=None):
        super(HParamsSearch, self).__init__(experiments=experiments)
        self.hyperparams = hyperparams
        self.metric = metric
        self.op = op
        self.results = dict()

[docs]    def fit(self, data, data_val, *args, **kwargs):
        """Apply the fit method to all the experiments

        Args:
            see `alp.core.Experiment.fit`

        Returns:
            a list of results"""
        self._fit_cm(data, data_val, gen=False, async=False, *args, **kwargs)
        return self.results

[docs]    def fit_gen(self, data, data_val, *args, **kwargs):
        """Apply the fit_gen method to all the experiments

        Args:
            see :meth:`alp.appcom.core.Experiment.fit_gen`

        Returns:
            a list of results"""
        self._fit_cm(data, data_val, gen=True, async=False, *args, **kwargs)
        return self.results

[docs]    def fit_gen_async(self, data, data_val, *args, **kwargs):
        """Apply the fit_gen_async method to all the experiments

        Args:
            see :meth:`alp.appcom.core.Experiment.fit_gen_async`

        Returns:
            a list of results"""
        self._fit_cm(data, data_val, gen=True, async=True, *args, **kwargs)
        return self.results

[docs]    def fit_async(self, data, data_val, *args, **kwargs):
        """Apply the fit_async method to all the experiments

        Args:
            see :meth:`alp.appcom.core.Experiment.fit_async`

        Returns:
            a list of results"""
        self._fit_cm(data, data_val, gen=False, async=True, *args, **kwargs)
        return self.results

    def _fit_cm(self, data, data_val, gen, async, *args, **kwargs):
        with ProgressBar(max_value=len(self.experiments),
                         redirect_stdout=True,
                         widgets=widgets, term_width=80) as progress:
            for i, kv in enumerate(self.experiments.items()):
                k, expe = kv
                b = time()
                if gen and async:
                    res = expe.fit_gen_async(data, data_val, *args, **kwargs)
                elif gen and not async:
                    res = expe.fit_gen(data, data_val, *args, **kwargs)
                elif not gen and async:
                    res = expe.fit_async(data, data_val, *args, **kwargs)
                else:
                    res = expe.fit(data, data_val, *args, **kwargs)

                self.results[k] = res
                if i == 0:
                    spent = time() - b
                    to_print = spent
                else:
                    spent += time() - b
                    to_print = spent / (i + 1)
                progress.update(i, s=float(1 / to_print))
                if expe.backend_name == 'keras' and async:  # pragma: no cover
                    import keras.backend as K
                    if K.backend() == 'tensorflow':
                        K.clear_session()
        return self.results

[docs]    def predict(self, data, metric=None, op=None, partial=False,
                *args, **kwargs):
        """Apply the predict method to all the experiments

        Args:
            see :meth:`alp.appcom.core.Experiment.predict`
            metric(str): the name of the metric to use
            op(function): an operator returning the value to select an
                experiment

        Returns:
            an array of results"""
        if not metric:
            metric = self.metric
        if not op:
            op = self.op

        if metric is None or op is None:
            raise Exception('You should provide a metric along with an op')
        best_exp, best_key = get_best(self.experiments, metric, op, partial)
        return best_key, best_exp.predict(data, *args, **kwargs)

[docs]    def summary(self, metrics, verbose=False):
        """Build a results table using individual results from models

        Args:
            verbose(bool): if True, print a description of the results
            metrics(dict): a dictionnary mapping metric's names to ops.

        Returns:
            a pandas DataFrame of results"""
        # build results table
        res_dict = dict()
        expes = self.experiments
        for kv in self.results.items():
            k, res = kv
            res, t = res
            if t is not None:
                t.join()
            for kr, v in expes[k].full_res['metrics'].items():
                if isinstance(v, list):
                    if kr in metrics:
                        op = metrics[kr]
                        if kr in res_dict:
                            res_dict[kr] += [op(v)]
                        else:
                            res_dict[kr] = []
                            res_dict[kr] += [op(v)]
        res_table = pd.DataFrame(res_dict)
        if verbose is True:
            print(res_table.describe())
        return res_table