Source code for autogl.module.ensemble.stacking

"""
Ensemble module.
"""

import numpy as np

import torch
import torch.nn.functional as F
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from .base import BaseEnsembler
from . import register_ensembler
from ...utils import get_logger

STACKING_LOGGER = get_logger("stacking")


[docs]@register_ensembler("stacking")
class Stacking(BaseEnsembler):
    """
    A stacking ensembler. Currently we support gradient boosting as the meta-algorithm.

    Parameters
    ----------
    meta_model : 'gbm' or 'glm' (Optional)
        Type of the stacker:
            'gbm' : Gradient boosting model. This is the default.
            'glm' : Generalized linear model.

    meta_params : a ``dict``  (Optional)
        When ``meta_model`` is specified, you can customize the parameters of the stacker.
        If this argument is not provided, the stacker will be configurated with default parameters.
        Default ``{}``.
    """

    def __init__(self, meta_model="gbm", meta_params={}, *args, **kwargs):
        super().__init__()
        self.model_name = meta_model.lower()
        assert self.model_name in [
            "gbm",
            "glm",
        ], "Only support gbm and glm when ensemble!"
        self.meta_params = meta_params

[docs]    def fit(
        self, predictions, label, identifiers, feval, n_classes="auto", *args, **kwargs
    ):
        """
        Fit the ensembler to the given data using Stacking method.

        Parameters
        ----------
        predictions : a list of np.ndarray
            Predictions of base learners (corresponding to the elements in identifiers).

        label : a list of int
            Class labels of instances.

        identifiers : a list of str
            The names of base models.

        feval : (a list of) autogl.module.train.evaluate
            Performance evaluation metrices.

        n_classes : int or str (Optional)
            The number of classes. Default as ``'auto'``, which will use maximum label.

        Returns
        -------
        (a list of) float
            The validation performance of the final stacker.
        """

        n_classes = n_classes if not n_classes == "auto" else max(label) + 1
        assert n_classes > max(
            label
        ), "Detect max label passed (%d) exceeeds" " n_classes given (%d)" % (
            max(label),
            n_classes,
        )

        assert len(identifiers) == len(
            set(identifiers)
        ), "Duplicate name" " in identifiers {} !".format(identifiers)

        self.fit_identifiers = identifiers

        if not isinstance(feval, list):
            feval = [feval]

        self._re_initialize(identifiers, len(predictions))

        config = self.meta_params

        STACKING_LOGGER.debug("meta-model name %s", self.model_name)

        if self.model_name == "gbm":
            meta_X = (
                torch.tensor(predictions).transpose(0, 1).flatten(start_dim=1).numpy()
            )
            meta_Y = np.array(label)

            model = GradientBoostingClassifier(**config)
            model.fit(meta_X, meta_Y)

            self.model = model
            ensemble_prediction = model.predict_proba(meta_X)

        elif self.model_name == "glm":
            meta_X = (
                torch.tensor(predictions).transpose(0, 1).flatten(start_dim=1).numpy()
            )
            meta_Y = np.array(label)

            config["multi_class"] = "auto"
            config["solver"] = "lbfgs"
            model = LogisticRegression(**config)
            model.fit(meta_X, meta_Y)

            self.model = model
            ensemble_prediction = model.predict_proba(meta_X)

        elif self.model_name == "nn":
            meta_X = torch.tensor(predictions).transpose(0, 1).flatten(start_dim=1)
            meta_Y = F.one_hot(
                torch.tensor(label, dtype=torch.int64), n_classes
            ).double()
            # print(meta_Y.type())

            n_instance, n_input = meta_X.size()
            n_learners = len(identifiers)

            fc = torch.nn.Linear(n_input, n_input // n_learners).double()

            config["lr"] = 1e-1
            # config['weight_decay'] = 1e-2
            optimizer = torch.optim.SGD(fc.parameters(), **config)

            max_epoch = 100
            for epoch in range(max_epoch):
                optimizer.zero_grad()
                ensemble_prediction = F.normalize(fc.forward(meta_X), dim=0)
                loss = F.mse_loss(ensemble_prediction, meta_Y)
                loss.backward()
                optimizer.step()

            self.model = fc
            ensemble_prediction = (
                F.normalize(fc.forward(meta_X), dim=0).detach().numpy()
            )

        else:
            STACKING_LOGGER.error(
                "Cannot parse stacking ensemble model name %s", self.model_name
            )

        return [fx.evaluate(ensemble_prediction, label) for fx in feval]

[docs]    def ensemble(self, predictions, identifiers, *args, **kwargs):
        """
        Ensemble the predictions of base models.

        Parameters
        ----------
        predictions : a list of ``np.ndarray``
            Predictions of base learners (corresponding to the elements in identifiers).
        identifiers : a list of ``str``
            The names of base models.

        Returns
        -------
        ``np.ndarray``
            The ensembled predictions.
        """

        assert len(identifiers) == len(
            set(identifiers)
        ), "Duplicate name in" " identifiers {} !".format(identifiers)

        assert set(self.fit_identifiers) == set(
            identifiers
        ), "Different identifiers" " passed in fit {} and ensemble {} !".format(
            self.fit_identifiers, identifiers
        )

        # re-order predictions if needed
        if not self.fit_identifiers == identifiers:
            re_id = [
                identifiers.index(identifier) for identifier in self.fit_identifiers
            ]
            predictions = [predictions[i] for i in re_id]

        if self.model_name in ["gbm", "glm"]:
            pred_packed = (
                torch.tensor(predictions).transpose(0, 1).flatten(start_dim=1).numpy()
            )
            return self.model.predict_proba(pred_packed)

        elif self.model_name in ["nn"]:
            pred_packed = torch.tensor(predictions).transpose(0, 1).flatten(start_dim=1)
            return F.normalize(self.model.forward(pred_packed), dim=0).detach().numpy()

    def _re_initialize(self, identifiers, n_models):
        self.identifiers = identifiers
        self.model = None