Source code for languageflow.model.xgboost

from __future__ import absolute_import
import random
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    import xgboost as xgb
from sklearn.base import BaseEstimator, ClassifierMixin

import numpy as np


[docs]class XGBoostClassifier(BaseEstimator, ClassifierMixin):
    """ A simple wrapper around XGBoost
    More details: https://github.com/dmlc/xgboost/wiki/Parameters

    Parameters
    ----------
    base_estimator : string
        Can be 'gbtree' or 'gblinear'
            - 'gbtree' : classification
            - 'gblinear' : regression
    gamma : float
        minimum loss reduction required to make a partition, higher values mean more conservative boosting
    max_depth : int
        maximum depth of a tree
    min_child_weight : int
        larger values mean more conservative partitioning
    objective : string
        Specify the learning task and the corresponding learning objective or a custom objective function to be used
            - 'reg:linear' : linear regression
            - 'reg:logistic' : logistic regression
            - 'binary:logistic' : binary logistic regression
            - 'binary:logitraw' - binary logistic regression before logistic transformation
            - 'multi:softmax' : multiclass classification
            - 'multi:softprob' : multiclass classification with class probability output
            - 'rank:pairwise' : pairwise minimize loss
    metric : string
         Evaluation metrics:
            - 'rmse' - root mean square error
            - 'logloss' - negative log likelihood
            - 'error' - binary classification error rate
            - 'merror' - multiclass error rate
            - 'mlogloss' - multiclass logloss
            - 'auc' - area under the curve for ranking evaluation
            - 'ndcg' - normalized discounted cumulative gain ndcg@n for top n eval
            - 'map' - mean average precision map@n for top n eval
    """
    def __init__(self,
                 base_estimator='gbtree',
                 objective='multi:softprob',
                 metric='mlogloss',
                 num_classes=9,
                 learning_rate=0.25,
                 max_depth=10,
                 max_samples=1.0,
                 max_features=1.0,
                 max_delta_step=0,
                 min_child_weight=4,
                 min_loss_reduction=1,
                 l1_weight=0.0,
                 l2_weight=0.0,
                 l2_on_bias=False,
                 gamma=0.02,
                 inital_bias=0.5,
                 random_state=None,
                 watchlist=None,
                 n_jobs=4,
                 n_iter=150,
                 silent=1,
                 verbose_eval=True):

        if random_state is None:
            random_state = random.randint(0, 1000000)

        param = {
            'silent': silent,
            'verbose': 0,
            'use_buffer': True,
            'base_score': inital_bias,
            'nthread': n_jobs,
            'booster': base_estimator,
            'eta': learning_rate,
            'gamma': gamma,
            'max_depth': max_depth,
            'max_delta_step': max_delta_step,
            'min_child_weight': min_child_weight,
            'min_loss_reduction': min_loss_reduction,
            'subsample': max_samples,
            'colsample_bytree': max_features,
            'alpha': l1_weight,
            'lambda': l2_weight,
            'lambda_bias': l2_on_bias,
            'objective': objective,
            'eval_metric': metric,
            'seed': random_state,
            'num_class': num_classes,
            'verbose_eval': verbose_eval
        }
        self.param = param
        if not watchlist:
            self.wl = []
        else:
            self.wl = watchlist
        self.n_iter = n_iter

[docs]    def fit(self, X, y=None):
        """

        Parameters
        ----------
        X : {array-like, sparse matrix}
            Training data. Shape (n_samples, n_features)

        y : numpy array
            Target values. Shape (n_samples,)

        Returns
        -------
        self : C
            returns an instance of self.
        """
        self.booster_ = None
        X = self._convert(X, y)
        if self.wl:
            wl = [(X, 'train')]
            for i, ent in enumerate(self.wl):
                ent, lbl = ent
                wl.append((self.convert(ent, lbl), 'test-' + str(i)))
            self.booster_ = xgb.train(self.param, X, self.n_iter, wl, verbose_eval=self.param["verbose_eval"])
        else:
            self.booster_ = xgb.train(self.param, X, self.n_iter,
                                      [(X, 'train')], verbose_eval=self.param["verbose_eval"])

        return self

[docs]    def predict_proba(self, X):
        import scipy
        if isinstance(X, scipy.sparse.csr.csr_matrix):
            X = scipy.sparse.csc_matrix(X)
        X = xgb.DMatrix(X)
        return self.booster_.predict(X)

    def _convert(self, X, y=None):
        if y is None:
            if isinstance(X, xgb.DMatrix):
                return X
            if hasattr(X, 'values'):
                X = xgb.DMatrix(X.values)
                return X
            return xgb.DMatrix(X)
        else:
            if hasattr(X, 'values'):
                X = xgb.DMatrix(X.values, y.values, missing=np.nan)
                return X
            return xgb.DMatrix(X, y, missing=np.nan)

[docs]    def predict(self, X):
        X = self._convert(X)
        probs = self.booster_.predict(X)
        return list(np.argmax(probs, axis=1))

[docs]    def get_params(self, deep=False):
        params = {
            'base_estimator': self.param['booster'],
            'objective': self.param['objective'],
            'metric': self.param['eval_metric'],
            'num_classes': self.param['num_class'],
            'learning_rate': self.param['eta'],
            'max_depth': self.param['max_depth'],
            'max_samples': self.param['subsample'],
            'max_features': self.param['colsample_bytree'],
            'max_delta_step': self.param['max_delta_step'],
            'min_child_weight': self.param['min_child_weight'],
            'min_loss_reduction': self.param['min_loss_reduction'],
            'l1_weight': self.param['alpha'],
            'l2_weight': self.param['lambda'],
            'l2_on_bias': self.param['lambda_bias'],
            'gamma': self.param['gamma'],
            'inital_bias': self.param['base_score'],
            'random_state': self.param['seed'],
            'watchlist': self.wl,
            'n_jobs': self.param['nthread'],
            'n_iter': self.n_iter}
        return params

[docs]    def set_params(self, **parameters):
        for parameter, value in parameters.iteritems():
            self.setattr(parameter, value)
        return self