This tutorial explains how to use XGBoost for regression problems.

Training

The following objective is supported in XGboost for regression:

  • reg:squarederror regression with squared loss
  • reg:logistic logistic regression

reg:squarederror is widely used as the regression objective.

use e2006;

desc e2006tfidf_train;
col_name data_type
rowid int
target float
features array
-- explicitly use 3 reducers
-- set mapred.reduce.tasks=3

drop table xgb_regr_model;
create table xgb_regr_model as
select 
  train_xgboost(features, target, '-objective reg:squarederror -num_round 10 -num_early_stopping_rounds 3') 
    as (model_id, model)
from (
  select features, target
  from e2006tfidf_train
  cluster by rand(43) -- shuffle data to reducers
) shuffled;

prediction

drop table xgb_regr_predicted;
create table xgb_regr_predicted as
select
  rowid,
  avg(predicted) as predicted
from (
  select
    xgboost_predict_one(rowid, features, model_id, model) as (rowid, predicted)
  from
    xgb_regr_model l
    LEFT OUTER JOIN e2006tfidf_test r
) t
group by rowid;

Note

xgboost_predict returns new double[1] (e.g., [-3.9760303385555744]) for -objective reg:squarederror. On the other hand, xgboost_predict_one returns a scalar double value as predicted -3.9760303385555744.

evaluation

WITH submit as (
  select 
    t.target as actual, 
    p.predicted as predicted
  from 
    e2006tfidf_test t
    JOIN xgb_regr_predicted p 
      on (t.rowid = p.rowid)
)
select 
   rmse(predicted, actual) as RMSE,
   mse(predicted, actual) as MSE, 
   mae(predicted, actual) as MAE,
   r2(predicted, actual) as R2
from 
   submit;
rmse mse mae r2
0.3949633797136429 0.15599607131482326 0.25367043577533693 0.4603881976325721

results matching ""

    No results matching ""