This tutorial explains how to use XGBoost for regression problems.
Training
The following objective is supported in XGboost for regression:
reg:squarederrorregression with squared lossreg:logisticlogistic regression
reg:squarederror is widely used as the regression objective.
use e2006;
desc e2006tfidf_train;
| col_name | data_type |
|---|---|
| rowid | int |
| target | float |
| features | array |
-- explicitly use 3 reducers
-- set mapred.reduce.tasks=3
drop table xgb_regr_model;
create table xgb_regr_model as
select
train_xgboost(features, target, '-objective reg:squarederror -num_round 10 -num_early_stopping_rounds 3')
as (model_id, model)
from (
select features, target
from e2006tfidf_train
cluster by rand(43) -- shuffle data to reducers
) shuffled;
prediction
drop table xgb_regr_predicted;
create table xgb_regr_predicted as
select
rowid,
avg(predicted) as predicted
from (
select
xgboost_predict_one(rowid, features, model_id, model) as (rowid, predicted)
from
xgb_regr_model l
LEFT OUTER JOIN e2006tfidf_test r
) t
group by rowid;
Note
xgboost_predict returns new double[1] (e.g., [-3.9760303385555744]) for -objective reg:squarederror.
On the other hand, xgboost_predict_one returns a scalar double value as predicted -3.9760303385555744.
evaluation
WITH submit as (
select
t.target as actual,
p.predicted as predicted
from
e2006tfidf_test t
JOIN xgb_regr_predicted p
on (t.rowid = p.rowid)
)
select
rmse(predicted, actual) as RMSE,
mse(predicted, actual) as MSE,
mae(predicted, actual) as MAE,
r2(predicted, actual) as R2
from
submit;
| rmse | mse | mae | r2 |
|---|---|---|---|
| 0.3949633797136429 | 0.15599607131482326 | 0.25367043577533693 | 0.4603881976325721 |