Data preparation
create database iris;
use iris;
create external table raw (
sepal_length int,
sepal_width int,
petal_length int,
petak_width int,
class string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE LOCATION '/dataset/iris/raw';
$ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
drop table label_mapping;
create table label_mapping
as
select
class,
rank - 1 as label -- zero start index
from (
select
distinct class,
dense_rank() over (order by class) as rank
from
raw
) t;
drop table xgb_input;
create table xgb_input
as
select
rowid() as rowid,
array(sepal_length, sepal_width, petal_length, petal_width) as dense_features,
indexed_features(sepal_length, sepal_width, petal_length, petal_width) as sparse_features,
t2.label
from
raw t1
JOIN label_mapping t2 ON (t1.class = t2.class)
;
select * from xgb_input limit 3;
xgb_input.rowid | xgb_input.dense_features | xgb_input.sparse_features | xgb_input.label |
---|---|---|---|
1-1 | [5,3,1,0] | ["1:5","2:3","3:1","4:0"] | 0 |
1-2 | [4,3,1,0] | ["1:4","2:3","3:1","4:0"] | 0 |
1-3 | [4,3,1,0] | ["1:4","2:3","3:1","4:0"] | 0 |
Training
-- explicitly use 3 reducers
-- set mapred.reduce.tasks=3;
drop table xgb_softmax_model;
create table xgb_softmax_model
as
select
train_xgboost(features, label, '-objective multi:softmax -num_class 3 -num_round 10 -num_early_stopping_rounds 3')
as (model_id, model)
from (
select
-- both sparse and dense format is supported
dense_features as features, label
-- sparse_features as features, label
from
xgb_input
cluster by rand(43) -- shuffle
) shuffled;
Caution
-num_class
is required for multiclass objectives.
Note both sparse and dense vector is supported for feature vector format.
Prediction
drop table xgb_softmax_predicted;
create table xgb_softmax_predicted as
select
rowid,
majority_vote(cast(predicted as int)) as label
from (
select
xgboost_predict_one(rowid, dense_features, model_id, model) as (rowid, predicted)
-- xgboost_predict_one(rowid, sparse_features, model_id, model) as (rowid, predicted)
from
xgb_softmax_model l
LEFT OUTER JOIN xgb_input r
) t
group by rowid;
Evaluation
WITH validate as (
select
t.label as actual,
p.label as predicted
from
xgb_input t
JOIN xgb_softmax_predicted p
on (t.rowid = p.rowid)
)
select
sum(if(actual=predicted,1.0,0.0))/count(1)
from
validate;
0.9533333333333333333