Hivemall Random Forest supports libsvm-like sparse inputs. This page shows a classification example on 20-newsgroup dataset.
Note
This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5.0 or later.
feature_hashing
function is useful to prepare feature vectors for Random Forest.
Training
drop table rf_model;
create table rf_model
as
select
train_randomforest_classifier(
features,
convert_label(label), -- convert -1/1 to 0/1
'-trees 50 -seed 71' -- hyperparameters
)
from
train;
Caution
label must be in [0, k)
where k
is the number of classes.
Prediction
-- SET hivevar:classification=true;
drop table rf_predicted;
create table rf_predicted
as
SELECT
rowid,
rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
-- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
FROM (
SELECT
rowid,
m.model_weight,
-- v0.5.0 and later
tree_predict(m.model_id, m.model, t.features, "-classification") as predicted
-- before v0.5.0
-- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted
FROM
rf_model m
LEFT OUTER JOIN -- CROSS JOIN
test t
) t1
group by
rowid
;
Evaluation
WITH submit as (
select
convert_label(t.label) as actual,
p.predicted.label as predicted
from
test t
JOIN rf_predicted p on (t.rowid = p.rowid)
)
select
sum(if(actual = predicted, 1, 0)) / count(1) as accuracy
from
submit;
0.8112489991993594