Hivemall Random Forest supports libsvm-like sparse inputs. This page shows a classification example on 20-newsgroup dataset.

Note

This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5.0 or later. feature_hashing function is useful to prepare feature vectors for Random Forest.

Training

drop table rf_model;
create table rf_model
as
select
  train_randomforest_classifier(
    features,
    convert_label(label),  -- convert -1/1 to 0/1
    '-trees 50 -seed 71'   -- hyperparameters
  )
from
  train;

Caution

label must be in [0, k) where k is the number of classes.

Prediction

-- SET hivevar:classification=true;

drop table rf_predicted;
create table rf_predicted
as
SELECT
  rowid,
  rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
  -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
FROM (
  SELECT
    rowid, 
    m.model_weight,
    -- v0.5.0 and later
    tree_predict(m.model_id, m.model, t.features, "-classification") as predicted
    -- before v0.5.0
    -- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted
  FROM
    rf_model m
    LEFT OUTER JOIN -- CROSS JOIN
    test t
) t1
group by
  rowid
;

Evaluation

WITH submit as (
  select 
    convert_label(t.label) as actual, 
    p.predicted.label as predicted
  from 
    test t 
    JOIN rf_predicted p on (t.rowid = p.rowid)
)
select
  sum(if(actual = predicted, 1, 0)) / count(1) as accuracy
from
  submit;

0.8112489991993594

results matching ""

    No results matching ""