Hivemall generally uses model averaging (i.e., model ensemble) for creating a unified prediction model. In this tutorial, we show how to apply bagging (i.e., prediction ensemble) for making a prediction.

Training

-- set mapred.reduce.tasks=3; -- explicitly use 3 reducers

CREATE TABLE bagging_models
as 
WITH train as (
  select 
     train_classifier(
       add_bias(features), label, 
       '-loss logistic -opt AdamHD -reg l1 -iters 20'
     ) as (feature,weight)
  from
     news20b_train_x3
)
select
  taskid() as modelid,
  feature,
  weight
from 
  train;

prediction

create table bagging_predict
as
WITH weights as (
  select
    t.rowid,
    m.modelid,
    sum(m.weight * t.value) as total_weight
  from
    news20b_test_exploded t 
    LEFT OUTER JOIN
    bagging_models m ON (t.feature = m.feature)
  group by
    rowid, modelid
),
bagging as (
  select
    rowid,
    voted_avg(total_weight) as total_weight
  from 
    weights
  group by
    rowid 
)
select
  rowid,
  total_weight,
  -- Note: sum(total_weight) > 0.0 equals to sigmoid(total_weight) > 0.5
  -- https://en.wikipedia.org/wiki/Sigmoid_function
  case when total_weight > 0.0 then 1 else -1 end as label
from
  bagging
group by
  rowid;

evaluation

WITH submit as (
  select 
    t.label as actual, 
    p.label as predicted
  from 
    news20b_test t 
    JOIN bagging_predict p on (t.rowid = p.rowid)
)
select 
  sum(if(actual = predicted, 1, 0)) / count(1) as accuracy
from
  submit;

0.9641713370696557

results matching ""

    No results matching ""