This pages shows an example of applying logistic regression for a9a binary classification task.

Caution

logloss() became deprecated since v0.5.0 release. Use smarter general classifier instead.

UDF preparation

select count(1) from a9atrain;
-- set total_steps ideally be "count(1) / #map tasks"
set hivevar:total_steps=32561;

select count(1) from a9atest;
set hivevar:num_test_instances=16281;

training

create table a9a_model1 
as
select 
 cast(feature as int) as feature,
 avg(weight) as weight
from 
 (select 
     logress(add_bias(features),label,"-total_steps ${total_steps}") as (feature,weight)
  from 
     a9atrain
 ) t 
group by feature;

Note

-total_steps option is optional for logress() function. We recommend you NOT to use options (e.g., total_steps and eta0) if you are not familiar with those options. Hivemall then uses an autonomic ETA (learning rate) estimator.

prediction

create or replace view a9a_predict1 
as
WITH a9atest_exploded as (
select 
  rowid,
  label,
  extract_feature(feature) as feature,
  extract_weight(feature) as value
from 
  a9atest LATERAL VIEW explode(add_bias(features)) t AS feature
)
select
  t.rowid, 
  sigmoid(sum(m.weight * t.value)) as prob,
  CAST((case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1.0 else 0.0 end) as FLOAT) as label
from 
  a9atest_exploded t LEFT OUTER JOIN
  a9a_model1 m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

create or replace view a9a_submit1 as
select 
  t.label as actual, 
  pd.label as predicted, 
  pd.prob as probability
from 
  a9atest t JOIN a9a_predict1 pd 
    on (t.rowid = pd.rowid);
select count(1) / ${num_test_instances} from a9a_submit1 
where actual == predicted;

0.8430071862907684

results matching ""

    No results matching ""