This pages shows an example of applying logistic regression for a9a binary classification task.
Caution
logloss()
became deprecated since v0.5.0 release. Use smarter general classifier instead.
UDF preparation
select count(1) from a9atrain;
-- set total_steps ideally be "count(1) / #map tasks"
set hivevar:total_steps=32561;
select count(1) from a9atest;
set hivevar:num_test_instances=16281;
training
create table a9a_model1
as
select
cast(feature as int) as feature,
avg(weight) as weight
from
(select
logress(add_bias(features),label,"-total_steps ${total_steps}") as (feature,weight)
from
a9atrain
) t
group by feature;
Note
-total_steps
option is optional for logress() function. We recommend you NOT to use options (e.g., total_steps
and eta0
) if you are not familiar with those options. Hivemall then uses an autonomic ETA (learning rate) estimator.
prediction
create or replace view a9a_predict1
as
WITH a9atest_exploded as (
select
rowid,
label,
extract_feature(feature) as feature,
extract_weight(feature) as value
from
a9atest LATERAL VIEW explode(add_bias(features)) t AS feature
)
select
t.rowid,
sigmoid(sum(m.weight * t.value)) as prob,
CAST((case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1.0 else 0.0 end) as FLOAT) as label
from
a9atest_exploded t LEFT OUTER JOIN
a9a_model1 m ON (t.feature = m.feature)
group by
t.rowid;
evaluation
create or replace view a9a_submit1 as
select
t.label as actual,
pd.label as predicted,
pd.prob as probability
from
a9atest t JOIN a9a_predict1 pd
on (t.rowid = pd.rowid);
select count(1) / ${num_test_instances} from a9a_submit1
where actual == predicted;
0.8430071862907684