Preparation
use kdd12track2;
-- SET mapreduce.framework.name=yarn;
-- SET hive.execution.engine=mr;
-- SET mapreduce.framework.name=yarn-tez;
-- SET hive.execution.engine=tez;
SET mapred.reduce.tasks=32; -- [optional] set the explicit number of reducers to make group-by aggregation faster
AdaGrad
drop table adagrad_model;
create table adagrad_model
as
select
feature,
avg(weight) as weight
from
(select
adagrad(features,label) as (feature,weight)
from
training_orcfile
) t
group by feature;
drop table adagrad_predict;
create table adagrad_predict
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
LINES TERMINATED BY "\n"
STORED AS TEXTFILE
as
select
t.rowid,
sigmoid(sum(m.weight)) as prob
from
testing_exploded t LEFT OUTER JOIN
adagrad_model m ON (t.feature = m.feature)
group by
t.rowid
order by
rowid ASC;
Note
sigmoid(sum(m.weight))
not sigmoid(sum(m.weight * t.value)))
because t.value is always 1.0 for categorical variable.
hadoop fs -getmerge /user/hive/warehouse/kdd12track2.db/adagrad_predict adagrad_predict.tbl
gawk -F "\t" '{print $2;}' adagrad_predict.tbl > adagrad_predict.submit
pypy scoreKDD.py KDD_Track2_solution.csv adagrad_predict.submit
Algorithm | AUC |
---|---|
SGD | 0.739351 |
ADAGRAD | 0.743279 |
AdaDelta
drop table adadelta_model;
create table adadelta_model
as
select
feature,
cast(avg(weight) as float) as weight
from
(select
adadelta(features,label) as (feature,weight)
from
training_orcfile
) t
group by feature;
drop table adadelta_predict;
create table adadelta_predict
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
LINES TERMINATED BY "\n"
STORED AS TEXTFILE
as
select
t.rowid,
sigmoid(sum(m.weight)) as prob
from
testing_exploded t LEFT OUTER JOIN
adadelta_model m ON (t.feature = m.feature)
group by
t.rowid
order by
rowid ASC;
hadoop fs -getmerge /user/hive/warehouse/kdd12track2.db/adadelta_predict adadelta_predict.tbl
gawk -F "\t" '{print $2;}' adadelta_predict.tbl > adadelta_predict.submit
pypy scoreKDD.py KDD_Track2_solution.csv adadelta_predict.submit
Algorithm | AUC |
---|---|
SGD | 0.739351 |
ADAGRAD | 0.743279 |
AdaDelta | 0.746878 |