Preparation

use kdd12track2;

-- SET mapreduce.framework.name=yarn;
-- SET hive.execution.engine=mr;
-- SET mapreduce.framework.name=yarn-tez;
-- SET hive.execution.engine=tez;
SET mapred.reduce.tasks=32; -- [optional] set the explicit number of reducers to make group-by aggregation faster

AdaGrad

drop table adagrad_model;
create table adagrad_model 
as
select 
 feature,
 avg(weight) as weight
from 
 (select 
     adagrad(features,label) as (feature,weight)
  from 
     training_orcfile
 ) t 
group by feature;

drop table adagrad_predict;
create table adagrad_predict
  ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY "\t"
    LINES TERMINATED BY "\n"
  STORED AS TEXTFILE
as
select
  t.rowid, 
  sigmoid(sum(m.weight)) as prob
from 
  testing_exploded t LEFT OUTER JOIN
  adagrad_model m ON (t.feature = m.feature)
group by 
  t.rowid
order by 
  rowid ASC;

Note

sigmoid(sum(m.weight)) not sigmoid(sum(m.weight * t.value))) because t.value is always 1.0 for categorical variable.

hadoop fs -getmerge /user/hive/warehouse/kdd12track2.db/adagrad_predict adagrad_predict.tbl

gawk -F "\t" '{print $2;}' adagrad_predict.tbl > adagrad_predict.submit

pypy scoreKDD.py KDD_Track2_solution.csv adagrad_predict.submit
Algorithm AUC
SGD 0.739351
ADAGRAD 0.743279

AdaDelta

drop table adadelta_model;
create table adadelta_model 
as
select 
 feature,
 cast(avg(weight) as float) as weight
from 
 (select 
     adadelta(features,label) as (feature,weight)
  from 
     training_orcfile
 ) t 
group by feature;

drop table adadelta_predict;
create table adadelta_predict
  ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY "\t"
    LINES TERMINATED BY "\n"
  STORED AS TEXTFILE
as
select
  t.rowid, 
  sigmoid(sum(m.weight)) as prob
from 
  testing_exploded  t LEFT OUTER JOIN
  adadelta_model m ON (t.feature = m.feature)
group by 
  t.rowid
order by 
  rowid ASC;
hadoop fs -getmerge /user/hive/warehouse/kdd12track2.db/adadelta_predict adadelta_predict.tbl

gawk -F "\t" '{print $2;}' adadelta_predict.tbl > adadelta_predict.submit

pypy scoreKDD.py KDD_Track2_solution.csv adadelta_predict.submit
Algorithm AUC
SGD 0.739351
ADAGRAD 0.743279
AdaDelta 0.746878

results matching ""

    No results matching ""