Get the dataset from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#webspam
Putting data on HDFS
hadoop fs -mkdir -p /dataset/webspam/raw
awk -f conv.awk webspam_wc_normalized_trigram.svm | \
hadoop fs -put - /dataset/webspam/raw/
Table preparation
create database webspam;
use webspam;
create external table webspam_raw (
rowid int,
label int,
features ARRAY<STRING>
) ROW FORMAT
DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/webspam/raw';
set hive.sample.seednumber=43;
create table webspam_test
as
select * from webspam_raw TABLESAMPLE(1000 ROWS) s
CLUSTER BY rand(43)
limit 70000;
Make auxiliary tables
create table webspam_train_orcfile (
rowid int,
label int,
features array<string>
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");
-- SET mapred.reduce.tasks=128;
INSERT OVERWRITE TABLE webspam_train_orcfile
select
s.rowid,
label,
add_bias(features) as features
from webspam_raw s
where not exists (select rowid from webspam_test t where s.rowid = t.rowid)
CLUSTER BY rand(43);
-- SET mapred.reduce.tasks=-1;
set hivevar:xtimes=3;
set hivevar:shufflebuffersize=100;
set hivemall.amplify.seed=32;
create or replace view webspam_train_x3
as
select
rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features)
from
webspam_train_orcfile;
create table webspam_test_exploded as
select
rowid,
label,
split(feature,":")[0] as feature,
cast(split(feature,":")[1] as float) as value
from
webspam_test LATERAL VIEW explode(add_bias(features)) t AS feature;
Caution: For this dataset, use small shufflebuffersize because each training example has lots of features though (xtimes shufflebuffersize N) training examples are cached in memory.