https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (algebra))
- the number of classes: 2
- the number of data: 8,407,752 (training) / 510,302 (testing)
- the number of features: 20,216,830 in about 2.73 GB (training) / 20,216,830 (testing)
Define training/testing tables
create database kdd2010;
use kdd2010;
create external table kdd10a_train (
rowid int,
label int,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/kdd10a/train';
create external table kdd10a_test (
rowid int,
label int,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/kdd10a/test';
Putting data into HDFS
awk -f conv.awk kdda | hadoop fs -put - /dataset/kdd10a/train/kdda
awk -f conv.awk kdda.t | hadoop fs -put - /dataset/kdd10a/test/kdda.t
Make auxiliary tables
create table kdd10a_train_orcfile (
rowid bigint,
label int,
features array<string>
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");
-- SET mapred.reduce.tasks=64;
INSERT OVERWRITE TABLE kdd10a_train_orcfile
select * from kdd10a_train
CLUSTER BY rand();
-- SET mapred.reduce.tasks=-1;
create table kdd10a_test_exploded as
select
rowid,
label,
split(feature,":")[0] as feature,
cast(split(feature,":")[1] as float) as value
from
kdd10a_test LATERAL VIEW explode(add_bias(features)) t AS feature;
set hivevar:xtimes=3;
set hivevar:shufflebuffersize=1000;
-- set hivemall.amplify.seed=32;
create or replace view kdd10a_train_x3
as
select
rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features)
from
kdd10a_train_orcfile;