https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (bridge to algebra))
- the number of classes: 2
- the number of examples: 19,264,097 (training) / 748,401 (testing)
- the number of features: 29,890,095 (training) / 29,890,095 (testing)
Define training/testing tables
create database kdd2010;
use kdd2010;
create external table kdd10b_train (
rowid int,
label int,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/kdd10b/train';
create external table kdd10b_test (
rowid int,
label int,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/kdd10b/test';
Putting data into HDFS
awk -f conv.awk kddb | hadoop fs -put - /dataset/kdd10b/train/kddb
awk -f conv.awk kddb.t | hadoop fs -put - /dataset/kdd10b/test/kddb.t
Make auxiliary tables
create table kdd10b_test_exploded as
select
rowid,
label,
split(feature,":")[0] as feature,
cast(split(feature,":")[1] as float) as value
from
kdd10b_test LATERAL VIEW explode(add_bias(features)) t AS feature;
set hivevar:xtimes=3;
set hivevar:shufflebuffersize=1000;
create or replace view kdd10b_train_x3
as
select
rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features)
from
kdd10b_train;