Prerequisite
Data preparation
cd /mnt/archive/datasets/regression/E2006-tfidf
awk -f conv.awk E2006.train > E2006.train.tsv
awk -f conv.awk E2006.test > E2006.test.tsv
hadoop fs -mkdir -p /dataset/E2006-tfidf/train
hadoop fs -mkdir -p /dataset/E2006-tfidf/test
hadoop fs -put E2006.train.tsv /dataset/E2006-tfidf/train
hadoop fs -put E2006.test.tsv /dataset/E2006-tfidf/test
create database E2006;
use E2006;
create external table e2006tfidf_train (
rowid int,
target float,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/E2006-tfidf/train';
create external table e2006tfidf_test (
rowid int,
target float,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/E2006-tfidf/test';
create table e2006tfidf_test_exploded as
select
rowid,
target,
extract_feature(feature) as feature,
extract_weight(feature) as value
from
e2006tfidf_test LATERAL VIEW explode(add_bias(features)) t AS feature;
Amplify training examples (global shuffle)
set hivevar:seed=31;
set hivevar:xtimes=3;
create or replace view e2006tfidf_train_x3 as
select * from (
select amplify(${xtimes}, *) as (rowid, target, features)
from e2006tfidf_train
) t
CLUSTER BY rand(${seed});