Get the dataset from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#webspam

Putting data on HDFS

hadoop fs -mkdir -p /dataset/webspam/raw

awk -f conv.awk webspam_wc_normalized_trigram.svm | \
hadoop fs -put - /dataset/webspam/raw/

Table preparation

create database webspam;
use webspam;

create external table webspam_raw (
  rowid int,
  label int,
  features ARRAY<STRING>
) ROW FORMAT 
DELIMITED FIELDS TERMINATED BY '\t' 
COLLECTION ITEMS TERMINATED BY "," 
STORED AS TEXTFILE LOCATION '/dataset/webspam/raw';

set hive.sample.seednumber=43;
create table webspam_test
as
select * from webspam_raw TABLESAMPLE(1000 ROWS) s
CLUSTER BY rand(43)
limit 70000;

Make auxiliary tables

create table webspam_train_orcfile (
 rowid int,
 label int,
 features array<string>
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");

-- SET mapred.reduce.tasks=128;
INSERT OVERWRITE TABLE webspam_train_orcfile
select
  s.rowid, 
  label,
  add_bias(features) as features
from webspam_raw s
where not exists (select rowid from webspam_test t where s.rowid = t.rowid)
CLUSTER BY rand(43);
-- SET mapred.reduce.tasks=-1;

set hivevar:xtimes=3;
set hivevar:shufflebuffersize=100;
set hivemall.amplify.seed=32;
create or replace view webspam_train_x3
as
select
   rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features)
from  
   webspam_train_orcfile;

create table webspam_test_exploded as
select 
  rowid,
  label,
  split(feature,":")[0] as feature,
  cast(split(feature,":")[1] as float) as value
from 
  webspam_test LATERAL VIEW explode(add_bias(features)) t AS feature;

Caution: For this dataset, use small shufflebuffersize because each training example has lots of features though (xtimes shufflebuffersize N) training examples are cached in memory.

results matching ""

    No results matching ""