Dataset prepration
Iris Dataset: https://archive.ics.uci.edu/ml/datasets/Iris
$ wget http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
$ less iris.data
...
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
...
Create training/test table in Hive
create database iris;
use iris;
create external table iris_raw (
rowid int,
label string,
features array<float>
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/iris/raw';
Loading data into HDFS
$ awk -F"," 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' iris.data | head -3
1|Iris-setosa|5.1,3.5,1.4,0.2
2|Iris-setosa|4.9,3.0,1.4,0.2
3|Iris-setosa|4.7,3.2,1.3,0.2
$ awk -F"," 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
select count(1) from iris_raw;
> 150
Feature scaling
Normalization of feature weights is very important to get a good prediction in machine learning.
select
min(features[0]), max(features[0]),
min(features[1]), max(features[1]),
min(features[2]), max(features[2]),
min(features[3]), max(features[3])
from
iris_raw;
> 4.3 7.9 2.0 4.4 1.0 6.9 0.1 2.5
set hivevar:f0_min=4.3;
set hivevar:f0_max=7.9;
set hivevar:f1_min=2.0;
set hivevar:f1_max=4.4;
set hivevar:f2_min=1.0;
set hivevar:f2_max=6.9;
set hivevar:f3_min=0.1;
set hivevar:f3_max=2.5;
create or replace view iris_scaled
as
select
rowid,
label,
add_bias(array(
concat("1:", rescale(features[0],${f0_min},${f0_max})),
concat("2:", rescale(features[1],${f1_min},${f1_max})),
concat("3:", rescale(features[2],${f2_min},${f2_max})),
concat("4:", rescale(features[3],${f3_min},${f3_max}))
)) as features
from
iris_raw;
select * from iris_scaled limit 3;
> 1 Iris-setosa ["1:0.22222215","2:0.625","3:0.0677966","4:0.041666664","0:1.0"]
> 2 Iris-setosa ["1:0.16666664","2:0.41666666","3:0.0677966","4:0.041666664","0:1.0"]
> 3 Iris-setosa ["1:0.11111101","2:0.5","3:0.05084745","4:0.041666664","0:1.0"]
LibSVM web page provides a normalized (using ZScore) version of Iris dataset.
Create training/test data
set hivevar:rand_seed=31;
create table iris_shuffled
as
select rand(${rand_seed}) as rnd, * from iris_scaled;
-- 80% for training
create table train80p as
select * from iris_shuffled
order by rnd DESC
limit 120;
-- 20% for testing
create table test20p as
select * from iris_shuffled
order by rnd ASC
limit 30;
create table test20p_exploded
as
select
rowid,
label,
extract_feature(feature) as feature,
extract_weight(feature) as value
from
test20p LATERAL VIEW explode(features) t AS feature;
Define an amplified view for the training input
set hivevar:xtimes=10;
set hivevar:shufflebuffersize=1000;
create or replace view training_x10
as
select
rand_amplify(${xtimes}, ${shufflebuffersize}, rowid, label, features) as (rowid, label, features)
from
train80p;