quantified_features
is useful for transforming values of non-number columns to indexed numbers.
Note: The feature is supported from Hivemall v0.4 or later.
desc train;
id int
age int
job string
marital string
education string
default string
balance int
housing string
loan string
contact string
day int
month string
duration int
campaign int
pdays int
previous int
poutcome string
y int
select * from train limit 10;
1 39 blue-collar married secondary no 1756 yes no cellular 3 apr 939 1 -1 0 unknown 1
2 51 entrepreneur married primary no 1443 no no cellular 18 feb 172 10 -1 0 unknown 1
3 36 management single tertiary no 436 no no cellular 13 apr 567 1 595 2 failure 1
4 63 retired married secondary no 474 no no cellular 25 jan 423 1 -1 0 unknown 1
5 31 management single tertiary no 354 no no cellular 30 apr 502 1 9 2 success 1
6 29 blue-collar single secondary no 260 yes no unknown 2 jun 707 14 -1 0 unknown 1
7 37 services married secondary no 52 yes no cellular 6 sep 908 1 185 9 success 1
8 32 technician single secondary no 230 yes no cellular 18 may 442 1 266 8 failure 1
9 31 admin. single secondary no 0 yes no cellular 7 may 895 2 295 2 failure 1
10 32 self-employed single tertiary no 1815 no no telephone 10 aug 235 1 102 2 failure 1
set hivevar:output_row=true;
select quantify(${output_row}, *)
from (
select * from train
order by id asc -- force quantify() runs on a single reducer
) t
limit 10;
1 39 0 0 0 0 1756 0 0 0 3 0 939 1 -1 0 0 1
2 51 1 0 1 0 1443 1 0 0 18 1 172 10 -1 0 0 1
3 36 2 1 2 0 436 1 0 0 13 0 567 1 595 2 1 1
4 63 3 0 0 0 474 1 0 0 25 2 423 1 -1 0 0 1
5 31 2 1 2 0 354 1 0 0 30 0 502 1 9 2 2 1
6 29 0 1 0 0 260 0 0 1 2 3 707 14 -1 0 0 1
7 37 4 0 0 0 52 0 0 0 6 4 908 1 185 9 2 1
8 32 5 1 0 0 230 0 0 0 18 5 442 1 266 8 1 1
9 31 6 1 0 0 0 0 0 0 7 5 895 2 295 2 1 1
10 32 7 1 2 0 1815 1 0 2 10 6 235 1 102 2 1 1
select
quantify(
${output_row}, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, cast(pdays as string), previous, poutcome, y
) as (id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome, y)
from (
select * from train
order by id asc
) t
limit 10;
1 39 0 0 0 0 1756 0 0 0 3 0 939 1 0 0 0 1
2 51 1 0 1 0 1443 1 0 0 18 1 172 10 0 0 0 1
3 36 2 1 2 0 436 1 0 0 13 0 567 1 1 2 1 1
4 63 3 0 0 0 474 1 0 0 25 2 423 1 0 0 0 1
5 31 2 1 2 0 354 1 0 0 30 0 502 1 2 2 2 1
6 29 0 1 0 0 260 0 0 1 2 3 707 14 0 0 0 1
7 37 4 0 0 0 52 0 0 0 6 4 908 1 3 9 2 1
8 32 5 1 0 0 230 0 0 0 18 5 442 1 4 8 1 1
9 31 6 1 0 0 0 0 0 0 7 5 895 2 5 2 1 1
10 32 7 1 2 0 1815 1 0 2 10 6 235 1 6 2 1 1
select
quantified_features(
${output_row}, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, cast(pdays as string), previous, poutcome, y
) as features
from (
select * from train
order by id asc
) t
limit 10;
[1.0,39.0,0.0,0.0,0.0,0.0,1756.0,0.0,0.0,0.0,3.0,0.0,939.0,1.0,0.0,0.0,0.0,1.0]
[2.0,51.0,1.0,0.0,1.0,0.0,1443.0,1.0,0.0,0.0,18.0,1.0,172.0,10.0,0.0,0.0,0.0,1.0]
[3.0,36.0,2.0,1.0,2.0,0.0,436.0,1.0,0.0,0.0,13.0,0.0,567.0,1.0,1.0,2.0,1.0,1.0]
[4.0,63.0,3.0,0.0,0.0,0.0,474.0,1.0,0.0,0.0,25.0,2.0,423.0,1.0,0.0,0.0,0.0,1.0]
[5.0,31.0,2.0,1.0,2.0,0.0,354.0,1.0,0.0,0.0,30.0,0.0,502.0,1.0,2.0,2.0,2.0,1.0]
[6.0,29.0,0.0,1.0,0.0,0.0,260.0,0.0,0.0,1.0,2.0,3.0,707.0,14.0,0.0,0.0,0.0,1.0]
[7.0,37.0,4.0,0.0,0.0,0.0,52.0,0.0,0.0,0.0,6.0,4.0,908.0,1.0,3.0,9.0,2.0,1.0]
[8.0,32.0,5.0,1.0,0.0,0.0,230.0,0.0,0.0,0.0,18.0,5.0,442.0,1.0,4.0,8.0,1.0,1.0]
[9.0,31.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,5.0,895.0,2.0,5.0,2.0,1.0,1.0]
[10.0,32.0,7.0,1.0,2.0,0.0,1815.0,1.0,0.0,2.0,10.0,6.0,235.0,1.0,6.0,2.0,1.0,1.0]
Quantify test dataset
select * from test limit 10;
1 30 management single tertiary no 1028 no no cellular 4 feb 1294 2 -1 0 unknown
2 39 self-employed single tertiary no 426 no no unknown 18 jun 1029 1 -1 0 unknown
3 38 technician single tertiary no -572 yes yes unknown 5 jun 26 24 -1 0 unknown
4 34 technician single secondary no -476 yes no unknown 27 may 92 4 -1 0 unknown
5 37 entrepreneur married primary no 62 no no cellular 31 jul 404 2 -1 0 unknown
6 43 services married primary no 574 yes no cellular 8 may 140 1 -1 0 unknown
7 54 technician married secondary no 324 yes no telephone 13 may 51 1 -1 0 unknown
8 41 blue-collar married secondary no 121 yes no cellular 13 may 16 6 176 5 other
9 52 housemaid married primary no 1466 no yes cellular 20 nov 150 1 -1 0 unknown
10 32 management married secondary no 6217 yes yes cellular 18 nov 486 2 181 2 failure
select
id,
array(age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome) as features
from (
select
quantify(
output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, if(pdays==-1,0,pdays), previous, poutcome
) as (id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome)
from (
select * from (
select
1 as train_first, false as output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome
from
train
union all
select
2 as train_first, true as output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome
from
test
) t0
order by train_first, id asc
) t1
) t2
limit 10;
1 [30,2,1,2,0,1028,1,0,0,4,1,1294,2,0,0,0]
2 [39,7,1,2,0,426,1,0,1,18,3,1029,1,0,0,0]
3 [38,5,1,2,0,-572,0,1,1,5,3,26,24,0,0,0]
4 [34,5,1,0,0,-476,0,0,1,27,5,92,4,0,0,0]
5 [37,1,0,1,0,62,1,0,0,31,8,404,2,0,0,0]
6 [43,4,0,1,0,574,0,0,0,8,5,140,1,0,0,0]
7 [54,5,0,0,0,324,0,0,2,13,5,51,1,0,0,0]
8 [41,0,0,0,0,121,0,0,0,13,5,16,6,176,5,3]
9 [52,8,0,1,0,1466,1,1,0,20,9,150,1,0,0,0]
10 [32,2,0,0,0,6217,0,1,0,18,9,486,2,181,2,1]