This examples gives a basic usage of RandomForest on Hivemall using Kaggle Titanic dataset. The example gives a baseline score without any feature engineering.
Data preparation
create database titanic;
use titanic;
drop table train;
create external table train (
passengerid int, -- unique id
survived int, -- target label
pclass int,
name string,
sex string,
age int,
sibsp int, -- Number of Siblings/Spouses Aboard
parch int, -- Number of Parents/Children Aboard
ticket string,
fare double,
cabin string,
embarked string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE LOCATION '/dataset/titanic/train';
hadoop fs -rm /dataset/titanic/train/train.csv
awk '{ FPAT="([^,]*)|(\"[^\"]+\")";OFS="|"; } NR >1 {$1=$1;$4=substr($4,2,length($4)-2);print $0}' train.csv | hadoop fs -put - /dataset/titanic/train/train.csv
drop table test_raw;
create external table test_raw (
passengerid int,
pclass int,
name string,
sex string,
age int,
sibsp int, -- Number of Siblings/Spouses Aboard
parch int, -- Number of Parents/Children Aboard
ticket string,
fare double,
cabin string,
embarked string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE LOCATION '/dataset/titanic/test_raw';
hadoop fs -rm /dataset/titanic/test_raw/test.csv
awk '{ FPAT="([^,]*)|(\"[^\"]+\")";OFS="|"; } NR >1 {$1=$1;$3=substr($3,2,length($3)-2);print $0}' test.csv | hadoop fs -put - /dataset/titanic/test_raw/test.csv
Data preparation for RandomForest
set hivevar:output_row=true;
drop table train_rf;
create table train_rf
as
WITH train_quantified as (
select
quantify(
${output_row}, passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked
) as (passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked)
from (
select * from train
order by passengerid asc
) t
)
select
rand(31) as rnd,
passengerid,
array(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) as features,
survived
from
train_quantified
;
drop table test_rf;
create table test_rf
as
WITH test_quantified as (
select
quantify(
output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked
) as (passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked)
from (
-- need training data to assign consistent ids to categorical variables
select * from (
select
1 as train_first, false as output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked
from
train
union all
select
2 as train_first, true as output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked
from
test_raw
) t0
order by train_first asc, passengerid asc
) t1
)
select
passengerid,
array(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) as features
from
test_quantified
;
Training
select guess_attribute_types(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) from train limit 1;
Q,C,C,Q,Q,Q,C,Q,C,C
Q
and C
represent quantitative variable and categorical variables, respectively.
Caution
Note that the output of guess_attribute_types
is not perfect. Revise it by your self.
For example, pclass
is a categorical variable.
set hivevar:attrs=C,C,C,Q,Q,Q,C,Q,C,C;
drop table model_rf;
create table model_rf
AS
select
train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}")
from
train_rf
;
select
array_sum(var_importance) as var_importance,
sum(oob_errors) / sum(oob_tests) as oob_err_rate
from
model_rf;
[137.00242639169272,1194.2140119834373,328.78017188176966,628.2568660509628,200.31275032394072,160.12876797647078,1083.5987543408116,664.1234312561456,422.89449844090393,130.72019667694784] 0.18742985409652077
Prediction
-- SET hivevar:classification=true;
set hive.auto.convert.join=true;
SET hive.mapjoin.optimized.hashtable=false;
SET mapred.reduce.tasks=16;
drop table predicted_rf;
create table predicted_rf
as
SELECT
passengerid,
predicted.label,
predicted.probability,
predicted.probabilities
FROM (
SELECT
passengerid,
rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
-- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
FROM (
SELECT
t.passengerid,
p.model_weight,
tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
-- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later
FROM (
SELECT
model_id, model_weight, model
FROM
model_rf
DISTRIBUTE BY rand(1)
) p
LEFT OUTER JOIN test_rf t
) t1
group by
passengerid
) t2
;
Caution
tree_predict_v1
is for the backward compatibility for using prediction models built before v0.5.0
on v0.5.0
or later.
Kaggle submission
drop table predicted_rf_submit;
create table predicted_rf_submit
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ","
LINES TERMINATED BY "\n"
STORED AS TEXTFILE
as
SELECT passengerid, label as survived
FROM predicted_rf
ORDER BY passengerid ASC;
hadoop fs -getmerge /user/hive/warehouse/titanic.db/predicted_rf_submit predicted_rf_submit.csv
sed -i -e "1i PassengerId,Survived" predicted_rf_submit.csv
Accuracy would gives 0.76555
for a Kaggle submission.
Graphviz export
Note
tree_export
feature is supported from Hivemall v0.5.0 or later.
Better to limit tree depth on training by -depth
option to plot a Decision Tree.
Hivemall provide tree_export
to export a decision tree into Graphviz or human-readable Javascript format. You can find the usage by issuing the following query:
> select tree_export("","-help");
usage: tree_export(string model, const string options, optional
array<string> featureNames=null, optional array<string>
classNames=null) - exports a Decision Tree model as javascript/dot]
[-help] [-output_name <arg>] [-r] [-t <arg>]
-help Show function help
-output_name,--outputName <arg> output name [default: predicted]
-r,--regression Is regression tree or not
-t,--type <arg> Type of output [default: js,
javascript/js, graphviz/dot
CREATE TABLE model_exported
STORED AS ORC tblproperties("orc.compress"="SNAPPY")
AS
select
model_id,
tree_export(model, "-type javascript -output_name survived", array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked'), array('no','yes')) as js,
tree_export(model, "-type graphviz -output_name survived", array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked'), array('no','yes')) as dot
from
model_rf
-- limit 1
;
Here is an example plotting a decision tree using Graphviz or Vis.js.
Test by dividing training dataset
drop table train_rf_07;
create table train_rf_07
as
select * from train_rf
where rnd < 0.7;
drop table test_rf_03;
create table test_rf_03
as
select * from train_rf
where rnd >= 0.7;
drop table model_rf_07;
create table model_rf_07
AS
select
train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}")
from
train_rf_07;
select
array_sum(var_importance) as var_importance,
sum(oob_errors) / sum(oob_tests) as oob_err_rate
from
model_rf_07;
[116.12055542977338,960.8569891444097,291.08765260103837,469.74671636586226,163.721292772701,120.784769882858,847.9769298113661,554.4617571355476,346.3500941757221,97.42593940113392] 0.1838351822503962
-- SET hivevar:classification=true;
SET hive.mapjoin.optimized.hashtable=false;
SET mapred.reduce.tasks=16;
drop table predicted_rf_03;
create table predicted_rf_03
as
SELECT
passengerid,
predicted.label,
predicted.probability,
predicted.probabilities
FROM (
SELECT
passengerid,
rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
-- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
FROM (
SELECT
t.passengerid,
p.model_weight,
tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
-- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
-- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later
FROM (
SELECT
model_id, model_weight, model
FROM
model_rf_07
DISTRIBUTE BY rand(1)
) p
LEFT OUTER JOIN test_rf_03 t
) t1
group by
passengerid
) t2;
WITH rf_submit_03 as (
select
t.survived as actual,
p.label as predicted
from
test_rf_03 t
JOIN predicted_rf_03 p on (t.passengerid = p.passengerid)
)
select sum(if(actual=predicted,1,0))/count(1) as accuracy
from rf_submit_03;
0.8153846153846154
Tracing predictions
Find important attributes and conditions predicted to survive.
WITH tmp as (
SELECT
t.survived as actual,
decision_path(m.model_id, m.model, t.features, '-classification -no_verbose', array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked')) as path
FROM
model_rf_07 m
LEFT OUTER JOIN -- CROSS JOIN
test_rf_03 t
)
select
r.branch,
count(1) as cnt
from
tmp l
LATERAL VIEW explode(array_slice(path, 0, -1)) r as branch
where
-- actual = 1 and -- actual is survived
last_element(path) = 1 -- predicted is survived
group by
r.branch
order by
cnt desc
limit 100;
r.branch | cnt |
---|---|
sex != 0.0 | 29786 |
pclass != 3.0 | 18520 |
pclass = 3.0 | 7444 |
sex = 0.0 | 6494 |
embarked != 1.0 | 6175 |
ticket != 22.0 | 5560 |
... | ... |