One-vs-the-rest is a multiclass classification method that uses binary classifiers independently for each class. http://en.wikipedia.org/wiki/Multiclass_classification#one_vs_all
Dataset preparation for one-vs-the-rest classifiers
select collect_set(label) from news20mc_train;
[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16,19,18,20]
SET hivevar:possible_labels="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16,19,18,20";
create or replace view news20_onevsrest_train
as
select transform(${possible_labels}, rowid, label, add_bias(features))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
COLLECTION ITEMS TERMINATED BY ","
LINES TERMINATED BY "\n"
using 'gawk -f one-vs-rest.awk'
as (rowid BIGINT, label INT, target INT, features ARRAY<STRING>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
COLLECTION ITEMS TERMINATED BY ","
LINES TERMINATED BY "\n"
from news20mc_train;
create or replace view news20_onevsrest_train_x3
as
select
*
from (
select
amplify(3, *) as (rowid, label, target, features)
from
news20_onevsrest_train
) t
CLUSTER BY rand();