This page explains how to run matrix factorization on MovieLens 1M dataset.
Calculate the mean rating in the training dataset
use movielens;
select avg(rating) from training;
3.593565
Set variables (hyperparameters) for training
-- mean rating
set hivevar:mu=3.593565;
-- number of factors
set hivevar:factor=10;
-- maximum number of training iterations
set hivevar:iters=50;
Note that there are no need to set an exact value for $mu
. It actually works without setting $mu
but recommended to set one for getting a better prediction.
Due to a bug in Hive, do not issue comments in CLI.
Training
create table sgd_model
as
select
idx,
array_avg(u_rank) as Pu,
array_avg(m_rank) as Qi,
avg(u_bias) as Bu,
avg(m_bias) as Bi
from (
select
train_mf_sgd(userid, movieid, rating, '-factor ${factor} -mu ${mu} -iter ${iters}') as (idx, u_rank, m_rank, u_bias, m_bias)
from
training
) t
group by idx;
Note
Hivemall also provides train_mf_adagrad for training using AdaGrad.
-help
option shows a complete list of hyperparameters.
Predict
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx);
Evaluate (computes MAE and RMSE)
select
mae(predicted, actual) as mae,
rmse(predicted, actual) as rmse
from (
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx)
) t;
MAE | RMSE |
---|---|
0.6728969407733578 | 0.8584162122694449 |
Item Recommendation
Recommend top-k movies that a user have not ever seen.
set hivevar:userid=1;
set hivevar:topk=5;
select
t1.movieid,
mf_predict(t2.Pu, t1.Qi, t2.Bu, t1.Bi, ${mu}) as predicted
from (
select
idx movieid,
Qi,
Bi
from
sgd_model p
where
p.idx NOT IN
(select movieid from training where userid=${userid})
) t1 CROSS JOIN (
select
Pu,
Bu
from
sgd_model
where
idx = ${userid}
) t2
order by
predicted DESC
limit ${topk};
movieid | predicted |
---|---|
318 | 4.8051853 |
2503 | 4.788541 |
53 | 4.7518783 |
904 | 4.7463417 |
953 | 4.732769 |