This page explains how to run matrix factorization on MovieLens 1M dataset.

Calculate the mean rating in the training dataset
Set variables (hyperparameters) for training
Training

Calculate the mean rating in the training dataset

use movielens;

select avg(rating) from training;

3.593565

Set variables (hyperparameters) for training

-- mean rating
set hivevar:mu=3.593565;
-- number of factors
set hivevar:factor=10;
-- maximum number of training iterations
set hivevar:iters=50;

Note that there are no need to set an exact value for $mu. It actually works without setting $mu but recommended to set one for getting a better prediction.

Due to a bug in Hive, do not issue comments in CLI.

Training

create table sgd_model
as
select
  idx, 
  array_avg(u_rank) as Pu, 
  array_avg(m_rank) as Qi, 
  avg(u_bias) as Bu, 
  avg(m_bias) as Bi
from (
  select 
    train_mf_sgd(userid, movieid, rating, '-factor ${factor} -mu ${mu} -iter ${iters}') as (idx, u_rank, m_rank, u_bias, m_bias)
  from 
    training
) t
group by idx;

Note

Hivemall also provides train_mf_adagrad for training using AdaGrad. -help option shows a complete list of hyperparameters.

Predict

select
  t2.actual,
  mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
  select
    t1.userid, 
    t1.movieid,
    t1.rating as actual,
    p1.Pu,
    p1.Bu
  from
    testing t1 LEFT OUTER JOIN sgd_model p1
    ON (t1.userid = p1.idx) 
) t2 
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx);

Evaluate (computes MAE and RMSE)

select
  mae(predicted, actual) as mae,
  rmse(predicted, actual) as rmse
from (
  select
    t2.actual,
    mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
  from (
    select
      t1.userid, 
      t1.movieid,
      t1.rating as actual,
      p1.Pu,
      p1.Bu
    from
      testing t1 LEFT OUTER JOIN sgd_model p1
      ON (t1.userid = p1.idx) 
  ) t2 
  LEFT OUTER JOIN sgd_model p2
  ON (t2.movieid = p2.idx)
) t;

MAE	RMSE
0.6728969407733578	0.8584162122694449

Item Recommendation

Recommend top-k movies that a user have not ever seen.

set hivevar:userid=1;
set hivevar:topk=5;

select
  t1.movieid, 
  mf_predict(t2.Pu, t1.Qi, t2.Bu, t1.Bi, ${mu}) as predicted
from (
  select
    idx movieid,
    Qi, 
    Bi
  from
    sgd_model p
  where
    p.idx NOT IN 
      (select movieid from training where userid=${userid})
) t1 CROSS JOIN (
  select
    Pu,
    Bu
  from 
    sgd_model
  where
    idx = ${userid}
) t2
order by
  predicted DESC
limit ${topk};

movieid	predicted
318	4.8051853
2503	4.788541
53	4.7518783
904	4.7463417
953	4.732769

results matching ""

No results matching ""