List related (similar) articles for each article.
Extract clusters
use news20;
set hivevar:hashes=100; -- Generate N sets of minhash values for each row (DEFAULT: 5)
set hivevar:keygroups=2; -- Use K minhash value for generating a resulting value (DEFAULT: 2)
create table news20_clusterid_assign
as
select
-- minhash(rowid, features) as (clusterId, rowid)
minhash(rowid, features, "-n ${hashes} -k ${keygroups}") as (clusterId, rowid)
from
news20mc_train;
--set hivevar:min_cluster_size=5;
create or replace view news20_cluster
as
select
clusterId,
collect_set(rowid) as rowids
from
news20_clusterid_assign
group by clusterId
-- having size(rowids) > ${min_cluster_size}
;
Get recommendations
create table news20_similar_articles
as
WITH t1 as (
select
l.rowid,
r.rowid as other_id,
count(1) as cnt
from
news20_clusterid_assign l
LEFT OUTER JOIN
news20_clusterid_assign r
ON (l.clusterid = r.clusterid)
where
l.rowid != r.rowid
group by
l.rowid, r.rowid
having
-- 10/${hashes}=10/100=0.1 (filter by a pseudo Jaccard similarity by Minhash is greater than or equals to 0.1)
cnt >= 10
)
select
rowid,
collect_set(other_id) as related_articles
from
t1
group by
rowid
-- order by rowid asc
;
List all possible clusters w/o using a similarity threshold:
create table news20_similar_articles2
as
select
l.rowid,
collect_set(r.rowid) as related_articles
from
news20_clusterid_assign l
LEFT OUTER JOIN
news20_clusterid_assign r
ON (l.clusterid = r.clusterid)
where
l.rowid != r.rowid
group by
l.rowid
-- order by rowid asc
;
Jaccard similarity computation using k-Minhash
create table news20_jaccard_similarity
as
WITH t1 as (
select
l.rowid,
r.rowid as other_id,
count(1) / ${hashes} as similarity
from
news20_clusterid_assign l
JOIN news20_clusterid_assign r
ON (l.clusterid = r.clusterid)
where
l.rowid != r.rowid
group by
l.rowid, r.rowid
)
select
rowid,
other_id,
similarity,
1.0 - similarity as distance
from
t1
where
similarity >= 0.1
;