List related (similar) articles for each article.

Extract clusters

use news20;

set hivevar:hashes=100; -- Generate N sets of minhash values for each row (DEFAULT: 5)
set hivevar:keygroups=2; -- Use K minhash value for generating a resulting value (DEFAULT: 2)

create table news20_clusterid_assign
as
select 
  -- minhash(rowid, features) as (clusterId, rowid)
  minhash(rowid, features, "-n ${hashes} -k ${keygroups}") as (clusterId, rowid)
from 
  news20mc_train;

--set hivevar:min_cluster_size=5;

create or replace view news20_cluster
as
select
  clusterId, 
  collect_set(rowid) as rowids
from 
  news20_clusterid_assign
group by clusterId
-- having size(rowids) > ${min_cluster_size}
;

Get recommendations

create table news20_similar_articles
as
WITH t1 as (
select
  l.rowid,
  r.rowid as other_id,
  count(1) as cnt
from
  news20_clusterid_assign l 
  LEFT OUTER JOIN
  news20_clusterid_assign r
     ON (l.clusterid = r.clusterid)
where 
  l.rowid != r.rowid
group by
  l.rowid, r.rowid
having 
-- 10/${hashes}=10/100=0.1 (filter by a pseudo Jaccard similarity by Minhash is greater than or equals to 0.1)
  cnt >= 10 
)
select
  rowid,
  collect_set(other_id) as related_articles
from 
  t1
group by
  rowid
-- order by rowid asc
;

List all possible clusters w/o using a similarity threshold:

create table news20_similar_articles2
as
select
  l.rowid,
  collect_set(r.rowid) as related_articles
from
  news20_clusterid_assign l 
  LEFT OUTER JOIN
  news20_clusterid_assign r
     ON (l.clusterid = r.clusterid)
where 
  l.rowid != r.rowid
group by
  l.rowid
-- order by rowid asc
;

Jaccard similarity computation using k-Minhash

create table news20_jaccard_similarity
as
WITH t1 as (
select
  l.rowid,
  r.rowid as other_id,
  count(1) / ${hashes} as similarity
from
  news20_clusterid_assign l 
  JOIN news20_clusterid_assign r
     ON (l.clusterid = r.clusterid)
where 
  l.rowid != r.rowid
group by
  l.rowid, r.rowid
)
select
  rowid,
  other_id,
  similarity,
  1.0 - similarity as distance
from
  t1
where
  similarity >= 0.1
;

results matching ""

    No results matching ""