Extract clusters and assign N cluster IDs to each article

create or replace view news20_cluster
as
select 
  minhash(rowid, features) as (clusterId, rowid)
from 
  news20mc_train;

create table news20_with_clusterid
as
select 
  t1.clusterid, 
  t1.rowid, 
  o1.features
from 
  news20_cluster t1
  JOIN news20mc_train o1 ON (t1.rowid = o1.rowid);

Query expression with cluster id

set hivevar:noWeight=false;

create table extract_target_cluster
as
select 
  features,
  clusterid
from (
  select 
     features,
     minhashes(features,${noWeight}) as clusters
  from 
     news20mc_test 
  where 
     rowid = 1
) t1
LATERAL VIEW explode(clusters) t2 AS clusterid;

kNN search using minhashing

set hivevar:topn=10;

select 
  t1.rowid, 
  cosine_similarity(t1.features, q1.features, false) as similarity
from
  news20_with_clusterid t1
  JOIN extract_target_cluster q1 ON (t1.clusterid = q1.clusterid)
order by
  similarity DESC
limit ${topn};

Time taken: 22.161 seconds

rowid	similarity
2182	0.21697778
5622	0.21483186
962	0.13240485
12242	0.12158953
5102	0.11168713
8562	0.107470974
14396	0.09949879
2542	0.09011547
1645	0.08894014
2862	0.08800333

Brute force kNN search (based on cosine similarity)

select
  t1.rowid,
  cosine_similarity(t1.features, q1.features) as similarity -- hive v0.3.2 or later
  -- cosine_similarity(t1.features, q1.features, false) as similarity -- hive v0.3.1 or before
from 
  news20mc_train t1
  CROSS JOIN
  (select features from news20mc_test where rowid = 1) q1
ORDER BY
  similarity DESC
limit ${topn};

Time taken: 24.335 seconds

rowid	similarity
12902	0.47759432
7922	0.4184913
2382	0.21919869
2182	0.21697778
5622	0.21483186
9562	0.21223815
3222	0.164399
11202	0.16439897
10122	0.1620197
8482	0.15229382

Refer this page for efficient top-k kNN computation.

results matching ""

No results matching ""