Extract clusters and assign N cluster IDs to each article
create or replace view news20_cluster
as
select
minhash(rowid, features) as (clusterId, rowid)
from
news20mc_train;
create table news20_with_clusterid
as
select
t1.clusterid,
t1.rowid,
o1.features
from
news20_cluster t1
JOIN news20mc_train o1 ON (t1.rowid = o1.rowid);
Query expression with cluster id
set hivevar:noWeight=false;
create table extract_target_cluster
as
select
features,
clusterid
from (
select
features,
minhashes(features,${noWeight}) as clusters
from
news20mc_test
where
rowid = 1
) t1
LATERAL VIEW explode(clusters) t2 AS clusterid;
kNN search using minhashing
set hivevar:topn=10;
select
t1.rowid,
cosine_similarity(t1.features, q1.features, false) as similarity
from
news20_with_clusterid t1
JOIN extract_target_cluster q1 ON (t1.clusterid = q1.clusterid)
order by
similarity DESC
limit ${topn};
Time taken: 22.161 seconds
rowid | similarity |
---|---|
2182 | 0.21697778 |
5622 | 0.21483186 |
962 | 0.13240485 |
12242 | 0.12158953 |
5102 | 0.11168713 |
8562 | 0.107470974 |
14396 | 0.09949879 |
2542 | 0.09011547 |
1645 | 0.08894014 |
2862 | 0.08800333 |
Brute force kNN search (based on cosine similarity)
select
t1.rowid,
cosine_similarity(t1.features, q1.features) as similarity -- hive v0.3.2 or later
-- cosine_similarity(t1.features, q1.features, false) as similarity -- hive v0.3.1 or before
from
news20mc_train t1
CROSS JOIN
(select features from news20mc_test where rowid = 1) q1
ORDER BY
similarity DESC
limit ${topn};
Time taken: 24.335 seconds
rowid | similarity |
---|---|
12902 | 0.47759432 |
7922 | 0.4184913 |
2382 | 0.21919869 |
2182 | 0.21697778 |
5622 | 0.21483186 |
9562 | 0.21223815 |
3222 | 0.164399 |
11202 | 0.16439897 |
10122 | 0.1620197 |
8482 | 0.15229382 |
Refer this page for efficient top-k kNN computation.