Function Signature of bbit_minhash
Text bbit_minhash(array<int|string> features)
Text bbit_minhash(array<int|string> features, int numHashes=128)
Text bbit_minhash(array<int|string> features, boolean discardWeight=false)
Text bbit_minhash(array<int|string> features, int numHashes=128, boolean discardWeight=false)
Create a signature for each article
create table new20mc_with_signature
as
select
rowid,
bbit_minhash(features, false) as signature
from
news20mc_train;
kNN brute-force search using b-Bit minhash
set hivevar:topn=10;
select
t1.rowid,
jaccard_similarity(t1.signature, q1.signature,128) as similarity
from
new20mc_with_signature t1
CROSS JOIN
(select bbit_minhash(features,128,false) as signature from news20mc_test where rowid = 1) q1
order by
similarity DESC
limit ${topn};
rowid |
similarity |
popcnt |
11952 |
0.390625 |
41 |
10748 |
0.359375 |
41 |
12902 |
0.34375 |
45 |
3087 |
0.328125 |
48 |
3 |
0.328125 |
37 |
11493 |
0.328125 |
38 |
3839 |
0.328125 |
41 |
12669 |
0.328125 |
37 |
13604 |
0.3125 |
41 |
6333 |
0.3125 |
39 |