This document explains how to compute TF-IDF with Apache Hive/Hivemall.
What you need to compute TF-IDF is a table/view composing (docid, word) pair, 2 views, and 1 query.
Note
This feature is supported since Hivemall v0.3-beta3 or later. Macro is supported since Hive 0.12 or later.
Define macros used in the TF-IDF computation
create temporary macro max2(x INT, y INT)
if(x>y,x,y);
-- create temporary macro idf(df_t INT, n_docs INT)
-- (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0);
create temporary macro tfidf(tf FLOAT, df_t INT, n_docs INT)
tf * (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0);
Data preparation
To calculate TF-IDF, you need to prepare a relation consists of (docid,word) tuples.
create external table wikipage (
docid int,
page string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;
cd ~/tmp
wget https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv
LOAD DATA LOCAL INPATH '/home/myui/tmp/tfidf_test.tsv' INTO TABLE wikipage;
create or replace view wikipage_exploded
as
select
docid,
word
from
wikipage LATERAL VIEW explode(tokenize(page,true)) t as word
where
not is_stopword(word);
You can download the data of the wikipage table from this link.
Define views of TF/DF
create or replace view term_frequency
as
select
docid,
word,
freq
from (
select
docid,
tf(word) as word2freq
from
wikipage_exploded
group by
docid
) t
LATERAL VIEW explode(word2freq) t2 as word, freq;
create or replace view document_frequency
as
select
word,
count(distinct docid) docs
from
wikipage_exploded
group by
word;
TF-IDF calculation for each docid/word pair
-- set the total number of documents
select count(distinct docid) from wikipage;
set hivevar:n_docs=3;
create or replace view tfidf
as
select
tf.docid,
tf.word,
-- tf.freq * (log(10, CAST(${n_docs} as FLOAT)/max2(1,df.docs)) + 1.0) as tfidf
tfidf(tf.freq, df.docs, ${n_docs}) as tfidf
from
term_frequency tf
JOIN document_frequency df ON (tf.word = df.word)
order by
tfidf desc;
The result will be as follows:
docid word tfidf
1 justice 0.1641245850805637
3 knowledge 0.09484606645205085
2 action 0.07033910867777095
1 law 0.06564983513276658
1 found 0.06564983513276658
1 religion 0.06564983513276658
1 discussion 0.06564983513276658
...
...
2 act 0.017584777169442737
2 virtues 0.017584777169442737
2 well 0.017584777169442737
2 willingness 0.017584777169442737
2 find 0.017584777169442737
2 1 0.014001086678120098
2 experience 0.014001086678120098
2 often 0.014001086678120098
The above result is considered to be appropriate as docid 1, 2, and 3 are the Wikipedia entries of Justice, Wisdom, and Knowledge, respectively.
Feature Vector with TF-IDF values
select
docid,
-- collect_list(concat(word, ":", tfidf)) as features -- Hive 0.13 or later
collect_list(feature(word, tfidf)) as features -- Hivemall v0.3.4 & Hive 0.13 or later
-- collect_all(concat(word, ":", tfidf)) as features -- before Hive 0.13
from
tfidf
group by
docid;
1 ["justice:0.1641245850805637","found:0.06564983513276658","discussion:0.06564983513276658","law:0.065
64983513276658","based:0.06564983513276658","religion:0.06564983513276658","viewpoints:0.03282491756638329","
rationality:0.03282491756638329","including:0.03282491756638329","context:0.03282491756638329","concept:0.032
82491756638329","rightness:0.03282491756638329","general:0.03282491756638329","many:0.03282491756638329","dif
fering:0.03282491756638329","fairness:0.03282491756638329","social:0.03282491756638329","broadest:0.032824917
56638329","equity:0.03282491756638329","includes:0.03282491756638329","theology:0.03282491756638329","ethics:
0.03282491756638329","moral:0.03282491756638329","numerous:0.03282491756638329","philosophical:0.032824917566
38329","application:0.03282491756638329","perspectives:0.03282491756638329","procedural:0.03282491756638329",
"realm:0.03282491756638329","divided:0.03282491756638329","concepts:0.03282491756638329","attainment:0.032824
91756638329","fields:0.03282491756638329","often:0.026135361945200226","philosophy:0.026135361945200226","stu
dy:0.026135361945200226"]
2 ["action:0.07033910867777095","wisdom:0.05275433288400458","one:0.05275433288400458","understanding:0
.04200326112968063","judgement:0.035169554338885474","apply:0.035169554338885474","disposition:0.035169554338
885474","given:0.035169554338885474"
...