The task is predicting the click through rate (CTR) of advertisement, meaning that we are to predict the probability of each ad being clicked. https://www.kaggle.com/c/kddcup2012-track2


Dataset

File Size Records
KDD_Track2_solution.csv 244MB 20,297,595 (20,297,594 w/o header)
descriptionid_tokensid.txt 268MB 3,171,830
purchasedkeywordid_tokensid.txt 26MB 1,249,785
queryid_tokensid.txt 704MB 26,243,606
test.txt 1.3GB 20,297,594
titleid_tokensid.txt 171MB 4,051,441
training.txt 9.9GB 149,639,105
serid_profile.txt 283MB 23,669,283

tables

Tokens are actually not used in this example. Try using them on your own.


create database kdd12track2;
use kdd12track2;

Create external table training (
  RowID BIGINT,
  Clicks INT, 
  Impression INT, 
  DisplayURL STRING, 
  AdID INT,
  AdvertiserID INT, 
  Depth SMALLINT, 
  Position SMALLINT, 
  QueryID INT, 
  KeywordID INT,
  TitleID INT, 
  DescriptionID INT, 
  UserID INT
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/training';

Create external table testing (
  RowID BIGINT,
  DisplayURL STRING, 
  AdID INT,
  AdvertiserID INT, 
  Depth SMALLINT, 
  Position SMALLINT, 
  QueryID INT, 
  KeywordID INT,
  TitleID INT, 
  DescriptionID INT, 
  UserID INT
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/testing';

Create external table user (
  UserID INT, 
  Gender TINYINT,
  Age TINYINT
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/user';

Create external table query (
  QueryID INT,
  Tokens STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/query';

Create external table keyword (
  KeywordID INT,
  Tokens STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/keyword';

Create external table title (
  TitleID INT, 
  Tokens STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/title';

Create external table description (
  DescriptionID INT,
  Tokens STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/description';

Create external table solution (
   RowID BIGINT,
   Clicks INT,
   Impressions INT,
   Private BOOLEAN 
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/solution';
gawk '{print NR"\t"$0;}' training.txt | \
hadoop fs -put - /kddcup2012/track2/training/training.tsv

gawk '{print NR"\t"$0;}' test.txt | \
hadoop fs -put - /kddcup2012/track2/testing/test.tsv

hadoop fs -put userid_profile.txt /kddcup2012/track2/user/user.tsv

tail -n +2 KDD_Track2_solution.csv | sed -e 's/Public/FALSE/g' | sed -e 's/Private/TRUE/g' | gawk '{print NR","$0;}' \
hadoop fs -put - /kddcup2012/track2/solution/solution.csv

hadoop fs -put  queryid_tokensid.txt /kddcup2012/track2/query/tokensid.tsv
hadoop fs -put purchasedkeywordid_tokensid.txt /kddcup2012/track2/keyword/tokensid.tsv
hadoop fs -put titleid_tokensid.txt /kddcup2012/track2/title/tokensid.tsv
hadoop fs -put descriptionid_tokensid.txt /kddcup2012/track2/description/tokensid.tsv

Converting feature representation by feature hashing

http://en.wikipedia.org/wiki/Feature_hashing

mhash is the MurmurHash3 function to convert a feature vector into a hash value.

create or replace view training2 as
select
  rowid,
  clicks,
  (impression - clicks) as noclick,
  mhash(concat("1:", displayurl)) as displayurl, 
  mhash(concat("2:", adid)) as adid, 
  mhash(concat("3:", advertiserid)) as advertiserid, 
  mhash(concat("4:", depth)) as depth, 
  mhash(concat("5:", position)) as position, 
  mhash(concat("6:", queryid)) as queryid, 
  mhash(concat("7:", keywordid)) as keywordid, 
  mhash(concat("8:", titleid)) as titleid, 
  mhash(concat("9:", descriptionid)) as descriptionid, 
  mhash(concat("10:", userid)) as userid, 
  mhash(concat("11:", COALESCE(gender,"0"))) as gender, 
  mhash(concat("12:", COALESCE(age,"-1"))) as age, 
  -1 as bias
from (
  select
    t.*,
    u.gender,
    u.age
  from 
    training t 
    LEFT OUTER JOIN user u 
      on t.userid = u.userid
) t;

create or replace view testing2 as
select
  rowid, 
  array(displayurl, adid, advertiserid, depth, position, queryid, keywordid, titleid, descriptionid, userid, gender, age, bias) 
    as features
from (
  select
    rowid,
    mhash(concat("1:", displayurl)) as displayurl, 
    mhash(concat("2:", adid)) as adid, 
    mhash(concat("3:", advertiserid)) as advertiserid, 
    mhash(concat("4:", depth)) as depth, 
    mhash(concat("5:", position)) as position, 
    mhash(concat("6:", queryid)) as queryid, 
    mhash(concat("7:", keywordid)) as keywordid, 
    mhash(concat("8:", titleid)) as titleid, 
    mhash(concat("9:", descriptionid)) as descriptionid, 
    mhash(concat("10:", userid)) as userid, 
    mhash(concat("11:", COALESCE(gender,"0"))) as gender, 
    mhash(concat("12:", COALESCE(age,"-1"))) as age, 
    -1 as bias
  from (
    select
      t.*,
      u.gender,
      u.age
    from 
      testing t 
      LEFT OUTER JOIN user u 
        on t.userid = u.userid
  ) t1
) t2;

Compressing large training tables

create table training_orcfile (
 rowid bigint,
 features array<int>,
 label int
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");

Caution: Joining between training table and user table takes a long time. Consider not to use gender and age and avoid joins if your Hadoop cluster is small.

-- SET mapred.reduce.tasks=64;
-- SET hive.auto.convert.join=false;

INSERT OVERWRITE TABLE training_orcfile 
select 
  binarize_label(clicks, noclick, rowid, features)
    as (rowid, features, label)
from
  training2
CLUSTER BY rand(); -- shuffle

-- SET mapred.reduce.tasks=-1;
-- SET hive.auto.convert.join=true;

create table testing_exploded as
select 
  rowid,
  feature
from 
  testing2 
  LATERAL VIEW explode(features) t AS feature;

results matching ""

    No results matching ""