This page describes a list of useful Hivemall generic functions. See also a list of machine-learning-related functions.

Aggregation
Array
Bitset
Compression
Datetime
JSON
Map
MapReduce
Math
Vector/Matrix
Sanity Checks
Text processing
Timeseries
Others

Aggregation

majority_vote(Primitive x) - Returns the most frequent value of x

-- see https://issues.apache.org/jira/browse/HIVE-17406 
WITH data as (
  select
    explode(array('1', '2', '2', '2', '5', '4', '1', '2')) as k
)
select
  majority_vote(k) as k
from 
  data;
2

max_by(x, y) - Returns the value of x associated with the maximum value of y over all input values.

-- see https://issues.apache.org/jira/browse/HIVE-17406 
WITH data as (
  select 'jake' as name, 18 as age
  union all
  select 'tom' as name, 64 as age
  union all
  select 'lisa' as name, 32 as age
)
select
  max_by(name, age) as name
from
  data;
tom

min_by(x, y) - Returns the value of x associated with the minimum value of y over all input values.

-- see https://issues.apache.org/jira/browse/HIVE-17406 
WITH data as (
  select 'jake' as name, 18 as age
  union all
  select 'tom' as name, 64 as age
  union all
  select 'lisa' as name, 32 as age
)
select
  min_by(name, age) as name
from
  data;

jake

Array

arange([int start=0, ] int stop, [int step=1]) - Return evenly spaced values within a given interval

SELECT arange(5), arange(1, 5), arange(1, 5, 1), arange(0, 5, 1);
> [0,1,2,3,4]     [1,2,3,4]       [1,2,3,4]       [0,1,2,3,4]

SELECT arange(1, 6, 2);
> 1, 3, 5

SELECT arange(-1, -6, 2);
-1, -3, -5

argmax(array<T> a) - Returns the first index of the maximum value
```
SELECT argmax(array(5,2,0,1));
0
```
argmin(array<T> a) - Returns the first index of the minimum value
```
SELECT argmin(array(5,2,0,1));
2
```

argrank(array<ANY> a) - Returns the indices that would sort an array.

SELECT argrank(array(5,2,0,1)), argsort(argsort(array(5,2,0,1)));
[3, 2, 0, 1]     [3, 2, 0, 1]

argsort(array<ANY> a) - Returns the indices that would sort an array.

SELECT argsort(array(5,2,0,1));
2, 3, 1, 0

SELECT array_slice(array(5,2,0,1), argsort(array(5,2,0,1)));
0, 1, 2, 5

array_append(array<T> arr, T elem) - Append an element to the end of an array

SELECT array_append(array(1,2),3);
 1,2,3

SELECT array_append(array('a','b'),'c');
 "a","b","c"

array_avg(array<number>) - Returns an array<double> in which each element is the mean of a set of numbers

WITH input as (
  select array(1.0, 2.0, 3.0) as nums
  UNION ALL
  select array(2.0, 3.0, 4.0) as nums
)
select
  array_avg(nums)
from
  input;

["1.5","2.5","3.5"]

array_concat(array<ANY> x1, array<ANY> x2, ..) - Returns a concatenated array
```
SELECT array_concat(array(1),array(2,3));
 [1,2,3]
```

array_flatten(array<array<ANY>>) - Returns an array with the elements flattened.

SELECT array_flatten(array(array(1,2,3),array(4,5),array(6,7,8)));
 [1,2,3,4,5,6,7,8]

array_intersect(array<ANY> x1, array<ANY> x2, ..) - Returns an intersect of given arrays
```
SELECT array_intersect(array(1,3,4),array(2,3,4),array(3,5));
 [3]
```

array_remove(array<PRIMITIVE> values, PRIMITIVE|array<PRIMITIVE> target) - Returns an array that the target elements are removed from the original array

select array_remove(array(2.0,2.1,3.0,4.0,2.0),2), array_remove(array(2.0,3.0,4.0),array(3,2.0));
[2.1,3,4]       [4]

SELECT array_remove(array(1,null,3),null);
[1,3]

SELECT array_remove(array(1,null,3,null,5),null);
[1,3,5]

SELECT array_remove(array(1,null,3),array(null));
[1,3]

SELECT array_remove(array('aaa','bbb'),'bbb');
["aaa"]

SELECT array_remove(array('aaa','bbb','ccc','bbb'), array('bbb','ccc'));
["aaa"]

select array_remove(array(null),null);
[]

select array_remove(array(null,'bbb'),'aaa');
[null,"bbb"]

array_slice(array<ANY> values, int offset [, int length]) - Slices the given array by the given offset and length parameters.

SELECT 
  array_slice(array(1,2,3,4,5,6),2,4),
  array_slice(
   array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"),
   0, -- offset
   2 -- length
  ),
  array_slice(
   array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"),
   6, -- offset
   3 -- length
  ),
  array_slice(
   array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"),
   6, -- offset
   10 -- length
  ),
  array_slice(
   array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"),
   6 -- offset
  ),
  array_slice(
   array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"),
   -3 -- offset
  ),
  array_slice(
   array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"),
   -3, -- offset
   2 -- length
  );

 [3,4]
 ["zero","one"] 
 ["six","seven","eight"]
 ["six","seven","eight","nine","ten"]
 ["six","seven","eight","nine","ten"]
 ["eight","nine","ten"]
 ["eight","nine"]

array_sum(array<number>) - Returns an array<double> in which each element is summed up

WITH input as (
  select array(1.0, 2.0, 3.0) as nums
  UNION ALL
  select array(2.0, 3.0, 4.0) as nums
)
select
  array_sum(nums)
from
  input;

["3.0","5.0","7.0"]

array_to_str(array arr [, string sep=',']) - Convert array to string using a sperator
```
SELECT array_to_str(array(1,2,3),'-');
1-2-3
```

array_union(array1, array2, ...) - Returns the union of a set of arrays

SELECT array_union(array(1,2),array(1,2));
[1,2]

SELECT array_union(array(1,2),array(2,3),array(2,5));
[1,2,3,5]

conditional_emit(array<boolean> conditions, array<primitive> features) - Emit features of a row according to various conditions

WITH input as (
   select array(true, false, true) as conditions, array("one", "two", "three") as features
   UNION ALL
   select array(true, true, false), array("four", "five", "six")
)
SELECT
  conditional_emit(
     conditions, features
  )
FROM 
  input;
 one
 three
 four
 five

element_at(array<T> list, int pos) - Returns an element at the given position

SELECT element_at(array(1,2,3,4),0);
 1

SELECT element_at(array(1,2,3,4),-2);
 3

first_element(x) - Returns the first element in an array

SELECT first_element(array('a','b','c'));
 a

SELECT first_element(array());
 NULL

float_array(nDims) - Returns an array<float> of nDims elements
last_element(x) - Return the last element in an array
```
SELECT last_element(array('a','b','c'));
 c
```
select_k_best(array<number> array, const array<number> importance, const int k) - Returns selected top-k elements as array<double>
sort_and_uniq_array(array<int>) - Takes array<int> and returns a sorted array with duplicate elements eliminated
```
SELECT sort_and_uniq_array(array(3,1,1,-2,10));
 [-2,1,3,10]
```

subarray(array<ANY> values, int fromIndex [, int toIndex])- Returns a slice of the original array between the inclusive fromIndex and the exclusive toIndex.

SELECT 
  subarray(array(0,1,2,3,4,5),4),
  subarray(array(0,1,2,3,4,5),3,4),
  subarray(array(0,1,2,3,4,5),3,3),
  subarray(array(0,1,2,3,4,5),3,2),
  subarray(array(0,1,2,3,4,5),0,2),
  subarray(array(0,1,2,3,4,5),-1,2),
  subarray(array(1,2,3,4,5,6),4),
  subarray(array(1,2,3,4,5,6),4,6),
  subarray(array(1,2,3,4,5,6),2,4),
  subarray(array(1,2,3,4,5,6),0,2),
  subarray(array(1,2,3,4,5,6),4,6),
  subarray(array(1,2,3,4,5,6),4,7);

 [4,5]
 [3]
 []
 []
 [0,1]
 [0,1]
 [5,6]
 [5,6]
 [3,4]
 [1,2]
 [5,6]
 [5,6]

subarray_endwith(array<int|text> original, int|text key) - Returns an array that ends with the specified key
```
SELECT subarray_endwith(array(1,2,3,4), 3);
 [1,2,3]
```
subarray_startwith(array<int|text> original, int|text key) - Returns an array that starts with the specified key
```
SELECT subarray_startwith(array(1,2,3,4), 2);
 [2,3,4]
```

to_string_array(array<ANY>) - Returns an array of strings

select to_string_array(array(1.0,2.0,3.0));

["1.0","2.0","3.0"]

to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options]) - Return list of values sorted by value itself or specific key

WITH data as (
    SELECT 5 as key, 'apple' as value
    UNION ALL
    SELECT 3 as key, 'banana' as value
    UNION ALL
    SELECT 4 as key, 'candy' as value
    UNION ALL
    SELECT 1 as key, 'donut' as value
    UNION ALL
    SELECT 2 as key, 'egg' as value 
    UNION ALL
    SELECT 4 as key, 'candy' as value -- both key and value duplicates
)
SELECT                                                  -- expected output
    to_ordered_list(value, key, '-reverse'),            -- [apple, candy, candy, (banana, egg | egg, banana), donut] (reverse order)
    to_ordered_list(value, key, '-k 2'),                -- [apple, candy] (top-k)
    to_ordered_list(value, key, '-k 100'),              -- [apple, candy, candy, (banana, egg | egg, banana), dunut]
    to_ordered_list(value, key, '-k 2 -reverse'),       -- [donut, (banana | egg)] (reverse top-k = tail-k)
    to_ordered_list(value, key),                        -- [donut, (banana, egg | egg, banana), candy, candy, apple] (natural order)
    to_ordered_list(value, key, '-k -2'),               -- [donut, (banana | egg)] (tail-k)
    to_ordered_list(value, key, '-k -100'),             -- [donut, (banana, egg | egg, banana), candy, candy, apple]
    to_ordered_list(value, key, '-k -2 -reverse'),      -- [apple, candy] (reverse tail-k = top-k)
    to_ordered_list(value, '-k 2'),                     -- [egg, donut] (alphabetically)
    to_ordered_list(key, '-k -2 -reverse'),             -- [5, 4] (top-2 keys)
    to_ordered_list(key),                               -- [1, 2, 3, 4, 4, 5] (natural ordered keys)
    to_ordered_list(value, key, '-k 2 -kv_map'),        -- {5:"apple",4:"candy"}
    to_ordered_list(value, key, '-k 2 -vk_map'),        -- {"apple":5,"candy":4}
    to_ordered_list(value, key, '-k -2 -kv_map'),       -- {1:"donut",2:"egg"}
    to_ordered_list(value, key, '-k -2 -vk_map'),       -- {"donut":1,"egg":2}
    to_ordered_list(value, key, '-k 4 -dedup -vk_map'), -- {"apple":5,"candy":4,"banana":3,"egg":2}
    to_ordered_list(value, key, '-k 4 -vk_map'),        -- {"apple":5,"candy":4,"banana":3}
    to_ordered_list(value, key, '-k 4 -dedup'),         -- ["apple","candy","banana","egg"]
    to_ordered_list(value, key, '-k 4')                 -- ["apple","candy","candy","banana"]
FROM
    data

Bitset

bits_collect(int|long x) - Returns a bitset in array<long>
bits_or(array<long> b1, array<long> b2, ..) - Returns a logical OR given bitsets
```
SELECT unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));
 [1,2,3,4]
```
to_bits(int[] indexes) - Returns an bitset representation if the given indexes in long[]
```
SELECT to_bits(array(1,2,3,128));
 [14,-9223372036854775808]
```
unbits(long[] bitset) - Returns an long array of the give bitset representation
```
SELECT unbits(to_bits(array(1,4,2,3)));
 [1,2,3,4]
```

Compression

deflate(TEXT data [, const int compressionLevel]) - Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9]
```
SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));
 AA+=kaIM|WTt!+wbGAA
```

inflate(BINARY compressedData) - Returns a decompressed STRING by using Inflater

SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));
 aaaaaaaaaaaaaaaabbbbccc

Datetime

sessionize(long timeInSec, long thresholdInSec [, String subject])- Returns a UUID string of a session.

SELECT 
  sessionize(time, 3600, ip_addr) as session_id, 
  time, ip_addr
FROM (
  SELECT time, ipaddr 
  FROM weblog 
  DISTRIBUTE BY ip_addr, time SORT BY ip_addr, time DESC
) t1

JSON

from_json(string jsonString, const string returnTypes [, const array<string>|const string columnNames]) - Return Hive object.

SELECT
  from_json(to_json(map('one',1,'two',2)), 'map<string,int>'),
  from_json(
    '{ "person" : { "name" : "makoto" , "age" : 37 } }',
    'struct<name:string,age:int>', 
    array('person')
  ),
  from_json(
    '[0.1,1.1,2.2]',
    'array<double>'
  ),
  from_json(to_json(
    ARRAY(
      NAMED_STRUCT("country", "japan", "city", "tokyo"), 
      NAMED_STRUCT("country", "japan", "city", "osaka")
    )
  ),'array<struct<country:string,city:string>>'),
  from_json(to_json(
    ARRAY(
      NAMED_STRUCT("country", "japan", "city", "tokyo"), 
      NAMED_STRUCT("country", "japan", "city", "osaka")
    ),
    array('city')
  ), 'array<struct<country:string,city:string>>'),
  from_json(to_json(
    ARRAY(
      NAMED_STRUCT("country", "japan", "city", "tokyo"), 
      NAMED_STRUCT("country", "japan", "city", "osaka")
    )
  ),'array<struct<city:string>>');

 {"one":1,"two":2}
 {"name":"makoto","age":37}
 [0.1,1.1,2.2]
 [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}]
 [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}]
 [{"city":"tokyo"},{"city":"osaka"}]

to_json(ANY object [, const array<string>|const string columnNames]) - Returns Json string

SELECT 
  NAMED_STRUCT("Name", "John", "age", 31),
  to_json(
     NAMED_STRUCT("Name", "John", "age", 31)
  ),
  to_json(
     NAMED_STRUCT("Name", "John", "age", 31),
     array('Name', 'age')
  ),
  to_json(
     NAMED_STRUCT("Name", "John", "age", 31),
     array('name', 'age')
  ),
  to_json(
     NAMED_STRUCT("Name", "John", "age", 31),
     array('age')
  ),
  to_json(
     NAMED_STRUCT("Name", "John", "age", 31),
     array()
  ),
  to_json(
     null,
     array()
  ),
  to_json(
    struct("123", "456", 789, array(314,007)),
    array('ti','si','i','bi')
  ),
  to_json(
    struct("123", "456", 789, array(314,007)),
    'ti,si,i,bi'
  ),
  to_json(
    struct("123", "456", 789, array(314,007))
  ),
  to_json(
    NAMED_STRUCT("country", "japan", "city", "tokyo")
  ),
  to_json(
    NAMED_STRUCT("country", "japan", "city", "tokyo"), 
    array('city')
  ),
  to_json(
    ARRAY(
      NAMED_STRUCT("country", "japan", "city", "tokyo"), 
      NAMED_STRUCT("country", "japan", "city", "osaka")
    )
  ),
  to_json(
    ARRAY(
      NAMED_STRUCT("country", "japan", "city", "tokyo"), 
      NAMED_STRUCT("country", "japan", "city", "osaka")
    ),
    array('city')
  );

 {"name":"John","age":31}
 {"name":"John","age":31}
 {"Name":"John","age":31}
 {"name":"John","age":31}
 {"age":31}
 {}
 NULL
 {"ti":"123","si":"456","i":789,"bi":[314,7]}
 {"ti":"123","si":"456","i":789,"bi":[314,7]}
 {"col1":"123","col2":"456","col3":789,"col4":[314,7]}
 {"country":"japan","city":"tokyo"}
 {"city":"tokyo"}
 [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}]
 [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}]

Map

map_exclude_keys(Map<K,V> map, array<K> filteringKeys) - Returns the filtered entries of a map not having specified keys
```
SELECT map_exclude_keys(map(1,'one',2,'two',3,'three'),array(2,3));
{1:"one"}
```

map_get(MAP<K> a, K n) - Returns the value corresponding to the key in the map.

Note this is a workaround for a Hive issue that non-constant expression for map indexes not supported.
See https://issues.apache.org/jira/browse/HIVE-1955

WITH tmp as (
  SELECT "one" as key
  UNION ALL
  SELECT "two" as key
)
SELECT map_get(map("one",1,"two",2),key)
FROM tmp;

> 1
> 2

map_get_sum(map<int,float> src, array<int> keys) - Returns sum of values that are retrieved by keys
map_include_keys(Map<K,V> map, array<K> filteringKeys) - Returns the filtered entries of a map having specified keys
```
SELECT map_include_keys(map(1,'one',2,'two',3,'three'),array(2,3));
{2:"two",3:"three"}
```

map_key_values(MAP<K, V> map) - Returns a array of key-value pairs in array<named_struct<key,value>>

SELECT map_key_values(map("one",1,"two",2));

> [{"key":"one","value":1},{"key":"two","value":2}]

map_roulette(Map<K, number> map [, (const) int/bigint seed]) - Returns a map key based on weighted random sampling of map values. Average of values is used for null values

-- `map_roulette(map<key, number> [, integer seed])` returns key by weighted random selection
SELECT 
  map_roulette(to_map(a, b)) -- 25% Tom, 21% Zhang, 54% Wang
FROM ( -- see https://issues.apache.org/jira/browse/HIVE-17406
  select 'Wang' as a, 54 as b
  union all
  select 'Zhang' as a, 21 as b
  union all
  select 'Tom' as a, 25 as b
) tmp;
> Wang

-- Weight random selection with using filling nulls with the average value
SELECT
  map_roulette(map(1, 0.5, 'Wang', null)), -- 50% Wang, 50% 1
  map_roulette(map(1, 0.5, 'Wang', null, 'Zhang', null)) -- 1/3 Wang, 1/3 1, 1/3 Zhang
;

-- NULL will be returned if every key is null
SELECT 
  map_roulette(map()),
  map_roulette(map(null, null, null, null));
> NULL    NULL

-- Return NULL if all weights are zero
SELECT
  map_roulette(map(1, 0)),
  map_roulette(map(1, 0, '5', 0))
;
> NULL    NULL

-- map_roulette does not support non-numeric weights or negative weights.
SELECT map_roulette(map('Wong', 'A string', 'Zhao', 2));
> HiveException: Error evaluating map_roulette(map('Wong':'A string','Zhao':2))
SELECT map_roulette(map('Wong', 'A string', 'Zhao', 2));
> UDFArgumentException: Map value must be greather than or equals to zero: -2

map_tail_n(map SRC, int N) - Returns the last N elements from a sorted array of SRC
merge_maps(Map x) - Returns a map which contains the union of an aggregation of maps. Note that an existing value of a key can be replaced with the other duplicate key entry.
```
SELECT 
  merge_maps(m) 
FROM (
  SELECT map('A',10,'B',20,'C',30) 
  UNION ALL 
  SELECT map('A',10,'B',20,'C',30)
) t
```

to_map(key, value) - Convert two aggregated columns into a key-value map

WITH input as (
  select 'aaa' as key, 111 as value
  UNION all
  select 'bbb' as key, 222 as value
)
select to_map(key, value)
from input;

> {"bbb":222,"aaa":111}

to_ordered_map(key, value [, const int k|const boolean reverseOrder=false]) - Convert two aggregated columns into an ordered key-value map

with t as (
    select 10 as key, 'apple' as value
    union all
    select 3 as key, 'banana' as value
    union all
    select 4 as key, 'candy' as value
)
select
    to_ordered_map(key, value, true),   -- {10:"apple",4:"candy",3:"banana"} (reverse)
    to_ordered_map(key, value, 1),      -- {10:"apple"} (top-1)
    to_ordered_map(key, value, 2),      -- {10:"apple",4:"candy"} (top-2)
    to_ordered_map(key, value, 3),      -- {10:"apple",4:"candy",3:"banana"} (top-3)
    to_ordered_map(key, value, 100),    -- {10:"apple",4:"candy",3:"banana"} (top-100)
    to_ordered_map(key, value),         -- {3:"banana",4:"candy",10:"apple"} (natural)
    to_ordered_map(key, value, -1),     -- {3:"banana"} (tail-1)
    to_ordered_map(key, value, -2),     -- {3:"banana",4:"candy"} (tail-2)
    to_ordered_map(key, value, -3),     -- {3:"banana",4:"candy",10:"apple"} (tail-3)
    to_ordered_map(key, value, -100)    -- {3:"banana",4:"candy",10:"apple"} (tail-100)
from t

MapReduce

distcache_gets(filepath, key, default_value [, parseKey]) - Returns map<key_type, value_type>|value_type
jobconf_gets() - Returns the value from JobConf
jobid() - Returns the value of mapred.job.id
rowid() - Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER}
rownum() - Returns a generated row number sprintf(%d%04d,sequence,taskId) in long
```
SELECT rownum() as rownum, xxx from ...
```
taskid() - Returns the value of mapred.task.partition

Math

infinity() - Returns the constant representing positive infinity.

is_finite(x) - Determine if x is finite.

SELECT is_finite(333), is_finite(infinity());
true false

is_infinite(x) - Determine if x is infinite.
is_nan(x) - Determine if x is not-a-number.

l2_norm(double x) - Return a L2 norm of the given input x.

WITH input as (
  select generate_series(1,3) as v
)
select l2_norm(v) as l2norm
from input;
3.7416573867739413 = sqrt(1^2+2^2+3^2))

nan() - Returns the constant representing not-a-number.
```
SELECT nan(), is_nan(nan());
NaN true
```

sigmoid(x) - Returns 1.0 / (1.0 + exp(-x))

WITH input as (
  SELECT 3.0 as x
  UNION ALL
  SELECT -3.0 as x
)
select 
  1.0 / (1.0 + exp(-x)),
  sigmoid(x)
from
  input;
0.04742587317756678   0.04742587357759476
0.9525741268224334    0.9525741338729858

Vector/Matrix

transpose_and_dot(array<number> X, array<number> Y) - Returns dot(X.T, Y) as array<array<double>>, shape = (X.#cols, Y.#cols)

WITH input as (
  select array(1.0, 2.0, 3.0, 4.0) as x, array(1, 2) as y
  UNION ALL
  select array(2.0, 3.0, 4.0, 5.0) as x, array(1, 2) as y
)
select
  transpose_and_dot(x, y) as xy,
  transpose_and_dot(y, x) as yx
from 
  input;

[["3.0","6.0"],["5.0","10.0"],["7.0","14.0"],["9.0","18.0"]]   [["3.0","5.0","7.0","9.0"],["6.0","10.0","14.0","18.0"]]

vector_add(array<NUMBER> x, array<NUMBER> y) - Perform vector ADD operation.
```
SELECT vector_add(array(1.0,2.0,3.0), array(2, 3, 4));
[3.0,5.0,7.0]
```

vector_dot(array<NUMBER> x, array<NUMBER> y) - Performs vector dot product.

SELECT vector_dot(array(1.0,2.0,3.0),array(2.0,3.0,4.0));
20

SELECT vector_dot(array(1.0,2.0,3.0),2);
[2.0,4.0,6.0]

Sanity Checks

assert(boolean condition) or FUNC(boolean condition, string errMsg)- Throws HiveException if condition is not met

SELECT count(1) FROM stock_price WHERE assert(price > 0.0);
SELECT count(1) FROM stock_price WHERE assert(price > 0.0, 'price MUST be more than 0.0')

raise_error() or FUNC(string msg) - Throws an error

SELECT product_id, price, raise_error('Found an invalid record') FROM xxx WHERE price < 0.0

Text processing

base91(BINARY bin) - Convert the argument from binary to a BASE91 string

SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));
 AA+=kaIM|WTt!+wbGAA

is_stopword(string word) - Returns whether English stopword or not

normalize_unicode(string str [, string form]) - Transforms str with the specified normalization form. The form takes one of NFC (default), NFD, NFKC, or NFKD

SELECT normalize_unicode('ﾊﾝｶｸｶﾅ','NFKC');
 ハンカクカナ

SELECT normalize_unicode('㈱㌧㌦Ⅲ','NFKC');
 (株)トンドルIII

singularize(string word) - Returns singular form of a given English word
```
SELECT singularize(lower("Apples"));

 "apple"
```
split_words(string query [, string regex]) - Returns an array<text> containing splitted strings
tokenize(string englishText [, boolean toLowerCase]) - Returns tokenized words in array<string>

unbase91(string) - Convert a BASE91 string to a binary

SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));
 aaaaaaaaaaaaaaaabbbbccc

word_ngrams(array<string> words, int minSize, int maxSize]) - Returns list of n-grams for given words, where minSize <= n <= maxSize

SELECT word_ngrams(tokenize('Machine learning is fun!', true), 1, 2);

 ["machine","machine learning","learning","learning is","is","is fun","fun"]

str_contains(string query, array<string> searchTerms [, boolean orQuery=false]) - Returns true if the given query contains search terms

select
  str_contains('There are apple and orange', array('apple')), -- or=false
  str_contains('There are apple and orange', array('apple', 'banana'), true), -- or=true
  str_contains('There are apple and orange', array('apple', 'banana'), false); -- or=false
> true, true, false

Timeseries

moving_avg(NUMBER value, const int windowSize) - Returns moving average of a time series using a given window

SELECT moving_avg(x, 3) FROM (SELECT explode(array(1.0,2.0,3.0,4.0,5.0,6.0,7.0)) as x) series;
 1.0
 1.5
 2.0
 3.0
 4.0
 5.0
 6.0

Others

convert_label(const int|const float) - Convert from -1|1 to 0.0f|1.0f, or from 0.0f|1.0f to -1|1
each_top_k(int K, Object group, double cmpKey, *) - Returns top-K values (or tail-K values when k is less than 0)

generate_series(const int|bigint start, const int|bigint end) - Generate a series of values, from start to end. A similar function to PostgreSQL's generate_serics

SELECT generate_series(2,4);

 2
 3
 4

SELECT generate_series(5,1,-2);

 5
 3
 1

SELECT generate_series(4,3);

 (no return)

SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t;

 2018-04-21      1
 2018-04-22      2
 2018-04-23      3

WITH input as (
 SELECT 1 as c1, 10 as c2, 3 as step
 UNION ALL
 SELECT 10, 2, -3
)
SELECT generate_series(c1, c2, step) as series
FROM input;

 1
 4
 7
 10
 10
 7
 4

try_cast(ANY src, const string typeName) - Explicitly cast a value as a type. Returns null if cast fails.

SELECT try_cast(array(1.0,2.0,3.0), 'array<string>')
SELECT try_cast(map('A',10,'B',20,'C',30), 'map<string,double>')

x_rank(KEY) - Generates a pseudo sequence number starting from 1 for each key

List of Generic Hivemall Functions