Skip to main content



Hive Complex Data Types


  • Array
$ vi arrayfile
1,abc,40000,a$b$c,hyd
2,def,3000,d$f,bang
3,abc,40000,a$b$c,hyd
4,def,3000,d$f,bang
5,abc,40000,a$b$c,hyd
6,def,3000,d$f,bang
7,abc,40000,a$b$c,hyd
8,def,3000,d$f,bang
9,abc,40000,a$b$c,hyd
10,def,3000,d$f$d$e$d$e$e$r$g,bang
hive> create table array_tab (id int, name string, salary bigint, sub array<string>, city string)
    > row format delimited
    > fields terminated by ','
    > collection items terminated by '$';
hive> load data local inpath '/root/arrayfile' into table array_tab;
hive> select * from array_tab;
OK
1       abc     40000   ["a","b","c"]   hyd
2       def     3000    ["d","f"]       bang
3       abc     40000   ["a","b","c"]   hyd
4       def     3000    ["d","f"]       bang
5       abc     40000   ["a","b","c"]   hyd
6       def     3000    ["d","f"]       bang
7       abc     40000   ["a","b","c"]   hyd
8       def     3000    ["d","f"]       bang
9       abc     40000   ["a","b","c"]   hyd
10      def     3000    ["d","f","d","e","d","e","e","r","g"]   bang
hive> describe array_tab;
OK
id                      int
name                    string
salary                  bigint
sub                     array<string>
city                    string
Time taken: 0.79 seconds, Fetched: 5 row(s)
hive> select sub[0] from array_tab where id=1;
….
OK
a



  • Map

$  vi mapfile
1,abc,40000,a$b$c,pf#500$epf#200,hyd
2,def,3000,d$f,pf#500,bang
2,abc,40000,a$b$c,pf#500$epf#200,hyd
3,def,3000,d$f,pf#500,bang
4,abc,40000,a$b$c,pf#500$epf#200,hyd
5,def,3000,d$f,pf#500,bang
6,abc,40000,a$b$c,pf#500$epf#200,hyd
7,def,3000,d$f,pf#500,bang
8,abc,40000,a$b$c,pf#500$epf#200,hyd
hive> create table arr_map_tab (id int, name string, salary bigint, sub array<string>, details map<string, int>, city string)
    > row format delimited
    > fields terminated by ','
    > collection items terminated by '$'
    > map keys terminated by '#';
hive> load data local inpath 'mapfile' into table arr_map_tab;
hive> select * from arr_map_tab;
OK
1       abc     40000   ["a","b","c"]   {"pf":500,"epf":200}    hyd
2       def     3000    ["d","f"]       {"pf":500}      bang
2       abc     40000   ["a","b","c"]   {"pf":500,"epf":200}    hyd
3       def     3000    ["d","f"]       {"pf":500}      bang
4       abc     40000   ["a","b","c"]   {"pf":500,"epf":200}    hyd
5       def     3000    ["d","f"]       {"pf":500}      bang
6       abc     40000   ["a","b","c"]   {"pf":500,"epf":200}    hyd
7       def     3000    ["d","f"]       {"pf":500}      bang
8       abc     40000   ["a","b","c"]   {"pf":500,"epf":200}    hyd
Time taken: 2.04 seconds, Fetched: 9 row(s)
hive> describe arr_map_tab;
OK
id                      int
name                    string
salary                  bigint
sub                     array<string>
details                 map<string,int>
city                    string
Time taken: 0.838 seconds, Fetched: 6 row(s)
hive> select details["pf"] from arr_map_tab limit 1;
OK
500
Time taken: 33.805 seconds, Fetched: 1 row(s)


  • Struct

$  vi structfile
1,abc,40000,a$b$c,pf#500$epf#200,hyd$ap$500001
2,def,3000,d$f,pf#500,bang$kar$600038
4,abc,40000,a$b$c,pf#500$epf#200,bhopal$MP$452013
5,def,3000,d$f,pf#500,Indore$MP$452014

hive> create table arr_map_struct_tab (id int, name string, salary bigint, sub array<string>, details map<string, int>, address struct<city:string, state:string, pin:int>)
> row format delimited                                                                                                                                                  
> fields terminated by ','
> collection items terminated by '$'                                                                                                         > map keys terminated by #';
OK
Time taken: 4.982 seconds
hive> describe arr_map_struct_tab;
OK
id                      int
name                    string
salary                  bigint
sub                     array<string>
details                 map<string,int>
address                 struct<city:string,state:string,pin:int>
Time taken: 1.416 seconds, Fetched: 6 row(s)
hive> load data local inpath 'structfile' into table arr_map_struct_tab;
hive> select * from arr_map_struct_tab;
OK
1       abc     40000   ["a","b","c"]   {"pf":500,"epf":200}    {"city":"hyd","state":"ap","pin":500001}
2       def     3000    ["d","f"]       {"pf":500}      {"city":"bang","state":"kar","pin":600038}
4       abc     40000   ["a","b","c"]   {"pf":500,"epf":200}    {"city":"bhopal","state":"MP","pin":452013}
5       def     3000    ["d","f"]       {"pf":500}      {"city":"Indore","state":"MP","pin":452014}
Time taken: 1.226 seconds, Fetched: 4 row(s)
hive> select address.city from arr_map_struct_tab where details["pf"]="500" and sub[0]="a";
OK
hyd
bhopal
Time taken: 20.286 seconds, Fetched: 2 row(s)



  • Uniontype
hive> CREATE TABLE union_tab(col1 UNIONTYPE<INT, DOUBLE, STRING, ARRAY<string>, STRUCT<a:INT,b:string>>)
    > row format delimited
    > fields terminated by ','
    > COLLECTION ITEMS TERMINATED BY '|'
    > LINES TERMINATED BY '\n';
OK
Time taken: 2.356 seconds
$ vi unionfile
0|1
0|12
1|1.234
1|2.3456
2|dinesh
2|Dinesh Sachdev
hive> load data local inpath 'unionfile' overwrite into table union_tab;
hive> select * from union_tab;
OK
{0:1}
{0:12}
{1:1.234}
{1:2.3456}
{2:"dinesh"}
{2:"Dinesh Sachdev"}
Time taken: 1.211 seconds, Fetched: 6 row(s)
It becomes quiet simple to load data into uniontype for primitives. But what about complex types? For example if we edit ‘unionfile’ and append an array:
$vi unionfile
0|1
0|12
1|1.234
1|2.3456
2|dinesh
2|Dinesh Sachdev
3|din|esh|sach|dev
hive> load data local inpath 'unionfile' overwrite into table union_tab;
hive> select * from union_tab;
OK
{0:1}
{0:12}
{1:1.234}
{1:2.3456}
{2:"dinesh"}
{2:"Dinesh Sachdev"}
{3:["din|esh|sach|dev"]}
Time taken: 1.11 seconds, Fetched: 7 row(s)

There is only a single element in array whereas we expected to have array of 4 strings [“din”,”esh”,”sach”,”dev”]

For this we can use create_union UDF:

hive> insert into table union_tab
    > select create_union(4,1, 1.02,"d", array("d","f"), named_struct('a',1, 'b','dinesh')) from sample_07 limit 1;
...
...
hive> insert into table union_tab
    > select create_union(3,1, 1.02,"d", array("d","f"), named_struct('a',1, 'b','dinesh')) from sample_07 limit 1;
hive> select * from union_tab;
OK
{4:{"a":1,"b":"dinesh"}}
{3:["d","f"]}
{0:1}
{0:12}
{1:1.234}
{1:2.3456}
{2:"dinesh"}
{2:"Dinesh Sachdev"}
{3:["din|esh|sach|dev"]}
Time taken: 0.064 seconds, Fetched: 9 row(s)


Comments

  1. boss, how can I query an union? suppose say I want to say "select from union_tab where "

    ReplyDelete

Post a Comment