Processing OSS Data Files in Different Formats Using Data Lake Analytics

Storage Format and SerDe

CREATE EXTERNAL TABLE nation (
N_NATIONKEY INT,
N_NAME STRING,
N_REGIONKEY INT,
N_COMMENT STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS TEXTFILE
LOCATION 'oss://test-bucket-julian-1/tpch_100m/nation';
mysql> show create table nation;
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Result |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| CREATE EXTERNAL TABLE `nation`(
`n_nationkey` int,
`n_name` string,
`n_regionkey` int,
`n_comment` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='|',
'serialization.format'='|')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'oss://test-bucket-julian-1/tpch_100m/nation'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='false',
'numFiles'='1',
'numRows'='-1',
'rawDataSize'='-1',
'totalSize'='2224',
'transient_lastDdlTime'='1528440011')
|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
1 row in set (1.81 sec)

Examples

CSV Files

Beijing,China,010
ShangHai,China,021
Tianjin,China,022
CREATE EXTERNAL TABLE city (
city STRING,
country STRING,
code INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'oss://bucket-for-testing/oss/text/cities';
  1. You can specify the field separator, field content quote character, and escape character for a row field, for example, WITH SERDEPROPERTIES (“separatorChar” = “,”, “quoteChar” = “`”, “escapeChar” = “” ).
  2. Row separators inside a field are not supported.
  3. All fields are of the STRING type.
  4. You can use a function in the SQL statement to convert data of other types and then process the data.
CREATE EXTERNAL TABLE test_csv_opencsvserde (
id STRING,
name STRING,
location STRING,
create_date STRING,
create_timestamp STRING,
longitude STRING,
latitude STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties(
"separatorChar"=",",
"quoteChar"="\"",
"escapeChar"="\\"
)
STORED AS TEXTFILE LOCATION 'oss://test-bucket-julian-1/test_csv_serde_1';
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
ESCAPED BY '\\'
LINES TERMINATED BY '\n'

TSV Files

Beijing    China    010
ShangHai China 021
Tianjin China 022
CREATE EXTERNAL TABLE city (
city STRING,
country STRING,
code INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION 'oss://bucket-for-testing/oss/text/cities';

Files with Multi-Character Data Field Separators

ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.MultiDelimitSerDe'
with serdeproperties(
"field.delim"="||"
)
CREATE EXTERNAL TABLE test_csv_multidelimit (
id STRING,
name STRING,
location STRING,
create_date STRING,
create_timestamp STRING,
longitude STRING,
latitude STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.MultiDelimitSerDe'
with serdeproperties(
"field.delim"="||"
)
STORED AS TEXTFILE LOCATION 'oss://bucket-for-testing/oss/text/cities/';

JSON Files

{"id": 123, "name": "jack", 
"c3": "2001-02-03 12:34:56"}
{"id": 456, "name": "rose", "c3": "1906-04-18 05:12:00"}
{"id": 789, "name": "tom", "c3": "2001-02-03 12:34:56"}
{"id": 234, "name": "alice", "c3": "1906-04-18 05:12:00"}
{"id": 123, "name": "jack", "c3": "2001-02-03 12:34:56"}
{"id": 456, "name": "rose", "c3": "1906-04-18 05:12:00"}
{"id": 789, "name": "tom", "c3": "2001-02-03 12:34:56"}
{"id": 234, "name": "alice", "c3": "1906-04-18 05:12:00"}
CREATE EXTERNAL TABLE t1 (id int, name string, c3 timestamp)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION 'oss://path/to/t1/directory';
{       "DocId": "Alibaba",         "User_1": {             "Id": 1234,             "Username": "bob1234",          "Name": "Bob",          "ShippingAddress": {                    "Address1": "969 Wenyi West St.",                     "Address2": null,                       "City": "Hangzhou",                      "Province": "Zhejiang"           },              "Orders": [{                            "ItemId": 6789,                                 "OrderDate": "11/11/2017"                       },                      {                               "ItemId": 4352,                                 "OrderDate": "12/12/2017"                       }               ]       } }
{
"DocId": "Alibaba",
"User_1": {
"Id": 1234,
"Username": "bob1234",
"Name": "Bob",
"ShippingAddress": {
"Address1": "969 Wenyi West St.",
"Address2": null,
"City": "Hangzhou",
"Province": "Zhejiang"
},
"Orders": [
{
"ItemId": 6789,
"OrderDate": "11/11/2017"
},
{
"ItemId": 4352,
"OrderDate": "12/12/2017"
}
]
}
}
CREATE EXTERNAL TABLE json_table_1 (
docid string,
user_1 struct<
id:INT,
username:string,
name:string,
shippingaddress:struct<
address1:string,
address2:string,
city:string,
province:string
>,
orders:array<
struct<
itemid:INT,
orderdate:string
>
>
>
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION 'oss://xxx/test/json/hcatalog_serde/table_1/';
select * from json_table_1;+---------+----------------------------------------------------------------------------------------------------------------+
| docid | user_1 |
+---------+----------------------------------------------------------------------------------------------------------------+
| Alibaba | [1234, bob1234, Bob, [969 Wenyi West St., null, Hangzhou, Zhejiang], [[6789, 11/11/2017], [4352, 12/12/2017]]] |
+---------+----------------------------------------------------------------------------------------------------------------+
select DocId,
User_1.Id,
User_1.ShippingAddress.Address1,
User_1.Orders[1].ItemId
from json_table_1
where User_1.Username = 'bob1234'
and User_1.Orders[2].OrderDate = '12/12/2017';
+---------+------+--------------------+-------+
| DocId | id | address1 | _col3 |
+---------+------+--------------------+-------+
| Alibaba | 1234 | 969 Wenyi West St. | 6789 |
+---------+------+--------------------+-------+
{"data_key":"com.taobao.vipserver.domains.meta.biz.alibaba.com","ts":1524550275112,"value_string":"{\"appName\":\"\",\"apps\":[],\"checksum\":\"50fa0540b430904ee78dff07c7350e1c\",\"clusterMap\":{\"DEFAULT\":{\"defCkport\":80,\"defIPPort\":80,\"healthCheckTask\":null,\"healthChecker\":{\"checkCode\":200,\"curlHost\":\"\",\"curlPath\":\"/status.taobao\",\"type\":\"HTTP\"},\"name\":\"DEFAULT\",\"nodegroup\":\"\",\"sitegroup\":\"\",\"submask\":\"0.0.0.0/0\",\"syncConfig\":{\"appName\":\"trade-ma\",\"nodegroup\":\"tradema\",\"pubLevel\":\"publish\",\"role\":\"\",\"site\":\"\"},\"useIPPort4Check\":true}},\"disabledSites\":[],\"enableArmoryUnit\":false,\"enableClientBeat\":false,\"enableHealthCheck\":true,\"enabled\":true,\"envAndSites\":\"\",\"invalidThreshold\":0.6,\"ipDeleteTimeout\":1800000,\"lastModifiedMillis\":1524550275107,\"localSiteCall\":true,\"localSiteThreshold\":0.8,\"name\":\"biz.alibaba.com\",\"nodegroup\":\"\",\"owners\":[\"junlan.zx\",\"张三\",\"李四\",\"cui.yuanc\"],\"protectThreshold\":0,\"requireSameEnv\":false,\"resetWeight\":false,\"symmetricCallType\":null,\"symmetricType\":\"warehouse\",\"tagName\":\"ipGroup\",\"tenantId\":\"\",\"tenants\":[],\"token\":\"1cf0ec0c771321bb4177182757a67fb0\",\"useSpecifiedURL\":false}"}
{
"data_key": "com.taobao.vipserver.domains.meta.biz.alibaba.com",
"ts": 1524550275112,
"value_string": "{\"appName\":\"\",\"apps\":[],\"checksum\":\"50fa0540b430904ee78dff07c7350e1c\",\"clusterMap\":{\"DEFAULT\":{\"defCkport\":80,\"defIPPort\":80,\"healthCheckTask\":null,\"healthChecker\":{\"checkCode\":200,\"curlHost\":\"\",\"curlPath\":\"/status.taobao\",\"type\":\"HTTP\"},\"name\":\"DEFAULT\",\"nodegroup\":\"\",\"sitegroup\":\"\",\"submask\":\"0.0.0.0/0\",\"syncConfig\":{\"appName\":\"trade-ma\",\"nodegroup\":\"tradema\",\"pubLevel\":\"publish\",\"role\":\"\",\"site\":\"\"},\"useIPPort4Check\":true}},\"disabledSites\":[],\"enableArmoryUnit\":false,\"enableClientBeat\":false,\"enableHealthCheck\":true,\"enabled\":true,\"envAndSites\":\"\",\"invalidThreshold\":0.6,\"ipDeleteTimeout\":1800000,\"lastModifiedMillis\":1524550275107,\"localSiteCall\":true,\"localSiteThreshold\":0.8,\"name\":\"biz.alibaba.com\",\"nodegroup\":\"\",\"owners\":[\"junlan.zx\",\"张三\",\"李四\",\"cui.yuanc\"],\"protectThreshold\":0,\"requireSameEnv\":false,\"resetWeight\":false,\"symmetricCallType\":null,\"symmetricType\":\"warehouse\",\"tagName\":\"ipGroup\",\"tenantId\":\"\",\"tenants\":[],\"token\":\"1cf0ec0c771321bb4177182757a67fb0\",\"useSpecifiedURL\":false}"
}
CREATE external TABLE json_table_2 (
data_key string,
ts bigint,
value_string string
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION 'oss://xxx/test/json/hcatalog_serde/table_2/';
select * from json_table_2;+---------------------------------------------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
+---------------------------------------------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| com.taobao.vipserver.domains.meta.biz.alibaba.com | 1524550275112 | {"appName":"","apps":[],"checksum":"50fa0540b430904ee78dff07c7350e1c","clusterMap":{"DEFAULT":{"defCkport":80,"defIPPort":80,"healthCheckTask":null,"healthChecker":{"checkCode":200,"curlHost":"","curlPath":"/status.taobao","type":"HTTP"},"name":"DEFAULT","nodegroup":"","sitegroup":"","submask":"0.0.0.0/0","syncConfig":{"appName":"trade-ma","nodegroup":"tradema","pubLevel":"publish","role":"","site":""},"useIPPort4Check":true}},"disabledSites":[],"enableArmoryUnit":false,"enableClientBeat":false,"enableHealthCheck":true,"enabled":true,"envAndSites":"","invalidThreshold":0.6,"ipDeleteTimeout":1800000,"lastModifiedMillis":1524550275107,"localSiteCall":true,"localSiteThreshold":0.8,"name":"biz.alibaba.com","nodegroup":"","owners":["junlan.zx","张三","李四","cui.yuanc"],"protectThreshold":0,"requireSameEnv":false,"resetWeight":false,"symmetricCallType":null,"symmetricType":"warehouse","tagName":"ipGroup","tenantId":"","tenants":[],"token":"1cf0ec0c771321bb4177182757a67fb0","useSpecifiedURL":false} |
+---------------------------------------------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
mysql> select json_extract_scalar(json_parse(value), '$.owners[1]') from json_table_2;+--------+
| _col0 |
+--------+
| John |
+--------+
mysql> select json_extract_scalar(json_obj.json_col, '$.DEFAULT.submask')
from (
select json_extract(json_parse(value), '$.clusterMap') as json_col from json_table_2
) json_obj
where json_extract_scalar(json_obj.json_col, '$.DEFAULT.healthChecker.curlPath') = '/status.taobao';
+-----------+
| _col0 |
+-----------+
| 0.0.0.0/0 |
+-----------+
mysql> with json_obj as (select json_extract(json_parse(value), '$.clusterMap') as json_col from json_table_2)
select json_extract_scalar(json_obj.json_col, '$.DEFAULT.submask')
from json_obj
where json_extract_scalar(json_obj.json_col, '$.DEFAULT.healthChecker.curlPath') = '/status.taobao';
+-----------+
| _col0 |
+-----------+
| 0.0.0.0/0 |
+-----------+

ORC Files

CREATE EXTERNAL TABLE orders_orc_date (
O_ORDERKEY INT,
O_CUSTKEY INT,
O_ORDERSTATUS STRING,
O_TOTALPRICE DOUBLE,
O_ORDERDATE DATE,
O_ORDERPRIORITY STRING,
O_CLERK STRING,
O_SHIPPRIORITY INT,
O_COMMENT STRING
)
STORED AS ORC
LOCATION 'oss://bucket-for-testing/datasets/tpch/1x/orc_date/orders_orc';

Parquet Files

CREATE EXTERNAL TABLE orders_parquet_date (
O_ORDERKEY INT,
O_CUSTKEY INT,
O_ORDERSTATUS STRING,
O_TOTALPRICE DOUBLE,
O_ORDERDATE DATE,
O_ORDERPRIORITY STRING,
O_CLERK STRING,
O_SHIPPRIORITY INT,
O_COMMENT STRING
)
STORED AS PARQUET
LOCATION 'oss://bucket-for-testing/datasets/tpch/1x/parquet_date/orders_parquet';

RCFile Files

CREATE EXTERNAL TABLE lineitem_rcfile_date (
L_ORDERKEY INT,
L_PARTKEY INT,
L_SUPPKEY INT,
L_LINENUMBER INT,
L_QUANTITY DOUBLE,
L_EXTENDEDPRICE DOUBLE,
L_DISCOUNT DOUBLE,
L_TAX DOUBLE,
L_RETURNFLAG STRING,
L_LINESTATUS STRING,
L_SHIPDATE DATE,
L_COMMITDATE DATE,
L_RECEIPTDATE DATE,
L_SHIPINSTRUCT STRING,
L_SHIPMODE STRING,
L_COMMENT STRING
)
STORED AS RCFILE
LOCATION 'oss://bucke-for-testing/datasets/tpch/1x/rcfile_date/lineitem_rcfile'

Avro Files

java -jar avro-tools-1.8.2.jar getschema /path/to/your/doctors.avro
{
"type" : "record",
"name" : "doctors",
"namespace" : "testing.hive.avro.serde",
"fields" : [ {
"name" : "number",
"type" : "int",
"doc" : "Order of playing the role"
}, {
"name" : "first_name",
"type" : "string",
"doc" : "first name of actor playing role"
}, {
"name" : "last_name",
"type" : "string",
"doc" : "last name of actor playing role"
} ]
}
CREATE EXTERNAL TABLE doctors(
number int,
first_name string,
last_name string)
STORED AS AVRO
LOCATION 'oss://mybucket-for-testing/directory/to/doctors';

Files Matching Regular Expressions

127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
127.0.0.1 - - [26/May/2009:00:00:00 +0000] "GET /someurl/?track=Blabla(Main) HTTP/1.1" 200 5864 - "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.65 Safari/525.19"
([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?
CREATE EXTERNAL TABLE serde_regex(
host STRING,
identity STRING,
userName STRING,
time STRING,
request STRING,
status STRING,
size INT,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
)
STORED AS TEXTFILE
LOCATION 'oss://bucket-for-testing/datasets/serde/regex';
mysql> select * from serde_regex;
+-----------+----------+-------+------------------------------+---------------------------------------------+--------+------+---------+--------------------------------------------------------------------------------------------------------------------------+
| host | identity | userName | time | request | status | size | referer | agent |
+-----------+----------+-------+------------------------------+---------------------------------------------+--------+------+---------+--------------------------------------------------------------------------------------------------------------------------+
| 127.0.0.1 | - | frank | [10/Oct/2000:13:55:36 -0700] | "GET /apache_pb.gif HTTP/1.0" | 200 | 2326 | NULL | NULL |
| 127.0.0.1 | - | - | [26/May/2009:00:00:00 +0000] | "GET /someurl/?track=Blabla(Main) HTTP/1.1" | 200 | 5864 | - | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.65 Safari/525.19" |
+-----------+----------+-------+------------------------------+---------------------------------------------+--------+------+---------+--------------------------------------------------------------------------------------------------------------------------+

Geographical JSON Data Files of Esri ArcGIS

CREATE EXTERNAL TABLE IF NOT EXISTS california_counties
(
Name string,
BoundaryShape binary
)
ROW FORMAT SERDE 'com.esri.hadoop.hive.serde.JsonSerde'
STORED AS INPUTFORMAT 'com.esri.json.hadoop.EnclosedJsonInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'oss://test_bucket/datasets/geospatial/california-counties/'

Conclusion

--

--

--

Follow me to keep abreast with the latest technology news, industry insights, and developer trends. Alibaba Cloud website:https://www.alibabacloud.com

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

Building the custom Edge Impulse Thingy91 Firmware and Connecting to MQTT

Floating-Point Rounding Problem

Three Years of Web Development Career— Takeaways

Upload to Google Drive from CLI | Python3 | Drive API v3

[Git] Collaboration with remote side

The Hidden Synchronized Keyword With a Static Block

Uploading array of images using multipart/form-data in swift.

Some Talk About Flutter

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Alibaba Cloud

Alibaba Cloud

Follow me to keep abreast with the latest technology news, industry insights, and developer trends. Alibaba Cloud website:https://www.alibabacloud.com

More from Medium

FluentD webhook with TLS Mutual Authentication

Apache Airflow: Write your first DAG in Apache Airflow

How to succeed and twice fail for AWS Glue Developer Endpoint and Apache Zeppelin

Gilbane Advisor 4–27–22 — SQL limits, Wasm opps