Exploring Blockchain and Big Data with Alibaba Cloud Data Lake Analytics

Experimental Data Set

Data Acquisition Process

Step 1: Establish all the Ethereum nodes.

wget  https://gethstore.blob.core.windows.net/builds/geth-linux-amd64-1.8.12-37685930.tar.gz
nohup ./geth --datadir ./data/ --cache=1024 &
./geth attach ipc:./data/geth.ipc
eth.syncing

Step 2: Use a third-party program to export the Ethereum data into CSV files.

git clone https://github.com/medvedev1088/ethereum-etl
yum install epel-release -y
yum install https://centos7.iuscommunity.org/ius-release.rpm -y
yum install python36u -y
ln -s /bin/python3.6 /bin/python3
yum install python36u-pip -y
ln -s /bin/pip3.6 /bin/pip3
cd ethereum-etl
pip3 install -r requirements.txt
nohup bash export_all.sh -s 0 -e 5775105 -b 100000 -i data/geth.ipc -o output &

Step 3: Import the exported CSV files into the Alibaba Cloud OSS.

Using Data Lake Analytics to Create Tables

create schema ethereumetl;
use ethereumetl;
CREATE EXTERNAL TABLE IF NOT EXISTS blocks (
block_number BIGINT,
block_hash STRING,
block_parent_hash STRING,
block_nonce STRING,
block_sha3_uncles STRING,
block_logs_bloom STRING,
block_transactions_root STRING,
block_state_root STRING,
block_miner STRING,
block_difficulty DECIMAL(38,0),
block_total_difficulty DECIMAL(38,0),
block_size BIGINT,
block_extra_data STRING,
block_gas_limit BIGINT,
block_gas_used BIGINT,
block_timestamp BIGINT,
block_transaction_count BIGINT
)
PARTITIONED BY (start_block BIGINT, end_block BIGINT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = ',',
'field.delim' = ',',
'escape.delim' = '\\'
)
STORED AS TEXTFILE
LOCATION 'oss://ethblock/export/blocks'
TBLPROPERTIES (
'skip.header.line.count' = '1'
);
MSCK REPAIR TABLE blocks;
CREATE EXTERNAL TABLE IF NOT EXISTS transactions (
tx_hash STRING,
tx_nonce BIGINT,
tx_block_hash STRING,
tx_block_number BIGINT,
tx_index BIGINT,
tx_from STRING,
tx_to STRING,
tx_value DECIMAL(38,0),
tx_gas BIGINT,
tx_gas_price BIGINT,
tx_input STRING
)
PARTITIONED BY (start_block BIGINT, end_block BIGINT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = ',',
'field.delim' = ',',
'escape.delim' = '\\'
)
STORED AS TEXTFILE
LOCATION 'oss://ethblock/export/transactions'
TBLPROPERTIES (
'skip.header.line.count' = '1'
);
MSCK REPAIR TABLE transactions;
CREATE EXTERNAL TABLE IF NOT EXISTS erc20_transfers (
erc20_token STRING,
erc20_from STRING,
erc20_to STRING,
erc20_value DECIMAL(38,0),
erc20_tx_hash STRING,
erc20_log_index BIGINT,
erc20_block_number BIGINT
)
PARTITIONED BY (start_block BIGINT, end_block BIGINT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = ',',
'field.delim' = ',',
'escape.delim' = '\\'
)
STORED AS TEXTFILE
LOCATION 'oss://ethblock/export/erc20_transfers'
TBLPROPERTIES (
'skip.header.line.count' = '1'
);
MSCK REPAIR TABLE erc20_transfers;
select * from blocks limit 10;

Using Data Lake Analytics

Direct query

SELECT count(distinct(erc20_token)) FROM erc20_transfers

Collaboration with BI

select sum(block_transaction_count) as transaction_count,fetch_month from
(
SELECT block_transaction_count,DATE_FORMAT(FROM_UNIXTIME(block_timestamp),'%Y-%m') as fetch_month
FROM blocks
) as transaction_data
group by fetch_month
order by fetch_month

Data mining

SELECT tx_from,tx_to from transactions where tx_block_number<300000
order by rand() limit 50000

Conclusion

References

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store