Data Sampling in PostgreSQL

Image for post
Image for post

Background

The online database is large. Therefore, we usually build a test database using data sampling. In addition to evenly extracting data, data sampling also encrypts data by hiding or encrypting some sensitive fields.

Example

Use functions for data sampling in PostgreSQL versions earlier than 9.5. PostgreSQL v.9.5 and later versions provide the SQL syntax for data sampling.

create or replace function samp_rows(nsp name, rel name, cnt int8, dist float8 default 100.0) returns setof record as $$
declare
pages int8;
tups float8;
begin
if dist<0 or dist>100 then
raise notice 'dist must between 0 and 100';
return;
end if;
select relpages,reltuples into pages,tups from pg_class where oid=(quote_ident(nsp)||'.'||quote_ident(rel))::regclass;
if not found then
return;
end if;
if cnt >= tups then -- 输入的采样记录数大于实际的记录数, 直接返回全表
-- ctid可以反映采样的随机度, 仅用于观察, 实际使用中可以不返回ctid
return query execute 'select ctid, * from '||quote_ident(nsp)||'.'||quote_ident(rel)||' t limit '||cnt;
else
-- 如果采样的记录数小于实际的记录数,则根据随机数的概率来返回采样结果 cnt/tups 即全离散时的采样概率
-- 离散度0-100,越大表示采样的结果越离散,可以理解为随机采样时覆盖的数据面,0表示连续采样,100则表示在所有数据中随机采样。
if dist = 100.0 then
return query execute 'select ctid, * from '||quote_ident(nsp)||'.'||quote_ident(rel)||' where random() <= '||cnt/tups||' limit '||cnt;
elsif (dist/100.0) <= (cnt/tups) then
return query execute 'select ctid, * from '||quote_ident(nsp)||'.'||quote_ident(rel)||' limit '||cnt;
else
return query execute 'select ctid, * from '||quote_ident(nsp)||'.'||quote_ident(rel)||' where random() <= '||(cnt/tups) * (100.0/dist)||' limit '||cnt;
end if;
end if;
end;
$$ language plpgsql strict;
postgres=# create table test(id int, info text, crt_time timestamp);
CREATE TABLE
Time: 2.522 ms
postgres=# insert into test select generate_series(1,10000000), md5(random()::text), now();
INSERT 0 10000000
Time: 46274.872 ms
postgres=# select * from samp_rows('public', 'test', 10) as t(ctid tid, c1 int, c2 text, c3 timestamp);
ctid | c1 | c2 | c3
------------+---------+----------------------------------+----------------------------
(88846,70) | 9506592 | 83f2ed9c48f5c850a80b09219f8ef81d | 2016-09-29 23:59:20.619389
(2223,92) | 237953 | 2bd148a78ba8681150494f7beada073f | 2016-09-29 23:59:20.619389
(4675,95) | 500320 | 9588232acc62ce109ac91a28744ff75a | 2016-09-29 23:59:20.619389
(11396,86) | 1219458 | 38cb4f0b0f8e3c4d2c867c52aa9e9276 | 2016-09-29 23:59:20.619389
(12434,77) | 1330515 | d02e3b81543895f142f706b45589916c | 2016-09-29 23:59:20.619389
(20625,14) | 2206889 | 989f65dba839e3ad0d404b33b3dcbc54 | 2016-09-29 23:59:20.619389
(46954,59) | 5024137 | 71b84624b57271d224b368d141267f40 | 2016-09-29 23:59:20.619389
(50156,4) | 5366696 | fa96056fa5e8a914a999921d21f89c1d | 2016-09-29 23:59:20.619389
(50533,42) | 5407073 | 988dd648caabeb2bbdb9700ab4c39e85 | 2016-09-29 23:59:20.619389
(52526,59) | 5620341 | a02eb2e3499b985e719f61eb4f2d44f3 | 2016-09-29 23:59:20.619389
(10 rows)
Time: 997.933 ms
postgres=# select * from samp_rows('public', 'test', 10, 20.0) as t(ctid tid, c1 int, c2 text, c3 timestamp);
ctid | c1 | c2 | c3
------------+---------+----------------------------------+----------------------------
(53248,74) | 5697610 | c833355249d72dd7c7679d1e857106ec | 2016-09-29 23:59:20.619389
(56404,63) | 6035291 | 0b43555f5e06684a546c61e04bf31ead | 2016-09-29 23:59:20.619389
(56643,61) | 6060862 | 205d8cead5fd828f4b7898c9533f057c | 2016-09-29 23:59:20.619389
(57482,15) | 6150589 | 88a6ea385f972745611fcc9a7ac63620 | 2016-09-29 23:59:20.619389
(63422,49) | 6786203 | f7615ebbecafa6e7f2590980e97dc21e | 2016-09-29 23:59:20.619389
(65723,98) | 7032459 | 1c7fc819d2990470608f24cfcf06f841 | 2016-09-29 23:59:20.619389
(73309,84) | 7844147 | 840dac30c69394c8faf1ce280cc26657 | 2016-09-29 23:59:20.619389
(73606,48) | 7875890 | 0115573ad12642d1a39a1e28a0c138a8 | 2016-09-29 23:59:20.619389
(75288,94) | 8055910 | f4cb320309a5abc29d5413c631643304 | 2016-09-29 23:59:20.619389
(76649,65) | 8201508 | 422364b88440b9ed38e4327080998f8b | 2016-09-29 23:59:20.619389
(10 rows)
Time: 383.163 ms
postgres=# select * from samp_rows('public', 'test', 10, 1.0) as t(ctid tid, c1 int, c2 text, c3 timestamp);
ctid | c1 | c2 | c3
------------+---------+----------------------------------+----------------------------
(76660,85) | 8202705 | ecbbaeb19c9e223b18de807e3a891704 | 2016-09-29 23:59:20.619389
(76684,16) | 8205204 | a1a7c251d0751d0d40005a82af330357 | 2016-09-29 23:59:20.619389
(76703,42) | 8207263 | be9701285b99d2c76efe3362c27b7b4c | 2016-09-29 23:59:20.619389
(77124,49) | 8252317 | b9ddff0fb85b02e6b1eebfce7938a791 | 2016-09-29 23:59:20.619389
(77301,49) | 8271256 | 2b32b83b118fb8560d41d8841b7707ec | 2016-09-29 23:59:20.619389
(77321,44) | 8273391 | 844555ec12e0ff82581abd37302b851c | 2016-09-29 23:59:20.619389
(77390,99) | 8280829 | 4d84ac5f7b957e66f1d9a5c05411b064 | 2016-09-29 23:59:20.619389
(77464,97) | 8288745 | 66080fa16f99f27aa5392615371b8c10 | 2016-09-29 23:59:20.619389
(77534,88) | 8296226 | cfed15f4f05b1af7a4d92ddc759202b0 | 2016-09-29 23:59:20.619389
(77671,95) | 8310892 | 0070304cbe99722705a1b0c471c98132 | 2016-09-29 23:59:20.619389
(10 rows)
Time: 17.056 ms
create or replace function samp_rows(nsp name, rel name, cnt int8, dist float8 default 100.0) returns setof record as $$
declare
pages int8;
tups float8;
begin
if dist<0 or dist>100 then
raise notice 'dist must between 0 and 100';
return;
end if;
select relpages,reltuples into pages,tups from pg_class where oid=(quote_ident(nsp)||'.'||quote_ident(rel))::regclass;
if not found then
return;
end if;
if cnt >= tups then
return query execute 'select * from '||quote_ident(nsp)||'.'||quote_ident(rel)||' t limit '||cnt;
else
-- 如果采样的记录数小于实际的记录数,则根据随机数的概率来返回采样结果 cnt/tups 即全离散时的采样概率
-- 离散度0-100,越大表示采样的结果越离散,可以理解为随机采样时覆盖的数据面,0表示连续采样,100则表示在所有数据中随机采样。
if dist = 100.0 then
return query execute 'select * from '||quote_ident(nsp)||'.'||quote_ident(rel)||' where random() <= '||cnt/tups||' limit '||cnt;
elsif (dist/100.0) <= (cnt/tups) then
return query execute 'select * from '||quote_ident(nsp)||'.'||quote_ident(rel)||' limit '||cnt;
else
return query execute 'select * from '||quote_ident(nsp)||'.'||quote_ident(rel)||' where random() <= '||(cnt/tups) * (100.0/dist)||' limit '||cnt;
end if;
end if;
end;
$$ language plpgsql strict;
postgres=# select * from samp_rows('public', 'test', 10, 1.0) as t(c1 int, c2 text, c3 timestamp);
c1 | c2 | c3
---------+----------------------------------+----------------------------
8510389 | aa866b8fecfa5dc73e25df0eb8eb6e5d | 2016-09-29 23:59:20.619389
8511823 | 3f637e7b4c2dc3ca72dec979d25d8945 | 2016-09-29 23:59:20.619389
8542741 | 921dc72741ddcc5100a9204a9e1f67f2 | 2016-09-29 23:59:20.619389
8560000 | 150138e96c00557a2904293a461a6bb6 | 2016-09-29 23:59:20.619389
8571110 | 510bcdc6b4a3108cc7adc901a30932ed | 2016-09-29 23:59:20.619389
8583953 | 555afdc0ca25da10a1fe0c980d6505a5 | 2016-09-29 23:59:20.619389
8590392 | 03a3faf95d397784fa2c2aeccce296cc | 2016-09-29 23:59:20.619389
8601889 | ecab338c982818673b024f5299717c11 | 2016-09-29 23:59:20.619389
8603505 | c176f2e29c5fa0538054f2651cf9c4e3 | 2016-09-29 23:59:20.619389
8620874 | 75ca3ff98cc8040d6ca3f16b6402a5d0 | 2016-09-29 23:59:20.619389
(10 rows)
Time: 15.660 mspostgres=# select * from samp_rows('public', 'test', 10, 20.0) as t(c1 int, c2 text, c3 timestamp);
c1 | c2 | c3
---------+----------------------------------+----------------------------
9217485 | dbe16993b20d736572a926bf1df05aea | 2016-09-29 23:59:20.619389
9327154 | cf6414a9a49f0fa41e0465fa2d015054 | 2016-09-29 23:59:20.619389
9664326 | 84c1f8ad26228c55b20085f80b2b3292 | 2016-09-29 23:59:20.619389
9667704 | 1da59e4572a09456408d65e7babbe8e4 | 2016-09-29 23:59:20.619389
221144 | 990aa3f26c8db1b8a9a48471a515b65f | 2016-09-29 23:59:20.619389
443124 | 974d539de407b7f4824510992cb71ef3 | 2016-09-29 23:59:20.619389
500254 | f8a70b3b0b2920618b17d63043444fbb | 2016-09-29 23:59:20.619389
952485 | bef93b6c48a2f74c7a7cde5af1153ccb | 2016-09-29 23:59:20.619389
1494854 | 7ed5e5873bcc7e4e12a6a424d39ac755 | 2016-09-29 23:59:20.619389
1557716 | 1c79f1f3d0026b47a7470d700cb0baad | 2016-09-29 23:59:20.619389
(10 rows)

Original Source:

Follow me to keep abreast with the latest technology news, industry insights, and developer trends.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store