Store Operations Optimization: Search Acceleration over PostgreSQL Arrays, JSON and Internal Tag Data

Background

The PostgreSQL arrays, JSON data, and other data types offer a lot of convenience to sellers for managing store operations. However, the optimization of these data types becomes increasingly complicated. For example, if a seller stores the information of some products in the database and attaches multiple tags in JSON format to each product. Then, the data structure may appear as follows.

create table js(  
gid int, -- 店铺ID
item int, -- 商品ID
prop jsonb[] -- 商品标签, json数组
);
  • Each JSON array contains multiple IDs.
  • Each ID corresponds to a score.
1, 1, ['{"id":10, "score":80}', '{"id":11, "score":70}', '{"id":21, "score":60}', .....]   
1, 2, ['{"id":11, "score":50}', '{"id":13, "score":30}', '{"id":21, "score":80}', .....]
.....
select * from js where gid=? and prop包含id=? 并且score between ? and ?的记录。
create or replace function get_res(  
jsonb[], -- 输入的JSONB数组
int, -- id
int, -- score最小值
int -- score最大值
) returns boolean as $$
declare
v_id int;
v_score int;
v_js jsonb;
begin
foreach v_js in array $1 loop
if (v_js->>'id')::int = $2 and (v_js->>'score')::float4 between $3 and $4 then
return true;
end if;
end loop;
return false;
end;
$$ language plpgsql strict;
select * from js where gid=? and get_res(prop, ?, ?, ?);

Performance Optimization Ideas

The critical question is how to accurately perform index retrieval on data and improve the performance when a known property has a maximum of N JSON elements.

select * from js where   
(gid=? and prop[1]->>'id'=? and prop[1]->>'score' between x and x) or -- 每个JSON元素对应一颗树
(gid=? and prop[2]->>'id'=? and prop[2]->>'score' between x and x) or -- 每个JSON元素对应一颗树
...
select * from js where   
(gid=? and prop[1]->>'id'=? and prop[1]->>'score' between x and x)
union all
select * from js where
(gid=? and prop[2]->>'id'=? and prop[2]->>'score' between x and x)
union all
...
;

Optimization Method

Follow the steps listed below to optimize the performance:

create index idx_js_1 on js (gid, ((prop[1]->>'id')::int), ((prop[1]->>'score')::float4));  
create index idx_js_2 on js (gid, ((prop[2]->>'id')::int), ((prop[2]->>'score')::float4));
create index idx_js_3 on js (gid, ((prop[3]->>'id')::int), ((prop[3]->>'score')::float4));
create index idx_js_4 on js (gid, ((prop[4]->>'id')::int), ((prop[4]->>'score')::float4));
create index idx_js_5 on js (gid, ((prop[5]->>'id')::int), ((prop[5]->>'score')::float4));
create index idx_js_6 on js (gid, ((prop[6]->>'id')::int), ((prop[6]->>'score')::float4));
create index idx_js_7 on js (gid, ((prop[7]->>'id')::int), ((prop[7]->>'score')::float4));
create index idx_js_8 on js (gid, ((prop[8]->>'id')::int), ((prop[8]->>'score')::float4));
create index idx_js_9 on js (gid, ((prop[9]->>'id')::int), ((prop[9]->>'score')::float4));
create index idx_js_10 on js (gid, ((prop[10]->>'id')::int), ((prop[10]->>'score')::float4));
create or replace function get_js(  
int, -- loops, 一个prop里多少个json元素
int, -- gid
int, -- json id
int, -- json score 最小值
int -- json score 最大值
) returns setof js as $$
declare
sql text := 'select * from js where ';
begin
for i in 1..$1 loop
sql := format($_$ %s (gid=%s and (prop[%s]->>'id')::int=%s and (prop[%s]->>'score')::float4 between %s and %s) union all select * from js where $_$, sql, $2, i, $3, i, $4, $5);
end loop;
sql := rtrim(sql, 'union all select * from js where ');
-- raise notice '%', sql;
return query execute sql;
end;
$$ language plpgsql strict;
postgres=# select * from get_js(10,1,1,10,20);  
gid | item | prop
-----+------+------
(0 rows)
postgres=# explain select * from js where  (gid=1 and (prop[1]->>'id')::int=1 and (prop[1]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[2]->>'id')::int=1 and (prop[2]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[3]->>'id')::int=1 and (prop[3]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[4]->>'id')::int=1 and (prop[4]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[5]->>'id')::int=1 and (prop[5]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[6]->>'id')::int=1 and (prop[6]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[7]->>'id')::int=1 and (prop[7]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[8]->>'id')::int=1 and (prop[8]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[9]->>'id')::int=1 and (prop[9]->>'score')::float4 between 10 and 20) union all select * from js where  (gid=1 and (prop[10]->>'id')::int=1 and (prop[10]->>'score')::float4 between 10 and 20);  
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------
Append (cost=0.15..21.87 rows=10 width=40)
-> Index Scan using idx_js_1 on js (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[1] ->> 'id'::text))::integer = 1) AND (((prop[1] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[1] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_2 on js js_1 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[2] ->> 'id'::text))::integer = 1) AND (((prop[2] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[2] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_3 on js js_2 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[3] ->> 'id'::text))::integer = 1) AND (((prop[3] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[3] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_4 on js js_3 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[4] ->> 'id'::text))::integer = 1) AND (((prop[4] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[4] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_5 on js js_4 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[5] ->> 'id'::text))::integer = 1) AND (((prop[5] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[5] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_6 on js js_5 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[6] ->> 'id'::text))::integer = 1) AND (((prop[6] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[6] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_7 on js js_6 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[7] ->> 'id'::text))::integer = 1) AND (((prop[7] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[7] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_8 on js js_7 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[8] ->> 'id'::text))::integer = 1) AND (((prop[8] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[8] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_9 on js js_8 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[9] ->> 'id'::text))::integer = 1) AND (((prop[9] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[9] ->> 'score'::text))::real <= '20'::doubleprecision))
-> Index Scan using idx_js_10 on js js_9 (cost=0.15..2.18 rows=1 width=40)
Index Cond: ((gid = 1) AND (((prop[10] ->> 'id'::text))::integer = 1) AND (((prop[10] ->> 'score'::text))::real >= '10'::double precision) AND (((prop[10] ->> 'score'::text))::real <= '20'::doubleprecision))
(21 rows)

Additional Knowledge

If you do not want to use a composite index, use a single column expression index instead.

create index idx_js_1 on js ( (gid||'_'||prop[1]->>'id'||'_'||lpad(prop[1]->>'score',4,'0')) );  
.............
select * from js where (gid||'_'||prop[1]->>'id'||'_'||lpad(prop[1]->>'score',4,'0')) between '1_1_0020' and '1_1_0100'
union all
select * from js where (gid||'_'||prop[2]->>'id'||'_'||lpad(prop[2]->>'score',4,'0')) between '1_1_0020' and '1_1_0100'
union all
....;
postgres=# select '90' > '100';  
?column?
----------
t
(1 row)

postgres=# select '090' > '100';
?column?
----------
f
(1 row)

Performance Improvement Test

Implement the following steps to test the performance.

create table js(  
gid int, -- 店铺ID
item int, -- 商品ID
prop jsonb[], -- 商品标签, json数组
primary key (gid,item)
);
create or replace function ins(int, int) returns void as $$  
declare
begin
insert into js values (
$1, $2,
array[
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb,
format('{"id":%s, "score":%s}', (random()*1000)::int, (random()*100)::int)::jsonb
]
) on conflict do nothing;
end;
$$ language plpgsql strict;
vi test.sql
\set gid random(1,10000)
\set item random(1,100000)
select ins(:gid, :item);
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 50 -j 50 -t 20000000
psql -c "create index idx_js_1 on js (gid, ((prop[1]->>'id')::int), ((prop[1]->>'score')::float4));" &  
psql -c "create index idx_js_2 on js (gid, ((prop[2]->>'id')::int), ((prop[2]->>'score')::float4));" &
psql -c "create index idx_js_3 on js (gid, ((prop[3]->>'id')::int), ((prop[3]->>'score')::float4));" &
psql -c "create index idx_js_4 on js (gid, ((prop[4]->>'id')::int), ((prop[4]->>'score')::float4));" &
psql -c "create index idx_js_5 on js (gid, ((prop[5]->>'id')::int), ((prop[5]->>'score')::float4));" &
psql -c "create index idx_js_6 on js (gid, ((prop[6]->>'id')::int), ((prop[6]->>'score')::float4));" &
psql -c "create index idx_js_7 on js (gid, ((prop[7]->>'id')::int), ((prop[7]->>'score')::float4));" &
psql -c "create index idx_js_8 on js (gid, ((prop[8]->>'id')::int), ((prop[8]->>'score')::float4));" &
psql -c "create index idx_js_9 on js (gid, ((prop[9]->>'id')::int), ((prop[9]->>'score')::float4));" &
psql -c "create index idx_js_10 on js (gid, ((prop[10]->>'id')::int), ((prop[10]->>'score')::float4));" &
vi test1.sql  
\set gid random(1,10000)
\set id random(0,1000)
\set l random(0,50)
\set u random(51,100)
select * from js where gid=:gid and get_res(prop, :id, :l, :u);
pgbench -M prepared -n -r -P 1 -f ./test1.sql -c 56 -j 56 -T 120
transaction type: ./test1.sql
scaling factor: 1
query mode: prepared
number of clients: 56
number of threads: 56
duration: 120 s
number of transactions actually processed: 11517
latency average = 582.923 ms
latency stddev = 109.062 ms
tps = 95.708298 (including connections establishing)
tps = 95.927842 (excluding connections establishing)
statement latencies in milliseconds:
0.004 \set gid random(1,10000)
0.001 \set id random(0,1000)
0.001 \set l random(0,50)
0.001 \set u random(51,100)
582.917 select * from js where gid=:gid and get_res(prop, :id, :l, :u);
top - 12:37:42 up 28 days, 17:25, 3 users, load average: 47.06, 19.61, 10.53
Tasks: 542 total, 58 running, 484 sleeping, 0 stopped, 0 zombie
%Cpu(s): 95.6 us, 4.3 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
KiB Mem : 23094336+total, 85957776 free, 5347196 used, 13963840+buff/cache
KiB Swap: 0 total, 0 free, 0 used. 17652979+avail Mem
vi test2.sql  

\set gid random(1,10000)
\set id random(0,1000)
\set l random(0,50)
\set u random(51,100)
select * from get_js(10,:gid, :id, :l, :u);
pgbench -M prepared -n -r -P 1 -f ./test2.sql -c 56 -j 56 -T 120
transaction type: ./test2.sql
scaling factor: 1
query mode: prepared
number of clients: 56
number of threads: 56
duration: 120 s
number of transactions actually processed: 2042359
latency average = 3.290 ms
latency stddev = 0.300 ms
tps = 16999.278637 (including connections establishing)
tps = 17000.090714 (excluding connections establishing)
statement latencies in milliseconds:
0.002 \set gid random(1,10000)
0.001 \set id random(0,1000)
0.000 \set l random(0,50)
0.001 \set u random(51,100)
3.288 select * from get_js(10,:gid, :id, :l, :u);
top - 12:45:12 up 28 days, 17:32, 2 users, load average: 22.72, 23.75, 17.37
Tasks: 537 total, 58 running, 479 sleeping, 0 stopped, 0 zombie
%Cpu(s): 93.8 us, 6.2 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
KiB Mem : 23094336+total, 49231168 free, 8543336 used, 17316886+buff/cache
KiB Swap: 0 total, 0 free, 0 used. 14191040+avail Mem

Conclusion

1. When you use the multi-tree APPEND method described in this document, there is no wastage of CPU resources and the performance improves by N times.

Source:https://www.postgresql.org/docs/10/static/datatype-json.htm

Original Source:

Follow me to keep abreast with the latest technology news, industry insights, and developer trends.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store