Applying PostgreSQL Graph Database to Social Scenarios

Background

Graph Data Model

Can PostgreSQL Recursive Queries Meet the Needs of Relation Derivation?

postgres=# with recursive s as ( 
select c1,c2 from a where c1=1
union all
select a.c1,a.c2 from a join s on (a.c1=s.c2) where a.c1 not in (with t as (insert into tmp select * from s) select c2 from tmp ) and s.* is not null
)
select * from s;
ERROR: 42P19: recursive reference to query "s" must not appear within a subquery
LINE 4: ... not in (with t as (insert into tmp select * from s) select ...
^
LOCATION: checkWellFormedRecursionWalker, parse_cte.c:773

PostgreSQL UDF to Solve the Issue of Relation Loops

Modeling

postgres=# create table a(c1 int, c2 int, primary key(c1,c2));
CREATE TABLE
postgres=# create index idx_a_1 on a(c1);
CREATE INDEX
postgres=# create index idx_a_2 on a(c2);
CREATE INDEX
postgres=# insert into a select random()*10000000, random()*10000000 from generate_series(1,100000000) ;
create temp table if not exists tmp1(level int, c1 int, c2 int) ON COMMIT delete rows; 
create index if not exists idx_tmp1_1 on tmp1(level);
create index if not exists idx_tmp1_2 on tmp1(c1);
create index if not exists idx_tmp1_3 on tmp1(c2);
create unlogged table u1 (like tmp1);create temp table if not exists tmp2(level int, path int[], c1 int, c2 int) ON COMMIT delete rows;
create index if not exists idx_tmp2_1 on tmp2(level);
create index if not exists idx_tmp2_2 on tmp2(c1);
create index if not exists idx_tmp2_3 on tmp2(c2);
create unlogged table u2 (like tmp2);

Function 1: Output A-Centered and N-Level Relation Data

create or replace function find_rel(v_c1 int, v_level int) returns setof u1 as 
$$
declare
i int := 1;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;

-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
create temp table if not exists tmp1(level int, c1 int, c2 int) ON COMMIT delete rows;
create index if not exists idx_tmp1_1 on tmp1(level);
create index if not exists idx_tmp1_2 on tmp1(c1);
create index if not exists idx_tmp1_3 on tmp1(c2);
-- To store the data of the initial level (namely the start point)
return query insert into tmp1 select i, * from a where c1=v_c1 returning *;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
-- The next level relation is derived though a join with level=i-1 (Group By excludes duplicate nodes. For example, 3 in 1-2-3-4 and 1-5-3-4 is excluded), and the looping points are excluded by not exists.
return query insert into tmp1 select i, a.c1, a.c2 from a join (select c2 from tmp1 where level=i-1 group by c2) tmp on (a.c1=tmp.c2) where not exists (select 1 from tmp1 where a.c1 = tmp1.c1) returning *;
end loop;
end;
$$
language plpgsql strict;

Function 2: Output A-Centered and N-Level Relation Data

create or replace function find_rel_withpath(v_c1 int, v_level int) returns setof u2 as 
$$
declare
i int := 1;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;

-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
create temp table if not exists tmp2(level int, path int[], c1 int, c2 int) ON COMMIT delete rows;
create index if not exists idx_tmp2_1 on tmp2(level);
create index if not exists idx_tmp2_2 on tmp2(c1);
create index if not exists idx_tmp2_3 on tmp2(c2);
-- To store the data of the initial level (namely the start point)
return query insert into tmp2 select i, array[]::int[] || c1 || c2 , * from a where c1=v_c1 returning *;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
-- The next level relation is derived though a join with level=i-1 (Group By excludes duplicate nodes. For example, duplicate paths like 1-2-3-4 and 1-2-3-4 should be excluded. In practice, this situation cannot exist after adding the unique constraints of c1 and c2), and the looping points are excluded by not exists.
-- path indicates the current path.
-- If PK constraints for c1 and c2 already exist, you can directly associate them with tmp2 without using Group By.
return query insert into tmp2 select i, tmp.path||a.c2, a.c1, a.c2 from a join (select c2,path from tmp2 where level=i-1 group by c2,path) tmp on (a.c1=tmp.c2) where not exists (select 1 from tmp2 where a.c1 = tmp2.c1) returning *;
end loop;
end;
$$
language plpgsql strict;

Function 3: Find the shortest relation path between two people and two events

create or replace function find_rel_path(v_c1 int, v_c2 int) returns setof int[] as 
$$
declare
i int := 1;
begin
-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
create temp table if not exists tmp2(level int, path int[], c1 int, c2 int) ON COMMIT delete rows;
create index if not exists idx_tmp2_1 on tmp2(level);
create index if not exists idx_tmp2_2 on tmp2(c1);
create index if not exists idx_tmp2_3 on tmp2(c2);
-- To store the data of the initial level (namely the start point)
insert into tmp2 select i, array[]::int[] || c1 || c2 , * from a where c1=v_c1;
loop
i := i+1;

perform 1 from tmp2 where c2=v_c2 limit 1;
if found then
return query select path from tmp2 where c2=v_c2 and level=i-1;
return;
end if;
insert into tmp2 select i, tmp.path||a.c2, a.c1, a.c2 from a join (select c2,path from tmp2 where level=i-1 group by c2,path) tmp on (a.c1=tmp.c2) where not exists (select 1 from tmp2 where a.c1 = tmp2.c1);
end loop;
end;
$$
language plpgsql strict;

Test

select * from find_rel(1,3);select * from find_rel_withpath(1,3);
postgres=# select * from find_rel(1,3);
NOTICE: relation "tmp1" already exists, skipping
NOTICE: relation "idx_tmp1_1" already exists, skipping
NOTICE: relation "idx_tmp1_2" already exists, skipping
NOTICE: relation "idx_tmp1_3" already exists, skipping
level | c1 | c2
------+---------+---------
1 | 1 | 5157873
1 | 1 | 3102468
1 | 1 | 8399312
1 | 1 | 1442141
1 | 1 | 5094441
1 | 1 | 1521897
1 | 1 | 401079
2 | 401079 | 1147078
2 | 401079 | 9740100
......
3 | 1731998 | 6503196
3 | 1731998 | 5112396
3 | 6525458 | 937613
3 | 6525458 | 8459123
3 | 6525458 | 8419231
3 | 6525458 | 4021987
(828 rows)
Time: 15.000 ms
postgres=# select * from find_rel_withpath(1,3);
NOTICE: relation "tmp2" already exists, skipping
NOTICE: relation "idx_tmp2_1" already exists, skipping
NOTICE: relation "idx_tmp2_2" already exists, skipping
NOTICE: relation "idx_tmp2_3" already exists, skipping
level | path | c1 | c2
-------+-----------------------------+---------+---------
1 | {1,5157873} | 1 | 5157873
1 | {1,3102468} | 1 | 3102468
1 | {1,8399312} | 1 | 8399312
1 | {1,1442141} | 1 | 1442141
1 | {1,5094441} | 1 | 5094441
1 | {1,1521897} | 1 | 1521897
1 | {1,401079} | 1 | 401079
2 | {1,401079,1147078} | 401079 | 1147078
2 | {1,401079,9740100} | 401079 | 9740100
......
3 | {1,1442141,9719411,7811631} | 9719411 | 7811631
3 | {1,401079,9740100,5119416} | 9740100 | 5119416
3 | {1,401079,9740100,350046} | 9740100 | 350046
3 | {1,401079,9740100,8223067} | 9740100 | 8223067
3 | {1,401079,9740100,5608312} | 9740100 | 5608312
3 | {1,401079,9740100,7920319} | 9740100 | 7920319
3 | {1,401079,9740100,6416565} | 9740100 | 6416565
(828 rows)
Time: 19.524 ms
postgres=# select * from find_rel_withpath(1,5) order by level desc limit 10;
NOTICE: relation "tmp2" already exists, skipping
NOTICE: relation "idx_tmp2_1" already exists, skipping
NOTICE: relation "idx_tmp2_2" already exists, skipping
NOTICE: relation "idx_tmp2_3" already exists, skipping
level | path | c1 | c2
-------+----------------------------------------+-----+---------
5 | {1,401079,3806993,3879310,165,8074824} | 165 | 8074824
5 | {1,401079,3806993,3879310,165,5983603} | 165 | 5983603
5 | {1,401079,3806993,3879310,165,9804825} | 165 | 9804825
5 | {1,401079,3806993,3879310,165,3848025} | 165 | 3848025
5 | {1,401079,3806993,3879310,165,2045526} | 165 | 2045526
5 | {1,401079,3806993,3879310,165,9783524} | 165 | 9783524
5 | {1,401079,3806993,3879310,165,386886} | 165 | 386886
5 | {1,401079,3806993,3879310,165,6318408} | 165 | 6318408
5 | {1,401079,3806993,3879310,165,1150578} | 165 | 1150578
5 | {1,401079,3806993,3879310,165,6702827} | 165 | 6702827
(10 rows)
Time: 1473.953 ms
select * from find_rel_path(1,386886);
postgres=# select * from find_rel_path(1,386886);
NOTICE: relation "tmp2" already exists, skipping
NOTICE: relation "idx_tmp2_1" already exists, skipping
NOTICE: relation "idx_tmp2_2" already exists, skipping
NOTICE: relation "idx_tmp2_3" already exists, skipping
find_rel_path
---------------------------------------
{1,401079,3806993,3879310,165,386886}
(1 row)
Time: 1069.781 ms
postgres=# select * from find_rel_path(1,3879310);
NOTICE: relation "tmp2" already exists, skipping
NOTICE: relation "idx_tmp2_1" already exists, skipping
NOTICE: relation "idx_tmp2_2" already exists, skipping
NOTICE: relation "idx_tmp2_3" already exists, skipping
find_rel_path
----------------------------
{1,401079,3806993,3879310}
(1 row)
Time: 17.290 ms

UDF Optimization Idea 1 — Using a Cursor to Receive Streams

create or replace function find_rel_withpath_cur(v_c1 int, v_level int) returns setof refcursor as 
$$
declare
i int := 1;
ref refcursor[];
res refcursor;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;
for x in 1..v_level loop
ref[x] := 'a'||x;
end loop;
-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
create temp table if not exists tmp2(level int, path int[], c1 int, c2 int) ON COMMIT delete rows;
create index if not exists idx_tmp2_1 on tmp2(level);
create index if not exists idx_tmp2_2 on tmp2(c1);
create index if not exists idx_tmp2_3 on tmp2(c2);
-- To store the data of the initial level (namely the start point)
res := ref[i];
open res for insert into tmp2 select i, array[]::int[] || c1 || c2 , * from a where c1=v_c1 returning *;
return next res;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
-- The next level relation is derived though a join with level=i-1 (Group By excludes duplicate nodes. For example, duplicate paths like 1-2-3-4 and 1-2-3-4 should be excluded. In practice, this situation cannot exist after adding the unique constraints of c1 and c2), and the looping points are excluded by not exists.
-- path indicates the current path.
-- If PK constraints for c1 and c2 already exist, you can directly associate them with tmp2 without using Group By.
res := ref[i];
open res for insert into tmp2 select i, tmp.path||a.c2, a.c1, a.c2 from a join (select c2,path from tmp2 where level=i-1 group by c2,path) tmp on (a.c1=tmp.c2) where not exists (select 1 from tmp2 where a.c1 = tmp2.c1) returning *;
return next res;
end loop;
end;
$$
language plpgsql strict;
postgres=# begin;
BEGIN
Time: 0.390 ms
postgres=# select * from find_rel_withpath_cur(1,5);
NOTICE: relation "tmp2" already exists, skipping
NOTICE: relation "idx_tmp2_1" already exists, skipping
NOTICE: relation "idx_tmp2_2" already exists, skipping
NOTICE: relation "idx_tmp2_3" already exists, skipping
find_rel_withpath_cur
-----------------------
a1
a2
a3
a4
a5
(5 rows)
Time: 4.008 ms
postgres=# fetch all in a1;
level | path | c1 | c2
-------+-------------+----+---------
1 | {1,5157873} | 1 | 5157873
1 | {1,3102468} | 1 | 3102468
1 | {1,8399312} | 1 | 8399312
1 | {1,1442141} | 1 | 1442141
1 | {1,5094441} | 1 | 5094441
1 | {1,1521897} | 1 | 1521897
1 | {1,401079} | 1 | 401079
(7 rows)
Time: 0.958 ms
postgres=# fetch all in a2;
level | path | c1 | c2
-------+---------------------+---------+---------
2 | {1,401079,1147078} | 401079 | 1147078
2 | {1,401079,9740100} | 401079 | 9740100
2 | {1,401079,8171779} | 401079 | 8171779
2 | {1,401079,3806993} | 401079 | 3806993
2 | {1,401079,2491387} | 401079 | 2491387
......
2 | {1,8399312,5886963} | 8399312 | 5886963
2 | {1,8399312,3652462} | 8399312 | 3652462
2 | {1,8399312,8148713} | 8399312 | 8148713
2 | {1,8399312,8282991} | 8399312 | 8282991
(75 rows)
Time: 3.173 ms
postgres=# fetch all in a3;
level | path | c1 | c2
-------+-----------------------------+---------+---------
3 | {1,1521897,47614,2653114} | 47614 | 2653114
3 | {1,1521897,47614,1354306} | 47614 | 1354306
3 | {1,1521897,47614,7452721} | 47614 | 7452721
...
3 | {1,401079,9740100,7920319} | 9740100 | 7920319
3 | {1,401079,9740100,6416565} | 9740100 | 6416565
(746 rows)
Time: 25.455 ms
postgres=# fetch all in a4;
......
4 | {1,401079,8171779,9968575,6388546} | 9968575 | 6388546
4 | {1,401079,8171779,9968575,8281935} | 9968575 | 8281935
4 | {1,401079,8171779,9968575,6076729} | 9968575 | 6076729
4 | {1,401079,8171779,9968575,7087557} | 9968575 | 7087557
(7383 rows)
Time: 14.482 ms
postgres=# begin;
BEGIN
Time: 0.561 ms
postgres=# select * from find_rel_withpath_cur(1,5);
NOTICE: relation "tmp2" already exists, skipping
NOTICE: relation "idx_tmp2_1" already exists, skipping
NOTICE: relation "idx_tmp2_2" already exists, skipping
NOTICE: relation "idx_tmp2_3" already exists, skipping
find_rel_withpath_cur
-----------------------
a1
a2
a3
a4
a5
(5 rows)
Time: 2.161 msIf a cursor is not received in a sequential manner in the first place, cursors after this cursor will no longer have data. postgres=# fetch all in a4;
level | path | c1 | c2
-------+------+----+----
(0 rows)
Time: 0.738 ms
postgres=# fetch all in a5;
level | path | c1 | c2
-------+------+----+----
(0 rows)
Time: 0.727 ms
postgres=# fetch all in a1;
level | path | c1 | c2
-------+-------------+----+---------
1 | {1,5157873} | 1 | 5157873
1 | {1,3102468} | 1 | 3102468
1 | {1,8399312} | 1 | 8399312
1 | {1,1442141} | 1 | 1442141
1 | {1,5094441} | 1 | 5094441
1 | {1,1521897} | 1 | 1521897
1 | {1,401079} | 1 | 401079
(7 rows)
Time: 0.954 ms
When c2 is of type Array, cursors return functions     

create or replace function find_rel_withpath_cur_array(v_c1 int, v_level int) returns setof refcursor as
$$
declare
i int := 1;
ref refcursor[];
res refcursor;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;
for x in 1..v_level loop
ref[x] := 'a'||x;
end loop;
-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
create temp table if not exists tmp2(level int, path int[], c1 int, c2 int) ON COMMIT delete rows;
create index if not exists idx_tmp2_1 on tmp2(level);
create index if not exists idx_tmp2_2 on tmp2(c1);
create index if not exists idx_tmp2_3 on tmp2(c2);
-- To store the data of the initial level (namely the start point)
res := ref[i];
open res for insert into tmp2 select i as level, array[]::int[] || c1 || c2 as path , c1, c2 from (select c1,unnest(c2) as c2 from a where c1=v_c1) a returning *;
return next res;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
-- The next level relation is derived though a join with level=i-1 (Group By excludes duplicate nodes. For example, duplicate paths like 1-2-3-4 and 1-2-3-4 should be excluded. In practice, this situation cannot exist after adding the unique constraints of c1 and c2), and the looping points are excluded by not exists.
-- path indicates the current path.
-- If PK constraints for c1 and c2 already exist, you can directly associate them with tmp2 without using Group By.
res := ref[i];
open res for insert into tmp2 select i as level, a.path||a.c2 as path, a.c1, a.c2 from (select tmp2.path, a.c1, unnest(a.c2) c2 from a join tmp2 on (a.c1=tmp2.c2 and tmp2.level=i-1 and tmp2.c1<>a.c1)) a returning *;
return next res;
end loop;
end;
$$
language plpgsql strict;

UDF Optimization Idea 2 — Functions and Stream Receiving

Create 100 Million Users and Consider Every 50 Thousand Users as a Group of Associated Users. Each User Is Associated with 1000 Close Users, Forming a Network of 100 Billion Relations.

postgres=# create table a(c1 int, c2 int[], primary key (c1));
CREATE TABLE
vi test.sql
\set id random(1,100000000)
insert into a select :id, (select array_agg(((width_bucket(:id,1,100000000,2000)-1)*50000 + (random()*50000)::int)) from generate_series(1,1000)) on conflict do nothing;
pgbench -M prepared -n -r -P 5 -f ./test.sql -c 64 -j 64 -T 100000

Test Scenario

create or replace function find_rel_withpath_cur(v_c1 int, v_level int) returns setof record as 
$$

declare
i int := 1;
ref1 cursor(var1 int, var2 int) for select var1 as level, array[]::int[] || c1 || c2 as path , c1, c2 from (select c1,unnest(c2) as c2 from a where c1=var2) a ;
ref2 cursor(var1 int) for select var1 as level, a.path||a.c2 as path, a.c1, a.c2 from (select tmp2.path, a.c1, unnest(a.c2) c2 from a join tmp2 on (a.c1=tmp2.c2 and tmp2.level=i-1 and tmp2.c1<>a.c1)) a;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;
-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
-- Currently PL/pgSQL doesn't support stream returning, even if return next or return query is used
-- https://www.postgresql.org/docs/9.6/static/plpgsql-control-structures.html
create temp table if not exists tmp2(level int, path int[] unique, c1 int, c2 int) ON COMMIT delete rows;
create index if not exists idx_tmp2_1 on tmp2(level, c2);
-- To store the data of the initial level (namely the start point)
for rec in ref1(i, v_c1) loop
insert into tmp2 values (rec.level, rec.path, rec.c1, rec.c2) on conflict do nothing;
if found then
return next rec;
end if;
end loop;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
-- The next level relation is derived though a join with level=i-1 (Group By excludes duplicate nodes. For example, duplicate paths like 1-2-3-4 and 1-2-3-4 should be excluded. In practice, this situation cannot exist after adding the unique constraints of c1 and c2), and the looping points are excluded by not exists.
-- path indicates the current path.
-- If PK constraints for c1 and c2 already exist, you can directly associate them with tmp2 without using group by.
for rec in ref2(i) loop
insert into tmp2 values(rec.level, rec.path, rec.c1, rec.c2) on conflict do nothing;
if found then
return next rec;
end if;
end loop;
end loop;
return;
end;
$$
language plpgsql strict;
Note: The current implementation of RETURN NEXT and RETURN QUERY stores the entire result set before returning from the function, as discussed above. That means that if a PL/pgSQL function produces a very large result set, performance might be poor: 
Data will be written to disk to avoid memory exhaustion, but the function itself will not return until the entire result set has been generated.
A future version of PL/pgSQL might allow users to define set-returning functions that do not have this limitation.
Currently, the point at which data begins being written to disk is controlled by the work_mem configuration variable.
Administrators who have sufficient memory to store larger result sets in memory should consider increasing this parameter.

Test Scenario 2

-- Eliminate duplicate nodes. For example, if N paths pass through the same node, record that node once only.create or replace function find_rel(v_c1 int, v_level int) returns setof u1 as 
$$
declare
i int := 1;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;
-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
create temp table if not exists tmp1( level int, c1 int primary key) ON COMMIT delete rows;
create index if not exists idx_tmp1_1 on tmp1(level, c1);
-- To store the data of the initial level (namely the start point)
return query insert into tmp1 select i, c1 from a where c1=v_c1 union select i, unnest(c2) from a where c1=v_c1 returning *;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
-- The next level relation is derived though a join with level=i-1 (Group By excludes duplicate nodes. For example, 3 in 1-2-3-4 and 1-5-3-4 is excluded), and the looping points are excluded by not exists. return query
insert into tmp1
select t.i, t.c2 from
( select i, unnest(c2) c2 from a join tmp1 tmp on ( a.c1=tmp.c1 and tmp.level=i-1 and a.c1<>v_c1 )
) t
left join tmp1 on (t.c2=tmp1.c1)
where tmp1.* is null group by 1,2
on conflict do nothing
returning *;

end loop;
end;
$$
language plpgsql strict;
postgres=# select count(*) from find_rel(5000,10);
NOTICE: relation "tmp1" already exists, skipping
NOTICE: relation "idx_tmp1_1" already exists, skipping
count
-------
50001
(1 row)
It takes about 15 seconds because 50 million records are searched for the third level due to the data model.
Another reason for such high time consumption is that CPU paralleling is not enabled.
Study shows that each person maintains a network of around 200 close friends.
For normal business models, certain restrictions are added to each level to converge results and prevent unlimited spreading.
For normal business models, a query can be performed in seconds.
Time: 17131.492 ms
create or replace function find_rel_cur(v_c1 int, v_level int) returns setof refcursor as
$$
declare
i int := 1;
ref refcursor[];
res refcursor;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;
for x in 1..v_level loop
ref[x] := 'a'||x;
end loop;
-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
create temp table if not exists tmp1( level int, c1 int primary key) ON COMMIT delete rows;
create index if not exists idx_tmp1_1 on tmp1(level, c1);
-- To store the data of the initial level (namely the start point)
res := ref[i];
open res for insert into tmp1 select i, c1 from a where c1=v_c1 union select i, unnest(c2) from a where c1=v_c1 returning *;
return next res;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
-- The next level relation is derived though a join with level=i-1 (Group By excludes duplicate nodes. For example, 3 in 1-2-3-4 and 1-5-3-4 is excluded), and the looping points are excluded by not exists. res := ref[i];
open res for
insert into tmp1
select t.i, t.c2 from
( select i, unnest(c2) c2 from a join tmp1 tmp on ( a.c1=tmp.c1 and tmp.level=i-1 and a.c1<>v_c1 )
) t
left join tmp1 on (t.c2=tmp1.c1)
where tmp1.* is null group by 1,2
on conflict do nothing
returning *;
return next res;
end loop;
end;
$$
language plpgsql strict;
postgres=# select * from find_rel_cur(1,10);
NOTICE: relation "tmp1" already exists, skipping
NOTICE: relation "idx_tmp1_1" already exists, skipping
find_rel_cur
--------------
a1
a2
a3
a4
a5
a6
a7
a8
a9
a10
(10 rows)
Time: 2.098 ms
postgres=# fetch 1 in a1;
level | c1
-------+-------
1 | 37394
(1 row)
Time: 3.138 ms
postgres=# fetch 1 in a2;
level | c1
-------+----
2 | 0
(1 row)
Time: 1345.473 msFor the third level, 50 million records are searched due to the test model
Do not worry about this data volume because normal business models generally do not have to process such a large volume of data.
postgres=# fetch 1 in a3;
level | c1
-------+----
(0 rows)
Time: 15587.686 ms
postgres=# fetch 1 in a4;
level | c1
-------+----
(0 rows)
Time: 0.143 ms
postgres=#

UDF Optimization Idea 3 — Asynchronous Messages

create extension dblink;
CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
CREATE SERVER dst FOREIGN DATA WRAPPER postgresql OPTIONS (hostaddr '127.0.0.1', port '1921', dbname 'postgres');
CREATE USER MAPPING FOR postgres SERVER dst OPTIONS (user 'postgres', password 'postgres');
create or replace function find_rel_withpath_notify(v_c1 int, v_level int, v_notify_channel text) returns void as
$$

declare
i int := 1;
query text;
ref1 cursor(var1 int, var2 int) for select var1 as level, array[]::int[] || c1 || c2 as path , c1, c2 from (select c1,unnest(c2) as c2 from a where c1=var2) a ;
ref2 cursor(var1 int) for select var1 as level, a.path||a.c2 as path, a.c1, a.c2 from (select tmp2.path, a.c1, unnest(a.c2) c2 from a join tmp2 on (a.c1=tmp2.c2 and tmp2.level=i-1 and tmp2.c1<>a.c1)) a;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;
-- Determine whether or not a connection exists. If not, create one.
if array_position(dblink_get_connections(), v_notify_channel) is not null then
else
perform dblink_connect(v_notify_channel, 'dst');
end if;
-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
-- Currently PL/pgSQL doesn't support stream returning, even if return next or return query is used
-- https://www.postgresql.org/docs/9.6/static/plpgsql-control-structures.html
create temp table if not exists tmp2(level int, path int[] unique, c1 int, c2 int) ON COMMIT delete rows;
create index if not exists idx_tmp2_1 on tmp2(level, c2);
-- To store the data of the initial level (namely the start point)
for rec in ref1(i, v_c1) loop
insert into tmp2 values (rec.level, rec.path, rec.c1, rec.c2) on conflict do nothing;
if found then
query := format($_$select 1 from pg_notify( %L , 'level: %s, path: %s, c1: %s, c2: %s')$_$, v_notify_channel, rec.level, rec.path, rec.c1, rec.c2);
-- Send asynchronous messages
perform * from dblink(v_notify_channel, query, true) as t(id int);
end if;
end loop;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
-- The next level relation is derived though a join with level=i-1 (Group By excludes duplicate nodes. For example, duplicate paths like 1-2-3-4 and 1-2-3-4 should be excluded. In practice, this situation cannot exist after adding the unique constraints of c1 and c2), and the looping points are excluded by not exists.
-- path indicates the current path.
-- If PK constraints for c1 and c2 already exist, you can directly associate them with tmp2 without using group by.
for rec in ref2(i) loop
insert into tmp2 values(rec.level, rec.path, rec.c1, rec.c2) on conflict do nothing;
if found then
query := format($_$select 1 from pg_notify( %L , 'level: %s, path: %s, c1: %s, c2: %s')$_$, v_notify_channel, rec.level, rec.path, rec.c1, rec.c2);
-- Send asynchronous messages
perform * from dblink(v_notify_channel, query, true) as t(id int);
end if;
end loop;
end loop;
return;
end;
$$
language plpgsql strict;
postgres=# listen hello;
postgres=# select find_rel_withpath_notify(1,5,'hello');
...
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,15869}, c1: 32847, c2: 15869" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,21312}, c1: 32847, c2: 21312" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,22852}, c1: 32847, c2: 22852" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,8031}, c1: 32847, c2: 8031" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,45248}, c1: 32847, c2: 45248" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,7139}, c1: 32847, c2: 7139" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,28589}, c1: 32847, c2: 28589" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,8615}, c1: 32847, c2: 8615" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,49518}, c1: 32847, c2: 49518" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,35727}, c1: 32847, c2: 35727" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,2679}, c1: 32847, c2: 2679" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,34267}, c1: 32847, c2: 34267" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,13890}, c1: 32847, c2: 13890" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,20092}, c1: 32847, c2: 20092" received from server process with PID 33062.
Asynchronous notification "hello" with payload "level: 2, path: {1,32847,11795}, c1: 32847, c2: 11795" received from server process with PID 33062.
...

Design Optimization Method 1 — Array

Design Optimization Method 2 — Pgrouting

a(c1, int, c2 int, weight numeric).

Other Graph Databases

Summary

Example Features Used in This Article

CREATE EXTENSION rum;
create table rum1(c1 int, c2 tsvector, primary key (c1));
create index idx_rum1_1 on rum1 using rum(c2 rum_tsvector_ops);
vi test.sql
\set id random(1,100000000)
insert into rum1 select :id, (select to_tsvector(string_agg(((width_bucket(:id,1,100000000,2000)-1)*50000 + (random()*50000)::int)::text, ' ')) from generate_series(1,100)) on conflict do nothing;
pgbench -M prepared -n -r -P 5 -f ./test.sql -c 64 -j 64 -T 100000
postgres=# select c1, c2 <=> tsq as dis 
from
rum1,
(select to_tsquery(replace(rtrim(ltrim(array_to_tsvector(tsvector_to_array(c2))::text, ''''), ''''),
$$
' '
$$
, ' | ')) tsq from rum1 where c1=1) as tmp
where
c2 @@ tsq
and
c1<>33233490
order by c2 <=> tsq
limit 10;
c1   |   dis    
-------+----------
1 | 0.164493
42469 | 4.11234
28939 | 5.48311
45740 | 5.48311
15508 | 5.48311
11589 | 5.48311
12377 | 5.48311
34282 | 5.48311
16731 | 5.48311
6474 | 5.48311
(10 rows)
Time: 259.834 ms

Iterative computation

create or replace function find_rel_pagerank_cur(
v_c1 int, -- the ID that needs to be deduced
v_level int, --levels of relations to be deduced
v_rank numeric, -- closeness threshold. Values greater than this threshold is not displayed (the more the value is, the longer the distance is and the smaller the closeness is)
v_limit_perlevel int, -- the maximum number of recorders per level (for example, number of persons)
)
returns setof record as
$$

declare
i int := 1;
i_c1 int;
ref cursor(
var_level int,
var_c1 int
) for
select level,c1,c2,pagerank::numeric from
(select var_level as level, var_c1 as c1, c1 as c2, c2 <=> tsq as pagerank from
rum1,
(select to_tsquery(replace(rtrim(ltrim(array_to_tsvector(tsvector_to_array(c2))::text, ''''), ''''), $_$' '$_$, ' | ')) tsq from rum1 where c1=var_c1) as tmp
where
c2 @@ tsq
and
c1<>var_c1
order by c2 <=> tsq
limit v_limit_perlevel
) t where t.pagerank < v_rank ;
begin
if v_level <=0 then
raise notice 'level must >=1';
return;
end if;
-- Note: In version 9.6, frequent creation and deletion of inner temp tables may still cause garbage to be generated in catalog.
-- To temporarily store the relations at each level from the start point
-- Currently PL/pgSQL doesn't support stream returning, even if return next or return query is used
-- https://www.postgresql.org/docs/9.6/static/plpgsql-control-structures.html
create temp table if not exists tmp2(level int, c1 int, c2 int, pagerank numeric, primary key(c1,c2)) ON COMMIT delete rows;
create index if not exists idx_tmp2_1 on tmp2(level, c2);
-- To store the data of the initial level (namely the start point)
for rec in ref(i,v_c1) loop
insert into tmp2 values (rec.level, rec.c1, rec.c2, rec.pagerank) on conflict do nothing;
if found then
raise notice 'level: %, c1: %, c2:% ,pagerank: %', rec.level, rec.c1, rec.c2, rec.pagerank;
return next rec;
end if;
end loop;
loop
i := i+1;
-- All levels of data has been found
if i > v_level then
return;
end if;
for i_c1 in select t2.c1 from rum1 t2 JOIN tmp2 t1 on (t1.c2=t2.c1 and t1.level=i-1 and t2.c1<>v_c1) where not exists (select 1 from tmp2 where tmp2.c1=t2.c1) group by 1
loop
for rec in ref(i,i_c1) loop
insert into tmp2 values (rec.level, rec.c1, rec.c2, rec.pagerank) on conflict do nothing;
if found then
raise notice 'level: %, c1: %, c2:% ,pagerank: %', rec.level, rec.c1, rec.c2, rec.pagerank;
return next rec;
end if;
end loop;
end loop; end loop;
return;
end;
$$
language plpgsql strict;
postgres=# select * from find_rel_pagerank_cur(96807211,2,10000,10) as t(level int, c1 int, c2 int, pagerank numeric);
NOTICE: relation "tmp2" already exists, skipping
NOTICE: relation "idx_tmp2_1" already exists, skipping
NOTICE: level: 1, c1: 96807211, c2:96810420 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96849305 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96810740 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96839717 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96849378 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96800097 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96832351 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96839438 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96816466 ,pagerank: 5.48311
NOTICE: level: 1, c1: 96807211, c2:96836416 ,pagerank: 5.48311
NOTICE: level: 2, c1: 96800097, c2:96812430 ,pagerank: 4.11234
NOTICE: level: 2, c1: 96800097, c2:96802051 ,pagerank: 5.48311
NOTICE: level: 2, c1: 96800097, c2:96824209 ,pagerank: 5.48311
......

Postgresql Improvements to Be Made

Original Source

--

--

Follow me to keep abreast with the latest technology news, industry insights, and developer trends. Alibaba Cloud website:https://www.alibabacloud.com

Love podcasts or audiobooks? Learn on the go with our new app.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store