How to Retrieve and Deduplicate Similar Text in PostgreSQL

Image for post
Image for post

Background

How to Solve the Problem

Practical Example

git clone https://github.com/jaiminpan/pg_jieba.git  
mv pg_jieba $PGSRC/contrib/
export PATH=/home/digoal/pgsql9.5/bin:$PATH
cd $PGSRC/contrib/pg_jieba
make clean;make;make install

git clone https://github.com/jaiminpan/pg_scws.git
mv pg_jieba $PGSRC/contrib/
export PATH=/home/digoal/pgsql9.5/bin:$PATH
cd $PGSRC/contrib/pg_scws
make clean;make;make install
psql  
# create extension pg_jieba;
# create extension pg_scws;
create table tdup1 (id int primary key, info text);  
create extension pg_trgm;
insert into tdup1 values (1, '银屑病怎么治?');
insert into tdup1 values (2, '银屑病怎么治疗?');
insert into tdup1 values (3, '银屑病怎么治疗好?');
insert into tdup1 values (4, '银屑病怎么能治疗好?');
postgres=# select to_tsvector('jiebacfg', info),* from tdup1 ;  
to_tsvector | id | info
---------------------+----+----------------------
'治':3 '银屑病':1 | 1 | 银屑病怎么治?
'治疗':3 '银屑病':1 | 2 | 银屑病怎么治疗?
'治疗':3 '银屑病':1 | 3 | 银屑病怎么治疗好?
'治疗':4 '银屑病':1 | 4 | 银屑病怎么能治疗好?
(4 rows)

postgres=# select to_tsvector('scwscfg', info),* from tdup1 ;
to_tsvector | id | info
-----------------------------------+----+----------------------
'治':2 '银屑病':1 | 1 | 银屑病怎么治?
'治疗':2 '银屑病':1 | 2 | 银屑病怎么治疗?
'好':3 '治疗':2 '银屑病':1 | 3 | 银屑病怎么治疗好?
'好':4 '治疗':3 '能':2 '银屑病':1 | 4 | 银屑病怎么能治疗好?
(4 rows)
postgres=# create or replace function array_union(text[], text[]) returns text[] as $$  
select array_agg(c1) from (select c1 from unnest($1||$2) t(c1) group by c1) t;
$$ language sql strict;
CREATE FUNCTION
postgres=# create or replace function array_dist(text[]) returns text[] as $$           
select array_agg(c1) from (select c1 from unnest($1) t(c1) group by c1) t;
$$ language sql strict;
CREATE FUNCTION
postgres=# create or replace function array_share(text[], text[]) returns text[] as $$  
select array_agg(unnest) from (select unnest($1) intersect select unnest($2) group by 1) t;
$$ language sql strict;
CREATE FUNCTION
postgres=# with t(c1,c2,c3) as   
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)
select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)
simulate from t t1,t t2) t;
t1c1 | t2c1 | t1c2 | t2c2 | t1c3 | t2c3 | simulate
------+------+----------------------+----------------------+-------------------+-------------------+----------
1 | 1 | 银屑病怎么治? | 银屑病怎么治? | {'银屑病','治'} | {'银屑病','治'} | 1.00
1 | 2 | 银屑病怎么治? | 银屑病怎么治疗? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33
1 | 3 | 银屑病怎么治? | 银屑病怎么治疗好? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33
1 | 4 | 银屑病怎么治? | 银屑病怎么能治疗好? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33
2 | 1 | 银屑病怎么治疗? | 银屑病怎么治? | {'银屑病','治疗'} | {'银屑病','治'} | 0.33
2 | 2 | 银屑病怎么治疗? | 银屑病怎么治疗? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
2 | 3 | 银屑病怎么治疗? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
2 | 4 | 银屑病怎么治疗? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
3 | 1 | 银屑病怎么治疗好? | 银屑病怎么治? | {'银屑病','治疗'} | {'银屑病','治'} | 0.33
3 | 2 | 银屑病怎么治疗好? | 银屑病怎么治疗? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
3 | 3 | 银屑病怎么治疗好? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
3 | 4 | 银屑病怎么治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
4 | 1 | 银屑病怎么能治疗好? | 银屑病怎么治? | {'银屑病','治疗'} | {'银屑病','治'} | 0.33
4 | 2 | 银屑病怎么能治疗好? | 银屑病怎么治疗? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
4 | 3 | 银屑病怎么能治疗好? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
4 | 4 | 银屑病怎么能治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
(16 rows)
Image for post
Image for post
postgres=# with t(c1,c2,c3) as   
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)
select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t;
t1c1 | t2c1 | t1c2 | t2c2 | t1c3 | t2c3 | simulate
------+------+--------------------+----------------------+-------------------+-------------------+----------
1 | 2 | 银屑病怎么治? | 银屑病怎么治疗? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33
1 | 3 | 银屑病怎么治? | 银屑病怎么治疗好? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33
1 | 4 | 银屑病怎么治? | 银屑病怎么能治疗好? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33
2 | 3 | 银屑病怎么治疗? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
2 | 4 | 银屑病怎么治疗? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
3 | 4 | 银屑病怎么治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
(6 rows)
postgres=# with t(c1,c2,c3) as   
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)
select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5;
t1c1 | t2c1 | t1c2 | t2c2 | t1c3 | t2c3 | simulate
------+------+--------------------+----------------------+-------------------+-------------------+----------
2 | 3 | 银屑病怎么治疗? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
2 | 4 | 银屑病怎么治疗? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
3 | 4 | 银屑病怎么治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
(3 rows)
delete from tdup1 where id in (with t(c1,c2,c3) as   
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)
select t2c1 from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5);

例如 :
postgres=# insert into tdup1 values (11, '白血病怎么治?');
INSERT 0 1
postgres=# insert into tdup1 values (22, '白血病怎么治疗?');
INSERT 0 1
postgres=# insert into tdup1 values (13, '白血病怎么治疗好?');
INSERT 0 1
postgres=# insert into tdup1 values (24, '白血病怎么能治疗好?');
INSERT 0 1
postgres=#
postgres=# with t(c1,c2,c3) as
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)
select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5;
t1c1 | t2c1 | t1c2 | t2c2 | t1c3 | t2c3 | simulate
------+------+--------------------+----------------------+-------------------+-------------------+----------
2 | 3 | 银屑病怎么治疗? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
2 | 4 | 银屑病怎么治疗? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
3 | 4 | 银屑病怎么治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00
22 | 24 | 白血病怎么治疗? | 白血病怎么能治疗好? | {'治疗','白血病'} | {'治疗','白血病'} | 1.00
13 | 22 | 白血病怎么治疗好? | 白血病怎么治疗? | {'治疗','白血病'} | {'治疗','白血病'} | 1.00
13 | 24 | 白血病怎么治疗好? | 白血病怎么能治疗好? | {'治疗','白血病'} | {'治疗','白血病'} | 1.00
(6 rows)

postgres=# begin;
BEGIN
postgres=# delete from tdup1 where id in (with t(c1,c2,c3) as
postgres(# (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)
postgres(# select t2c1 from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)
postgres(# simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5);
DELETE 4
postgres=# select * from tdup1 ;
id | info
----+--------------------
1 | 银屑病怎么治?
2 | 银屑病怎么治疗?
11 | 白血病怎么治?
13 | 白血病怎么治疗好?
(4 rows)
postgres=# explain delete from tdup1 where id in (with t(c1,c2,c3) as   
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)
select t2c1 from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5);
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------
Delete on tdup1 (cost=10005260133.58..10005260215.84 rows=2555 width=34)
-> Hash Join (cost=10005260133.58..10005260215.84 rows=2555 width=34)
Hash Cond: (tdup1.id = "ANY_subquery".t2c1)
-> Seq Scan on tdup1 (cost=0.00..61.10 rows=5110 width=10)
-> Hash (cost=10005260131.08..10005260131.08 rows=200 width=32)
-> HashAggregate (cost=10005260129.08..10005260131.08 rows=200 width=32)
Group Key: "ANY_subquery".t2c1
-> Subquery Scan on "ANY_subquery" (cost=10000002667.20..10005252911.99 rows=2886838 width=32)
-> Subquery Scan on t (cost=10000002667.20..10005224043.61 rows=2886838 width=4)
Filter: (t.simulate > 0.5)
CTE t
-> Seq Scan on tdup1 tdup1_1 (cost=0.00..2667.20 rows=5110 width=36)
-> Nested Loop (cost=10000000000.00..10005113119.99 rows=8660513 width=68)
Join Filter: ((t1.c1 <> t2.c1) AND (t1.c1 < t2.c1))
-> CTE Scan on t t1 (cost=0.00..102.20 rows=5110 width=36)
-> CTE Scan on t t2 (cost=0.00..102.20 rows=5110 width=36)
(16 rows)

Summary

References

Original Source:

Follow me to keep abreast with the latest technology news, industry insights, and developer trends.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store