Similarity Analysis for PostgreSQL Text Databases

  • sorting by image or facial similarity
  • sorting by overlapping hobbies
  • sorting by age approximation
  • sorting by distance
  • sorting by textual similarity
  • sorting by word-break similarities

Sorting by Image or Facial Similarity

Sorting by Overlapping Hobbies

Sorting by Age Approximation

postgres=# create extension btree_gist;
CREATE EXTENSION
postgres=# create table test12(id int);
CREATE TABLE
postgres=# insert into test12 select trunc(random()*1000) from generate_series(1,100000);
INSERT 0 100000
postgres=# create index idx_test12 on test12 using gist(id);
CREATE INDEX
postgres=# select * from test12 order by id <-> 100 limit 10;
id
-----
100
100
100
100
100
100
100
100
100
100
(10 rows)

Sorting by Distance

postgres=# create table test13(c1 point);
CREATE TABLE
postgres=# insert into test13 select ('('||trunc(random()*1000)||','||trunc(random()*5000)||')')::point from generate_series(1,10000);
INSERT 0 10000
postgres=# create index idx_test13 on test13 using gist(c1);
CREATE INDEX
postgres=# select * from test13 order by c1 <-> point '(1,10000)' limit 10;
c1
------------
(58,4993)
(191,4995)
(48,4991)
(326,4998)
(99,4988)
(205,4991)
(348,4998)
(53,4986)
(174,4988)
(136,4984)
(10 rows)

Sorting by Textual Similarity

postgres=# create extension pg_trgm;
CREATE EXTENSION
postgres=# create table test14(c1 text);
CREATE TABLE
postgres=# insert into test14 values ('Hello Digoal'), ('China'), ('Hello China'), ('Nihao Digoal');
INSERT 0 4
postgres=# select * from test14;
c1
--------------
Hello Digoal
China
Hello China
Nihao Digoal
(4 rows)
postgres=# create index idx_test14 on test14 using gist(c1 gist_trgm_ops);
CREATE INDEX
postgres=# explain select *,c1 <-> 'Digoal' from test14 order by c1 <-> 'Digoal' limit 2;
QUERY PLAN
--------------------------------------------------------------------------------
Limit (cost=0.13..4.17 rows=2 width=36)
-> Index Scan using idx_test14 on test14 (cost=0.13..8.21 rows=4 width=36)
Order By: (c1 <-> 'Digoal'::text)
(3 rows)
postgres=# select *,c1 <-> 'Digoal' from test14 order by c1 <-> 'Digoal' limit 2;
c1 | ?column?
--------------+----------
Hello Digoal | 0.461538
Nihao Digoal | 0.461538
(2 rows)

Sorting by Word-Break Similarity

export PATH=/home/digoal/pgsql9.6/bin:$PATH
git clone https://github.com/postgrespro/rum
cd rum
make USE_PGXS=1
make USE_PGXS=1 install
//
//
git clone https://github.com/jaiminpan/pg_jieba
cd pg_jieba
make USE_PGXS=1
make USE_PGXS=1 install
//
//
postgres=# create extension rum;
CREATE EXTENSION
postgres=# create extension pg_jieba;
CREATE EXTENSION
// Word Example examples:
postgres=# select * from to_tsvector('jiebacfg', 'Xiao Ming graduated with a master's degree from the Institute of Computing Technology of the Chinese Academy of Sciences, and later went on to study at Kyoto University in Japan.');
to_tsvector
----------------------------------------------------------------------------------
'Chinese Academy of Sciences':5 'Xiao Ming':1 'Kyoto University':10 'graduated':3 'study':11 'master's degree':2 'Institute of Computing Technology':6
(1 row)
// With similarities
postgres=# select * from rum_ts_distance(to_tsvector('jiebacfg', 'Xiao Ming graduated with a master's degree from the Institute of Computing Technology of the Chinese Academy of Sciences, and later went on to study at Kyoto University in Japan.') , to_tsquery(''Institute of Computing Technology'));
rum_ts_distance
-----------------
16.4493
(1 row)
// Without similarities
postgres=# select * from rum_ts_distance(to_tsvector('jiebacfg', 'Xiao Ming graduated with a master's degree from the Institute of Computing Technology of the Chinese Academy of Sciences, and later went on to study at Kyoto University in Japan.) , to_tsquery('Computer Science'));
rum_ts_distance
-----------------
Infinity
(1 row)
// One or the other has similarities
postgres=# select * from rum_ts_distance(to_tsvector('jiebacfg', 'Xiao Ming graduated with a master's degree from the Institute of Computing Technology of the Chinese Academy of Sciences, and later went on to study at Kyoto University in Japan.') , to_tsquery('''Institute of Computing Technology | master's degree'));
rum_ts_distance
-----------------
8.22467
(1 row)
// Both have similarities
postgres=# select * from rum_ts_distance(to_tsvector('jiebacfg', 'Xiao Ming graduated with a master's degree from the Institute of Computing Technology of the Chinese Academy of Sciences, and later went on to study at Kyoto University in Japan.') , to_tsquery('''Institute of Computing Technology & master's degree'));
rum_ts_distance
-----------------
32.8987
(1 row)
// Order
postgres=# create table test15(c1 tsvector);
CREATE TABLE
postgres=# insert into test15 values (to_tsvector('jiebacfg', 'hello china, i''m digoal')), (to_tsvector('jiebacfg', 'hello world, i''m postgresql')), (to_tsvector('jiebacfg', 'how are you, i''m digoal'));
INSERT 0 3
postgres=# select * from test15;
c1
-----------------------------------------------------
' ':2,5,9 'china':3 'digoal':10 'hello':1 'm':8
' ':2,5,9 'hello':1 'm':8 'postgresql':10 'world':3
' ':2,4,7,11 'digoal':12 'm':10
(3 rows)
postgres=# create index idx_test15 on test15 using rum(c1 rum_tsvector_ops);
CREATE INDEX
postgres=# select *,c1 <=> to_tsquery('hello') from test15;
c1 | ?column?
-----------------------------------------------------+----------
' ':2,5,9 'china':3 'digoal':10 'hello':1 'm':8 | 16.4493
' ':2,5,9 'hello':1 'm':8 'postgresql':10 'world':3 | 16.4493
' ':2,4,7,11 'digoal':12 'm':10 | Infinity
(3 rows)
postgres=# explain select *,c1 <=> to_tsquery('postgresql') from test15 order by c1 <=> to_tsquery('postgresql');
QUERY PLAN
--------------------------------------------------------------------------------
Index Scan using idx_test15 on test15 (cost=3600.25..3609.06 rows=3 width=36)
Order By: (c1 <=> to_tsquery('postgresql'::text))
(2 rows)

Original Source

--

--

--

Follow me to keep abreast with the latest technology news, industry insights, and developer trends. Alibaba Cloud website:https://www.alibabacloud.com

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

First OAuth implementation in sensenet ECM

Productionize ML Models in Seconds

Arduino Final Project

A practical evaluation of spectrum-based fault localization — A retrospective

Progress Euler

Adding Elasticity to the datanode in Hadoop

How to Reduce Software Defects

Business Intelligence in Software Projects

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Alibaba Cloud

Alibaba Cloud

Follow me to keep abreast with the latest technology news, industry insights, and developer trends. Alibaba Cloud website:https://www.alibabacloud.com

More from Medium

Cassandra Monitoring Setup Using Open Source Tools

kafkaVision: An open-source monitoring tool for Apache Kafka

Using Search Template — ElasticSearch

Setting up Kafka on Kubernetes - an easy way