Use Case Analysis — Real-time Precision Marketing and Group Selection

Image for post
Image for post

Use Case

Image for post
Image for post
Image for post
Image for post

Pain Points

Business Features

Business Pain Points

Technical Solution

Solution 1

KEY: 用户ID
标签1:
标签2:
...
标签N:
每个标签字段一个索引
and , or , not 组合
where 标签a and 标签b and ...

Solution 2

KEY:用户ID
VALUES:标签数组
标签数组字段: GIN倒排索引
与、或、非
where VALUES @> array[标签s] -- 与
where VALUES && array[标签s] -- 或
where not VALUES @> array[标签s] -- 非

Solution 3

KEY:标签ID
VALUES: 用户bitmap
标签ID字段: Btree索引
聚合bitmap: 与、或、非
and_agg(bitmaps) where KEY in (标签s) -- 与
or_agg(bitmaps) where KEY in (标签s) -- 或
except(bitmap1,bitmap2) -- 非

Demo

Solution 1 Demo

create table t_tag_dict (
tag int primary key, -- 标签(人群)id
info text, -- 人群描述
crt_time timestamp -- 时间
);
insert into t_tag_dict values (1, '男', now());
insert into t_tag_dict values (2, '女', now());
insert into t_tag_dict values (3, '大于24岁', now());
-- ...
insert into t_tag_dict
select generate_series(4,100000), md5(random()::text), clock_timestamp();
create table t_user_tag (
uid int8, -- 用户id
tag int, -- 用户对应标签(人群)
mod_time timestamp, -- 时间
primary key (tag,uid)
);
create or replace function gen_rand_tag(int,int) returns setof int as
$$
select case when random() > 0.5 then 1::int else 2::int end as tag
union all
select ceil(random()*$1)::int as tag from generate_series(1,$2);
$$ language sql strict volatile;
insert into t_user_tag
select uid, gen_rand_tag(100000,63) as tag, clock_timestamp()
from generate_series(1,20000000) as uid on conflict (uid,tag) do nothing;
-- 或使用如下方法加速导入
create sequence seq;
vi test.sql
insert into t_user_tag
select uid, gen_rand_tag(100000,63) as tag, clock_timestamp()
from nextval('seq'::regclass) as uid
on conflict(tag,uid) do nothing;
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 50 -j 50 -t 400000
1、人群数量
select count(*) from
(
select uid from t_user_tag where tag=1
intersect
select uid from t_user_tag where tag=3
) t;
-- Time: 1494.789 ms (00:01.495)
2、提取人群ID
select uid from t_user_tag where tag=1
intersect
select uid from t_user_tag where tag=3;
-- Time: 3246.184 ms (00:03.246)
1、人群数量
select count(*) from
(
select uid from t_user_tag where tag=1
union
select uid from t_user_tag where tag=3
union
select uid from t_user_tag where tag=10
union
select uid from t_user_tag where tag=200
) t;
-- Time: 3577.714 ms (00:03.578)
2、提取人群ID
select uid from t_user_tag where tag=1
union
select uid from t_user_tag where tag=3
union
select uid from t_user_tag where tag=10
union
select uid from t_user_tag where tag=200;
-- Time: 5682.458 ms (00:05.682)
public | t_user_tag         | table | postgres | 62 GB   | 
public | t_user_tag_pkey | index | postgres | t_user_tag | 61 GB |

Solution 2 Demo

create table t_tag_dict (
tag int primary key, -- 标签(人群)id
info text, -- 人群描述
crt_time timestamp -- 时间
);
insert into t_tag_dict values (1, '男', now());
insert into t_tag_dict values (2, '女', now());
insert into t_tag_dict values (3, '大于24岁', now());
-- ...
insert into t_tag_dict
select generate_series(4,100000), md5(random()::text), clock_timestamp();
create table t_user_tags (
uid int8 primary key, -- 用户id
tags int[], -- 用户标签(人群)数组
mod_time timestamp -- 时间
);
create or replace function gen_rand_tags(int,int) returns int[] as $$
select array_agg(ceil(random()*$1)::int) from generate_series(1,$2);
$$ language sql strict;
select gen_rand_tags(100000, 8);                   gen_rand_tags                   
---------------------------------------------------
{43494,46038,74102,25308,99129,40893,33653,29690}
(1 row)
insert into t_user_tags 
select generate_series(1,10000000),
array_append(gen_rand_tags(100000, 63),1), now();
insert into t_user_tags
select generate_series(10000001,20000000),
array_append(gen_rand_tags(100000, 63),2), now();
create index idx_t_user_tags_1 on t_user_tags using gin (tags);
1、人群数量
select count(uid) from t_user_tags where tags @> array[1,3];
2、提取人群ID
select uid from t_user_tags where tags @> array[1,3];
1、人群数量
select count(uid) from t_user_tags where tags && array[1,3,10,200];
2、提取人群ID
select uid from t_user_tags where tags && array[1,3,10,200];

Solution 3 Demo

安装插件 – create extension roaringbitmap;bitmap输出格式 – set roaringbitmap.output_format='bytea|array';bitmap取值范围 – 40亿(int4) 构造bitmap –  rb_build(int4[]) bitmap转换为数组或多条记录 - rb_to_array(rb) – rb_iterate(rb) bitmap内包含对象个数 – rb_cardinality(rb) 逻辑运算: 与、或、异或、差
SELECT roaringbitmap('{1,2,3}') | roaringbitmap('{3,4,5}');
SELECT roaringbitmap('{1,2,3}') & roaringbitmap('{3,4,5}');
SELECT roaringbitmap('{1,2,3}') # roaringbitmap('{3,4,5}');
SELECT roaringbitmap('{1,2,3}') - roaringbitmap('{3,4,5}');
聚合运算: build rb、与、或、异或
SELECT rb_build_agg(e) FROM generate_series(1,100) e;
SELECT rb_or_agg(bitmap) FROM t1;
SELECT rb_and_agg(bitmap) FROM t1;
SELECT rb_xor_agg(bitmap) FROM t1;
聚合并统计对象数(与、或、异或)
rb_or_cardinality_agg
rb_and_cardinality_agg
rb_xor_cardinality_agg
逻辑判断: 包含、相交、相等、不相等
Opperator Input Output Desc Example Result
@> roaringbitmap,roaringbitmap bool contains roaringbitmap('{1,2,3}') @> roaringbitmap('{3,4,5}') f
@> roaringbitmap,integer bool contains roaringbitmap('{1,2,3,4,5}') @> 3 t
<@ roaringbitmap,roaringbitmap bool is contained by roaringbitmap('{1,2,3}') f
<@ integer,roaringbitmap bool is contained by 3 t
&& roaringbitmap,roaringbitmap bool overlap (have elements in common) roaringbitmap('{1,2,3}') && roaringbitmap('{3,4,5}') t
= roaringbitmap,roaringbitmap bool equal roaringbitmap('{1,2,3}') = roaringbitmap('{3,4,5}') f
<> roaringbitmap,roaringbitmap bool not equal roaringbitmap('{1,2,3}') <> roaringbitmap('{3,4,5}') t
create extension roaringbitmap;
create table t_tag_users (
tagid int primary key, -- 用户标签(人群)id
uid_offset int, -- 由于userid是int8类型,roaringbitmap内部使用int4存储,需要转换一下。
userbits roaringbitmap, -- 用户id聚合的 bitmap
mod_time timestamp -- 时间
);
insert into t_tag_users 
select tagid, uid_offset, rb_build_agg(uid::int) as userbits from
(
select
unnest(tags) as tagid,
(uid / (2^31)::int8) as uid_offset,
mod(uid, (2^31)::int8) as uid
from t_user_tags
) t
group by tagid, uid_offset;
1、人群数量
select sum(ub) from
(
select uid_offset,rb_and_cardinality_agg(userbits) as ub
from t_tag_users
where tagid in (1,3)
group by uid_offset
) t;
2、提取人群ID
select uid_offset,rb_and_agg(userbits) as ub
from t_tag_users
where tagid in (1,3)
group by uid_offset;
1、人群数量
select sum(ub) from
(
select uid_offset,rb_or_cardinality_agg(userbits) as ub
from t_tag_users
where tagid in (1,3,10,200)
group by uid_offset
) t;
2、提取人群ID
select uid_offset,rb_or_agg(userbits) as ub
from t_tag_users
where tagid in (1,3,10,200)
group by uid_offset;

Solution Comparison

Image for post
Image for post
Image for post
Image for post

Value of ApsaraDB RDS for PostgreSQL Solution

Original Source:

Follow me to keep abreast with the latest technology news, industry insights, and developer trends.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store