Use Case Analysis — Real-time Precision Marketing and Group Selection

Use Case

Pain Points

Business Features

Business Pain Points

Technical Solution

Solution 1

KEY: 用户ID
标签1:
标签2:
...
标签N:
每个标签字段一个索引
and , or , not 组合
where 标签a and 标签b and ...

Solution 2

KEY:用户ID
VALUES:标签数组
标签数组字段: GIN倒排索引
与、或、非
where VALUES @> array[标签s] -- 与
where VALUES && array[标签s] -- 或
where not VALUES @> array[标签s] -- 非

Solution 3

KEY:标签ID
VALUES: 用户bitmap
标签ID字段: Btree索引
聚合bitmap: 与、或、非
and_agg(bitmaps) where KEY in (标签s) -- 与
or_agg(bitmaps) where KEY in (标签s) -- 或
except(bitmap1,bitmap2) -- 非

Demo

Solution 1 Demo

create table t_tag_dict (
tag int primary key, -- 标签(人群)id
info text, -- 人群描述
crt_time timestamp -- 时间
);
insert into t_tag_dict values (1, '男', now());
insert into t_tag_dict values (2, '女', now());
insert into t_tag_dict values (3, '大于24岁', now());
-- ...
insert into t_tag_dict
select generate_series(4,100000), md5(random()::text), clock_timestamp();
create table t_user_tag (
uid int8, -- 用户id
tag int, -- 用户对应标签(人群)
mod_time timestamp, -- 时间
primary key (tag,uid)
);
create or replace function gen_rand_tag(int,int) returns setof int as
$$
select case when random() > 0.5 then 1::int else 2::int end as tag
union all
select ceil(random()*$1)::int as tag from generate_series(1,$2);
$$ language sql strict volatile;
insert into t_user_tag
select uid, gen_rand_tag(100000,63) as tag, clock_timestamp()
from generate_series(1,20000000) as uid on conflict (uid,tag) do nothing;
-- 或使用如下方法加速导入
create sequence seq;
vi test.sql
insert into t_user_tag
select uid, gen_rand_tag(100000,63) as tag, clock_timestamp()
from nextval('seq'::regclass) as uid
on conflict(tag,uid) do nothing;
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 50 -j 50 -t 400000
1、人群数量
select count(*) from
(
select uid from t_user_tag where tag=1
intersect
select uid from t_user_tag where tag=3
) t;
-- Time: 1494.789 ms (00:01.495)
2、提取人群ID
select uid from t_user_tag where tag=1
intersect
select uid from t_user_tag where tag=3;
-- Time: 3246.184 ms (00:03.246)
1、人群数量
select count(*) from
(
select uid from t_user_tag where tag=1
union
select uid from t_user_tag where tag=3
union
select uid from t_user_tag where tag=10
union
select uid from t_user_tag where tag=200
) t;
-- Time: 3577.714 ms (00:03.578)
2、提取人群ID
select uid from t_user_tag where tag=1
union
select uid from t_user_tag where tag=3
union
select uid from t_user_tag where tag=10
union
select uid from t_user_tag where tag=200;
-- Time: 5682.458 ms (00:05.682)
public | t_user_tag         | table | postgres | 62 GB   | 
public | t_user_tag_pkey | index | postgres | t_user_tag | 61 GB |

Solution 2 Demo

create table t_tag_dict (
tag int primary key, -- 标签(人群)id
info text, -- 人群描述
crt_time timestamp -- 时间
);
insert into t_tag_dict values (1, '男', now());
insert into t_tag_dict values (2, '女', now());
insert into t_tag_dict values (3, '大于24岁', now());
-- ...
insert into t_tag_dict
select generate_series(4,100000), md5(random()::text), clock_timestamp();
create table t_user_tags (
uid int8 primary key, -- 用户id
tags int[], -- 用户标签(人群)数组
mod_time timestamp -- 时间
);
create or replace function gen_rand_tags(int,int) returns int[] as $$
select array_agg(ceil(random()*$1)::int) from generate_series(1,$2);
$$ language sql strict;
select gen_rand_tags(100000, 8);                   gen_rand_tags                   
---------------------------------------------------
{43494,46038,74102,25308,99129,40893,33653,29690}
(1 row)
insert into t_user_tags 
select generate_series(1,10000000),
array_append(gen_rand_tags(100000, 63),1), now();
insert into t_user_tags
select generate_series(10000001,20000000),
array_append(gen_rand_tags(100000, 63),2), now();
create index idx_t_user_tags_1 on t_user_tags using gin (tags);
1、人群数量
select count(uid) from t_user_tags where tags @> array[1,3];
2、提取人群ID
select uid from t_user_tags where tags @> array[1,3];
1、人群数量
select count(uid) from t_user_tags where tags && array[1,3,10,200];
2、提取人群ID
select uid from t_user_tags where tags && array[1,3,10,200];

Solution 3 Demo

安装插件 – create extension roaringbitmap;bitmap输出格式 – set roaringbitmap.output_format='bytea|array';bitmap取值范围 – 40亿(int4) 构造bitmap –  rb_build(int4[]) bitmap转换为数组或多条记录 - rb_to_array(rb) – rb_iterate(rb) bitmap内包含对象个数 – rb_cardinality(rb) 逻辑运算: 与、或、异或、差
SELECT roaringbitmap('{1,2,3}') | roaringbitmap('{3,4,5}');
SELECT roaringbitmap('{1,2,3}') & roaringbitmap('{3,4,5}');
SELECT roaringbitmap('{1,2,3}') # roaringbitmap('{3,4,5}');
SELECT roaringbitmap('{1,2,3}') - roaringbitmap('{3,4,5}');
聚合运算: build rb、与、或、异或
SELECT rb_build_agg(e) FROM generate_series(1,100) e;
SELECT rb_or_agg(bitmap) FROM t1;
SELECT rb_and_agg(bitmap) FROM t1;
SELECT rb_xor_agg(bitmap) FROM t1;
聚合并统计对象数(与、或、异或)
rb_or_cardinality_agg
rb_and_cardinality_agg
rb_xor_cardinality_agg
逻辑判断: 包含、相交、相等、不相等
Opperator Input Output Desc Example Result
@> roaringbitmap,roaringbitmap bool contains roaringbitmap('{1,2,3}') @> roaringbitmap('{3,4,5}') f
@> roaringbitmap,integer bool contains roaringbitmap('{1,2,3,4,5}') @> 3 t
<@ roaringbitmap,roaringbitmap bool is contained by roaringbitmap('{1,2,3}') f
<@ integer,roaringbitmap bool is contained by 3 t
&& roaringbitmap,roaringbitmap bool overlap (have elements in common) roaringbitmap('{1,2,3}') && roaringbitmap('{3,4,5}') t
= roaringbitmap,roaringbitmap bool equal roaringbitmap('{1,2,3}') = roaringbitmap('{3,4,5}') f
<> roaringbitmap,roaringbitmap bool not equal roaringbitmap('{1,2,3}') <> roaringbitmap('{3,4,5}') t
create extension roaringbitmap;
create table t_tag_users (
tagid int primary key, -- 用户标签(人群)id
uid_offset int, -- 由于userid是int8类型,roaringbitmap内部使用int4存储,需要转换一下。
userbits roaringbitmap, -- 用户id聚合的 bitmap
mod_time timestamp -- 时间
);
insert into t_tag_users 
select tagid, uid_offset, rb_build_agg(uid::int) as userbits from
(
select
unnest(tags) as tagid,
(uid / (2^31)::int8) as uid_offset,
mod(uid, (2^31)::int8) as uid
from t_user_tags
) t
group by tagid, uid_offset;
1、人群数量
select sum(ub) from
(
select uid_offset,rb_and_cardinality_agg(userbits) as ub
from t_tag_users
where tagid in (1,3)
group by uid_offset
) t;
2、提取人群ID
select uid_offset,rb_and_agg(userbits) as ub
from t_tag_users
where tagid in (1,3)
group by uid_offset;
1、人群数量
select sum(ub) from
(
select uid_offset,rb_or_cardinality_agg(userbits) as ub
from t_tag_users
where tagid in (1,3,10,200)
group by uid_offset
) t;
2、提取人群ID
select uid_offset,rb_or_agg(userbits) as ub
from t_tag_users
where tagid in (1,3,10,200)
group by uid_offset;

Solution Comparison

Value of ApsaraDB RDS for PostgreSQL Solution

Original Source:

Follow me to keep abreast with the latest technology news, industry insights, and developer trends.