PostgreSQL, SQL Server 逻辑增量 (通过逻辑标记update,delete) 同步到 Greenplum, PostgreSQL

create table t_src ( id int primary key, info text not null, is_del boolean default null, -- 删除标记，NULL表示未删除，非空表示已删除 mod_time timestamp not null default clock_timestamp() -- 插入、删除、修改的时间戳 );

2、创建索引，加速同步

create index idx_t_src_1 on t_src(mod_time);

3、创建触发器函数，更新、删除数据时，更新时间戳，同时修改删除标记位

当被删除的记录重新被插入时，把删除标记改成未删除。

create or replace function tg1_t_src() returns trigger as $$ declare begin NEW.mod_time := clock_timestamp(); select case when OLD.is_del is null and NEW.is_del = true then true else null end into NEW.is_del; -- 如果以前这个ID被删除过，则插入，并将is_del重新置为未删除 return NEW; end; $$ language plpgsql strict;

4、创建触发器，更新时触发

create trigger tg1 before update on t_src for each row execute procedure tg1_t_src();

5、创建规则，当删除记录时，使用UPDATE代替DELETE

create rule r1 as on delete to t_src do instead update t_src set is_del=true where t_src.id=OLD.id and t_src.is_del is null; -- 未标记为删除的记录is_del=null，标记为删除.

6、查看插入、更新、删除是否符合预期

```
postgres=# insert into t_src values (1, md5(random()::text)) on conflict (id) do update set info=excluded.info;
INSERT 0 1
postgres=# select * from t_src where id=1;
id | info | is_del | mod_time
----+----------------------------------+--------+----------------------------
1 | 56c21963342997fd8bf80a5b542abde9 | | 2018-05-12 08:54:19.393532
(1 row)

postgres=# insert into t_src values (1, md5(random()::text)) on conflict (id) do update set info=excluded.info;
INSERT 0 1
postgres=# select * from t_src where id=1;
id | info | is_del | mod_time
----+----------------------------------+--------+----------------------------
1 | 5bca407559081d6cfc1154fd0f17b6a9 | | 2018-05-12 08:54:23.465005
(1 row)

7、创建压测脚本

```
vi test.sql

\set id1 random(1,10000000)
\set id2 random(1,20000000)
insert into t_src values (:id1, md5(random()::text)) on conflict (id) do update set info=excluded.info;
delete from t_src where id=:id2;
```

8、压测，高压插入、更新、删除动作，每秒处理13.8万行。

```
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 120

transaction type: ./test.sql
scaling factor: 1
query mode: prepared
number of clients: 64
number of threads: 64
duration: 120 s
number of transactions actually processed: 16634225
latency average = 0.462 ms
latency stddev = 0.506 ms
tps = 138437.925513 (including connections establishing)
tps = 138445.093708 (excluding connections establishing)
statement latencies in milliseconds:
0.002 \set id1 random(1,10000000)
0.000 \set id2 random(1,20000000)
0.298 insert into t_src values (:id1, md5(random()::text)) on conflict (id) do update set info=excluded.info;
0.163 delete from t_src where id=:id2;
```

目标端

1、创建目标表

create table t_dst ( id int primary key, info text not null, is_del boolean default null, mod_time timestamp not null default clock_timestamp() );

2、创建索引

create index idx_t_dst_1 on t_dst(mod_time);

数据同步

1、源端数据增量同步到目标端

(在同一DB中模拟，后面有例子讲源和目标在不同集群的DEMO)

```
do language plpgsql $$
declare
pos timestamp; -- 位点
pre interval := '10 s'; -- 缓冲10秒，防止空洞，根据业务层设置
begin
-- 已同步位点
select max(mod_time) into pos from t_dst;

-- NULL表示目标端没有数据
if pos is null then
-- 缓冲上限
pos := now()::timestamp;
insert into t_dst select * from t_src where mod_time < (pos - pre) on conflict (id) do update set -- on conflict 合并insert,update
info=excluded.info, is_del=excluded.is_del, mod_time=excluded.mod_time
where t_dst.info is distinct from excluded.info or
t_dst.is_del is distinct from excluded.is_del or
t_dst.mod_time is distinct from excluded.mod_time;
return;
end if;

-- 同步超过位点的数据
insert into t_dst select * from t_src where mod_time > pos and mod_time < (now()::timestamp - pre) on conflict (id) do update set
info=excluded.info, is_del=excluded.is_del, mod_time=excluded.mod_time
where t_dst.info is distinct from excluded.info or
t_dst.is_del is distinct from excluded.is_del or
t_dst.mod_time is distinct from excluded.mod_time;
return;
end;
$$;
```

2、一边压测，一边调用以上过程同步，最后达到一致性状态，检查一致性的SQL如下：

```
select sum(hashtext((t.)::text)),count(),sum(case is_del when true then 0 else 1 end) from t_src t;

  sum       |  count  |   sum
复制

----------------+---------+---------
-2967631712018 | 8299587 | 6199359
(1 row)

select sum(hashtext((t.)::text)),count(),sum(case is_del when true then 0 else 1 end) from t_dst t;

  sum       |  count  |   sum
复制

----------------+---------+---------
-2967631712018 | 8299587 | 6199359
(1 row)
```

二、源PostgreSQL, 目标Greenplum, 增量复制delete,insert,update

第一种方法，DELETE使用逻辑标记，所以实际上数据并没有删除。

还有一种方法，可以把DELETE的记录，MOVE到另一张表。

方法与一差不多，只是把rule改一下，改成INSERT到其他表。

略

三、源PostgreSQL, 目标Greenplum, 增量复制insert,update

上游没有del，或者说不需要捕获DEL操作 (DEL操作，人为在上下游同时执行SQL来删除)

源端

1、创建表

create table tbl_src ( id int primary key, info text not null, mod_time timestamp not null default clock_timestamp() -- update时间戳 );

2、创建索引

create index idx_tbl_src_1 on tbl_src(mod_time);

3、创建触发器函数，更新时，自动更新时间戳字段

create or replace function tg1_tbl_src() returns trigger as $$ declare begin NEW.mod_time := clock_timestamp(); return NEW; end; $$ language plpgsql strict;

4、创建触发器

create trigger tg1 before update on tbl_src for each row execute procedure tg1_tbl_src();

5、压测

```
vi test.sql
\set id1 random(1,10000000)
insert into tbl_src values (:id1, md5(random()::text)) on conflict (id) do update set info=excluded.info;

pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 120
```

目标端

1、创建目标表

create table tbl_dst ( id int primary key, info text not null, mod_time timestamp not null default clock_timestamp() );

2、创建索引

create index idx_tbl_dst_1 on tbl_dst(mod_time);

数据同步

在同一个实例中模拟。

1、调用过程，同步数据。

```
do language plpgsql $$
declare
pos timestamp; -- 位点
pre interval := '10 s'; -- 缓冲10秒，防止空洞，根据业务层设置
begin
select max(mod_time) into pos from tbl_dst;
if pos is null then
pos := now()::timestamp;
insert into tbl_dst select * from tbl_src where mod_time < (pos - pre) on conflict (id) do update set
info=excluded.info, mod_time=excluded.mod_time
where tbl_dst.info is distinct from excluded.info or
tbl_dst.mod_time is distinct from excluded.mod_time;
return;
end if;

insert into tbl_dst select * from tbl_src where mod_time > pos and mod_time < (now()::timestamp - pre) on conflict (id) do update set
info=excluded.info, mod_time=excluded.mod_time
where tbl_dst.info is distinct from excluded.info or
tbl_dst.mod_time is distinct from excluded.mod_time;
return;
end;
$$;
```

2、一边压测，一边调用以上过程同步数据，最后检查一致性

```
select sum(hashtext((t.)::text)),count() from tbl_src t;

  sum      |  count
复制

---------------+---------
2739329060132 | 6725930
(1 row)

select sum(hashtext((t.)::text)),count() from tbl_dst t;

  sum      |  count
复制

---------------+---------
2739329060132 | 6725930
(1 row)
```

四、源PostgreSQL, 目标Greenplum, 增量复制insert,update

同样，不包括DELETE的复制。只复制insert和update。

由于greenplum 没有 insert into on conflict 的功能，所以需要采用临时表，分步实现MERGE。

同步方法

1. 目标端，HDB PG, 获取max(mod_time)

2. 源端，PG, 拉取增量

3. 目标端，增量数据，导入HDB PG 临时表

4. 目标端，HDB PG ，DELETE from 目标表 using 临时表

5. 目标端，HDB PG ，insert into 目标表 select * from 临时表

6. 目标端，清空临时表

目标端

1、创建临时表

create table tbl_dst_tmp ( id int primary key, info text not null, mod_time timestamp not null default clock_timestamp() );

2、创建数据同步的脚本

```
vi imp.sh

!/bin/bash

1. HDB PG, 获取max(mod_time)

MOD_TIME=PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres -q -t -A -c "select coalesce(max(mod_time),'4714-11-24 00:00:00 BC'::timestamp) from tbl_dst"

2. PG, 拉取增量

3. 增量数据，导入HDB PG 临时表

使用 linux 管道同步上下游

PGPASSWORD="pwd" psql -h 10.31.124.69 -p 4000 -U postgres postgres -c "copy (select * from tbl_src where mod_time > '$MOD_TIME'::timestamp and mod_time < (now()-'10 sec'::interval) ) to stdout" | PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres -c "copy tbl_dst_tmp from stdin"

4. HDB PG ，DELETE from 目标表 using 临时表

5. HDB PG ，insert into 目标表 select * from 临时表

6. 清空临时表

放在一个事务中，并且对临时表加锁保护。

PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres <<EOF
begin;
lock table tbl_dst_tmp in ACCESS EXCLUSIVE mode;
delete from tbl_dst using tbl_dst_tmp where tbl_dst.id=tbl_dst_tmp.id;
insert into tbl_dst select * from tbl_dst_tmp;
truncate tbl_dst_tmp;
end;
EOF

chmod 700 imp.sh
```

3、一边压测，一边同步

```
COPY 6725930
BEGIN
LOCK TABLE
DELETE 0
INSERT 0 6725930
TRUNCATE TABLE
COMMIT

COPY 0
BEGIN
LOCK TABLE
DELETE 0
INSERT 0 0
TRUNCATE TABLE
COMMIT

COPY 78423
BEGIN
LOCK TABLE
DELETE 52796
INSERT 0 78423
TRUNCATE TABLE
COMMIT

COPY 486817
BEGIN
LOCK TABLE
DELETE 327603
INSERT 0 486817
TRUNCATE TABLE
COMMIT

COPY 2019059
BEGIN
LOCK TABLE
DELETE 1381561
INSERT 0 2019059
TRUNCATE TABLE
COMMIT
```

```
COPY 864687
Timing is on.
BEGIN
Time: 0.149 ms
LOCK TABLE
Time: 0.629 ms
DELETE 652117
Time: 11029.147 ms (00:11.029)
INSERT 0 864687
Time: 10180.576 ms (00:10.181)
TRUNCATE TABLE
Time: 4.729 ms
COMMIT
Time: 53.555 ms

COPY 2235200
Timing is on.
BEGIN
Time: 0.178 ms
LOCK TABLE
Time: 0.702 ms
DELETE 1719572
Time: 18621.210 ms (00:18.621)
INSERT 0 2235200
Time: 27716.155 ms (00:27.716)
TRUNCATE TABLE
Time: 63.408 ms
COMMIT
Time: 81.915 ms

COPY 5448790 Timing is on. BEGIN Time: 0.141 ms LOCK TABLE Time: 0.552 ms DELETE 4486067 Time: 15297.884 ms (00:15.298) INSERT 0 5448790 Time: 35654.723 ms (00:35.655) TRUNCATE TABLE Time: 1.860 ms COMMIT Time: 125.503 ms ```

4、检查数据一致性

```
postgres=# select sum(hashtext((t.)::text)),count() from tbl_dst t;
sum | count
----------------+---------
-1351270286348 | 7548269
(1 row)

postgres=# select sum(hashtext((t.)::text)),count() from tbl_src t;
sum | count
----------------+---------
-1351270286348 | 7548269
(1 row)
```

五、源SQL Server, 目标Greenplum, 增量复制insert,update

SQL Server有一个时间戳类型timestamp，会自动记录数据插入，更新的时间，所以不需要用触发器来实现标记。

SQL Server的timestamp，每个表只能建一个，并且在表上绝对唯一。8个字节，是一个相对时间，可以与bigint互相转换。

目标端可以使用bigint来存储SQL Server的timestamp类型。

其他的方法与章节四类似，注意标记位的读取、比较时需要转换一下(timestamp, bigint)。 BIGINT 最小值为 (-9223372036854775808)::int8 。

DEMO1，位点从HDB PG读取。

1、HDB PG, 创建临时表

``` create table tbl_dst (
id int primary key,
info text not null,
mod_time int8 not null -- 使用int8代替时间戳，对应ms sql的timestamp );

-- 临时表

create table tbl_dst_tmp (
id int primary key,
info text not null,
mod_time int8 not null -- 使用int8代替时间戳，对应ms sql的timestamp );
```

2、数据同步的流程（MS SQL -> HDB PG，无法封装到LINUX SHELL中，可能需要ETL程序介入。流程大致如下）

```

1. HDB PG, 获取max(mod_time)

MOD_TIME=PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres -q -t -A -c "select coalesce(max(mod_time),(-9223372036854775808)::int8) from tbl_dst"