前言
一个朋友做内核优化想去分析新增等待事件的一些性能影响,抓火焰图等方法无法抓取到具体的等待时间。这个时候注意到PG好像确实没有等待事件的等待时间的记录。
等待事件反应的是SQL进程“当前”所处的状态,除过“锁等待”等这样少部分等待事件的等待时间可能会比较久,其余大部分等待事件的“等待”过程是比较短的。对于这些事件我们不会去在意它的等待时间,并不会成为性能瓶颈。因此数据库并没有记录所有等待事件的时间。不过“锁等待”这种等待时间相对较久的事件已经有开始时间的记录了,pg14版本开始pg_locks系统视图中增加了一列waitstart。
那么假如我们就是想了解到任意一个等待事件什么时候开始的,什么时候结束的。就如同之前讲的场景,我们新增了一些等待事件,想了解这些事件正常等待时间,该如何去做呢?
以下纯属胡折腾,基本没有实际生产使用价值。
方案思考
我们通常可以查询pg_stat_activity来获取SQL执行状态信息,包括wait_event,state等。关于时间的有backend_start 进程启动时间,xact_start 事务开始时间,query_start sql开始执行时间,state_change 状态变更时间这四种。
可以照猫画虎,直接剽窃xact_start的处理逻辑来实现wait_event_start。
xact_start 在内核中描述为PgBackendStatus结构体的成员st_xact_start_timestamp。
设置的过程:
StartTransaction/PrepareTransaction/CommitTransaction/AbortTransaction -->pgstat_report_xact_timestamp: beentry->st_xact_start_timestamp={GetCurrentTimestamp || 0}
启动事务或者提交/回滚事务时,会调用pgstat_report_xact_timestamp函数来修改pgstat_report_xact_timestamp。
获取的过程:
pg_stat_get_activity: TimestampTzGetDatum(beentry->st_xact_start_timestamp)
使用pg_stat_activity查询时,直接从beentry->st_xact_start_timestamp获取。
beentry为当前进程对应的PgBackendStatus。
那么可以给PgBackendStatus添加一个成员st_wait_event_start_timestamp,
在pgstat_report_wait_start设置等待事件时同步设置对应的wait_event_start,在pgstat_report_wait_end等待事件结束时重置为0即可。
代码实现
使用版本为:postgresql-16.1
PgBackendStatus新增成员st_wait_event_start_timestamp
typedef struct PgBackendStatus
{
/* 省略部分代码行 */
/* Times when current backend, transaction, and activity started */
TimestampTz st_proc_start_timestamp;
TimestampTz st_xact_start_timestamp;
TimestampTz st_activity_start_timestamp;
TimestampTz st_state_start_timestamp;
/* Add by NickYoung at 2024-07-13 AM */
TimestampTz st_wait_event_start_timestamp;
/* End at 2024-07-13 AM */
/* 省略部分代码行 */
}
修改pgstat_report_wait_start函数,设置my_wait_event_info同时设置 beentry->st_wait_event_start_timestamp=GetCurrentTimestamp()
static inline void
pgstat_report_wait_start(uint32 wait_event_info)
{
/*
* Since this is a four-byte field which is always read and written as
* four-bytes, updates are atomic.
*/
*(volatile uint32 *) my_wait_event_info = wait_event_info;
/*
* Add by NickYoung at 2023-07-13 AM
* when call report_wait_start , set wait_event_start to CurrentTimestamp
*/
volatile PgBackendStatus *beentry = MyBEEntry;
TimestampTz wait_start_timestamp = 0;
if (!pgstat_track_activities || !beentry)
return;
/*
* Update my status entry, following the protocol of bumping
* st_changecount before and after. We use a volatile pointer here to
* ensure the compiler doesn't try to get cute.
*/
PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
wait_start_timestamp = GetCurrentTimestamp();
beentry->st_wait_event_start_timestamp = wait_start_timestamp;
PGSTAT_END_WRITE_ACTIVITY(beentry);
/* End at 2024-07-13 AM */
}
修改pgstat_report_wait_end函数,在等待事件结束时同步设置
beentry->st_wait_event_start_timestamp=0
static inline void
pgstat_report_wait_end(void)
{
/* see pgstat_report_wait_start() */
*(volatile uint32 *) my_wait_event_info = 0;
/*
* Add by NickYoung at 2023-07-13 AM
* when call report_wait_end , set wait_event_start to 0
*/
volatile PgBackendStatus *beentry = MyBEEntry;
if (!pgstat_track_activities || !beentry)
return;
/*
* Update my status entry, following the protocol of bumping
* st_changecount before and after. We use a volatile pointer here to
* ensure the compiler doesn't try to get cute.
*/
PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
beentry->st_wait_event_start_timestamp = 0;
PGSTAT_END_WRITE_ACTIVITY(beentry);
/* End at 2024-07-13 AM*/
}
修改pg_stat_get_activity函数,当存在wait_event时获取beentry->st_wait_event_start_timestamp。
/*
* Returns activity of PG backends.
*/
Datum
pg_stat_get_activity(PG_FUNCTION_ARGS)
{
/* Modify by NickYoung at 2023-07-13 AM
* Add values[31] for wait_event_start
*/
#define PG_STAT_GET_ACTIVITY_COLS 32
/* 省略部分代码行 */
/*
* if wait_event exists , get the wait_event_start from beentry
* else set nulls[31] is true
*/
if (wait_event)
{
values[7] = CStringGetTextDatum(wait_event);
values[31] = TimestampTzGetDatum(beentry->st_wait_event_start_timestamp);
}
else
{
nulls[7] = true;
nulls[31] = true;
}
/* 省略部分代码行 */
}
总览:

重新编译代码,重启实例。
测试验证
首先创建一个测试数据库testdb0713,在数据库里创建SQL 函数pg_stat_get_activity_test用来调用c函数pg_stat_get_activity。SQL函数里指定返回wait_event_start
CREATE OR REPLACE FUNCTION pg_catalog.pg_stat_get_activity_test(pid integer, OUT datid oid, OUT pid integer, OUT usesysid oid, OUT application_name text, OUT state text, OUT query text, OUT wait_event_type text, OUT wait_event text, OUT xact_start timestamp with time zone, OUT query_start timestamp with time zone, OUT backend_start timestamp with time zone, OUT state_change timestamp with time zone, OUT client_addr inet, OUT client_hostname text, OUT client_port integer, OUT backend_xid xid, OUT backend_xmin xid, OUT backend_type text, OUT ssl boolean, OUT sslversion text, OUT sslcipher text, OUT sslbits integer, OUT ssl_client_dn text, OUT ssl_client_serial numeric, OUT ssl_issuer_dn text, OUT gss_auth boolean, OUT gss_princ text, OUT gss_enc boolean, OUT gss_delegation boolean, OUT leader_pid integer, OUT query_id bigint,OUT wait_event_start timestamp with time zone)
RETURNS SETOF record
LANGUAGE internal
STABLE PARALLEL RESTRICTED ROWS 100
AS $function$pg_stat_get_activity$function$;
仿照pg_stat_activity视图结构创建pg_stat_activity_test视图,调用pg_stat_get_activity_test函数。
create View pg_stat_activity_test as
SELECT s.datid,
d.datname,
s.pid,
s.leader_pid,
s.usesysid,
u.rolname AS usename,
s.application_name,
s.client_addr,
s.client_hostname,
s.client_port,
s.backend_start,
s.xact_start,
s.query_start,
s.state_change,
s.wait_event_type,
s.wait_event,
s.wait_event_start,
s.state,
s.backend_xid,
s.backend_xmin,
s.query_id,
s.query,
s.backend_type
FROM pg_stat_get_activity_test(NULL::integer) s(datid, pid, usesysid, application_name, state, query, wait_event_type, wait_event, xact_start, query_start, backend_start, state_change, client_ad
dr, client_hostname, client_port, backend_xid, backend_xmin, backend_type, ssl, sslversion, sslcipher, sslbits, ssl_client_dn, ssl_client_serial, ssl_issuer_dn, gss_auth, gss_princ, gss_enc, gss_de
legation, leader_pid, query_id,wait_event_start)
LEFT JOIN pg_database d ON s.datid = d.oid
LEFT JOIN pg_authid u ON s.usesysid = u.oid;
session1创建表,执行一个长事务;session2 执行DDL;session3观察长事务提交前后wait_event及wait_event_start的变化。
session1:创建表t_test,并插入数据,长事务未提交。
testdb0713=# create table t_test(id int);
CREATE TABLE
testdb0713=# begin ;
BEGIN
testdb0713=*# insert into t_test select generate_series(1,100);
INSERT 0 100
testdb0713=*#
session2:执行DDL,truncate table t_test,被session1长事务阻塞等锁中。
testdb0713=# begin;
BEGIN
testdb0713=*# truncate table t_test ;
session3:观察到session2的wait_event_type为Lock,wait_event为relation ,wait_event_start为2024-07-13 21:23:53.932597+08
testdb0713=# select datid,datname,pid,query,backend_start,xact_start,state,state_change,query_start,wait_event_type,wait_event, wait_event_start from pg_stat_activity_test where backend_type='client backend' and pid in ('1004226','1005429') order by pid;
-[ RECORD 1 ]----+--------------------------------------------------
datid | 16810
datname | testdb0713
pid | 1004226
query | insert into t_test select generate_series(1,100);
backend_start | 2024-07-13 21:17:10.793283+08
xact_start | 2024-07-13 21:18:34.084355+08
state | idle in transaction
state_change | 2024-07-13 21:19:15.16208+08
query_start | 2024-07-13 21:19:15.16143+08
wait_event_type | Client
wait_event | ClientRead
wait_event_start | 2024-07-13 21:19:15.162094+08
-[ RECORD 2 ]----+--------------------------------------------------
datid | 16810
datname | testdb0713
pid | 1005429
query | truncate table t_test ;
backend_start | 2024-07-13 21:19:37.126686+08
xact_start | 2024-07-13 21:20:01.162827+08
state | active
state_change | 2024-07-13 21:23:52.932436+08
query_start | 2024-07-13 21:23:52.932433+08
wait_event_type | Lock
wait_event | relation
wait_event_start | 2024-07-13 21:23:53.932597+08
session1:执行commit,提交长事务;session2:truncate table t_test成功;session3:
观察到session2的wait_event_type为Client,wait_event为ClientRead ,wait_event_start为2024-07-13 21:25:02.678701+08
testdb0713=# select datid,datname,pid,query,backend_start,xact_start,state,state_change,query_start,wait_event_type,wait_event, wait_event_start from pg_stat_activity_test where backend_type='client backend' and pid in ('1004226','1005429') order by pid;
-[ RECORD 1 ]----+------------------------------
datid | 16810
datname | testdb0713
pid | 1004226
query | commit;
backend_start | 2024-07-13 21:17:10.793283+08
xact_start |
state | idle
state_change | 2024-07-13 21:25:02.678333+08
query_start | 2024-07-13 21:25:02.677822+08
wait_event_type | Client
wait_event | ClientRead
wait_event_start | 2024-07-13 21:25:02.678346+08
-[ RECORD 2 ]----+------------------------------
datid | 16810
datname | testdb0713
pid | 1005429
query | truncate table t_test ;
backend_start | 2024-07-13 21:19:37.126686+08
xact_start | 2024-07-13 21:20:01.162827+08
state | idle in transaction
state_change | 2024-07-13 21:25:02.678687+08
query_start | 2024-07-13 21:23:52.932433+08
wait_event_type | Client
wait_event | ClientRead
wait_event_start | 2024-07-13 21:25:02.678701+08
testdb0713=#
从结果来看,wait_event_start随着wait_event而变化是符合预期的,假如我要查询一个等待事件的等待时间now() - wait_event_start就可以得到结果。
小结
给PgBackendStatus添加一个成员st_wait_event_start_timestamp从而可以实现wait_event_start来记录等待事件开始时间。当然加入这个逻辑,肯定会有些性能损耗,毕竟每次设置等待事件时,多执行了几行代码;也可以新建一个GUC参数来控制,将wait_event_start理解为debug选项,默认关闭,当需要分析定位某个等待事件时再打开。




