关于 PolarDB PostgreSQL 版
PolarDB PostgreSQL 版是一款阿里云自主研发的云原生关系型数据库产品,100% 兼容 PostgreSQL,高度兼容Oracle语法;采用基于 Shared-Storage 的存储计算分离架构,具有极致弹性、毫秒级延迟、HTAP 、Ganos全空间数据处理能力和高可靠、高可用、弹性扩展等企业级数据库特性。同时,PolarDB PostgreSQL 版具有大规模并行计算能力,可以应对 OLTP 与 OLAP 混合负载。
功能介绍
原生的PostgreSQL中autovacuum 由两类进程构成: autovacuum launcher和 autovacuum work。 autovacuum launcher进程是一个持续运行的进程,在入口函数StartAutoVacLauncher 中fork创建Postmaster的子进程。autovacuum work进程一般由autovacuum launcher进程决策出需要vacuum的DB,则告知postmaster进程fork autovacuum work去处理。autovacuum中vacuum主要清理死元组占用空间的回收,以及老的事务ID freeze。
原理概述
autovacuum launcher
关键结构体
typedef struct
{
sig_atomic_t av_signal[AutoVacNumSignals];
pid_t av_launcherpid;
dlist_head av_freeWorkers;
dlist_head av_runningWorkers;
WorkerInfo av_startingWorker;
AutoVacuumWorkItem av_workItems[NUM_WORKITEMS];
} AutoVacuumShmemStruct;复制
其中,av_freeWorkers 记录剩余的可以launch autovacuum work的个数,初始化的个数和guc参数autovacuum_max_workers配置相关。av_runningWorkers中记录正在做autovacuum的进程;av_startingWorker记录启动中的autovacuum进程;
typedef struct avl_dbase
{
Oid adl_datid;
TimestampTz adl_next_worker;
int adl_score;
dlist_node adl_node;
} avl_dbase;复制
关键流程:
设置相应的信号处理函数;
/*
* Set up signal handlers. We operate on databases much like a regular
* backend, so we use the same signal handling. See equivalent code in
* tcop/postgres.c.
*/
pqsignal(SIGHUP, av_sighup_handler);
pqsignal(SIGINT, StatementCancelHandler);
pqsignal(SIGTERM, avl_sigterm_handler);
pqsignal(SIGQUIT, quickdie);
InitializeTimeouts(); /* establishes SIGALRM handler */
pqsignal(SIGPIPE, SIG_IGN);
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGUSR2, avl_sigusr2_handler);
pqsignal(SIGFPE, FloatExceptionHandler);
pqsignal(SIGCHLD, SIG_DFL);复制
rebuild_database_list 构建数据库列表,每一个数据库对应一个结构体 avl_dbase
,初始化每一个数据库的adl_score,同时记录adl_next_worker时间,其用于与autovacuum_naptime比较判断是否去处理该db做autovacuum。维护DatabaseList一个列表,autovacuum处理数据库顺序按照从DatabaseList列表的尾部开始;
/*
* move the elements from the array into the dllist, setting the
* next_worker while walking the array
*/
for (i = 0; i < nelems; i++)
{
avl_dbase *db = &(dbary[i]);
current_time = TimestampTzPlusMilliseconds(current_time,
millis_increment);
db->adl_next_worker = current_time;
/* later elements should go closer to the head of the list */
dlist_push_head(&DatabaseList, &db->adl_node);
}复制
循环等待直到满足超时或者信号触发被唤醒时,则去处理db是否做autovauum;
/*
* This loop is a bit different from the normal use of WaitLatch,
* because we'd like to sleep before the first launch of a child
* process. So it's WaitLatch, then ResetLatch, then check for
* wakening conditions.
*/
//...
/*
* Wait until naptime expires or we get some type of signal (all the
* signal handlers will wake us by calling SetLatch).
*/
rc = WaitLatch(MyLatch,
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
(nap.tv_sec * 1000L) + (nap.tv_usec 1000L),
WAIT_EVENT_AUTOVACUUM_MAIN);
//...复制
判断:1)
av_freeWorkers
是否还可以分配进程去做autovacuum;2)判断av_startingWorker
是否存在正在启动的autovacuum;满足上述条件则做
launch_worker
;否则继续循环等待;do_start_worker
函数的思路 优先考虑 db的frozen信息是否满足触发条件,如果不满足再判断adl_next_worker
和last_autovac_time
//...
/* Check to see if this one is at risk of wraparound */
if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
{
if (avdb == NULL ||
TransactionIdPrecedes(tmp->adw_frozenxid,
avdb->adw_frozenxid))
avdb = tmp;
for_xid_wrap = true;
continue;
}
elseif (for_xid_wrap)
continue; /* ignore not-at-risk DBs */
elseif (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit))
{
if (avdb == NULL ||
MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti))
avdb = tmp;
for_multi_wrap = true;
continue;
}
elseif (for_multi_wrap)
continue; /* ignore not-at-risk DBs */
//...
dlist_reverse_foreach(iter, &DatabaseList)
{
avl_dbase *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
if (dbp->adl_datid == tmp->adw_datid)
{
/*
* Skip this database if its next_worker value falls between
* the current time and the current time plus naptime.
*/
if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
current_time, 0) &&
!TimestampDifferenceExceeds(current_time,
dbp->adl_next_worker,
autovacuum_naptime * 1000))
skipit = true;
break;
}
}
//...
/*
* Remember the db with oldest autovac time. (If we are here, both
* tmp->entry and db->entry must be non-null.)
*/
if (avdb == NULL ||
tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
avdb = tmp;
}复制
然后告诉master节点去start autovacuum work进程;
autovacuum work
关键流程
autovacuum(不包括analyze):
获取指定的数据库中每一张表,获取 pgstat
中统计信息,relation_needs_vacanalyze
中根据freeze
相关参数计算表是否需要做vacuum;
//...
/* Fetch reloptions and the pgstat entry for this table */
relopts = extract_autovac_opts(tuple, pg_class_desc);
tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
shared, dbentry);
/* Check if it needs vacuum or analyze */
relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
effective_multixact_freeze_max_age,
&dovacuum, &doanalyze, &wraparound);
//...复制
对指定的table做vacuum时会对表上ShareUpdateExclusiveLock锁,与RowShareLock/RowExclusiveLock(正常的读写表操作)不冲突。 vacuum_set_xid_limits
计算OldestXmin,以及freeze相关的limit;
/*
* Open the relation and get the appropriate lock on it.
*
* There's a race condition here: the rel may have gone away since the
* last time we saw it. If so, we don't need to vacuum it.
*
* If we've been asked not to wait for the relation lock, acquire it first
* in non-blocking mode, before calling try_relation_open().
*/
if (!(options & VACOPT_NOWAIT))
onerel = try_relation_open(relid, lmode);
elseif (ConditionalLockRelationOid(relid, lmode))
onerel = try_relation_open(relid, NoLock);
else
{
onerel = NULL;
rel_lock = false;
}
//...
vacuum_set_xid_limits(onerel,
params->freeze_min_age,
params->freeze_table_age,
params->multixact_freeze_min_age,
params->multixact_freeze_table_age,
&OldestXmin, &FreezeLimit,
&xidFullScanLimit,
&MultiXactCutoff, &mxactFullScanLimit);复制
lazy_scan_heap
根据vm获取可以skip的block, 循环扫描每一个页面,对每一个page
做prune
,移除死元组并对page碎片空间做重排。遍历page中每一个tuple,如果有必要,冻结旧的元组的事务标识,移除指向死亡元组的索引元组。判断是否需要截断最后一个页面,当需要截断时会获取AccessExclusiveLock
锁;
if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
{
while (next_unskippable_block < nblocks)
{
uint8 vmstatus;
vmstatus = visibilitymap_get_status(onerel, next_unskippable_block,
&vmbuffer);
if (aggressive)
{
if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
break;
}
else
{
if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
break;
}
vacuum_delay_point();
next_unskippable_block++;
}
//...
/*
* Prune all HOT-update chains in this page.
*
* We count tuples removed by the pruning step as removed by
* VACUUM.
*/
tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
&vacrelstats->latestRemovedXid);
//...
/* execute collected freezes */
for (i = 0; i < nfrozen; i++)
{
ItemId itemid;
HeapTupleHeader htup;
itemid = PageGetItemId(page, frozen[i].offset);
htup = (HeapTupleHeader) PageGetItem(page, itemid);
heap_execute_freeze_tuple(htup, &frozen[i]);
}
//...
/*
* Optionally truncate the relation.
*/
if (should_attempt_truncation(vacrelstats))
lazy_truncate_heap(onerel, vacrelstats);
//...复制
freeze
后更新datfrozenxid
,并计算所有db最老的freeze用来判断需要移除的CLOG文件。
/*
* We leak table_toast_map here (among other things), but since we're
* going away soon, it's not a problem.
*/
/*
* Update pg_database.datfrozenxid, and truncate pg_xact if possible. We
* only need to do this once, not after each table.
*
* Even if we didn't vacuum anything, it may still be important to do
* this, because one indirect effect of vac_update_datfrozenxid() is to
* update ShmemVariableCache->xidVacLimit. That might need to be done
* even if we haven't vacuumed anything, because relations with older
* relfrozenxid values or other databases with older datfrozenxid values
* might have been dropped, allowing xidVacLimit to advance.
*
* However, it's also important not to do this blindly in all cases,
* because when autovacuum=off this will restart the autovacuum launcher.
* If we're not careful, an infinite loop can result, where workers find
* no work to do and restart the launcher, which starts another worker in
* the same database that finds no work to do. To prevent that, we skip
* this if (1) we found no work to do and (2) we skipped at least one
* table due to concurrent autovacuum activity. In that case, the other
* worker has already done it, or will do so when it finishes.
*/
if (did_vacuum || !found_concurrent_worker)
vac_update_datfrozenxid();复制