暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

PostgreSQL源码分析——备份恢复

原创 chirpyli 2023-11-09
630

在上一篇PostgreSQL源码分析——基础备份中,我们分析了PG中基础备份的过程以及源码,备份与恢复是不分离的,这里我们继续分析一下,从基础备份中进行恢复的源码。

备份过程

执行备份:

postgres=# select pg_start_backup('bak3'); pg_start_backup ----------------- 0/6000060 (1 row) postgres=# insert into t1 values(5); INSERT 0 1 postgres=# select pg_stop_backup(); NOTICE: WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup pg_stop_backup ---------------- 0/60002D8 (1 row)

查看日志:

postgres@slpc:~/pgsql/pgdata/pg_wal$ pg_waldump -p ../pg_wal 000000010000000000000006 rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000028, prev 0/05000110, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738 rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000060, prev 0/06000028, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738 rmgr: XLOG len (rec/tot): 114/ 114, tx: 0, lsn: 0/06000098, prev 0/06000060, desc: CHECKPOINT_ONLINE redo 0/6000060; tli 1; prev tli 1; fpw true; xid 0:738; oid 16387; multi 1; offset 0; oldest xid 726 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 738; online rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000110, prev 0/06000098, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738 rmgr: Heap len (rec/tot): 54/ 258, tx: 738, lsn: 0/06000148, prev 0/06000110, desc: INSERT off 5 flags 0x00, blkref #0: rel 1663/13010/16384 blk 0 FPW rmgr: Transaction len (rec/tot): 34/ 34, tx: 738, lsn: 0/06000250, prev 0/06000148, desc: COMMIT 2023-09-18 14:40:06.694650 CST rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000278, prev 0/06000250, desc: RUNNING_XACTS nextXid 739 latestCompletedXid 738 oldestRunningXid 739 rmgr: XLOG len (rec/tot): 34/ 34, tx: 0, lsn: 0/060002B0, prev 0/06000278, desc: BACKUP_END 0/6000060 rmgr: XLOG len (rec/tot): 24/ 24, tx: 0, lsn: 0/060002D8, prev 0/060002B0, desc: SWITCH

查看backup_label文件:

postgres@slpc:~/pgsql/pgbak2$ cat backup_label START WAL LOCATION: 0/6000060 (file 000000010000000000000006) CHECKPOINT LOCATION: 0/6000098 BACKUP METHOD: pg_start_backup BACKUP FROM: primary START TIME: 2023-09-18 14:39:50 CST LABEL: bak3 START TIMELINE: 1

恢复源码分析

启动备份数据库,检测到有backup_label文件时,则认为是从一个备份文件中进行恢复,读取backup_label中的检查点信息,而不是从pg_control中读取。

main(int argc, char *argv[]) --> PostmasterMain(argc, argv); --> LocalProcessControlFile(false); // 读pg_control文件 --> StartupPID = StartupDataBase(); // 启动startup子进程 --> StartupProcessMain(); --> StartupXLOG(); --> ValidateXLOGDirectoryStructure(); // Verify that pg_wal and pg_wal/archive_status exist. --> readRecoverySignalFile(); // Check for signal files, and if so set up state for offline recovery --> validateRecoveryParameters(); --> XLogReaderAllocate // Allocate and initialize a new XLogReader. // 是否存在backup_label文件,如果存在的话,则认为是从一个备份文件进行恢复 --> read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby) --> record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); //回放的起点为backup_label中的检查点 --> XLogBeginRead(xlogreader, RecPtr); // Begin reading WAL at 'RecPtr'. --> record = ReadRecord(xlogreader, LOG, true); for (;;) { record = XLogReadRecord(xlogreader, &errormsg); // Attempt to read an XLOG record. } --> StartupCLOG(); /* REDO */ if (InRecovery) { UpdateControlFile(); CheckRecoveryConsistency(); if (checkPoint.redo < RecPtr) { /* back up to find the record */ XLogBeginRead(xlogreader, checkPoint.redo); record = ReadRecord(xlogreader, PANIC, false); } else { /* just have to read next record after CheckPoint */ record = ReadRecord(xlogreader, LOG, false); } if (record != NULL) { /* main redo apply loop */ do // 回放日志 { // 判断否已达到指定恢复位置,PITR用 if (recoveryStopsBefore(xlogreader)) { reachedRecoveryTarget = true; break; } /* Now apply the WAL record itself */ RmgrTable[record->xl_rmid].rm_redo(xlogreader); } } }

核心函数StartupXLOG源码分析:

void StartupXLOG(void) { // ... /* Set up XLOG reader facility */ MemSet(&private, 0, sizeof(XLogPageReadPrivate)); xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.page_read = &XLogPageRead, .segment_open = NULL, .segment_close = wal_segment_close), &private); // 读backup_label文件 if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)) { /* Archive recovery was requested, and thanks to the backup label * file, we know how far we need to replay to reach consistency. Enter * archive recovery directly. */ InArchiveRecovery = true; /* When a backup_label file is present, we want to roll forward from * the checkpoint it identifies, rather than using pg_control. */ record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); if (record != NULL) { memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); /* Make sure that REDO location exists. This may not be the case * if there was a crash during an online backup, which left a * backup_label around that references a WAL segment that's already been archived. */ if (checkPoint.redo < checkPointLoc) { XLogBeginRead(xlogreader, checkPoint.redo); if (!ReadRecord(xlogreader, LOG, false)) ereport(FATAL,(errmsg("could not find redo location referenced by checkpoint record"), errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir))); } } else { ereport(FATAL, (errmsg("could not locate required checkpoint record"), errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir))); wasShutdown = false; /* keep compiler quiet */ } /* set flag to delete it later */ haveBackupLabel = true; } else // 如果没有backup_label文件,则读pg_control文件,在备机恢复的场景中,如果丢失了backup_label文件,而读取了pg_control文件中的检查点,则会因为回放位置不对,无法达成数据一致,恢复失败。 { /* Get the last valid checkpoint record. */ checkPointLoc = ControlFile->checkPoint; RedoStartLSN = ControlFile->checkPointCopy.redo; record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true); if (record != NULL) { ereport(DEBUG1,(errmsg_internal("checkpoint record is at %X/%X", LSN_FORMAT_ARGS(checkPointLoc)))); } else { /* * We used to attempt to go back to a secondary checkpoint record * here, but only when not in standby mode. We now just fail if we * can't read the last checkpoint because this allows us to * simplify processing around checkpoints. */ ereport(PANIC,(errmsg("could not locate a valid checkpoint record"))); } memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); } /* REDO */ if (InRecovery) { /* * Set backupStartPoint if we're starting recovery from a base backup. * * Also set backupEndPoint and use minRecoveryPoint as the backup end * location if we're starting recovery from a base backup which was * taken from a standby. In this case, the database system status in * pg_control must indicate that the database was already in recovery. * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted * before reaching this point; e.g. because restore_command or primary_conninfo were faulty. * * Any other state indicates that the backup somehow became corrupted and we can't sensibly continue with recovery. */ if (haveBackupLabel) { ControlFile->backupStartPoint = checkPoint.redo; // 从基础备份中恢复 ControlFile->backupEndRequired = backupEndRequired; if (backupFromStandby) { if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY && dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY) ereport(FATAL, (errmsg("backup_label contains data inconsistent with control file"), errhint("This means that the backup is corrupted and you will " "have to use another backup for recovery."))); ControlFile->backupEndPoint = ControlFile->minRecoveryPoint; } } UpdateControlFile(); // 更新pg_control,主要是将Backup start location写入 /* * We're in recovery, so unlogged relations may be trashed and must be * reset. This should be done BEFORE allowing Hot Standby * connections, so that read-only backends don't try to read whatever * garbage is left over from before. */ ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP); /* Initialize resource managers */ for (rmid = 0; rmid <= RM_MAX_ID; rmid++) { if (RmgrTable[rmid].rm_startup != NULL) RmgrTable[rmid].rm_startup(); } CheckRecoveryConsistency(); // Checks if recovery has reached a consistent state. /* * Find the first record that logically follows the checkpoint --- it * might physically precede it, though. */ if (checkPoint.redo < RecPtr) { /* back up to find the record */ XLogBeginRead(xlogreader, checkPoint.redo); record = ReadRecord(xlogreader, PANIC, false); } else { /* just have to read next record after CheckPoint */ record = ReadRecord(xlogreader, LOG, false); } if (record != NULL) { // 在这里进行实质的日志回放 /* main redo apply loop */ do { bool switchedTLI = false; // 用于PITR,判断是否已经回放到了指定的Target /* Have we reached our recovery target? */ if (recoveryStopsBefore(xlogreader)) { reachedRecoveryTarget = true; break; } /* Now apply the WAL record itself */ RmgrTable[record->xl_rmid].rm_redo(xlogreader); // 调用standby_redo,xlog_redo,heap_redo,xact_redo等,进行回放, /* Allow read-only connections if we're consistent now */ CheckRecoveryConsistency(); /* Exit loop if we reached inclusive recovery target */ if (recoveryStopsAfter(xlogreader)) { reachedRecoveryTarget = true; break; } /* Else, try to fetch the next WAL record */ record = ReadRecord(xlogreader, LOG, false); } while (record != NULL); // 直到结束 } } /* * Determine where to start writing WAL next. * * When recovery ended in an incomplete record, write a WAL record about * that and continue after it. In all other cases, re-fetch the last * valid or last applied record, so we can identify the exact endpoint of * what we consider the valid portion of WAL. */ XLogBeginRead(xlogreader, LastRec); record = ReadRecord(xlogreader, PANIC, false); EndOfLog = EndRecPtr; // ... }

一直回放到XLOG_BACKUP_END

/* * XLOG resource manager's routines * * Definitions of info values are in include/catalog/pg_control.h, though * not all record types are related to control file updates. */ void xlog_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; XLogRecPtr lsn = record->EndRecPtr; if (info == XLOG_NEXTOID) { // ... } else if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); // ... RecoveryRestartPoint(&checkPoint); } else if (info == XLOG_CHECKPOINT_ONLINE) { CheckPoint checkPoint; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); // ... RecoveryRestartPoint(&checkPoint); } else if (info == XLOG_OVERWRITE_CONTRECORD) { xl_overwrite_contrecord xlrec; memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); VerifyOverwriteContrecord(&xlrec, record); } else if (info == XLOG_END_OF_RECOVERY) { xl_end_of_recovery xlrec; memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery)); /* * For Hot Standby, we could treat this like a Shutdown Checkpoint, * but this case is rarer and harder to test, so the benefit doesn't * outweigh the potential extra cost of maintenance. */ /* * We should've already switched to the new TLI before replaying this * record. */ if (xlrec.ThisTimeLineID != ThisTimeLineID) ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", xlrec.ThisTimeLineID, ThisTimeLineID))); } else if (info == XLOG_NOOP) { /* nothing to do here */ } else if (info == XLOG_SWITCH) { /* nothing to do here */ } else if (info == XLOG_RESTORE_POINT) { /* nothing to do here */ } else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) { /* * Full-page image (FPI) records contain nothing else but a backup * block (or multiple backup blocks). Every block reference must * include a full-page image - otherwise there would be no point in * this record. * * No recovery conflicts are generated by these generic records - if a * resource manager needs to generate conflicts, it has to define a * separate WAL record type and redo routine. * * XLOG_FPI_FOR_HINT records are generated when a page needs to be * WAL- logged because of a hint bit update. They are only generated * when checksums are enabled. There is no difference in handling * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info * code just to distinguish them for statistics purposes. */ for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++) { Buffer buffer; if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); UnlockReleaseBuffer(buffer); } } else if (info == XLOG_BACKUP_END) // 回放到这里,结束备份恢复过程 { XLogRecPtr startpoint; memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint)); if (ControlFile->backupStartPoint == startpoint) { /* * We have reached the end of base backup, the point where * pg_stop_backup() was done. The data on disk is now consistent. * Reset backupStartPoint, and update minRecoveryPoint to make * sure we don't allow starting up at an earlier point even if * recovery is stopped and restarted soon after this. */ elog(DEBUG1, "end of backup reached"); LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); if (ControlFile->minRecoveryPoint < lsn) { ControlFile->minRecoveryPoint = lsn; ControlFile->minRecoveryPointTLI = ThisTimeLineID; } ControlFile->backupStartPoint = InvalidXLogRecPtr; ControlFile->backupEndRequired = false; UpdateControlFile(); LWLockRelease(ControlFileLock); } } else if (info == XLOG_PARAMETER_CHANGE) { // ... } else if (info == XLOG_FPW_CHANGE) { // ... } }
「喜欢这篇文章,您的关注和赞赏是给作者最好的鼓励」
关注作者
【版权声明】本文为墨天轮用户原创内容,转载时必须标注文章的来源(墨天轮),文章链接,文章作者等基本信息,否则作者和墨天轮有权追究责任。如果您发现墨天轮中有涉嫌抄袭或者侵权的内容,欢迎发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

评论