最近对gaussdb进行了相关测试,总的来说感觉还是不错;与Oracle高度兼容。但是看文档介绍提到有个ztrst的内部修复工具;于是就想尝试模拟一下坏块。。。请勿在生产环境模拟测试。如下是简单的模拟过程:
+++跟踪进程
strace -fr -F -o /tmp/gauss_read2.log -p 11941
复制
++查询数据
SQL> select * from roger.test;
A B
---------------------------------------- --------------------
1 www.enmotech.com
1 www.killdb.com
100 www.modb.pro
3 rows fetched.
复制
+++查看读取数据的位置
[root@mysqldb bin]# cat /tmp/gauss_read.log |grep 'read'
[root@mysqldb tmp]# tail -50f /tmp/gauss_read2.log |grep 'read'
12116 0.000061 pread(49, "\252c\0\0\4\0\2\2\0\0\0\0\377\3\1\0`\342\205\0\0\0\0\0\253c\0\0\4\0\0\0"..., 65536, 209010688) = 65536
13162 0.000428 pread(51, <unfinished ...>
13162 0.000417 <... pread resumed> "\363-\0\0\3\0\7\2\362-\0\0\3\0\1\0c\232\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 96362496) = 8192
13162 0.000428 pread(51, "\3233\0\0\3\0\7\2\3223\0\0\3\0\1\0\37\233\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 108683264) = 8192
13162 0.000058 pread(51, "G:\0\0\3\0\7\2F:\0\0\3\0\1\0\355\233\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 122216448) = 8192
13557 0.002348 pread(51, <unfinished ...>
13557 0.000577 <... pread resumed> "\6@\0\0\3\0\7\2\5@\0\0\3\0\1\0\245\234\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 134266880) = 8192
13557 0.000512 pread(51, <unfinished ...>
13557 0.000053 <... pread resumed> "}F\0\0\3\0\7\2|F\0\0\3\0\1\0t\235\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 147824640) = 8192
13557 0.001260 pread(51, "\213L\0\0\3\0\7\2\212L\0\0\3\0\1\0006\236\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 160522240) = 8192
13946 0.002612 pread(51, <unfinished ...>
.....
复制
我们可以看到读取了49/51/52 等几个数据文件。 下面来分别看下这3个文件什么是什么 ?
[root@mysqldb log]# ls -ltr /proc/11941/fd
total 0
lr-x------ 1 roger roger 64 Feb 16 16:40 9 -> anon_inode:inotify
lrwx------ 1 roger roger 64 Feb 16 16:40 8 -> anon_inode:[eventpoll]
lrwx------ 1 roger roger 64 Feb 16 16:40 7 -> anon_inode:[eventpoll]
lrwx------ 1 roger roger 64 Feb 16 16:40 6 -> /opt/gauss/gaussdata/zengine.lck
lrwx------ 1 roger roger 64 Feb 16 16:40 52 -> /opt/gauss/gaussdata/sysaux
lrwx------ 1 roger roger 64 Feb 16 16:40 51 -> /opt/gauss/gaussdata/undo
lrwx------ 1 roger roger 64 Feb 16 16:40 50 -> /opt/gauss/gaussdata/system
lrwx------ 1 roger roger 64 Feb 16 16:40 5 -> /opt/gauss/gaussdata/trc/zengine_smon_00003.trc
lrwx------ 1 roger roger 64 Feb 16 16:40 49 -> /opt/gauss/gaussdata/user1
lrwx------ 1 roger roger 64 Feb 16 16:40 48 -> /opt/gauss/gaussdata/log/audit/zengine.aud
lrwx------ 1 roger roger 64 Feb 16 16:40 47 -> socket:[2039491]
lrwx------ 1 roger roger 64 Feb 16 16:40 46 -> /opt/gauss/gaussdata/sysaux
lrwx------ 1 roger roger 64 Feb 16 16:40 45 -> /opt/gauss/gaussdata/undo
lrwx------ 1 roger roger 64 Feb 16 16:40 44 -> /opt/gauss/gaussdata/system
lrwx------ 1 roger roger 64 Feb 16 16:40 43 -> /opt/gauss/gaussdata/sysaux
lrwx------ 1 roger roger 64 Feb 16 16:40 42 -> /opt/gauss/gaussdata/undo
lrwx------ 1 roger roger 64 Feb 16 16:40 41 -> /opt/gauss/gaussdata/undo
lrwx------ 1 roger roger 64 Feb 16 16:40 40 -> /opt/gauss/gaussdata/system
lrwx------ 1 roger roger 64 Feb 16 16:40 4 -> /opt/gauss/gaussdata/log/run/zengine.rlog
lrwx------ 1 roger roger 64 Feb 16 16:40 39 -> /opt/gauss/gaussdata/sysaux
lrwx------ 1 roger roger 64 Feb 16 16:40 38 -> /opt/gauss/gaussdata/system
lrwx------ 1 roger roger 64 Feb 16 16:40 37 -> /opt/gauss/gaussdata/system
lrwx------ 1 roger roger 64 Feb 16 16:40 36 -> /opt/gauss/gaussdata/sysaux
lrwx------ 1 roger roger 64 Feb 16 16:40 35 -> /opt/gauss/gaussdata/temp2_undo
lrwx------ 1 roger roger 64 Feb 16 16:40 34 -> /opt/gauss/gaussdata/temp2_02
lrwx------ 1 roger roger 64 Feb 16 16:40 33 -> /opt/gauss/gaussdata/temp2_01
lrwx------ 1 roger roger 64 Feb 16 16:40 32 -> /opt/gauss/gaussdata/user5
lrwx------ 1 roger roger 64 Feb 16 16:40 31 -> /opt/gauss/gaussdata/user4
lrwx------ 1 roger roger 64 Feb 16 16:40 30 -> /opt/gauss/gaussdata/user3
lrwx------ 1 roger roger 64 Feb 16 16:40 3 -> /opt/gauss/gaussdata/log/enmotech_alarm.log
lrwx------ 1 roger roger 64 Feb 16 16:40 29 -> /opt/gauss/gaussdata/user2
lrwx------ 1 roger roger 64 Feb 16 16:40 28 -> /opt/gauss/gaussdata/user1
lrwx------ 1 roger roger 64 Feb 16 16:40 27 -> /opt/gauss/gaussdata/undo
lrwx------ 1 roger roger 64 Feb 16 16:40 26 -> /opt/gauss/gaussdata/temp1_02
lrwx------ 1 roger roger 64 Feb 16 16:40 25 -> /opt/gauss/gaussdata/temp1_01
lrwx------ 1 roger roger 64 Feb 16 16:40 24 -> /opt/gauss/gaussdata/data/cntl3
lrwx------ 1 roger roger 64 Feb 16 16:40 23 -> /opt/gauss/gaussdata/data/cntl2
lrwx------ 1 roger roger 64 Feb 16 16:40 22 -> /opt/gauss/gaussdata/data/cntl1
lrwx------ 1 roger roger 64 Feb 16 16:40 21 -> /opt/gauss/gaussdata/system
lrwx------ 1 roger roger 64 Feb 16 16:40 20 -> /opt/gauss/gaussdata/log6
lrwx------ 1 roger roger 64 Feb 16 16:40 2 -> /dev/pts/1
lrwx------ 1 roger roger 64 Feb 16 16:40 19 -> /opt/gauss/gaussdata/log5
lrwx------ 1 roger roger 64 Feb 16 16:40 18 -> /opt/gauss/gaussdata/log4
lrwx------ 1 roger roger 64 Feb 16 16:40 17 -> /opt/gauss/gaussdata/log3
lrwx------ 1 roger roger 64 Feb 16 16:40 16 -> /opt/gauss/gaussdata/log2
lrwx------ 1 roger roger 64 Feb 16 16:40 15 -> /opt/gauss/gaussdata/log1
lrwx------ 1 roger roger 64 Feb 16 16:40 14 -> anon_inode:[eventpoll]
lrwx------ 1 roger roger 64 Feb 16 16:40 13 -> socket:[2039248]
lrwx------ 1 roger roger 64 Feb 16 16:40 12 -> anon_inode:[eventpoll]
lrwx------ 1 roger roger 64 Feb 16 16:40 11 -> socket:[2039245]
lrwx------ 1 roger roger 64 Feb 16 16:40 10 -> anon_inode:[eventpoll]
lrwx------ 1 roger roger 64 Feb 16 16:40 1 -> /dev/pts/1
lrwx------ 1 roger roger 64 Feb 16 16:40 0 -> /dev/pts/1
复制
我们可以看到,只有49号文件是数据文件;因为本身我们的test表是存放在users表空间上的。 我们来看看偏移量:
12116 0.000061 pread(49, "\252c\0\0\4\0\2\2\0\0\0\0\377\3\1\0`\342\205\0\0\0\0\0\253c\0\0\4\0\0\0"..., 65536, 209010688) = 65536
复制
简单计算一下,这里的偏移量:
SQL> select file_id,FILE_NAME,TABLESPACE_name,BYTES/1024/1024,BLOCKS,USER_BYTES/1024/1024 from dba_data_files;
FILE_ID FILE_NAME TABLESPACE BYTES/1024/1024 BLOCKS USER_BYTES/1024/1024
------------ ------------------------------------ ---------- -------------------- -------------------- --------------------
0 /opt/gauss/gaussdata/system SYSTEM 512 65536 510.75
1 /opt/gauss/gaussdata/temp1_01 TEMP 160 20480 159.609375
2 /opt/gauss/gaussdata/temp1_02 TEMP 160 20480 159.609375
3 /opt/gauss/gaussdata/undo UNDO 1024 131072 1021.5
4 /opt/gauss/gaussdata/user1 USERS 1024 131072 1021.5
8 /opt/gauss/gaussdata/user5 USERS 100 12800 99.755859375
7 /opt/gauss/gaussdata/user4 USERS 1024 131072 1021.5
6 /opt/gauss/gaussdata/user3 USERS 1024 131072 1021.5
5 /opt/gauss/gaussdata/user2 USERS 1024 131072 1021.5
9 /opt/gauss/gaussdata/temp2_01 TEMP2 160 20480 159.609375
10 /opt/gauss/gaussdata/temp2_02 TEMP2 160 20480 159.609375
11 /opt/gauss/gaussdata/temp2_undo TEMP2_UNDO 200 25600 199.51171875
12 /opt/gauss/gaussdata/sysaux SYSAUX 160 20480 159.609375
13 rows fetched.
SQL> select 209010688/8192 from dual;
209010688/8192
--------------------
25514
1 rows fetched.
SQL> select 65536/8192 from dual;
65536/8192
--------------------
8
复制
+++ dd 创造几个坏page
[roger@mysqldb gaussdata]$ dd if=/opt/gauss/gaussdata/user1 of=/tmp/dd_user.dd bs=8192 count=8
8+0 records in
8+0 records out
65536 bytes (66 kB) copied, 0.00438461 s, 14.9 MB/s
[roger@mysqldb gaussdata]$
[roger@mysqldb gaussdata]$ dd if=/tmp/dd_user.dd of=/opt/gauss/gaussdata/user1 bs=8192 seek=25514 count=8 conv=notrunc
8+0 records in
8+0 records out
65536 bytes (66 kB) copied, 0.000575396 s, 114 MB/s
复制
+++ 查询看看是否会报错
[roger@mysqldb gaussdata]$ zsql / as sysdba -q
connected.
SQL> alter system flush buffer;
Succeed.
SQL> select * from roger.test;
GS-00880, Page 4-25516 corrupted
SQL>
复制
我们来看看此时的日志:
UTC+8 2020-02-16 16:47:22.970|ZENGINE|00053|94734093659988|ERROR>[BUFFER] read page_id 4-1 doesn't match with expected page_id 4-25515 [knl_buffer_access.c:156]
UTC+8 2020-02-16 16:47:22.973|ZENGINE|00053|12116|INFO>[ABR] failed to create ABR task for file 4 page 25515 due to abr disabled [knl_abr.c:80]
UTC+8 2020-02-16 16:47:22.973|ZENGINE|00053|12116|ERROR>[BUFFER] read page_id 4-2 doesn't match with expected page_id 4-25516 [knl_buffer_access.c:156]
UTC+8 2020-02-16 16:47:22.973|ZENGINE|00053|12116|INFO>[ABR] failed to create ABR task for file 4 page 25516 due to abr disabled [knl_abr.c:80]
UTC+8 2020-02-16 16:47:22.975|ZENGINE|00053|12116|ERROR>GS-00880:[BUFFER] read page_id 4-3 doesn't match with expected page_id 4-25517,Page 4-25516 corrupted [knl_buffer_access.c:156]
UTC+8 2020-02-16 16:47:22.975|ZENGINE|00053|12116|INFO>GS-00880:[ABR] failed to create ABR task for file 4 page 25517 due to abr disabled,Page 4-25516 corrupted [knl_abr.c:80]
UTC+8 2020-02-16 16:47:22.975|ZENGINE|00053|12116|ERROR>GS-00880:[BUFFER] read page_id 4-4 doesn't match with expected page_id 4-25518,Page 4-25516 corrupted [knl_buffer_access.c:156]
UTC+8 2020-02-16 16:47:22.975|ZENGINE|00053|12116|INFO>GS-00880:[ABR] failed to create ABR task for file 4 page 25518 due to abr disabled,Page 4-25516 corrupted [knl_abr.c:80]
UTC+8 2020-02-16 16:47:22.975|ZENGINE|00053|12116|ERROR>GS-00880:[BUFFER] read page_id 4-5 doesn't match with expected page_id 4-25519,Page 4-25516 corrupted [knl_buffer_access.c:156]
UTC+8 2020-02-16 16:47:22.975|ZENGINE|00053|12116|INFO>GS-00880:[ABR] failed to create ABR task for file 4 page 25519 due to abr disabled,Page 4-25516 corrupted [knl_abr.c:80]
UTC+8 2020-02-16 16:47:22.979|ZENGINE|00053|12116|ERROR>GS-00880:[BUFFER] read page_id 4-6 doesn't match with expected page_id 4-25520,Page 4-25516 corrupted [knl_buffer_access.c:156]
UTC+8 2020-02-16 16:47:22.979|ZENGINE|00053|12116|INFO>GS-00880:[ABR] failed to create ABR task for file 4 page 25520 due to abr disabled,Page 4-25516 corrupted [knl_abr.c:80]
UTC+8 2020-02-16 16:47:22.979|ZENGINE|00053|12116|ERROR>GS-00880:[BUFFER] read page_id 4-7 doesn't match with expected page_id 4-25521,Page 4-25516 corrupted [knl_buffer_access.c:156]
UTC+8 2020-02-16 16:47:22.981|ZENGINE|00053|12116|INFO>GS-00880:[ABR] failed to create ABR task for file 4 page 25521 due to abr disabled,Page 4-25516 corrupted [knl_abr.c:80]
复制
测试一下重启看看能不能顺利启动数据库。
SQL> conn / as sysdba connected. SQL> SQL> shutdown abort; Succeed. SQL> exit [1]+ Done zengine open -D /opt/gauss/gaussdata [roger@mysqldb gaussdata]$ zengine open -D /opt/gauss/gaussdata & [1] 23851 [roger@mysqldb gaussdata]$ starting instance(normal) instance started [roger@mysqldb gaussdata]$
复制
可见,可以顺利打开数据库。
[roger@mysqldb gaussdata]$ zsql / as sysdba -q
connected.
SQL> select * from roger.test;
GS-00656, Dictionary cache is corrupted
SQL>
复制
此时log的信息如下:
UTC+8 2020-02-16 16:54:36.864|ZENGINE|00053|24455|INFO>[DC] could not load table ROGER.TEST, segment corrupted. [knl_dc_tbl.c:828]
复制
下面我们来尝试进行修复。
++++ 首先启动数据库到mount
[roger@mysqldb gaussdata]$ zengine mount -D /opt/gauss/gaussdata &
[1] 25369
[roger@mysqldb gaussdata]$ starting instance(mount)
instance started
[roger@mysqldb gaussdata]$
[roger@mysqldb gaussdata]$ ztrst -p Enmotech888:1611 -D /tmp/gauss_tmp -B /tmp/gaussdb_backup/roger_level0_01.bak/data_USERS_4_0.bak -P 4-25515 -S 127.0.0.1:1611
invalid argument: P 4-25515, convert uint16 failed
[roger@mysqldb gaussdata]$
[roger@mysqldb gaussdata]$ ztrst -p Enmotech888:1611 -D /tmp/gauss_tmp -B /tmp/gaussdb_backup/roger_level0_01.bak/data_USERS_4_0.bak -P 25516 -S 127.0.0.1:1611
Warning: ztrst tool is a database repair tool only for emergencies. Data consistency may be damaged after using this tool to recover a single tablespace in a distributed database. It is strongly recommended do not use this tool in non-emergency situations.
Continue anyway? (yes/no):yes
begin query database status
database status is open
end query database status
begin validate page
GS-00601 Sql syntax error: datafile value should be in [0, 1022]
page repair stop, expected error code 880, but get 601
[roger@mysqldb gaussdata]$
复制
暂时还没弄清楚ztrst工具如何使用。
+++ 尝试基于表空间去恢复
[roger@mysqldb gaussdata]$ ztrst -p Enmotech888:1611 -D /tmp/gauss_tmp -B /tmp/gaussdb_backup/roger_level0_01.bak/data_USERS_4_0.bak -T users -U ROGER -S 127.0.0.1:1611
Please enter ROGER password:
********
Warning: ztrst tool is a database repair tool only for emergencies. Data consistency may be damaged after using this tool to recover a single tablespace in a distributed database. It is strongly recommended do not use this tool in non-emergency situations.
Continue anyway? (yes/no):yes
starting instance(nomount)
instance startup failed, please check log /tmp/gauss_tmp/tmp_data/log/run/zengine.rlog
[roger@mysqldb gaussdata]$
UTC+8 2020-02-16 17:46:42.667|ZENGINE|00000|17179933885|INFO>[LOG] file '/tmp/gauss_tmp/tmp_data/log/zenith_alarm.log' is added [srv_param.c:518]
UTC+8 2020-02-16 17:46:42.667|ZENGINE|00000|64701|INFO>[LOG] file '/tmp/gauss_tmp/log/run/zengine.rlog' is added [cm_log.c:641]
UTC+8 2020-02-16 17:46:42.669|ZENGINE|00000|64701|INFO>[PARAM] LSNR_PORT = 1611
UTC+8 2020-02-16 17:46:42.669|ZENGINE|00000|64701|INFO>[PARAM] CONTROL_FILES = (/tmp/gauss_tmp/tmp_data/data/ctrl1)
UTC+8 2020-02-16 17:46:42.669|ZENGINE|00000|64701|INFO>[PARAM] PAGE_CHECKSUM = TYPICAL
UTC+8 2020-02-16 17:46:42.669|ZENGINE|00000|64701|INFO>[PARAM] LOG_HOME = /tmp/gauss_tmp/log
UTC+8 2020-02-16 17:46:42.669|ZENGINE|00000|64701|INFO>[PARAM] CPU_NODE_BIND = 0 0
UTC+8 2020-02-16 17:46:42.669|ZENGINE|00000|206158494909|INFO>starting instance(nomount)
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64856|INFO>lgwr thread started
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64861|INFO>index page recycle thread started
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64862|INFO>rollback thread started
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64857|INFO>dbwr thread started
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64858|INFO>ckpt thread started
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64859|INFO>smon thread started
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64863|INFO>rollback thread started
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64864|INFO>rmon thread started
UTC+8 2020-02-16 17:46:43.396|ZENGINE|00000|64860|INFO>stats thread started
UTC+8 2020-02-16 17:46:43.939|ZENGINE|00000|64701|INFO>local ip: 127.0.0.1
UTC+8 2020-02-16 17:46:43.946|ZENGINE|00000|64701|ERROR>GS-00309 : Tcp port conflict 127.0.0.1:1611 [cs_listener.c:199]
UTC+8 2020-02-16 17:46:43.946|ZENGINE|00000|64701|ERROR>failed to create lsnr sockets for listener type 1
UTC+8 2020-02-16 17:46:43.946|ZENGINE|00000|206158494909|ERROR>failed to start lsnr for LSNR_ADDR
UTC+8 2020-02-16 17:46:43.946|ZENGINE|00000|64865|INFO>reactor thread started
UTC+8 2020-02-16 17:46:44.006|ZENGINE|00000|64862|INFO>rollback thread closed
UTC+8 2020-02-16 17:46:44.207|ZENGINE|00000|64863|INFO>rollback thread closed
UTC+8 2020-02-16 17:46:44.406|ZENGINE|00000|64859|INFO>smon thread closed
UTC+8 2020-02-16 17:46:44.606|ZENGINE|00000|64864|INFO>rmon thread closed
UTC+8 2020-02-16 17:46:44.807|ZENGINE|00000|64860|INFO>stats thread closed
UTC+8 2020-02-16 17:46:45.395|ZENGINE|00000|64861|INFO>index_recycle thread closed
UTC+8 2020-02-16 17:46:45.410|ZENGINE|00000|64858|INFO>ckpt thread closed
UTC+8 2020-02-16 17:46:45.410|ZENGINE|00000|64857|INFO>dbwr thread closed
UTC+8 2020-02-16 17:46:45.609|ZENGINE|00000|64856|INFO>lgwr thread closed
UTC+8 2020-02-16 17:46:45.611|ZENGINE|00000|64865|INFO>reactor thread closed
UTC+8 2020-02-16 17:46:45.611|ZENGINE|00000|93905165024445|INFO>[agent] begin to destroy agent pool
UTC+8 2020-02-16 17:46:45.611|ZENGINE|00000|206158494909|INFO>[agent] all agents thread have been closed
UTC+8 2020-02-16 17:46:45.611|ZENGINE|00000|206158494909|INFO>[agent] destroy agent pool end
UTC+8 2020-02-16 17:46:45.611|ZENGINE|00000|9504726479603301565|ERROR>failed to start lsnr
[roger@mysqldb gauss_tmp]$ ztrst -p Enmotech888:1612 -D /tmp/gauss_tmp -B /tmp/gaussdb_backup/roach/20200215_194313 -T users -U ROGER -S 127.0.0.1:1611
Please enter ROGER password:
********
Warning: ztrst tool is a database repair tool only for emergencies. Data consistency may be damaged after using this tool to recover a single tablespace in a distributed database. It is strongly recommended do not use this tool in non-emergency situations.
Continue anyway? (yes/no):yes
starting instance(nomount)
instance started
UTC+8 2020-02-16 17:56:45.097:begin restore tablespace
UTC+8 2020-02-16 17:56:45.215:GS-00002 Failed to open the file /tmp/gaussdb_backup/roach/20200215_194313/backupset, the error code was 2
[roger@mysqldb gauss_tmp]$
复制
无法打开备份集;这里原因还未知。暂时还未研究明白ztrst的工具的用法;争取后面能够研究清楚,再与大家分享。
最后修改时间:2020-02-17 14:03:16
「喜欢这篇文章,您的关注和赞赏是给作者最好的鼓励」
关注作者
【版权声明】本文为墨天轮用户原创内容,转载时必须标注文章的来源(墨天轮),文章链接,文章作者等基本信息,否则作者和墨天轮有权追究责任。如果您发现墨天轮中有涉嫌抄袭或者侵权的内容,欢迎发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。
评论
TA的专栏
Roger's Database Notes
收录77篇内容