CPU CPU使用/cpu_count 超过 90% #容量#
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
检查连接情况,确保连接数使用率正常
col PROCESS_LIMIT for a20; select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';复制
二、请描述告警出现的可能原因
1、有效率差的语句,sql性能问题
2、语句并发过高
三、预计多久不处理就可以出现UIOC(P1)
CPU高可能导致业务语句执行缓慢甚至数据库不可用,需15分钟内处理正常
四、告警检查的步骤(通过本步骤确定告警是否是误告)
select to_char(round(aa.value/cc.value,2))||'%' from v$sysmetric aa,v$parameter cc where aa.METRIC_NAME ='CPU Usage Per Sec' and aa.group_id=2 and cc.name ='cpu_count';复制
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt; create table dbmgr.sysstat_20220000 as select tt.* from gv$sysstat tt; create table dbmgr.sysmetric_20220000 as select tt.* from gv$sysmetric tt; create table dbmgr.sysmtc_sum_20220000 as select tt.* from gv$sysmetric_summary tt; create table dbmgr.system_event_20220000 as select tt.* from gv$system_event tt; create table dbmgr.event_hitgam_20220000 as select tt.* from gv$event_histogram tt; create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt; create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt; create table dbmgr.sess_w_hist_20220000 as select tt.* from gv$session_wait_history tt; create table dbmgr.sql_20220000 as select tt.* from gv$sql tt;复制
六、需要反馈上级以及运营的信息
将导致CPU异常的语句反馈给运营,评估决策是否kill会话
七、建议处理的流程,步骤和对应命令
(一)一般情况下异常语句导致CPU冲高会伴随等待事件,常规的定位方法如下:
1、 检查数据库的等待事件情况以及对应的语句
--查看当前数据库的top等待事件情况以及对应的语句
col EVENT for a40 select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and sql_id is not null and wait_class <> 'Idle' group by inst_id,sql_id,event order by count(*) desc;复制
--查看最近一分钟内,最消耗CPU的语句
select sql_id,count(*), round(count(*)/sum(count(*)) over (),2) pctload from v$active_session_history where sample_time > sysdate -1/(24*60) and session_type <> 'BACKGROUND' and session_state= 'ON CPU' group by sql_id order by count(*) desc;复制
--查看近10分钟的会话状态分布,确定哪些sql处于on cpu模式,看sql有无性能问题
select sql_id, session_state, username, count(*) from v$active_session_history a, dba_users s where sample_time > sysdate - 10 / 24 / 60 and a.USER_ID = s.user_id group by sql_id, session_state, username order by 4 desc;复制
2、查询近1个小时每15分钟的TOP 5 SQL
select * from (select t.* ,row_number() over(partition by time order by cpu_time desc) rn from (select to_char(end_INTERVAL_TIME ,'yyyy/mm/dd hh24:mi') time,s.sql_id,sum(s.executions_delta) execs, sum(s.cpu_time_delta/1000/1000)cpu_time ,sum(s.elapsed_time_delta/1000/1000) elapsed_time from dba_hist_sqlstat s , dba_hist_snapshot b where s.instance_number=b.instance_number and s.snap_id = b.snap_id and b.end_interval_time > sysdate-1/24 group by to_char(end_INTERVAL_TIME ,'yyyy/mm/dd hh24:mi'),sql_id )t) where rn<=5 order by 1 desc,4 desc,6复制
--查询特定快照时间的CPU topsql,可以用来看非实时CPU问题
select * from dba_hist_snapshot order by snap_id desc; --sql ordered by cpu time select * from (select round(nvl((sqt.cput / 1000000), to_number(null)),2) "Cpu Time (s)", round( nvl((sqt.elap / 1000000), to_number(null)),2) "Elap Time (s)", sqt.exec, round(decode(sqt.exec,0,to_number(null),(sqt.cput / sqt.exec / 1000000)),2) "Cpu per Exec (s)", round((100 * (sqt.elap / (select sum(e.value) - sum(b.value) from dba_hist_sys_time_model b, dba_hist_sys_time_model e where b.snap_id = &beg_snap and e.snap_id = &end_snap and e.stat_name = 'DB time' and b.stat_name = 'DB time'))) ,2)norm_val, sqt.sql_id, decode(sqt.module, null, null, 'Module: ' || sqt.module) SqlModule, nvl(to_nchar(SUBSTR(st.sql_text,1,2000)) , (' ** SQL Text Not Available ** ')) SqlText from (select sql_id, max(module) module, sum(cpu_time_delta) cput, sum(elapsed_time_delta) elap, sum(executions_delta) exec from dba_hist_sqlstat where &beg_snap < snap_id and snap_id <= &end_snap group by sql_id) sqt, dba_hist_sqltext st where st.sql_id(+) = sqt.sql_id order by nvl(sqt.cput, -1) desc, sqt.sql_id) where rownum < 65 and (rownum <= 10 or norm_val > 1);复制
3、查看语句的执行计划
--先查看内存中等待事件对应的会话以及语句的详情
set linesize 150 pagesize 999 col EVENT for a30 col USERNAME for a10 col MACHINE for a10 col OSUSER for a10 col PROGRAM for a10 select inst_id,sid,serial#,sql_id,event,blocking_instance,blocking_session,blocking_session_status,final_blocking_instance,final_blocking_session,final_blocking_session_status,username,status,machine,osuser,program,sql_hash_value,to_char(logon_time,'yyyy-mm-dd hh24:mi:ss'),prev_sql_id from gv$session where event = '&event'; set linesize 150 pagesize 999 col PARSING_SCHEMA_NAME for a15 col FIRST_LOAD_TIME for a20 col LAST_LOAD_TIME for a20 select a.inst_id,a.sql_id,a.child_number,a.parsing_schema_name,a.first_load_time,a.last_load_time,a.last_active_time,a.plan_hash_value,a.sql_plan_baseline,a.sql_text from gv$sql a where sql_id='&sql_id' order by last_active_time desc;复制
--查看语句最近30天的执行情况,执行计划是否有改变
with a as (select snap_id,sql_id,instance_number, plan_hash_value, round(buffer_gets_delta/(decode(executions_delta,0,1,executions_delta)),2) get_exec, round(disk_reads_delta/(decode(executions_delta,0,1,executions_delta)),2) read_exec, round(cpu_time_delta/(decode(executions_delta,0,1,executions_delta))/1000,2) cpu_exec_s , round(s.ELAPSED_TIME_DELTA/(decode(executions_delta,0,1,executions_delta))/1000,2) elaps_exec_s , round(s.IOWAIT_DELTA/(decode(executions_delta,0,1,executions_delta))/1000,2) IOWAIT_exec_s , decode(executions_delta,0,-1,executions_delta) executions_delta, parsing_schema_id, parse_calls_delta, round(ROWS_PROCESSED_delta/(decode(executions_delta,0,1,executions_delta)),2) rows_exec from dba_hist_sqlstat s where sql_id in ('&sql_id' ,'' ) ) select to_char(b.BEGIN_INTERVAL_TIME ,'yyyy/mm/dd') time,sql_id,plan_hash_value,avg(rows_exec) "每次返回行数",avg(read_exec) "逻辑读",avg(read_exec) "物理读",avg(IOWAIT_exec_s) "每次IO等待(毫秒)", sum(A.executions_delta) "执行次数" ,avg(a.elaps_exec_s) "每次执行(毫秒)", sum(elaps_exec_s*A.executions_delta) "总时间" from a,dba_hist_snapshot b where a.snap_id=b.snap_id and b.instance_number = a.instance_number and b.begin_interval_time > trunc(sysdate )-30 group by to_char(b.BEGIN_INTERVAL_TIME ,'yyyy/mm/dd'),sql_id,plan_hash_value order by 2,1 desc ;复制
--备用语句
--查看语句历史执行计划
select s.snap_id, to_char(a.end_interval_time, 'YYYY-MM-DD HH24:MI:SS'), s.sql_id, s.plan_hash_value, s.executions_delta, round(s.buffer_gets_delta / executions_delta,3) as buffer_gets_delta_each, round(s.elapsed_time_delta / executions_delta / 1000000,3) as "elapsed_time_delta/s", round(s.cpu_time_delta / executions_delta / 1000000,3) as "cpu_time_delta/s", round(s.disk_reads_delta / executions_delta,3) as disk_reads_delta_each from dba_hist_sqlstat s, dba_hist_snapshot a where s.snap_id = a.snap_id and s.instance_number=a.instance_number and sql_id = '&sql_id' and s.executions_delta > 0 order by s.snap_id desc;复制
4、查看语句的执行计划明细,进一步分析是否是SQL性能问题
set linesize 150 pagesize 999 select * from table(dbms_xplan.display_cursor('&sql_id',&child_no,'ADVANCED')); select * from table(dbms_xplan.display_cursor('&sql_id',null,'ADVANCED')); select * from table(dbms_xplan.display_awr('&sql_id',&plan_hash_value,null,'ADVANCED')); select * from table(dbms_xplan.display_sql_plan_baseline('&sql_handle','&plan_name'));复制
5、固化较优的执行计划,常见下面三种,待补充引用
固化历史较优执行计划
加hint生成新的执行计划并固化
从其他库迁移spm导入正确的执行计划
6、应急情况下,可根据语句或者等待事件KILL会话(需要运营决策可以KILL)
select 'alter system kill session ' || '''' || sid || ',' || serial# || ',@' || inst_id || '''' || ' immediate;' from gv$session where event='&event'//sql_id='&sql_id' and type = 'USER';复制
7、查看语句涉及的对象的统计信息 (需要升级DA以及领导决策)
--查看语句涉及的对象信息
-- TABLE INFO select distinct t.owner,t.table_name,t.num_rows,t.last_analyzed,t.degree,t.row_movement from v$sql_plan s, dba_tables t where s.sql_id = '&1' and s.object_type like '%TABLE%' and s.object_name = t.table_name and s.object_owner = t.owner order by last_analyzed; -- INDEX INFO select distinct t.owner,t.index_name,t.index_type,t.uniqueness,t.distinct_keys,t.num_rows,t.last_analyzed,t.degree from v$sql_plan s, dba_indexes t where s.sql_id = '&1' and s.object_type like '%INDEX%' and s.object_name = t.index_name and s.object_owner = t.owner order by last_analyzed; -- INDEX PARTITION INFO select distinct t.index_owner, t.index_name, t.column_name, t.column_position, t.descend,c.num_distinct,c.low_value,c.high_value,c.num_nulls,c.num_buckets,c.last_analyzed from v$sql_plan s, dba_ind_columns t, dba_tab_cols c where s.sql_id = '&1' and s.object_type like '%INDEX%' and s.object_name = t.index_name and s.object_owner = t.index_owner and t.table_owner = c.owner and t.table_name = c.table_name and t.column_name = c.column_name order by last_analyzed; -- DDL INFO select distinct t.owner, t.object_name,t.status,t.created, t.last_ddl_time, t.timestamp from v$sql_plan s, dba_objects t where s.sql_id = '&1' and s.object_name = t.OBJECT_NAME and s.object_owner = t.owner order by timestamp;复制
--查看对象的统计信息
select owner,table_name,column_name,num_distinct,density,last_analyzed,histogram from dba_tab_col_statistics where table_name='&table_name';复制
--收集统计信息(需要升级DA以及领导决策)
exec dbms_stats.gather_table_stats(ownname => '&owner',tabname => '&table_name',estimate_percent => 10,cascade => true);复制
8、判断是少索引,并且需要紧急创建索引(需要升级DA以及领导决策)
create index xxx.xxx on xxx.xxx(xxx) tablespace xxx initrans xxx online parallel xxx;复制
ACTIVE SESSION 超过300 #性能#
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
检查连接情况,确保能正常登录、有应用连接、连接数使用率正常
col PROCESS_LIMIT for a20; select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';复制
二、请描述告警出现的可能原因
1、异常等待事件导致活跃连接突增
三、预计多久不处理就可以出现UIOC(P1)
等待严重影响业务语句正常执行,需15分钟内处理正常
四、告警检查的步骤(通过本步骤确定告警是否是误告)
select 'session_active:'||count(*) from v$session where status='ACTIVE' and TYPE !='BACKGROUND';复制
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt; create table dbmgr.sysstat_20220000 as select tt.* from gv$sysstat tt; create table dbmgr.sysmetric_20220000 as select tt.* from gv$sysmetric tt; create table dbmgr.sysmtc_sum_20220000 as select tt.* from gv$sysmetric_summary tt; create table dbmgr.system_event_20220000 as select tt.* from gv$system_event tt; create table dbmgr.event_hitgam_20220000 as select tt.* from gv$event_histogram tt; create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt; create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt; create table dbmgr.sess_w_hist_20220000 as select tt.* from gv$session_wait_history tt; create table dbmgr.sql_20220000 as select tt.* from gv$sql tt;复制
六、需要反馈上级以及运营的信息
将等待的情况反馈运营/DA,同时确认业务的影响情况,根据情况确认是否kill异常等待事件对应的会话
七、建议处理的流程,步骤和对应命令
1、查看数据库后台进程有没有被堵塞
select inst_id,type,count(*) from gv$session where blocking_session<>'' and status='ACTIVE' group by inst_id,type;复制
2、检查数据库的等待事件情况,确定top等待以及topsql
--查看当前的top等待和topsql col EVENT for a40 select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class <> 'Idle' group by inst_id,sql_id,event order by count(*) desc; --ASH查看用户的top等待事件 select * from (select event, username, count(*) from v$active_session_history a, dba_users s where sample_time >= to_date('&begin_time', 'YYYY-MM-DD HH24:MI:SS') AND sample_time <= to_date('&end_time', 'YYYY-MM-DD HH24:MI:SS') and a.USER_ID = s.user_id group by event, username order by 3 desc) where rownum < 50; --ASH查看用户的topsql select * from (select sql_id, username, count(*) from v$active_session_history a, dba_users s where sample_time >= to_date('&begin_time', 'YYYY-MM-DD HH24:MI:SS') AND sample_time <= to_date('&end_time', 'YYYY-MM-DD HH24:MI:SS') and a.USER_ID = s.user_id group by sql_id, username order by 3 desc) where rownum < 50;复制
3、详细查看等待事件的具体情况,一般关注top等待对应的sql,用户,对象,blocking session等
--查看event对应会话详情
set linesize 150 pagesize 999 col EVENT for a30 col USERNAME for a10 col MACHINE for a10 col OSUSER for a10 col PROGRAM for a10 select inst_id,sid,serial#,sql_id,event,blocking_instance,blocking_session,blocking_session_status,final_blocking_instance,final_blocking_session,final_blocking_session_status,username,status,machine,osuser,program,sql_hash_value,to_char(logon_time,'yyyy-mm-dd hh24:mi:ss'),prev_sql_id from gv$session where event = '&event';复制
--查看sql_id对应具体语句sql_text
set linesize 150 pagesize 999 col PARSING_SCHEMA_NAME for a15 col FIRST_LOAD_TIME for a20 col LAST_LOAD_TIME for a20 select a.inst_id,a.sql_id,a.child_number,a.parsing_schema_name,a.first_load_time,a.last_load_time,a.last_active_time,a.plan_hash_value,a.sql_plan_baseline,a.sql_text from gv$sql a where sql_id='&sql_id' order by last_active_time desc;复制
--利用ASH查看等待的对象
a.select s.ROW_WAIT_OBJ#, count(*) from gv$session s where event = '&event' group by ROW_WAIT_OBJ# order by 2 desc; b.select * from (select ash.CURRENT_OBJ#,event,count(*) from gv$active_session_history ash where sample_time>=to_timestamp('&begin_time','yyyy-mm-dd hh24:mi:ss') and sample_time<=to_timestamp('&end_time','yyyy-mm-dd hh24:mi:ss') group by ash.CURRENT_OBJ#,event order by 3 desc) where rownum<50; c.select * from (select CURRENT_OBJ#,sql_id,event,count(*) from gv$active_session_history ash where sample_time>=to_timestamp('&begin_time','yyyy-mm-dd hh24:mi:ss') and sample_time<=to_timestamp('&end_time','yyyy-mm-dd hh24:mi:ss') group by ash.CURRENT_OBJ#,sql_id,event order by 4 desc )where rownum<50;复制
--查看对象的状态以及ddl时间
col OWNER for a20 col OBJECT_NAME for a20 select a.owner,a.object_name,a.status,a.last_ddl_time,a.object_type from dba_objects a where object_id='&obj_id';复制
a.如果有blocking session,分析下blocking session的具体情况,是否可以kill
b. 如果没有blocking session,看下sql是否有性能问题,对sql进行优化及固化执行计划
4、查看语句的执行计划明细
set linesize 150 pagesize 999 select * from table(dbms_xplan.display_cursor('&sql_id',&child_no,'ADVANCED')); select * from table(dbms_xplan.display_cursor('&sql_id',null,'ADVANCED')); select * from table(dbms_xplan.display_awr('&sql_id',&plan_hash_value,null,'ADVANCED')); select * from table(dbms_xplan.display_sql_plan_baseline('&sql_handle','&plan_name'));复制
5、根据沟通决策,连接数据库批量kill会话
set head off feedback off set linesize 180 pagesize 0 spool /tmp/kill_sess.sql --输入等待事件名称 select 'alter system kill session '||''''||sid||','||serial#||',@'|| inst_id ||''''|| ' immediate;' from gv$session where event='&event' and type <> 'BACKGROUND'; spool off --执行/tmp/kill_sess.sql脚本,将相关会话杀掉: sqlplus "/ as sysdba" @/tmp/kill_sess.sql复制
6、如果异常等待严重导致数据库hang住、命令执行卡顿、连接数满等极端情况,立即升级分组经理、DA、运营,沟通实施下面应急方案
1)收集HANGANALYZE(升级DA确认是否收集HANGANALYZE)
sqlplus / as sysdba 或者 sqlplus -prelim / as sysdba(后台无法证登陆时)
SQL> oradebug setmypid 或者 oradebug setospid < use an existing process id, for example PMON/SMON> --(sqlplus -prelim / as sysdba时使用)
SQL> oradebug unlimit
SQL> oradebug hanganalyze 3 或者 oradebug -g all hanganalyze 3 (RAC环境)
等待1分钟
SQL> oradebug hanganalyze 3 或者 oradebug -g all hanganalyze 3 (RAC环境)
等待1分钟
SQL> oradebug dump systemstate 10 或者 oradebug -g all dump systemstate 10 (RAC环境) --默认采集10级别的,如需采集高级别的需要领导评估,级别分为256 258 266 267
等待1分钟
SQL> oradebug dump systemstate 10 或者 oradebug -g all dump systemstate 10 (RAC环境) -----默认采集10级别的,如需采集高级别的需要领导评估,级别分为256 258 266 267
等待1分钟
SQL> oradebug tracefile_name --------保留trace文件
2)检查数据库是否有占用undo较大的会话,会影响重启数据库后的恢复时间
--查看单个会话占用最大的undo空间
with bksize as (select /*+materialize */ value from v$parameter where NAME = 'db_block_size') select 'undo_session_size:'||nvl(max(round(tr.USED_UBLK * bksize.value / 1024 / 1024)), 0) || ' M' "undo_session_size(MB)" from v$session se, v$transaction tr, bksize where se.TADDR = tr.ADDR(+);复制
3)临时屏蔽crontab
crontab -l >.cron_file_bak
>cron_null
crontab cron_null
4)停监听 --需要先同运营确认是否可以停监听,注意标黄的需要替换,而且有的库存在多个监听,都要停掉
GI单实例: crsctl stop res $ORACLE_SID.lsnr -f
RAC环境: srvctl stop listener -l $ORACLE_SID
VCS单实例: lsnrctl stop $ORACLE_SID
5)如果连接数已满无法进入数据库,考虑从OS层面kill本实例LOCAL=NO的进程,使用下面语句匹配
ORA-00020: maximum number of processes (1500) exceeded
--替换实例名<ORACLE_SID>
ps -ef|grep -v grep|grep LOCAL=NO|grep -w oracle$ORACLE_SID |awk '{print $2}'|xargs kill -9
6)后台登陆数据库检查等待事件是否已释放,一定要确认好了,正常再启监听
select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class <> 'Idle' group by inst_id,sql_id,event order by count(*) desc;复制
7)启动监听
GI单实例: crsctl start res $ORACLE_SID.lsnr
RAC环境: srvctl start listener -l $ORACLE_SID
VCS单实例: lsnrctl start $ORACLE_SID
8)启动crontab
crontab .cron_file_bak
9)检查GG
cd $GGS_HOME
./ggsci
info all
如果进程异常,使用start 进程名启动
八、验证问题已经恢复的步骤和命令
1、检查连接情况,确保能正常登录、有应用连接、连接数使用率正常
col PROCESS_LIMIT for a20; select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';复制
2、检查数据库的等待事件情况
select inst_id,sql_id,event, count(*) as amount from gv$session where event is not null and wait_class <> 'Idle' group by inst_id,sql_id,event order by count(*) desc;复制
PROCESS 使用超过95% #配置#
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
检查连接情况,确保连接数使用率正常
col PROCESS_LIMIT for a20; select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';复制
二、请描述告警出现的可能原因
1、应用服务器连接配置超出数据库限制
2、异常等待事件导致连接堆积,常见场景是发版或者新上业务语句解析异常
三、预计多久不处理就可以出现UIOC(P1)
连接数打满时应用无法新建数据库连接,当达到90%时需要立即介入处理,需15分钟内处理正常
四、告警检查的步骤(通过本步骤确定告警是否是误告)
--查看当前连接数使用情况,process_pct代表使用率
col PROCESS_LIMIT for a20; select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';复制
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt; create table dbmgr.sessmetric_20220000 as select tt.* from gv$sessmetric tt; create table dbmgr.session_event_20220000 as select tt.* from gv$session_event tt;复制
六、需要反馈上级以及运营的信息
将连接会话明细发给运营确认应用是否有异常,是否可以停部分应用,是否最近有扩容应用服务器的情况,评估紧急kill会话
如果有扩容应用服务器,联系DA评估是否安排变更调整连接数
七、建议处理的流程,步骤和对应命令
1、查看最近连接使用情况,可以初步看出,是从哪个时间点突增的
set line 1000; set pagesize 1000; col PROCESS_VALUE for a20; select end_time, current_process,process_value ,used_pct from (select to_char(trunc(end_time,'mi'), 'YYYY-MM-DD HH24:MI:SS') end_time, round(value*limit_value/100) current_process, limit_value process_value, round(value,2)||'%' used_pct from v$sysmetric_history, (select limit_value from v$resource_limit where resource_name = 'processes') where metric_name = 'Process Limit %') order by end_time desc;复制
2、检查当前连接分布明细
--连接分布
select * from (select username,osuser,status,machine,count(*) from v$session group by username ,osuser,status,machine order by 5 desc) where rownum <= 20;复制
a.如果大部分连接的状态STATUS都为INACTIVE,将连接情况明细发给运营确认是否有扩容应用服务器或者刚重启应用,同时安排 kill会话,接步骤3
b.如果大部分连接的状态STATUS都为ACTIVE,一般则是异常等待事件导致,可进一步分析等待事件,接步骤5
3、连接数据库批量杀掉INACTIVE的会话
--根据条件每次杀一百个非活跃会话,可以替换用户名<username>进行筛选
with abc as (select inst_id,sid,serial#, username, status, logon_time from gv$session s where s.STATUS = 'INACTIVE' -- and username like '&username' -- and username not in ('&username') order by s.LOGON_TIME ) select 'alter system kill session '||''''||sid||','||serial#||',@'|| inst_id ||''''|| ' immediate;' from abc where rownum<=100;复制
4、如果连接数已满无法进入数据库,考虑从OS层面kill本实例LOCAL=NO的进程,使用下面语句匹配
ORA-00020: maximum number of processes (1500) exceeded
--替换实例名<ORACLE_SID>
ps -ef|grep -v grep|grep LOCAL=NO|grep -w oracle$ORACLE_SID |awk '{print $2}'|xargs kill -9复制
八、验证问题已经恢复的步骤和命令
--查看当前连接数使用情况,使用率已降至正常水平
col PROCESS_LIMIT for a20; select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';复制
UNDO 单个SESSION使用UNDO超过10G #应用#
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
检查连接情况,确保能正常登录、有应用连接、连接数使用率正常
col PROCESS_LIMIT for a20; select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';复制
二、请描述告警出现的可能原因
大事务,常见场景是下列原因:
1、dml没有分批提交,建议运营分批提交
2、大表全表delete,建议整改为truncate
3、异常业务逻辑导致的大事务,结合占用量评估kill(谨慎KILL),回滚期间会影响相关表的业务操作
三、预计多久不处理就可以出现UIOC(P1)
一般不会影响整个库的可用性,但是如果事务异常(应用取消或者将事务kill)会导致事务回滚,undo占用越多需要回滚的时间越长,回滚期间会影响相关表的业务操作
四、告警检查的步骤(通过本步骤确定告警是否是误告)
--查看单个会话占用最大的undo空间
with bksize as (select /*+materialize */ value from v$parameter where NAME = 'db_block_size') select 'undo_session_size:'||nvl(max(round(tr.USED_UBLK * bksize.value / 1024 / 1024)), 0) || ' M' "undo_session_size(MB)" from v$session se, v$transaction tr, bksize where se.TADDR = tr.ADDR(+);复制
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt; create table dbmgr.transaction_20220000 as select tt.* from gv$transaction tt;复制
六、需要反馈上级以及运营的信息
将占用undo较高的会话反馈给运营,确认事务进度以及事务重要性,以此来评估是让事务继续或者中止事务,并提醒运营安排整改
七、建议处理的流程,步骤和对应命令
1、查询占用undo超过50MB的top20会话详情,将占用undo较高的会话反馈给运营
set linesize 180 pagesize 999 select inst_id,sid,serial#,xidusn,undo_size_m,sess_status,status,username,sql_id,prev_sql_id,machine,start_time,logon_time,related_rows,sql_text from (select t.xidusn,s.status sess_status,t.status,s.inst_id,s.sid,s.sql_id,s.prev_sql_id,s.serial#,s.username,s.machine,t.start_time,s.logon_time,t.used_urec related_rows, round(t.used_ublk * block_size / 1024 / 1024, 0) undo_size_m,q.sql_text from gv$session s, gv$transaction t, dba_tablespaces dt, gv$system_parameter p, gv$sql q where q.sql_id(+) = s.sql_id and upper(p.name) = 'UNDO_TABLESPACE' and dt.tablespace_name = p.value and t.inst_id = s.inst_id and p.inst_id = s.inst_id and s.saddr = t.ses_addr and round(t.used_ublk * block_size / 1024 / 1024, 2) > 50 order by t.used_ublk desc) t where rownum < 20;复制
2、如果是异常的业务逻辑,根据运营决策,结合占用量评估kill(谨慎KILL),回滚期间会影响相关表的业务操作
-查看sid对应会话详情
set linesize 150 pagesize 999 col EVENT for a30 col USERNAME for a10 col MACHINE for a10 col OSUSER for a10 col PROGRAM for a10 select inst_id,sid,serial#,sql_id,event,blocking_instance,blocking_session,blocking_session_status,final_blocking_instance,final_blocking_session,final_blocking_session_status,username,status,machine,osuser,program,sql_hash_value,to_char(logon_time,'yyyy-mm-dd hh24:mi:ss'),prev_sql_id from gv$session where sid=&sid;复制
--查看sql_id对应具体语句sql_text
set linesize 150 pagesize 999 col PARSING_SCHEMA_NAME for a15 col FIRST_LOAD_TIME for a20 col LAST_LOAD_TIME for a20 select a.inst_id,a.sql_id,a.child_number,a.parsing_schema_name,a.first_load_time,a.last_load_time,a.last_active_time,a.plan_hash_value,a.sql_plan_baseline,a.sql_text from gv$sql a where sql_id='&sql_id' order by last_active_time desc;复制
--决策后kill sid对应的会话,确认sid、serial#
select 'alter system kill session '||''''||sid||','||serial#||',@'|| inst_id ||''''|| ' immediate;' from gv$session where sid=&sid;复制
3、查询undo表空间使用率,如果使用率较高可先添加数据文件确保语句正常运行,防止由于表空间满undo用尽导致事务回滚
select tb.tablespace_name, nvl(ta.USED_GB, 0) as used_gb, tb.TOTAL_GB, (tb.TOTAL_GB - nvl(ta.USED_GB, 0)) AS FREE_GB, round((nvl(ta.USED_GB, 0) / tb.TOTAL_GB * 100), 2) || ' %' AS pct_used, round((tb.TOTAL_GB - nvl(ta.USED_GB, 0)) * 100 / tb.TOTAL_GB, 2) || ' %' AS PCT_FREE from (select t1.tablespace_name, round(nvl(sum(bytes), 0) / 1024 / 1024 / 1024, 2) as USED_GB from dba_undo_extents t1 where tablespace_name like '%UNDO%' and status = 'ACTIVE' group by t1.tablespace_name) ta, (select t2.tablespace_name, round(sum(t2.bytes) / 1024 / 1024 / 1024, 2) as TOTAL_GB from dba_data_files t2 where tablespace_name like '%UNDO%' group by t2.tablespace_name) tb where ta.tablespace_name(+) = tb.tablespace_name;复制
八、验证问题已经恢复的步骤和命令
--查看单个会话占用最大的undo空间已下降至10G以下
with bksize as (select /*+materialize */ value from v$parameter where NAME = 'db_block_size') select 'undo_session_size:'||nvl(max(round(tr.USED_UBLK * bksize.value / 1024 / 1024)), 0) || ' M' "undo_session_size(MB)" from v$session se, v$transaction tr, bksize where se.TADDR = tr.ADDR(+);复制
TableSpace Usage Percent over 95% on TEMP
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
二、请描述告警出现的可能原因
一般是单个sql消耗大量temp, 特别是HASH JOIN阶段的build input过大造成。
三、预计多久不处理就可以出现UIOC(P1)
四、告警检查的步骤(通过本步骤确定告警是否是误告)
--查询temp表空间使用率
select tablespace_name, mb_total, mb_used, mb_free, round(mb_used / mb_total, 4) * 100 || '%' used_pct from (SELECT A.tablespace_name tablespace_name, D.mb_total, SUM(A.used_blocks * D.block_size) / 1024 / 1024 mb_used, D.mb_total - SUM(A.used_blocks * D.block_size) / 1024 / 1024 mb_free FROM v$sort_segment A, (SELECT B.name, C.block_size, SUM(C.bytes) / 1024 / 1024 mb_total FROM v$tablespace B, v$tempfile C WHERE B.ts# = C.ts# GROUP BY B.name, C.block_size) D WHERE A.tablespace_name = D.name GROUP by A.tablespace_name, D.mb_total);复制
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt; create table dbmgr.sort_usage_20220000 as select tt.* from gv$sort_usage tt; create table dbmgr.wkarea_20220000 as select tt.* from gv$sql_workarea tt; create table dbmgr.wkarea_ac_20220000 as select tt.* from gv$sql_workarea_active tt; create table dbmgr.v$temp_usg_20220000 as select * from V$TEMPSEG_USAGE tt;复制
六、需要反馈上级以及运营的信息
七、建议处理的流程,步骤和对应命令
1、一般会很快消耗完TEMP并报出ORA-1652后自行恢复。
若持续则查询temp消耗top-n,评估是否KILL或者添加临时表空间
select * from (select distinct se.username, se.sid, se.serial#, su.extents, su.blocks * to_number(rtrim(p.value))/1024/1024 as Space, round(su.blocks * p.value / 1024 / 1024 / 1024, 1)|| ' G' sort_used, tablespace, segtype, sql_text from v$sort_usage su, v$parameter p, v$session se, v$sql s where p.name = 'db_block_size' and su.session_addr = se.saddr and s.hash_value = su.sqlhash and s.address = su.sqladdr order by space desc ) where rownum <= 10;复制
2、添加临时表空间
文件系统:ALTER TABLESPACE TEMP ADD TEMPFILE '/paic/t1luqd0/data/oradata/luqd0/temp_new023.dbf' SIZE 20G autoextend off;
ASM:ALTER TABLESPACE TEMP ADD TEMPFILE SIZE 20G autoextend off;
八、验证问题已经恢复的步骤和命令
TableSpace Usage Percent over 95% on UNDOTBS1
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
检查连接情况,确保能正常登录、有应用连接、连接数使用率正常
col PROCESS_LIMIT for a20; select process_current,value process_limit,round(process_current / value, 2) * 100 || '%' as process_pct from (select count(*) as process_current from v$process) a, v$parameter where name = 'processes';复制
二、请描述告警出现的可能原因
大事务,常见场景是下列原因:
1、dml没有分批提交,建议运营分批提交
2、大表全表delete,建议整改为truncate
3、异常业务逻辑导致的大事务,结合占用量评估kill(谨慎KILL),回滚期间会影响相关表的业务操作
三、预计多久不处理就可以出现UIOC(P1)
UNDO表空间使用率100%会影响应用读写,正在执行的事务可能会因使用率慢而回滚事务,事务undo占用越多需要回滚的时间越长,回滚期间会影响相关表的业务操作,使用率达到90%时需要立即介入处理,需15分钟内处理正常
四、告警检查的步骤(通过本步骤确定告警是否是误告)
--查询undo表空间使用率
select tb.tablespace_name, nvl(ta.USED_GB, 0) as used_gb, tb.TOTAL_GB, (tb.TOTAL_GB - nvl(ta.USED_GB, 0)) AS FREE_GB, round((nvl(ta.USED_GB, 0) / tb.TOTAL_GB * 100), 2) || ' %' AS pct_used, round((tb.TOTAL_GB - nvl(ta.USED_GB, 0)) * 100 / tb.TOTAL_GB, 2) || ' %' AS PCT_FREE from (select t1.tablespace_name, round(nvl(sum(bytes), 0) / 1024 / 1024 / 1024, 2) as USED_GB from dba_undo_extents t1 where tablespace_name like '%UNDO%' and status = 'ACTIVE' group by t1.tablespace_name) ta, (select t2.tablespace_name, round(sum(t2.bytes) / 1024 / 1024 / 1024, 2) as TOTAL_GB from dba_data_files t2 where tablespace_name like '%UNDO%' group by t2.tablespace_name) tb where ta.tablespace_name(+) = tb.tablespace_name;复制
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt; create table dbmgr.transaction_20220000 as select tt.* from gv$transaction tt;复制
六、需要反馈上级以及运营的信息
将占用undo较高的会话反馈给运营,确认事务进度以及事务重要性,以此来评估是让事务继续或者中止事务,并提醒运营安排整改
七、建议处理的流程,步骤和对应命令
1、查询占用undo超过50MB的top20会话详情,将占用undo较高的会话反馈给运营
select inst_id,sid,serial#,xidusn,undo_size_m,sess_status,status,username,sql_id,prev_sql_id,machine,start_time,logon_time,related_rows,sql_text from (select t.xidusn,s.status sess_status,t.status,s.inst_id,s.sid,s.sql_id,s.prev_sql_id,s.serial#,s.username,s.machine,t.start_time,s.logon_time,t.used_urec related_rows, round(t.used_ublk * block_size / 1024 / 1024, 0) undo_size_m,q.sql_text from gv$session s, gv$transaction t, dba_tablespaces dt, gv$system_parameter p, gv$sql q where q.sql_id(+) = s.sql_id and upper(p.name) = 'UNDO_TABLESPACE' and dt.tablespace_name = p.value and t.inst_id = s.inst_id and p.inst_id = s.inst_id and s.saddr = t.ses_addr and round(t.used_ublk * block_size / 1024 / 1024, 2) > 50 order by t.used_ublk desc) t where rownum < 20;复制
2、如果是异常的业务逻辑,根据运营决策,结合占用量评估kill(谨慎KILL),回滚期间会影响相关表的业务操作
-查看sid对应会话详情
set linesize 150 pagesize 999 col EVENT for a30 col USERNAME for a10 col MACHINE for a10 col OSUSER for a10 col PROGRAM for a10 select inst_id,sid,serial#,sql_id,event,blocking_instance,blocking_session,blocking_session_status,final_blocking_instance,final_blocking_session,final_blocking_session_status,username,status,machine,osuser,program,sql_hash_value,to_char(logon_time,'yyyy-mm-dd hh24:mi:ss'),prev_sql_id from gv$session where sid=&sid;复制
--查看sql_id对应具体语句sql_text
set linesize 150 pagesize 999 col PARSING_SCHEMA_NAME for a15 col FIRST_LOAD_TIME for a20 col LAST_LOAD_TIME for a20 select a.inst_id,a.sql_id,a.child_number,a.parsing_schema_name,a.first_load_time,a.last_load_time,a.last_active_time,a.plan_hash_value,a.sql_plan_baseline,a.sql_text from gv$sql a where sql_id='&sql_id' order by last_active_time desc;复制
--决策后kill sid对应的会话,确认sid、serial#
select 'alter system kill session '||''''||sid||','||serial#||',@'|| inst_id ||''''|| ' immediate;' from gv$session where sid=&sid;复制
3、查询undo表空间使用率,如果使用率较高可先添加数据文件确保语句正常运行,防止由于表空间满undo用尽导致事务回滚
select tb.tablespace_name, nvl(ta.USED_GB, 0) as used_gb, tb.TOTAL_GB, (tb.TOTAL_GB - nvl(ta.USED_GB, 0)) AS FREE_GB, round((nvl(ta.USED_GB, 0) / tb.TOTAL_GB * 100), 2) || ' %' AS pct_used, round((tb.TOTAL_GB - nvl(ta.USED_GB, 0)) * 100 / tb.TOTAL_GB, 2) || ' %' AS PCT_FREE from (select t1.tablespace_name, round(nvl(sum(bytes), 0) / 1024 / 1024 / 1024, 2) as USED_GB from dba_undo_extents t1 where tablespace_name like '%UNDO%' and status = 'ACTIVE' group by t1.tablespace_name) ta, (select t2.tablespace_name, round(sum(t2.bytes) / 1024 / 1024 / 1024, 2) as TOTAL_GB from dba_data_files t2 where tablespace_name like '%UNDO%' group by t2.tablespace_name) tb where ta.tablespace_name(+) = tb.tablespace_name;复制
八、验证问题已经恢复的步骤和命令
--查询undo表空间使用率
select tb.tablespace_name, nvl(ta.USED_GB, 0) as used_gb, tb.TOTAL_GB, (tb.TOTAL_GB - nvl(ta.USED_GB, 0)) AS FREE_GB, round((nvl(ta.USED_GB, 0) / tb.TOTAL_GB * 100), 2) || ' %' AS pct_used, round((tb.TOTAL_GB - nvl(ta.USED_GB, 0)) * 100 / tb.TOTAL_GB, 2) || ' %' AS PCT_FREE from (select t1.tablespace_name, round(nvl(sum(bytes), 0) / 1024 / 1024 / 1024, 2) as USED_GB from dba_undo_extents t1 where tablespace_name like '%UNDO%' and status = 'ACTIVE' group by t1.tablespace_name) ta, (select t2.tablespace_name, round(sum(t2.bytes) / 1024 / 1024 / 1024, 2) as TOTAL_GB from dba_data_files t2 where tablespace_name like '%UNDO%' group by t2.tablespace_name) tb where ta.tablespace_name(+) = tb.tablespace_name;复制
TableSpace Usage Percent over 95% on XXX(非undo、temp表空间)
一、数据库状态检查的步骤/命令(通过本步骤可以确定数据库当前的状态是否OK)
二、请描述告警出现的可能原因
数据持续写入导致表空间使用量持续增加
三、预计多久不处理就可以出现UIOC(P1)
业务表空间满可能导致应用无法写入数据,使用率达到90%时需要立即介入处理,需15分钟内处理正常
四、告警检查的步骤(通过本步骤确定告警是否是误告)
--检查对应表空间使用量,关注maxuse_%指标
set line 1000 col tablespace_name for a20 SELECT a.tablespace_name, b.maxbytes / 1024 / 1024 / 1024 "maxbyes_GB", total / 1024 / 1024 / 1024 "bytes_GB", (maxbytes - total) / 1024 / 1024 / 1024 "free_extend_GB", free / 1024 / 1024 / 1024 "free_current_GB", (total - free) / 1024 / 1024 / 1024 "use_GB", b.files, ROUND((total - free) / total, 4) * 100 "use_%", ROUND((total - free) / b.maxbytes, 4) * 100 "maxuse_%" FROM (SELECT tablespace_name, SUM(bytes) free FROM DBA_FREE_SPACE where tablespace_name = upper('&tablespace_name') GROUP BY tablespace_name) a, (SELECT tablespace_name, sum(case autoextensible when 'YES' then maxbytes else bytes end) maxbytes, SUM(bytes) total, count(*) files FROM DBA_DATA_FILES where tablespace_name = upper('&tablespace_name') GROUP BY tablespace_name) b WHERE a.tablespace_name = b.tablespace_name; 说明:free_current_GB为表空间真实剩余空间,free_extend_GB为可拓展空间,两者之和为该表空间总剩余空间。复制
五、需要保存现场的步骤/命令
create table dbmgr.ash_20220000 as select tt.* from gv$active_session_history tt; create table dbmgr.sql_20220000 as select tt.* from gv$sql tt;复制
六、需要反馈上级以及运营的信息
七、建议处理的流程,步骤和对应命令
1、检查对应表空间使用量
set line 1000 col tablespace_name for a20 SELECT a.tablespace_name, b.maxbytes / 1024 / 1024 / 1024 "maxbyes_GB", total / 1024 / 1024 / 1024 "bytes_GB", (maxbytes - total) / 1024 / 1024 / 1024 "free_extend_GB", free / 1024 / 1024 / 1024 "free_current_GB", (total - free) / 1024 / 1024 / 1024 "use_GB", b.files, ROUND((total - free) / total, 4) * 100 "use_%", ROUND((total - free) / b.maxbytes, 4) * 100 "maxuse_%" FROM (SELECT tablespace_name, SUM(bytes) free FROM DBA_FREE_SPACE where tablespace_name = upper('&tablespace_name') GROUP BY tablespace_name) a, (SELECT tablespace_name, sum(case autoextensible when 'YES' then maxbytes else bytes end) maxbytes, SUM(bytes) total, count(*) files FROM DBA_DATA_FILES where tablespace_name = upper('&tablespace_name') GROUP BY tablespace_name) b WHERE a.tablespace_name = b.tablespace_name;复制
说明:free_current_GB为表空间真实剩余空间,free_extend_GB为可拓展空间,两者之和为该表空间总剩余空间。
2、先检查卷使用量,并实施扩容表空间,分小机和PC两种架构,小机使用san存储,PC用ASMDG
--调用下面脚本,生成添加数据文件的命令,将表空间降至80%以下
ls tbsextend.py && rm tbsextend.py;wget 10.11.100.193/downloads/OpsManage/Oracle/tbsextend.py python tbsextend.py &tbsname复制
3、进一步可分析导致表空间突增的原因,反馈应用排查
--用于查看表空间随时间增长情况
select tablespace_name, currentsize_GB, maxsize_GB, usedsize_GB, maxsize_GB - usedsize_GB freesize_GB, trunc(usedsize_GB / maxsize_GB, 4) * 100 || '%' maxused_pct, time from (select b.name tablespace_name, round(a.tablespace_size * c.block_size / 1024 / 1024 / 1024, 4) currentsize_GB, round(a.tablespace_maxsize * c.block_size / 1024 / 1024 / 1024, 4) maxsize_GB, round(a.tablespace_usedsize * c.block_size / 1024 / 1024 / 1024, 4) usedsize_GB, to_date(rtime, 'mm/dd/yyyy hh24:mi:ss') time from dba_hist_tbspc_space_usage a, v$tablespace b, dba_tablespaces c where a.tablespace_id = b.ts# and b.name = c.tablespace_name and b.name = upper('&tablespace_name') order by to_date(rtime, 'mm/dd/yyyy hh24:mi:ss') desc);复制
--用于查看表空间中哪个对象增长较快,取top5
select * from dba_hist_snapshot order by snap_id desc; select owner, tablespace_name, object_name, subobject_name, object_type, space_allocated_delta_MB, ratio from (select n.owner, n.tablespace_name, n.object_name, case when length(n.subobject_name) < 11 then n.subobject_name else substr(n.subobject_name, length(n.subobject_name) - 9) end subobject_name, n.object_type, r.space_allocated_delta_MB, round(r.ratio * 100, 2) ratio from dba_hist_seg_stat_obj n, (select dataobj#, obj#, dbid, sum(space_allocated_delta) / 1024 / 1024 space_allocated_delta_MB, ratio_to_report(sum(space_allocated_delta)) over() ratio from dba_hist_seg_stat where space_allocated_delta > 0 and &beg_snap < snap_id and snap_id <= &end_snap group by dataobj#, obj#, dbid) r where n.dataobj# = r.dataobj# and n.obj# = r.obj# and n.dbid = r.dbid and tablespace_name = upper('&tablespace_name') --可注释,用于取全库信息 order by r.space_allocated_delta_MB desc, object_name, owner, subobject_name) where rownum <= 5;复制
八、验证问题已经恢复的步骤和命令
--检查对应表空间使用量
set line 1000 col tablespace_name for a20 SELECT a.tablespace_name, b.maxbytes / 1024 / 1024 / 1024 "maxbyes_GB", total / 1024 / 1024 / 1024 "bytes_GB", (maxbytes - total) / 1024 / 1024 / 1024 "free_extend_GB", free / 1024 / 1024 / 1024 "free_current_GB", (total - free) / 1024 / 1024 / 1024 "use_GB", b.files, ROUND((total - free) / total, 4) * 100 "use_%", ROUND((total - free) / b.maxbytes, 4) * 100 "maxuse_%" FROM (SELECT tablespace_name, SUM(bytes) free FROM DBA_FREE_SPACE where tablespace_name = upper('&tablespace_name') GROUP BY tablespace_name) a, (SELECT tablespace_name, sum(case autoextensible when 'YES' then maxbytes else bytes end) maxbytes, SUM(bytes) total, count(*) files FROM DBA_DATA_FILES where tablespace_name = upper('&tablespace_name') GROUP BY tablespace_name) b WHERE a.tablespace_name = b.tablespace_name; 说明:free_current_GB为表空间真实剩余空间,free_extend_GB为可拓展空间,两者之和为该表空间总剩余空间。复制