北京出差中接到同事电话,说突然发现影像系统websphere出现管理控制台中受管节点状态异常,具体表现为nodeagent为启动状态,但在dmgr中显示未启动,经过一系列重启等常规操作不能解决问题。该问题不会影响应用服务器的运行,但是已经不能通过控制台进行管理了。
和朋友吃完饭,在北京的大雾霾中溜达回酒店,想着先处理一下吧,估计不会太难解决。连上VPN,登陆系统,常规一系列检查动作后,打开nodeagent日志和dmgr日志,分析一下。
dmgr日志部分如下
[11/3/16 19:14:53:050 GMT+08:00] 00000021 ORBRas E com.ibm.ws.security.orbssl.WSSSLClientSocketFactoryImpl createSSLSocket ProcessDiscovery : 0 JSSL0080E: javax
.net.ssl.SSLHandshakeException - The client and server could not negotiate the desired level of security. Reason: com.ibm.jsse2.util.h: No trusted certificate found j
avax.net.ssl.SSLHandshakeException: com.ibm.jsse2.util.h: No trusted certificate found
at com.ibm.jsse2.n.a(n.java:22)
at com.ibm.jsse2.pc.a(pc.java:519)
at com.ibm.jsse2.eb.a(eb.java:339)
at com.ibm.jsse2.eb.a(eb.java:332)
at com.ibm.jsse2.fb.a(fb.java:64)
at com.ibm.jsse2.fb.a(fb.java:167)
at com.ibm.jsse2.eb.m(eb.java:244)
at com.ibm.jsse2.eb.a(eb.java:133)
at com.ibm.jsse2.pc.a(pc.java:498)
at com.ibm.jsse2.pc.h(pc.java:570)
at com.ibm.jsse2.pc.a(pc.java:392)
at com.ibm.jsse2.pc.startHandshake(pc.java:463)
at com.ibm.ws.security.orbssl.WSSSLClientSocketFactoryImpl.createSSLSocket(WSSSLClientSocketFactoryImpl.java:490)
at com.ibm.ws.orbimpl.transport.WSSSLTransportConnection.createSocket(WSSSLTransportConnection.java:232)
at com.ibm.ws.orbimpl.transport.WSSSLTransportConnection.createSocket(WSSSLTransportConnection.java:311)
at com.ibm.CORBA.transport.TransportConnectionBase.connect(TransportConnectionBase.java:359)
at com.ibm.ws.orbimpl.transport.WSTransport$1.run(WSTransport.java:495)
at com.ibm.ws.security.util.AccessController.doPrivileged(AccessController.java:118)
at com.ibm.ws.orbimpl.transport.WSTransport.getConnection(WSTransport.java:492)
at com.ibm.CORBA.transport.TransportBase.getConnection(TransportBase.java:187)
at com.ibm.rmi.iiop.TransportManager.get(TransportManager.java:96)
at com.ibm.rmi.iiop.GIOPImpl.getConnection(GIOPImpl.java:129)
at com.ibm.rmi.iiop.GIOPImpl.locate(GIOPImpl.java:205)
at com.ibm.rmi.corba.ClientDelegate.locate(ClientDelegate.java:1966)
at com.ibm.rmi.corba.ClientDelegate._createRequest(ClientDelegate.java:1991)
at com.ibm.rmi.corba.ClientDelegate.createRequest(ClientDelegate.java:1169)
at com.ibm.rmi.corba.ClientDelegate.createRequest(ClientDelegate.java:1270)
at com.ibm.CORBA.iiop.ClientDelegate.createRequest(ClientDelegate.java:1327)
at com.ibm.rmi.corba.ClientDelegate.createRequest(ClientDelegate.java:1131)
at com.ibm.CORBA.iiop.ClientDelegate.createRequest(ClientDelegate.java:1293)
at com.ibm.rmi.corba.ClientDelegate.request(ClientDelegate.java:1869)
at com.ibm.CORBA.iiop.ClientDelegate.request(ClientDelegate.java:1249)
at org.omg.CORBA.portable.ObjectImpl._request(ObjectImpl.java:458)
at com.ibm.WsnOptimizedNaming._NamingContextStub.resolve_complete_info(_NamingContextStub.java:488)
at com.ibm.ws.naming.jndicos.CNContextImpl.cosResolve(CNContextImpl.java:4375)
at com.ibm.ws.naming.jndicos.CNContextImpl.doLookup(CNContextImpl.java:1905)
at com.ibm.ws.naming.jndicos.CNContextImpl.doLookup(CNContextImpl.java:1866)
at com.ibm.ws.naming.jndicos.CNContextImpl.lookupExt(CNContextImpl.java:1556)
[11/3/16 22:56:20:724 GMT+08:00] 00000020 ORBRas E com.ibm.ws.security.orbssl.WSSSLClientSocketFactoryImpl createSSLSocket ProcessDiscovery : 0 JSSL0080E: javax
.net.ssl.SSLHandshakeException - The client and server could not negotiate the desired level of security. Reason: com.ibm.jsse2.util.h: No trusted certificate found j
avax.net.ssl.SSLHandshakeException: com.ibm.jsse2.util.h: No trusted certificate found
at com.ibm.jsse2.n.a(n.java:22)
at com.ibm.jsse2.pc.a(pc.java:519)
at com.ibm.jsse2.eb.a(eb.java:339)
at com.ibm.jsse2.eb.a(eb.java:332)
at com.ibm.jsse2.fb.a(fb.java:64)
at com.ibm.jsse2.fb.a(fb.java:167)
at com.ibm.jsse2.eb.m(eb.java:244)
at com.ibm.jsse2.eb.a(eb.java:133)
at com.ibm.jsse2.pc.a(pc.java:498)
at com.ibm.jsse2.pc.h(pc.java:570)
at com.ibm.jsse2.pc.a(pc.java:392)
at com.ibm.jsse2.pc.startHandshake(pc.java:463)
at com.ibm.ws.security.orbssl.WSSSLClientSocketFactoryImpl.createSSLSocket(WSSSLClientSocketFactoryImpl.java:490)
at com.ibm.ws.orbimpl.transport.WSSSLTransportConnection.createSocket(WSSSLTransportConnection.java:232)
at com.ibm.ws.orbimpl.transport.WSSSLTransportConnection.createSocket(WSSSLTransportConnection.java:311)
at com.ibm.CORBA.transport.TransportConnectionBase.connect(TransportConnectionBase.java:359)
at com.ibm.ws.orbimpl.transport.WSTransport$1.run(WSTransport.java:495)
at com.ibm.ws.security.util.AccessController.doPrivileged(AccessController.java:118)
at com.ibm.ws.orbimpl.transport.WSTransport.getConnection(WSTransport.java:492)
at com.ibm.CORBA.transport.TransportBase.getConnection(TransportBase.java:187)
at com.ibm.rmi.iiop.TransportManager.get(TransportManager.java:96)
at com.ibm.rmi.iiop.GIOPImpl.getConnection(GIOPImpl.java:129)
at com.ibm.rmi.iiop.GIOPImpl.locate(GIOPImpl.java:205)
at com.ibm.rmi.corba.ClientDelegate.locate(ClientDelegate.java:1966)
at com.ibm.rmi.corba.ClientDelegate._createRequest(ClientDelegate.java:1991)
at com.ibm.rmi.corba.ClientDelegate.createRequest(ClientDelegate.java:1155)
at com.ibm.rmi.corba.ClientDelegate.createRequest(ClientDelegate.java:1270)
at com.ibm.CORBA.iiop.ClientDelegate.createRequest(ClientDelegate.java:1327)
nodeagent部分日志如下
[11/3/16 19:16:48:844 GMT+08:00] 0000001a ORBRas E com.ibm.ws.security.orbssl.WSSSLClientSocketFactoryImpl createSSLSocket ProcessDiscovery : 0 JSSL0080E: javax
.net.ssl.SSLHandshakeException - The client and server could not negotiate the desired level of security. Reason: com.ibm.jsse2.util.h: No trusted certificate found j
avax.net.ssl.SSLHandshakeException: com.ibm.jsse2.util.h: No trusted certificate found
at com.ibm.jsse2.n.a(n.java:22)
at com.ibm.jsse2.pc.a(pc.java:519)
at com.ibm.jsse2.eb.a(eb.java:339)
at com.ibm.jsse2.eb.a(eb.java:332)
at com.ibm.jsse2.fb.a(fb.java:64)
at com.ibm.jsse2.fb.a(fb.java:167)
at com.ibm.jsse2.eb.m(eb.java:244)
at com.ibm.jsse2.eb.a(eb.java:133)
at com.ibm.jsse2.pc.a(pc.java:498)
at com.ibm.jsse2.pc.h(pc.java:570)
at com.ibm.jsse2.pc.a(pc.java:392)
at com.ibm.jsse2.pc.startHandshake(pc.java:463)
at com.ibm.ws.security.orbssl.WSSSLClientSocketFactoryImpl.createSSLSocket(WSSSLClientSocketFactoryImpl.java:490)
at com.ibm.ws.orbimpl.transport.WSSSLTransportConnection.createSocket(WSSSLTransportConnection.java:232)
at com.ibm.ws.orbimpl.transport.WSSSLTransportConnection.createSocket(WSSSLTransportConnection.java:311)
at com.ibm.CORBA.transport.TransportConnectionBase.connect(TransportConnectionBase.java:359)
at com.ibm.ws.orbimpl.transport.WSTransport$1.run(WSTransport.java:495)
at com.ibm.ws.security.util.AccessController.doPrivileged(AccessController.java:118)
at com.ibm.ws.orbimpl.transport.WSTransport.getConnection(WSTransport.java:492)
at com.ibm.CORBA.transport.TransportBase.getConnection(TransportBase.java:187)
at com.ibm.rmi.iiop.TransportManager.get(TransportManager.java:96)
at com.ibm.rmi.iiop.GIOPImpl.getConnection(GIOPImpl.java:129)
at com.ibm.rmi.iiop.GIOPImpl.locate(GIOPImpl.java:205)
at com.ibm.rmi.corba.ClientDelegate.locate(ClientDelegate.java:1966)
at com.ibm.rmi.corba.ClientDelegate._createRequest(ClientDelegate.java:1991)
at com.ibm.rmi.corba.ClientDelegate.createRequest(ClientDelegate.java:1155)
at com.ibm.rmi.corba.ClientDelegate.createRequest(ClientDelegate.java:1270)
at com.ibm.CORBA.iiop.ClientDelegate.createRequest(ClientDelegate.java:1327)
at com.ibm.rmi.corba.ClientDelegate.createRequest(ClientDelegate.java:1131)
at com.ibm.CORBA.iiop.ClientDelegate.createRequest(ClientDelegate.java:1293)
at com.ibm.rmi.corba.ClientDelegate.request(ClientDelegate.java:1869)
at com.ibm.CORBA.iiop.ClientDelegate.request(ClientDelegate.java:1249)
at org.omg.CORBA.portable.ObjectImpl._request(ObjectImpl.java:458)
at com.ibm.WsnOptimizedNaming._NamingContextStub.resolve_complete_info(_NamingContextStub.java
从报错信息看估计是dmgr与nodeagent证书安全性之间出现问题。该问题以前从没遇到过啊!OMG!一通胡思乱想之后,估计大致问题如下:
此类问题发生的原因,是由于启用了管理安全性之后,WAS环境中的SSL通讯会涉及到安全证书(certificate),而安全证书是有一定有效期的。安全证书超过有效期之后会失效(expire),从而发生管理的问题。
特别是在集群环境下,部署管理器(dmgr)与各个结点(node)之间的安全通讯,在dmgr和node的配置中都保存有相应的证书,这些证书由于各种原因发生不一致时,也会出现SSLHandshakeException的问题,从而证书过期/证书自动更新导致Dmgr无法正常管理或者同步node。我估摸着大概率是这个问题,证书到期应该不是问题所在。
具体解决方案如下:
取消Dmgr和各个结点安全证书的差异性,简化SSL配置,此方案仅适用于使用WAS默认证书配置对证书没有特殊要求的WAS用户。
1.检查WAS补丁,最好是6.1.0.9之上(我是6.1.0.45)。备份Dmgr以及各Node的WAS配置。通过backupConfig.sh脚本执行备份。注意一定得备份备份备份!重要的事情说三遍!
2.登陆管理控制台,在dmgr下选择管理端点安全配置。
注意:选择各个结点,一定要取消“覆盖继承的值”选项后选择保存
3. 停止dmgr, node, 删除dmgr和node profile目录下(生产环境必须备份)所有*.p12文件
4. 启动dmgr,会发现在config/cells/xxx下面会重新生成key.p12和trust.p12,以这个为蓝本,copy到原来有*.p12的所有地方
5.分别从命令行停止和启动dmgr和各个node agent,如果命令行提示是否接受新的证书,选择接受。然后环境一切正常了。
6.(可选)如果环境中还有Web Server,记得产生新的plugin密钥文件并手工拷贝到Web Server所在机器。
本方案适用于使用WAS默认证书配置对证书没有特殊要求的WAS用户。
最后贴个正常的日志,已经凌晨两点多了,4个小时,憋了这么个case出来。特此记录。