背景
由于用户量和业务量的增加,或者代码的不合理编写,线上应用Java应用经常出现OOM,我们线上的应用均部署在k8s中,随着Pod内容器的重启,dump的堆文件也会被删除,导致我们一直很难定位问题。
如何解决
首先我们需要解决dump文件被删除的问题,这里我们会通过Sidecar容器,实现和Java容器目录的共享,这里主要解决堆文件被删除的问题
在我们的Sidecar容器中,我们会对存放dump文件的目录进行watch,如果发现有变更就会将dump文件上传至OSS,并进行预警,方便研发下载查看
通过Go代码实现OSS文件上传和钉钉告警
package main
import(
"fmt"
"github.com/aliyun/aliyun-oss-go-sdk/oss"
"log"
"net"
"os"
"path/filepath"
"time"
)
func main() {
var dumpPath = getEnvDefault("OPS_APPLICATION_MEM_DUMP_PATH", "")
var enable = getEnvDefault("OPS_APPLICATION_MEM_DUMP_ENABLE", "true")
if enable != "true"{
return
}
uploadFile(dumpPath)
}
func uploadFile(file string) {
fmt.Printf("start upload ali yun oss, file: %s \n", file)
var accessKeyId = getEnvDefault("OPS_ALI_KEY_ID", "")
var accessKeySecret = getEnvDefault("OPS_ALI_KEY_SECRET", "")
var bucketName = getEnvDefault("OPS_APPLICATION_MEM_DUMP_BUCKET", "")
var endpoint = getEnvDefault("OPS_APPLICATION_OSS_ENDPOINT", "")
var applicationName = getEnvDefault("OPS_APPLICATION_NAME", "")
client, err := oss.New(endpoint, accessKeyId, accessKeySecret)
if err != nil{
log.Fatalf("create oss link failed: %v", err)
}
bucket, err := client.Bucket(bucketName)
if err != nil{
log.Fatalf("get oss bucket failed: %v", err)
}
var filename = filepath.Base(file) //获取文件名
// 设置存储类型为标准存储。
var objectName = "k8s/application-mem-dump/"+ applicationName + "/"+ formatDateTime() + filename
if err := uploadMultipart(bucket, objectName, file, int64(5*1024*1024)); err != nil{
log.Fatalf("Failed to upload multipart: %v", err)
}
sendAlert(applicationName, bucketName, objectName)
}
func sendAlert(applicationName string, bucket string, memoryDumpPath string) {
// 这里写发送DingDIng告警的逻辑
}
func getEnvDefault(key, defaultVal string) string{
val, ex := os.LookupEnv(key)
if!ex {
return defaultVal
}
return val
}
func getHostIp() string{
addrList, err := net.InterfaceAddrs()
if err != nil{
fmt.Println("get current host ip err: ", err)
return""
}
var ip string
for _, address := range addrList {
if ipNet, ok := address.(*net.IPNet); ok && !ipNet.IP.IsLoopback() {
if ipNet.IP.To4() != nil{
ip = ipNet.IP.String()
break
}
}
}
return ip
}
func formatDateTime() string{
return time.Now().Format("20060102150405")
}
// 分片上传函数
func uploadMultipart(bucket *oss.Bucket, objectName, localFilename string, partSize int64) error {
// 将本地文件分片
chunks, err := oss.SplitFileByPartSize(localFilename, partSize)
if err != nil{
return fmt.Errorf("failed to split file into chunks: %w", err)
}
// 打开本地文件。
file, err := os.Open(localFilename)
if err != nil{
return fmt.Errorf("failed to open file: %w", err)
}
defer file.Close()
// 步骤1:初始化一个分片上传事件。
imur, err := bucket.InitiateMultipartUpload(objectName)
if err != nil{
return fmt.Errorf("failed to initiate multipart upload: %w", err)
}
// 步骤2:上传分片。
var parts []oss.UploadPart
for _, chunk := range chunks {
part, err := bucket.UploadPart(imur, file, chunk.Size, chunk.Number)
if err != nil{
// 如果上传某个部分失败,尝试取消整个上传任务。
if abortErr := bucket.AbortMultipartUpload(imur); abortErr != nil{
log.Printf("Failed to abort multipart upload: %v", abortErr)
}
return fmt.Errorf("failed to upload part: %w", err)
}
parts = append(parts, part)
}
// 指定Object的读写权限为私有,默认为继承Bucket的读写权限。
objectAcl := oss.ObjectACL(oss.ACLPrivate)
// 步骤3:完成分片上传。
_, err = bucket.CompleteMultipartUpload(imur, parts, objectAcl)
if err != nil{
// 如果完成上传失败,尝试取消上传。
if abortErr := bucket.AbortMultipartUpload(imur); abortErr != nil{
log.Printf("Failed to abort multipart upload: %v", abortErr)
}
return fmt.Errorf("failed to complete multipart upload: %w", err)
}
log.Printf("Multipart upload completed successfully.")
returnnil
}
复制
编写Dockerfile打包成Image方便Sidecar容器拉取
FROM arm64v8/golang:1.23.1-alpine
RUN set-eux \
&& apk update \
&& apk add --no-cache inotify-tools
ADD k8s-memory-dump/usr/local/bin
RUN chmod 777/usr/local/bin/k8s-memory-dump
ENTRYPOINT ["/bin/bash"]
复制
inotify-tools用来对dump文件进行watch
k8s-memory-dump是我们通过go源码编译好的二进制文件
Deployments改进
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: application-prod
name: jvm-applictaion
labels:
app: jvm-applictaion
spec:
replicas: 12
selector:
matchLabels:
app: jvm-applictaion
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 50%
maxSurge: 50%
template:
metadata:
labels:
app: jvm-applictaion
spec:
restartPolicy: Always
volumes:
- name: memory-dump
emptyDir: {}
containers:
- name: memory-dump-monitor
image: xxxx:k8s-memory-dump-v1.0
command: ["/bin/sh", "-c"]
args:
- >
inotifywait -m memory-dumps -e close_write | while read path
action file; do
k8s-memory-dump;
done;
envFrom:
- secretRef:
name: k8s-memory-dump.cert
env:
- name: OPS_APPLICATION_MEM_DUMP_PATH
value: /memory-dumps/heapdump.hprof
- name: OPS_APPLICATION_NAME
value: jvm-applictaion
- name: OPS_APPLICATION_MEM_DUMP_ENABLE
value: 'true'
volumeMounts:
- name: memory-dump
mountPath: /memory-dumps
imagePullPolicy: Always
- name: jvm-applictaion
image: xxxx:jvm-applictaion-image
volumeMounts:
- name: memory-dump
mountPath: /tmp/java-dump/
复制
在上面的Yaml中我只保留了关键信息,总得原理就是通过memory-dump的Volume实现存储共享,将我们Java容器在/tmp/java-dump/生成的dump文件共享到Sidecar容器的/memory-dumps目录下,Sidecar容器如果检测到文件发生变化,则会调用我们的go程序将dump文件上传到OSS并预警,效果图如下: