暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

如何监控K8S Java应用OOM并告警

116

背景

由于用户量和业务量的增加,或者代码的不合理编写,线上应用Java应用经常出现OOM,我们线上的应用均部署在k8s中,随着Pod内容器的重启,dump的堆文件也会被删除,导致我们一直很难定位问题。

如何解决

  1. 首先我们需要解决dump文件被删除的问题,这里我们会通过Sidecar容器,实现和Java容器目录的共享,这里主要解决堆文件被删除的问题

  2. 在我们的Sidecar容器中,我们会对存放dump文件的目录进行watch,如果发现有变更就会将dump文件上传至OSS,并进行预警,方便研发下载查看

通过Go代码实现OSS文件上传和钉钉告警

  1. package main


  2. import(

  3. "fmt"

  4. "github.com/aliyun/aliyun-oss-go-sdk/oss"

  5. "log"

  6. "net"

  7. "os"

  8. "path/filepath"

  9. "time"

  10. )


  11. func main() {

  12. var dumpPath = getEnvDefault("OPS_APPLICATION_MEM_DUMP_PATH", "")

  13. var enable = getEnvDefault("OPS_APPLICATION_MEM_DUMP_ENABLE", "true")

  14. if enable != "true"{

  15. return

  16. }

  17. uploadFile(dumpPath)

  18. }


  19. func uploadFile(file string) {

  20. fmt.Printf("start upload ali yun oss, file: %s \n", file)

  21. var accessKeyId = getEnvDefault("OPS_ALI_KEY_ID", "")

  22. var accessKeySecret = getEnvDefault("OPS_ALI_KEY_SECRET", "")

  23. var bucketName = getEnvDefault("OPS_APPLICATION_MEM_DUMP_BUCKET", "")

  24. var endpoint = getEnvDefault("OPS_APPLICATION_OSS_ENDPOINT", "")

  25. var applicationName = getEnvDefault("OPS_APPLICATION_NAME", "")

  26. client, err := oss.New(endpoint, accessKeyId, accessKeySecret)

  27. if err != nil{

  28. log.Fatalf("create oss link failed: %v", err)

  29. }

  30. bucket, err := client.Bucket(bucketName)

  31. if err != nil{

  32. log.Fatalf("get oss bucket failed: %v", err)

  33. }

  34. var filename = filepath.Base(file) //获取文件名

  35. // 设置存储类型为标准存储。

  36. var objectName = "k8s/application-mem-dump/"+ applicationName + "/"+ formatDateTime() + filename

  37. if err := uploadMultipart(bucket, objectName, file, int64(5*1024*1024)); err != nil{

  38. log.Fatalf("Failed to upload multipart: %v", err)

  39. }

  40. sendAlert(applicationName, bucketName, objectName)

  41. }


  42. func sendAlert(applicationName string, bucket string, memoryDumpPath string) {

  43. // 这里写发送DingDIng告警的逻辑

  44. }


  45. func getEnvDefault(key, defaultVal string) string{

  46. val, ex := os.LookupEnv(key)

  47. if!ex {

  48. return defaultVal

  49. }

  50. return val

  51. }


  52. func getHostIp() string{

  53. addrList, err := net.InterfaceAddrs()

  54. if err != nil{

  55. fmt.Println("get current host ip err: ", err)

  56. return""

  57. }

  58. var ip string

  59. for _, address := range addrList {

  60. if ipNet, ok := address.(*net.IPNet); ok && !ipNet.IP.IsLoopback() {

  61. if ipNet.IP.To4() != nil{

  62. ip = ipNet.IP.String()

  63. break

  64. }

  65. }

  66. }

  67. return ip

  68. }


  69. func formatDateTime() string{

  70. return time.Now().Format("20060102150405")

  71. }


  72. // 分片上传函数

  73. func uploadMultipart(bucket *oss.Bucket, objectName, localFilename string, partSize int64) error {

  74. // 将本地文件分片

  75. chunks, err := oss.SplitFileByPartSize(localFilename, partSize)

  76. if err != nil{

  77. return fmt.Errorf("failed to split file into chunks: %w", err)

  78. }


  79. // 打开本地文件。

  80. file, err := os.Open(localFilename)

  81. if err != nil{

  82. return fmt.Errorf("failed to open file: %w", err)

  83. }

  84. defer file.Close()


  85. // 步骤1:初始化一个分片上传事件。

  86. imur, err := bucket.InitiateMultipartUpload(objectName)

  87. if err != nil{

  88. return fmt.Errorf("failed to initiate multipart upload: %w", err)

  89. }


  90. // 步骤2:上传分片。

  91. var parts []oss.UploadPart

  92. for _, chunk := range chunks {

  93. part, err := bucket.UploadPart(imur, file, chunk.Size, chunk.Number)

  94. if err != nil{

  95. // 如果上传某个部分失败,尝试取消整个上传任务。

  96. if abortErr := bucket.AbortMultipartUpload(imur); abortErr != nil{

  97. log.Printf("Failed to abort multipart upload: %v", abortErr)

  98. }

  99. return fmt.Errorf("failed to upload part: %w", err)

  100. }

  101. parts = append(parts, part)

  102. }


  103. // 指定Object的读写权限为私有,默认为继承Bucket的读写权限。

  104. objectAcl := oss.ObjectACL(oss.ACLPrivate)


  105. // 步骤3:完成分片上传。

  106. _, err = bucket.CompleteMultipartUpload(imur, parts, objectAcl)

  107. if err != nil{

  108. // 如果完成上传失败,尝试取消上传。

  109. if abortErr := bucket.AbortMultipartUpload(imur); abortErr != nil{

  110. log.Printf("Failed to abort multipart upload: %v", abortErr)

  111. }

  112. return fmt.Errorf("failed to complete multipart upload: %w", err)

  113. }


  114. log.Printf("Multipart upload completed successfully.")

  115. returnnil

  116. }

复制

编写Dockerfile打包成Image方便Sidecar容器拉取

  1. FROM arm64v8/golang:1.23.1-alpine


  2. RUN set-eux \

  3. && apk update \

  4. && apk add --no-cache inotify-tools


  5. ADD k8s-memory-dump/usr/local/bin

  6. RUN chmod 777/usr/local/bin/k8s-memory-dump


  7. ENTRYPOINT ["/bin/bash"]

复制
  1. inotify-tools用来对dump文件进行watch

  2. k8s-memory-dump是我们通过go源码编译好的二进制文件

Deployments改进

  1. apiVersion: apps/v1

  2. kind: Deployment

  3. metadata:

  4. namespace: application-prod

  5. name: jvm-applictaion

  6. labels:

  7. app: jvm-applictaion

  8. spec:

  9. replicas: 12

  10. selector:

  11. matchLabels:

  12. app: jvm-applictaion

  13. strategy:

  14. type: RollingUpdate

  15. rollingUpdate:

  16. maxUnavailable: 50%

  17. maxSurge: 50%

  18. template:

  19. metadata:

  20. labels:

  21. app: jvm-applictaion

  22. spec:

  23. restartPolicy: Always

  24. volumes:

  25. - name: memory-dump

  26. emptyDir: {}

  27. containers:

  28. - name: memory-dump-monitor

  29. image: xxxx:k8s-memory-dump-v1.0

  30. command: ["/bin/sh", "-c"]

  31. args:

  32. - >

  33. inotifywait -m memory-dumps -e close_write | while read path

  34. action file; do

  35. k8s-memory-dump;

  36. done;

  37. envFrom:

  38. - secretRef:

  39. name: k8s-memory-dump.cert

  40. env:

  41. - name: OPS_APPLICATION_MEM_DUMP_PATH

  42. value: /memory-dumps/heapdump.hprof

  43. - name: OPS_APPLICATION_NAME

  44. value: jvm-applictaion

  45. - name: OPS_APPLICATION_MEM_DUMP_ENABLE

  46. value: 'true'

  47. volumeMounts:

  48. - name: memory-dump

  49. mountPath: /memory-dumps

  50. imagePullPolicy: Always

  51. - name: jvm-applictaion

  52. image: xxxx:jvm-applictaion-image

  53. volumeMounts:

  54. - name: memory-dump

  55. mountPath: /tmp/java-dump/

复制

在上面的Yaml中我只保留了关键信息,总得原理就是通过memory-dump的Volume实现存储共享,将我们Java容器在/tmp/java-dump/生成的dump文件共享到Sidecar容器的/memory-dumps目录下,Sidecar容器如果检测到文件发生变化,则会调用我们的go程序将dump文件上传到OSS并预警,效果图如下: 


文章转载自程序员修炼笔记,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

评论