侧边栏壁纸
博主头像
背锅小王子博主等级

我从事运维工作有十年之久,主要从事云原生相关的工作,对k8s、devops、servicemesh、可观察性等较为熟悉!

  • 累计撰写 59 篇文章
  • 累计创建 64 个标签
  • 累计收到 1 条评论

目 录CONTENT

文章目录

prometheus应用实践(二)告警规则

背锅小王子
2022-07-30 / 0 评论 / 0 点赞 / 329 阅读 / 4,980 字
温馨提示:
本文最后更新于 2022-11-14,若内容或图片失效,请留言反馈。部分素材来自网络,若不小心影响到您的利益,请联系我们删除。

1、添加自定义告警

k8s组件监控指标:k8s-prometheusRule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: k8s-rules
  namespace: monitoring
spec:
  groups:
  - name: k8s
    rules:
    - alert: K8S的APISERVER潜在危险过高
      annotations:
        description: '{{ $labels.cluster }} 集群 API server 的 {{ $labels.verb }} {{ $labels.resource }} 有异常延迟 {{ $value }} 秒!'
      expr: |
        (
          cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
          >
          on (verb) group_left()
          (
            avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
            +
            2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
          )
        ) > on (verb) group_left()
        1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
        and on (verb,resource)
        cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
        >
        1
      for: 5m
      labels:
        severity: warning
    - alert: K8S的APISERVER潜在致命风险
      annotations:
        description: '{{ $labels.cluster }} 集群 API server 的 {{ $labels.verb }} {{ $labels.resource }} 有 99% 的请求的延迟达 {{ $value }} 秒!'
      expr: |
        cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
      for: 10m
      labels:
        severity: critical
    - alert: K8S的APISERVER存在返回错误过高
      annotations:
        description: '{{ $labels.cluster }} 集群 API server 请求中有 {{ $value | humanizePercentage }} 的返回错误!'
      expr: |
        sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
          /
        sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
      for: 10m
      labels:
        severity: critical
    - alert: K8S的APISERVER存在返回错误
      annotations:
        description: '{{ $labels.cluster }} 集群 API server 请求中有 {{ $value | humanizePercentage }} 的返回错误!'
      expr: |
        sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
          /
        sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
      for: 10m
      labels:
        severity: warning
    - alert: K8S的APISERVER资源存在返回错误过高
      annotations:
        description: '{{ $labels.cluster }} 集群 API server 的 {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }} 的请求中有 {{ $value | humanizePercentage }} 的返回错误!'
      expr: |
        sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb,cluster)
          /
        sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb,cluster) > 0.10
      for: 10m
      labels:
        severity: critical
    - alert: K8S的APISERVER资源存在返回错误
      annotations:
        description: '{{ $labels.cluster }} 集群 API server 的 {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }} 的请求中有 {{ $value | humanizePercentage }} 的返回错误!'
      expr: |
        sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb,cluster)
          /
        sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb,cluster) > 0.05
      for: 10m
      labels:
        severity: warning
    - alert: K8S客户端证书将在一个月后过期
      annotations:
        description: '{{ $labels.cluster }} 集群一个 K8S 的客户端证书将在一个月后过期!'
      expr: |
        apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 * 30
      labels:
        severity: critical
    - alert: APISERVER掉线
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus Targets 无法发现 APISERVER!'
      expr: |
        absent(up{job="apiserver"} == 1)
      for: 15m
      labels:
        severity: critical

    - alert: KubeControllerManager掉线
      annotations:
        description: KubeControllerManager 从 Prometheus Targets 的发现中消失!
      expr: |
        absent(up{job="kube-controller-manager-svc"} == 1)
      for: 15m
      labels:
        severity: critical
    
    - alert: K8S节点未就绪
      annotations:
        description: '{{ $labels.cluster }} 集群K8S节点 {{ $labels.node }} 处于未就绪状态已超过15分钟!'
      expr: |
        kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
      for: 15m
      labels:
        severity: warning
    - alert: K8S节点不可达
      annotations:
        description: '{{ $labels.cluster }} 集群K8S节点 {{ $labels.node }} 不可达,一部分工作负载已重新调度!'
      expr: |
        kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1
      labels:
        severity: warning
    - alert: Kubelet掉线
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus Targets 无法发现 Kubelet {{ $labels.node }}!'
      expr: |
        absent(up{job="kubelet", metrics_path="/metrics"} == 1)
      for: 15m
      labels:
        severity: critical

    - alert: K8SScheduler掉线
      annotations:
        description: KubeScheduler 从 Prometheus Targets 的发现中消失!
      expr: |
        absent(up{job="kube-scheduler-svc"} == 1)
      for: 15m
      labels:
        severity: critical

    - alert: K8S的PV使用量超过80%
      annotations:
        description: '{{ $labels.cluster }} 集群命名空间 {{ $labels.namespace }} 中被PVC {{ $labels.persistentvolumeclaim }} 声明的的PV只剩下 {{ $value | humanizePercentage }} 空闲!'
      expr: |
        kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
        /
        kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
        < 0.2
      for: 5m
      labels:
        severity: critical
    - alert: KubePersistentVolumeFullInFourDays
      annotations:
        description: '{{ $labels.cluster }} 集群通过抽样计算,命名空间 {{ $labels.namespace }} 中被PVC {{ $labels.persistentvolumeclaim }} 声明的的PV将在4天内用尽,当前剩余 {{ $value | humanizePercentage }}!'
      expr: |
        (
          kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
          /
          kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
        ) < 0.15
        and
        predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
      for: 1h
      labels:
        severity: critical
    - alert: K8S的PV错误
      annotations:
        description: '{{ $labels.cluster }} 集群 PV {{ $labels.persistentvolume }} 的状态为 {{ $labels.phase }}!'
      expr: |
        kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
      for: 5m
      labels:
        severity: critical

node节点监控指标:node-prometheusRule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: node-rules
  namespace: monitoring
spec:
  groups: 
  - name: nodes
    rules:
    - alert: 节点文件系统将在24小时内用完
      annotations:
        description: '{{ $labels.cluster }} 集群的 {{ $labels.instance }} 节点的文件系统的 {{ $labels.device }} 设备只剩下 {{ printf "%.2f" $value }}% 可使用空间,速率计算可能在24小时内填满!'
      expr: |
        (
          node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
        and
          predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
        and
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
        )
      for: 1h
      labels:
        severity: warning
    - alert: 节点文件系统将在4小时内用完
      annotations:
        description: '{{ $labels.cluster }} 集群的 {{ $labels.instance }} 节点的文件系统的 {{ $labels.device }} 设备只剩下 {{ printf "%.2f" $value }}% 可使用空间,速率计算可能在4小时内填满!'
      expr: |
        (
          node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20
        and
          predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
        and
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
        )
      for: 1h
      labels:
        severity: warning
    - alert: 节点磁盘使用率超过80%
      annotations:
        description: '{{ $labels.cluster }} 集群的 {{ $labels.instance }} 节点的文件系统的 {{ $labels.device }} 设备只剩下 {{ printf "%.2f" $value }}% 可使用空间!'
      expr: |
        (
          node_filesystem_avail_bytes{job=~"node-exporter|harbor",fstype!=""} / node_filesystem_size_bytes{job=~"node-exporter|harbor",fstype!=""} * 100 < 20
        and
          node_filesystem_readonly{job=~"node-exporter|harbor",fstype!=""} == 0
        )
      for: 5m
      labels:
        severity: critical
    - alert: 节点磁盘使用率超过90%
      annotations:
        description: '{{ $labels.cluster }} 集群的 {{ $labels.instance }} 节点的文件系统的 {{ $labels.device }} 设备只剩下 {{ printf "%.2f" $value }}% 可使用空间!'
      expr: |
        (
          node_filesystem_avail_bytes{job=~"node-exporter|harbor",fstype!=""} / node_filesystem_size_bytes{job=~"node-exporter|harbor",fstype!=""} * 100 < 10
        and
          node_filesystem_readonly{job=~"node-exporter|harbor",fstype!=""} == 0
        )
      for: 5m
      labels:
        severity: critical
    - alert: 节点挂载的文件系统空闲的文件节点个数24小时内用完
      annotations:
        description: '{{ $labels.cluster }} 集群的 {{ $labels.instance }} 节点的文件系统的 {{ $labels.device }} 设备只剩下 {{ printf "%.2f" $value }}% 可使用空间,速率计算可能在24小时内填满!'
      expr: |
        (
          node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
        and
          predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
        and
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
        )
      for: 1h
      labels:
        severity: warning
    - alert: 节点挂载的文件系统空闲的文件节点个数4小时内用完
      annotations:
        description: '{{ $labels.cluster }} 集群的 {{ $labels.instance }} 节点的文件系统的 {{ $labels.device }} 设备只剩下 {{ printf "%.2f" $value }}% 可使用空间,速率计算可能在4小时内填满!'
      expr: |
        (
          node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
        and
          predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
        and
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
        )
      for: 1h
      labels:
        severity: critical
    - alert: 节点CPU使用大于80%
      expr: 100 - (avg by(cluster,instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
      for: 5m
      labels:
        severity: critical
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点 CPU 使用率在 3m 内持续达到 {{ printf "%.0f" $value }}%!'
    - alert: 节点内存使用率超过80%
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点侦测到内存使用率在 3m 内持续达到 {{ printf "%.0f" $value }}%!'
      expr: |
        100 - ( node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 ) > 80
      for: 5m
      labels:
        severity: critical
    - alert: 节点侦测到文件描述符切换次数过高
      expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 30000
      for: 5m
      labels:
        severity: critical
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点侦测到文件描述符切换次数达到 {{ printf "%.0f" $value }} 次/s!'
    - alert: 节点侦测到打开的文件描述符过多
      expr: avg by (instance,cluster) (node_filefd_allocated) > 102400
      for: 5m
      labels:
        severity: critical
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点侦测到打开的文件描述符达到 {{ printf "%.0f" $value }}!'
  

    - alert: Node网络网卡抖动
      annotations:
        description: '{{ $labels.cluster }} 集群侦测到 node-exporter {{ $labels.namespace }}/{{ $labels.pod }} 节点上的网卡 {{ $labels.device }} 状态经常改变!'
      expr: |
        changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
      for: 2m
      labels:
        severity: warning
    - alert: 节点侦测到TCP已分配的套接字数量
      expr: sum(avg_over_time(node_sockstat_TCP_alloc[5m])) by (instance,cluster)  > 5000
      for: 5m
      labels:
        severity: critical
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点侦测到 TCP 已分配的套接字数量达到 {{ printf "%.0f" $value }}!'
    - alert: 节点侦测到UDP使用中的套接字数量
      expr: sum(avg_over_time(node_sockstat_UDP_inuse[5m])) by (instance,cluster)  > 5000
      for: 5m
      labels:
        severity: critical
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点侦测到UDP使用中的套接字数量达到 {{ printf "%.0f" $value }}!'
    - alert: 节点下行网络错误
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点的网络设备 {{ $labels.device }} 再过去2分钟内侦测到 {{ printf "%.0f" $value }} 的下载错误!'
      expr: |
        increase(node_network_receive_errs_total[2m]) > 10
      for: 5m
      labels:
        severity: warning
    - alert: 节点上行网络错误
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点的网络设备 {{ $labels.device }} 再过去2分钟内侦测到 {{ printf "%.0f" $value }} 的上传错误!'
      expr: |
        increase(node_network_transmit_errs_total[2m]) > 10
      for: 5m
      labels:
        severity: warning
    - alert: 节点下行带宽过高
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点的网络设备 {{ $labels.device }} 下载带宽超过 > 100MB/s'
      expr: |
        sum by (icluster,instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
      for: 5m
      labels:
        severity: warning
    - alert: 节点上行带宽过高
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点的网络设备 {{ $labels.device }} 上传带宽超过 > 100MB/s'
      expr: |
        sum by (cluster,instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
      for: 5m
      labels:
        severity: warning
    - alert: 节点下行丢包率过高
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点5分钟内下行丢包率超过达到 {{ printf "%.0f" $value }}%!'
      expr: |
        sum by (instance,cluster) (irate(node_network_receive_drop_total[3m])) / sum by (instance,cluster) (irate(node_network_receive_packets_total[3m])) * 100 > 80
      for: 5m
      labels:
        severity: cirtical
    - alert: 节点上行丢包率过高
      annotations:
        description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点5分钟内上行丢包率超过达到 {{ printf "%.0f" $value }}%!'
      expr: |
        sum by (instance,cluster) (irate(node_network_transmit_drop_total[3m])) / sum by (instance,cluster) (irate(node_network_transmit_packets_total[3m])) * 100 > 80
      for: 5m
      labels:
        severity: cirtical

app应用告警指标:k8s-apps-prometheusRule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: k8s-apps-rules
  namespace: monitoring
spec:
  groups:
  - name: apps
    rules:
    - alert: K8S容器组短时间内多次重启
      annotations:
        description: 'K8S集群容器组 {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) 在5分钟内重启了 {{ printf "%.2f" $value }} 次!'
      expr: |
        rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 10 > 1
      for: 5m
      labels:
        severity: critical
    - alert: K8S容器组调度失败
      annotations:
        description: 'K8S集���容器组 {{ $labels.namespace }}/{{ $labels.pod }} 无符合预期工作节点,无法被调度!'
      expr: |
        sum by (cluster,pod) (kube_pod_status_scheduled) < 0
      for: 5m
      labels:
        severity: critical
    - alert: K8S容器组NotReady
      annotations:
        description: 'K8S集群 {{ $labels.namespace }}/{{ $labels.pod }} 已处于 not-ready 状态超过15分钟!'
      expr: |
        sum by (namespace, pod, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod, cluster) group_left(owner_kind) max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})) > 0
      for: 15m
      labels:
        severity: critical
    - alert: K8S部署状态异常
      annotations:
        description: 'K8S集群部署的 {{ $labels.namespace }}/{{ $labels.deployment }} 状态异常,部分实例不可用已达15分钟!'
      expr: |
        kube_deployment_status_replicas_unavailable{cluster="prod"} != 0
      for: 15m
      labels:
        severity: warning
    - alert: K8S部署版本号不匹配
      annotations:
        description: 'K8S集群部署的 {{ $labels.namespace }}/{{ $labels.deployment }} 部署版本号不匹配,这表明部署的部署过程失败,并且没有回滚达15分钟!'
      expr: |
        kube_deployment_status_observed_generation{job="kube-state-metrics"}
          !=
        kube_deployment_metadata_generation{job="kube-state-metrics"}
      for: 15m
      labels:
        severity: critical
    - alert: K8S部署实际副本数与预期数不匹配
      annotations:
        description: 'K8S集群部署 {{ $labels.namespace }}/{{ $labels.deployment }} 部署的实际副本数与预期数不匹配超过15分钟!'
      expr: |
        kube_deployment_spec_replicas{job="kube-state-metrics"}
          !=
        kube_deployment_status_replicas_available{job="kube-state-metrics"}
      for: 15m
      labels:
        severity: critical
    - alert: K8S有状态部署实际副本数与预期数不匹配
      annotations:
        description: 'K8S集群有状态部署 {{ $labels.namespace }}/{{ $labels.statefulset }} 有状态部署的实际副本数与预期数不匹配超过15分钟!'
      expr: |
        kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
          !=
        kube_statefulset_status_replicas{job="kube-state-metrics"}
      for: 15m
      labels:
        severity: critical
    - alert: K8S容器等待中
      annotations:
        description: 'K8S集群容器组 {{ $labels.namespace }}/{{ $labels.pod }} 中的 {{ $labels.container}} 容器已经再等待状态超过1小时!'
      expr: |
        sum by (cluster, namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
      for: 1h
      labels:
        severity: warning
    - alert: K8S的HPA副本数不匹配
      annotations:
        description: 'K8S集群HPA {{ $labels.namespace }}/{{ $labels.hpa }} 与预期副本数不匹配已经超过15分钟!'
      expr: |
        (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} != kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and changes(kube_horizontalpodautoscaler_status_current_replicas[15m]) == 0
      for: 15m
      labels:
        severity: warning
    - alert: 侦测到K8S的HPA缩容
      annotations:
        description: 'K8S集群 HPA {{ $labels.namespace }}/{{ $labels.hpa }} 已触发缩容,可用副本数达到预期,当前预期 {{ printf "%.0f" $value }} !'
      expr: |
        (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} == kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and delta(kube_horizontalpodautoscaler_status_current_replicas[5m]) < 0
      for: 1m
      labels:
        severity: warning
    - alert: 侦测到K8S的HPA扩容
      annotations:
        description: 'K8S集群 HPA {{ $labels.namespace }}/{{ $labels.hpa }} 已触发扩容,可用副本数达到预期,当前预期 {{ printf "%.0f" $value }} !!'
      expr: |
        (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} == kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and delta(kube_horizontalpodautoscaler_status_current_replicas[5m]) > 0
      for: 1m
      labels:
        severity: warning
    - alert: K8S工作负载的HPA保持满载
      annotations:
        description: 'K8S集群 HPA {{ $labels.namespace }}/{{ $labels.hpa }} 以限制最大副本数满载运行超过了15分钟!'
      expr: |
        kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} == kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}
      for: 15m
      labels:
        severity: warning

prometheus的告警指标:prometheus-prometheusRule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: prometheus-rules
  namespace: monitoring
spec:
  groups:
  - name: prometheus
    rules:

    - alert: Prometheus错误的配置
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 再重载配置时失败!'
      expr: |
        max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
      for: 10m
      labels:
        severity: critical
    - alert: Prometheus通知队列已满
      annotations:
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} 的报警通知队列已满!
          30m.
      expr: |
        (
          predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30)
        >
          min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m])
        )
      for: 15m
      labels:
        severity: warning
    - alert: Prometheus在推送警报时发生错误
      annotations:
        description: '{{ $labels.cluster }} 集群 {{$labels.namespace}}/{{$labels.pod}} 在推送警报至某些 Alertmanager {{$labels.alertmanager}} 时出现了 {{ printf "%.1f" $value }}% 的错误!'
      expr: |
        (
          rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
        /
          rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
        )
        * 100
        > 1
      for: 15m
      labels:
        severity: warning
    - alert: Prometheus在推送警报时全部错误
      annotations:
        description: '{{ $labels.cluster }} 集群 {{$labels.namespace}}/{{$labels.pod}} 在推送警报至全部 Alertmanager {{$labels.alertmanager}} 时出现了 {{ printf "%.1f" $value }}% 的错误!'
      expr: |
        min without(alertmanager) (
          rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
        /
          rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
        )
        * 100
        > 3
      for: 15m
      labels:
        severity: critical
    - alert: Prometheus未连接Alertmanagers
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 没有连接到任何 Alertmanagers!'
      expr: |
        max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
      for: 10m
      labels:
        severity: warning
    - alert: PrometheusTSDB重载失败
      annotations:
        description: '{{ $labels.cluster }} 集群在过去的3小时内 Prometheus {{$labels.namespace}}/{{$labels.pod}} 侦测到 {{$value | humanize}} 个重载错误!'
      expr: |
        increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
      for: 4h
      labels:
        severity: warning
    - alert: PrometheusTSDB压缩失败
      annotations:
        description: '{{ $labels.cluster }} 集群在过去的3小时内 Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} 个压缩错误!'
      expr: |
        increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
      for: 4h
      labels:
        severity: warning
    - alert: Prometheus没有采集到数据样本
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 没有采集到数据样本!'
      expr: |
        rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
      for: 10m
      labels:
        severity: warning
    - alert: Prometheus重复的时间戳
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 正在丢弃 {{ printf "%.4g" $value  }} 拥有相同时间戳不同数据的数据样本!'
      expr: |
        rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
      for: 10m
      labels:
        severity: warning
    - alert: Prometheus时间戳超过限制
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 正在丢弃 {{ printf "%.4g" $value  }} 超过时间戳限制的数据样本!'
      expr: |
        rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
      for: 10m
      labels:
        severity: warning
    - alert: Prometheus远程存储失败
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 在推送至数据都队列 {{$labels.queue}} 数据时有 {{ printf "%.1f" $value }}% 的错误!'
      expr: |
        (
          rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
        /
          (
            rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
          +
            rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
          )
        )
        * 100
        > 1
      for: 15m
      labels:
        severity: critical
    - alert: Prometheus远程数据写落后
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 远程写落后于队列 {{$labels.queue}} {{ printf "%.1f" $value }} 秒!'
      expr: |
        (
          max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
        - on(job, instance) group_right
          max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
        )
        > 120
      for: 15m
      labels:
        severity: critical
    - alert: Prometheus远程写预期切片
      annotations:
        description: '{{ $labels.cluster }} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 远程写的预期切片数估计需要 {{ $value }} shards, 大于最大值 {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}!'
      expr: |
        (
          max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
        >
          max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
        )
      for: 15m
      labels:
        severity: warning
    - alert: Prometheus规则错误
      annotations:
        description: '{{ $labels.cluster }} 集���在5分钟内 Prometheus {{$labels.namespace}}/{{$labels.pod}} 评估 {{ printf "%.0f" $value }} 条的规则失败!'
      expr: |
        increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
      for: 15m
      labels:
        severity: critical
    - alert: Prometheus缺少规则评估
      annotations:
        description: '{{ $labels.cluster }} 集群在过去5分钟内 Prometheus {{$labels.namespace}}/{{$labels.pod}} 错过了 {{ printf "%.0f" $value }} 规则组评估!'
      expr: |
        increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
      for: 15m
      labels:
        severity: warning

    - alert: Alertmanager配置不一致
      annotations:
        description: '{{ $labels.cluster }} 集群Alertmanager集群的节点之间配置不同步 {{ $labels.service }}!'
      expr: |
        count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (cluster, service) / ON(cluster, service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (cluster, name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
      for: 5m
      labels:
        severity: critical
    - alert: Alertmanager重载失败
      annotations:
        description: '{{ $labels.cluster }} 集群在重载Alertmanager配置时失败 {{ $labels.namespace }}/{{ $labels.pod }}!'
      expr: |
        alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
      for: 10m
      labels:
        severity: warning
    - alert: Alertmanager成员不一致
      annotations:
        description: '{{ $labels.cluster }} 集群Alertmanager未找到群集的所有成员!'
      expr: |
        alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
          != on (cluster,service) GROUP_LEFT()
        count by (cluster,service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
      for: 5m
      labels:
        severity: critical

2、部署自定义告警

kubectl apply -f k8s-prometheusRule.yaml
kubectl apply -f node-prometheusRule.yaml
kubectl apply -f k8s-apps-prometheusRule.yaml
kubectl apply -f prometheus-prometheusRule.yaml

3、登录prometheus确认告警规则

在prometheus的status中的rules可以找到刚刚配置的自定义告警组:

分别为:k8s、nodes、apps、prometheus

以及其中的告警内容

0

评论区