1、环境说明
因为公司业务需要,prometheus需要添加三种告警通道,分别为:slack、微信、电话
slack告警方式:使用webhook
微信告警方式:使用企业微信
电话告警方式:使用python编写接口
2、配置slack的webhook
alertmanager中slack告警通道需要使用到:slack的通道名称、slack的webhook地址
在slack中添加频道:
名称:alertmanager-alert
在集成中添加应用,选择Incoming WebHook,安装,再点击Add to Slack,就可以生成webhook的url地址:
https://hooks.slack.com/services/xxxx/xxxx/xxxxx
3、配置企业微信
alertmanager中微信告警通道需要使用到:企业ID、企业微信AgentId、企业微信Secret、部门ID
申请一个企业微信账号,企业微信官网:https://work.weixin.qq.com
企业ID:我的企业->企业信息->企业ID
企业微信AgentId:应用管理->应用->创建自建应用->名称:alertmanager-alert->打开创建的应用alertmanager-alert就可以查看到
AgentId和Secret
部门ID:通讯录->创建一个部门->查看部门ID
4、python编写电话接口
编写电话和短信告警接口:call-sms.py
import json
import time
from flask import Flask,request,jsonify
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.request import CommonRequest
from aliyunsdkcore.auth.credentials import AccessKeyCredential
from aliyunsdkdyvmsapi.request.v20170525.SingleCallByTtsRequest import SingleCallByTtsRequest
# http://127.0.0.1:8000/alert?cn_phone=xxx&describe=xx服务发生告警
ACCESS_KEY_ID = ${ACCESS_KEY_ID} # ACCESS_KEY_ID
ACCESS_KEY_SECRET = ${ACCESS_KEY_SECRET} # ACCESS_KEY_ID
client = AcsClient(region_id='cn-hangzhou', credential=AccessKeyCredential(ACCESS_KEY_ID, ACCESS_KEY_SECRET))
def init_call(mobile, description):
t = time.strftime(f"%Y-%m-%d %H:%M:%S")
r2 = SingleCallByTtsRequest()
r2.set_accept_format('json')
r2.set_CalledNumber(mobile)
r2.set_TtsCode("你的TtsCode")
r2.set_TtsParam({"description":description, "time":t})
return r2
def init_sms(mobile,description):
# 添加参数
r = CommonRequest()
r.set_accept_format('json')
r.set_domain('dysmsapi.aliyuncs.com')
r.set_protocol_type('https')
r.set_method('POST')
r.set_version('2017-05-25')
r.set_action_name('SendSms')
r.add_query_param('SignName', "你的SignName")
r.add_query_param('TemplateCode', "你的TemplateCode")
r.add_query_param('PhoneNumbers', mobile)
r.add_query_param('TemplateParam', {"describe":description})
return r
app = Flask(__name__)
app.config["ENV"] = "prod"
@app.route("/healthy")
def Healthy():
return "200"
@app.route("/grafana-call",methods=["POST"])
def GrafanaSendCall():
d={}
data=json.loads(request.data)
print(data)
if data['state'] == "alerting":
msg= json.loads(data["message"])
print(msg["mobile"],msg["description"])
for phone in (msg["mobile"].split(',')):
d2 = init_call(phone,msg["description"])
r2 = client.do_action_with_exception(d2)
d[phone] = json.loads(r2)
print(d)
return jsonify(d)
@app.route("/grafana-sms", methods=["POST"])
def GrafanaSendSms():
data=json.loads(request.data)
if data['state'] == "alerting":
msg= json.loads(data["message"])
print(msg["mobile"],msg["description"])
d = init_sms(msg["mobile"],msg["description"])
res = client.do_action(d)
print(res)
return json.loads(res)
@app.route("/elastalert-call",methods=["POST"])
def ElastalertSendCall():
d={}
data=json.loads(request.data)
print(data)
try:
for phone in (data["PhoneNumber"].split(',')):
d2 = init_call(phone,data["Description"])
r2 = client.do_action_with_exception(d2)
d[phone] = json.loads(r2)
print(d)
except Exception as e:
print(e)
return jsonify(d)
@app.route("/elastalert-sms", methods=["POST"])
def ElastAlertSendSms():
data = request.get_json()
try:
d = init_sms(data["PhoneNumber"],data["Description"])
res = client.do_action(d)
print(res)
except Exception as e:
print(e)
return json.loads(res)
@app.route("/prometheus-call",methods=["POST"])
def PrometheusSendCall():
data=json.loads(request.data)
alerts_list = data['alerts']
for alert in alerts_list:
description = alert.get('annotations').get('description')
mobiles = alert.get('annotations').get('mobile')
status = alert.get('status')
if status == "firing":
print(mobiles, description)
description = description.replace(".", "点").replace(":", "冒号").replace("/", "斜杠")
for phone in (mobiles.split(',')):
r = init_call(phone,description)
res = client.do_action_with_exception(r)
print(res, type(res))
return "ok"
@app.route("/prometheus-sms", methods=["POST"])
def PrometheusSendSms():
data=json.loads(request.data)
try:
alerts_list = data['alerts']
for alert in alerts_list:
description = alert.get('annotations').get('description')
mobiles = alert.get('annotations').get('mobile')
status = alert.get('status')
if status == "firing":
description = description.replace(".", "点").replace(":", "冒号").replace("/", "斜杠")
d = init_sms(mobiles,description)
res = client.do_action(d)
print(res)
except Exception as e:
print(e)
return json.loads(res)
if __name__ == "__main__":
app.run(debug=True,host="0.0.0.0",port=8080)
编写Dockerfile
FROM python:3.9.7-slim
RUN pip install flask aliyun-python-sdk-core aliyun-python-sdk-dysmsapi==2.1.1 aliyun-python-sdk-dyvmsapi
COPY call-sms.py /opt/
EXPOSE 8080
CMD ["/usr/local/bin/python3", "/opt/call-sms.py"]
构建docker镜像
docker build call-sms:v1 .
编写k8s资源文件
## aliyun.yml ##
tee aliyun.yml <<EOF
apiVersion: v1
kind: Secret
metadata:
name: aliyun-aksk
namespace: monitoring
type: Opaque
data:
ACCESS_KEY_ID: 你的阿里云ak
ACCESS_KEY_SECRET: 你的阿里云sk
EOF
## deploy.yml ##
tee deploy.yml <<EOF
apiVersion: apps/v1
kind: Deployment
metadata:
name: call-sms
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: call-sms
template:
metadata:
labels:
app: call-sms
spec:
volumes:
- name: log
hostPath:
path: /data/logs/call-sms/
type: ''
containers:
- name: call-sms
image: call-sms:v1
ports:
- containerPort: 8080
protocol: TCP
env:
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: aliyun-aksk
key: ACCESS_KEY_ID
- name: ACCESS_KEY_SECRET
valueFrom:
secretKeyRef:
name: aliyun-aksk
key: ACCESS_KEY_SECRET
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 100m
memory: 100Mi
volumeMounts:
- name: log
mountPath: /data/logs
livenessProbe:
httpGet:
path: /healthy
port: 8080
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 3
periodSeconds: 5
successThreshold: 1
failureThreshold: 10
readinessProbe:
httpGet:
path: /healthy
port: 8080
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 3
periodSeconds: 2
successThreshold: 1
failureThreshold: 2
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: Always
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
securityContext: {}
schedulerName: default-scheduler
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
maxSurge: 25%
revisionHistoryLimit: 10
progressDeadlineSeconds: 600
EOF
## svc.yml ##
tee svc.yml <<EOF
apiVersion: v1
kind: Service
metadata:
name: call-sms
namespace: monitoring
spec:
ports:
- protocol: TCP
port: 8080
targetPort: 8080
selector:
app: call-sms
type: ClusterIP
EOF
部署电话接口到k8s集群
kubectl apply -f .
5、微信告警模板
创建微信告警模板:wechat.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
==========异常告警==========
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};{{$alert.Annotations.summary}}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
{{- if gt (len $alert.Labels.namespace) 0 }}
命名空间: {{ $alert.Labels.namespace }}
{{- end }}
{{- if gt (len $alert.Labels.node) 0 }}
节点信息: {{ $alert.Labels.node }}
{{- end }}
{{- if gt (len $alert.Labels.pod) 0 }}
实例名称: {{ $alert.Labels.pod }}
{{- end }}
============END============
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
==========异常恢复==========
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};{{$alert.Annotations.summary}}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
{{- if gt (len $alert.Labels.namespace) 0 }}
命名空间: {{ $alert.Labels.namespace }}
{{- end }}
{{- if gt (len $alert.Labels.node) 0 }}
节点信息: {{ $alert.Labels.node }}
{{- end }}
{{- if gt (len $alert.Labels.pod) 0 }}
实例名称: {{ $alert.Labels.pod }}
{{- end }}
============END============
{{- end }}
{{- end }}
{{- end }}
{{- end }}
部署wechat.tmpl告警模板
kubectl delete secret generic alertmanager-main -n monitoring
kubectl create secret generic alertmanager-main --from-file=alertmanager.yaml --from-file=wechat.tmpl -n monitoring
6、配置alertmanager告警通道
apiVersion: v1
kind: Secret
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.23.0
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
templates:
- /etc/alertmanager/config/wechat.tmpl
"receivers":
- name: warning
slack_configs:
- api_url: https://hooks.slack.com/services/xxxx/xxxx/xxxxx
channel: "alertmanager-alert"
send_resolved: true
text: " {{ range .Alerts }} ========start========== \n 告警状态:{{ .Status }}
\n 告警级别:{{ .Labels.severity }} \n 告警类型:{{ .Labels.alertname }} \n 命名空间:{{ .Labels.namespace }} \n 告警应用:{{ .Annotations.summary }} \n 告警主机:{{ .Labels.instance }} \n 告警POD: {{ .Labels.pod }} \n 告警详情:{{ .Annotations.description }} \n
触发阀值:{{ .Annotations.value }} \n 告警时间:{{ (.StartsAt.Add 28800e9).Format \"2006-01-02 15:04:05\"
}} \n ========end========== {{ end }} "
wechat_configs:
- agent_id: "AgentID"
api_secret: 你申请的应用Secret
corp_id: 你的企业ID
send_resolved: true
to_party: "部门ID"
- name: error
slack_configs:
- api_url: https://hooks.slack.com/services/xxxx/xxxx/xxxxx
channel: "alertmanager-alert"
send_resolved: true
text: " {{ range .Alerts }} ========start========== \n 告警状态:{{ .Status }}
\n 告警级别:{{ .Labels.severity }} \n 告警类型:{{ .Labels.alertname }} \n 命名空间:{{ .Labels.namespace }} \n 告警应用:{{ .Annotations.summary }} \n 告警主机:{{ .Labels.instance }} \n 告警POD: {{ .Labels.pod }} \n 告警详情:{{ .Annotations.description }} \n
触发阀值:{{ .Annotations.value }} \n 告警时间:{{ (.StartsAt.Add 28800e9).Format \"2006-01-02 15:04:05\"
}} \n ========end========== {{ end }} "
wechat_configs:
- agent_id: "AgentID"
api_secret: 你申请的应用Secret
corp_id: 你的企业ID
send_resolved: true
to_party: "部门ID"
- name: critical
slack_configs:
- api_url: https://hooks.slack.com/services/xxxx/xxxx/xxxxx
channel: "alertmanager-alert"
send_resolved: true
text: " {{ range .Alerts }} ========start========== \n 告警状态:{{ .Status }}
\n 告警级别:{{ .Labels.severity }} \n 告警类型:{{ .Labels.alertname }} \n 命名空间:{{ .Labels.namespace }} \n 告警应用:{{ .Annotations.summary }} \n 告警主机:{{ .Labels.instance }} \n 告警POD: {{ .Labels.pod }} \n 告警详情:{{ .Annotations.description }} \n
触发阀值:{{ .Annotations.value }} \n 告警时间:{{ (.StartsAt.Add 28800e9).Format \"2006-01-02 15:04:05\"
}} \n ========end========== {{ end }} "
wechat_configs:
- agent_id: "AgentID"
api_secret: 你申请的应用Secret
corp_id: 你的企业ID
send_resolved: true
to_party: "部门ID"
webhook_configs:
- send_resolved: true
http_config:
follow_redirects: true
url: http://call-sms:8080/prometheus-call
inhibit_rules:
- source_match:
status: critical
target_match:
severity: warning
equal:
- alertname
- job
- instance
"route":
"group_by":
- "alertname"
"group_interval": "10s"
"group_wait": "10s"
"receiver": "warning"
"repeat_interval": "4h"
"routes":
- "matchers":
- "severity = error"
"receiver": "error"
group_interval: 10s
- "matchers":
- "severity = warning"
"receiver": "warning"
group_interval: 10s
- "matchers":
- "severity = critical"
"receiver": "critical"
group_interval: 10s
type: Opaque
重新部署alertmanager-secret.yaml
kubectl apply -f alertmanager-secret.yaml
登录到alertmanager,确认配置的告警通道是否生效
7、prometheus告警规则添加告警电话
前面在alertmanager中只为 severity:critical 的告警添加了电话告警功能
要在prometheus中添加告警电话,只需要符合告警级别为 severity:critical
并且在annotations中添加
- alert: 节点CPU使用大于80%
expr: 100 - (avg by(cluster,instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: critical
annotations:
mobile: "你的电话,多个电话通过英文逗号分割"
description: '{{ $labels.cluster }} 集群 {{ $labels.instance }} 节点 CPU 使用率在 3m 内持续达到 {{ printf "%.0f" $value }}%!'
评论区