Prometheus-安装配置

Prometheus-安装配置

supervisor安装

1
yum install -y supervisor

各组件ini

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
cat << EOF > alertmanager.ini
[program:alertmanager]
command=/data/moni/alertmanager/alertmanager --web.listen-address=":9093"
directory=/data/moni/alertmanager ; directory to cwd to before exec (def no cwd)
umask=022 ; umask for process (default None)
priority=999 ; the relative start priority (default 999)
autostart=true ; start at supervisord start (default: true)
autorestart=true ; retstart at unexpected quit (default: true)
startsecs=1 ; number of secs prog must stay running (def. 1)
startretries=3 ; max # of serial start failures (default 3)
exitcodes=0,2 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT ; signal used to kill process (default TERM)
stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10)
user=root ; setuid to this UNIX account to run the program
redirect_stderr=true ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/alertmanager/alertmanager.log ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10 ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false ; emit events on stdout writes (default false)
environment=A=1,B=2 ; process environment additions (def no adds)
EOF

cat << EOF > consul.ini
[program:consul]
command=/data/moni/consul agent -dev -data-dir=/data/moni/consul-data -node=aws-base-01 -bind=10.77.114.153 -config-dir=/etc/consul.d -enable-script-checks=true -datacenter=hykj -ui -rejoin -client=0.0.0.0
directory=/data/moni ; directory to cwd to before exec (def no cwd)
umask=022 ; umask for process (default None)
priority=999 ; the relative start priority (default 999)
autostart=true ; start at supervisord start (default: true)
autorestart=true ; retstart at unexpected quit (default: true)
startsecs=1 ; number of secs prog must stay running (def. 1)
startretries=3 ; max # of serial start failures (default 3)
exitcodes=0,2 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT ; signal used to kill process (default TERM)
stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10)
user=root ; setuid to this UNIX account to run the program
redirect_stderr=true ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/consul/consul.log ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10 ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false ; emit events on stdout writes (default false)
environment=A=1,B=2 ; process environment additions (def no adds)
#serverurl=AUTO ; override serverurl computation (childutils)
EOF

cat << EOF > dingtalk.ini
[program:dingtalk]
command=/data/moni/dingtalk/dingtalk --config.file=config.yml --web.listen-address=":8860"
directory=/data/moni/dingtalk ; directory to cwd to before exec (def no cwd)
umask=022 ; umask for process (default None)
priority=999 ; the relative start priority (default 999)
autostart=false ; start at supervisord start (default: true)
autorestart=true ; retstart at unexpected quit (default: true)
startsecs=1 ; number of secs prog must stay running (def. 1)
startretries=3 ; max # of serial start failures (default 3)
exitcodes=0,2 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT ; signal used to kill process (default TERM)
stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10)
user=root ; setuid to this UNIX account to run the program
redirect_stderr=true ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/dingtalk/dingtalk.log ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10 ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false ; emit events on stdout writes (default false)
environment=A=1,B=2 ; process environment additions (def no adds)
EOF

cat << EOF > grafana.ini
[program:grafana]
command=/data/moni/grafana/bin/grafana-server -config /data/moni/grafana/conf/grafana.ini ;
directory=/data/moni/grafana ; directory to cwd to before exec (def no cwd)
umask=022 ; umask for process (default None)
priority=999 ; the relative start priority (default 999)
autostart=true ; start at supervisord start (default: true)
autorestart=true ; retstart at unexpected quit (default: true)
startsecs=1 ; number of secs prog must stay running (def. 1)
startretries=3 ; max # of serial start failures (default 3)
exitcodes=0,2 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT ; signal used to kill process (default TERM)
stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10)
user=root ; setuid to this UNIX account to run the program
redirect_stderr=true ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/grafana/grafana.log ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10 ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false ; emit events on stdout writes (default false)
environment=A=1,B=2 ; process environment additions (def no adds)
EOF

cat << EOF > prometheus.ini
[program:prometheus]
command=/data/moni/prometheus/prometheus --storage.tsdb.retention.time=30d --storage.tsdb.path="/data/moni/prometheus/data"
directory=/data/moni/prometheus ; directory to cwd to before exec (def no cwd)
umask=022 ; umask for process (default None)
priority=999 ; the relative start priority (default 999)
autostart=true ; start at supervisord start (default: true)
autorestart=true ; retstart at unexpected quit (default: true)
startsecs=1 ; number of secs prog must stay running (def. 1)
startretries=3 ; max # of serial start failures (default 3)
exitcodes=0,2 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT ; signal used to kill process (default TERM)
stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10)
user=root ; setuid to this UNIX account to run the program
redirect_stderr=true ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/prometheus/prometheus.log ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10 ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false ; emit events on stdout writes (default false)
environment=A=1,B=2 ; process environment additions (def no adds)
EOF

supervisor启动

1
2
3
4
5
6
7
8
9
10
mkdir -p /var/log/{alertmanager,consul,prometheus,dingtalk,grafana}
mkdir -p /etc/consul.d
cp *.ini /etc/supervisord.d/
cp supervisord.conf /etc

systemctl enable supervisord
systemctl restart supervisord

sleep 2
supervisorctl status

prometheus配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
global:
scrape_interval: 15s
evaluation_interval: 15s

alerting:
alertmanagers:
- static_configs:
- targets:
- "localhost:9093"

rule_files:
- "/data/moni/prometheus/rules/*"

scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]

- job_name: "alertmanager"
static_configs:
- targets: ["localhost:9093"]

- job_name: "doris_job"
static_configs:
- targets: ['hy-bigdata-01:18030', 'hy-bigdata-02:18030', 'hy-bigdata-03:18030']
labels:
group: fe

- targets: ['hy-bigdata-01:18040', 'hy-bigdata-02:18040', 'hy-bigdata-03:18040']
labels:
group: be

# - job_name: "pushgateway"
# honor_labels: true
# static_configs:
# - targets: ['10.9.127.245:9091']
# labels:
# instance: pushgateway

# - job_name: "java_jvm"
# static_configs:
# - targets: ['10.9.127.245:8848']

# - job_name: "java-jmx-monitor"
# static_configs:
# - targets: ['10.249.2.51:3010']
#
# - job_name: "davinci_monitor"
# metrics_path: /prometheus
# static_configs:
# - targets: ['10.249.2.50:14399']

- job_name: 'consul_sd_node_exporter'
metrics_path: /metrics
honor_labels: false
consul_sd_configs:
- server: 'localhost:8500'
scheme: http
services: ['node_exporter']
relabel_configs: #根据实际监控所需添加label,并在注册时注册
- source_labels: ['__meta_consul_bzl'] # datacenter,会显示注册到的consul的datacenter
target_label: 'bgy'
- source_labels: ['__meta_consul_service_address']
target_label: 'host'
- source_labels: ['__meta_consul_service_metadata_hostname']
target_label: 'hostname'
- source_labels: ['__meta_consul_service_metadata_business']
target_label: 'business'

- job_name: 'sa_process_exporter'
scrape_interval: 10s
honor_labels: false
consul_sd_configs:
- server: 'localhost:8500'
scheme: http
services: ['process_exporter']
relabel_configs:
- source_labels: ['__meta_consul_service_address']
target_label: 'host'
- source_labels: ['__meta_consul_service_metadata_hostname']
target_label: 'hostname'

告警配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
groups:
- name: defaultStatsAlert
rules:
- alert: cpuUsageAlert
expr: (100 - (avg by (instance,hostname,hosttype,responsibility)(irate(node_cpu_seconds_total{job="consul_sd_node_exporter",mode="idle"}[3m])) * 100)) > 95
for: 10m
labels:
team: node
organization: ops
# labels:
# severity: page
annotations:
summary: "{{ $labels.host }} CPU usgae high"
description: "{{ $labels.host }} CPU usage above 95% (current value: {{ $value }})"

- alert: cpuIOwaitAlert
expr: (avg by (hostname,instance,hosttype,responsibility)(irate(node_cpu_seconds_total{business=~'bzl|aws|hy',job="consul_sd_node_exporter",mode="iowait"}[3m])) * 100) > 45
for: 10m
labels:
team: node
organization: ops
annotations:
summary: "{{ $labels.instance }} CPU iowait high"
description: "{{ $labels.host }} CPU iowait above 45% (current value: {{ $value }})"

- alert: sysLoadAlert
expr: sum by(hostname, instance,business,hosttype,responsibility) (node_load5{job="consul_sd_node_exporter"}) / count by(hostname, instance,business,hosttype,responsibility) (count by(cpu, hostname, instance,business,hosttype,responsibility) (node_cpu_seconds_total{job="consul_sd_node_exporter"})) > 3.85
for: 10m
labels:
team: node
organization: ops
annotations:
# summary: "{{ $labels.instance }} CPU usgae high"
#description: "{{ $labels.host }} CPU load1m is greater than cpu core number for 1min(current value: {{ $value }})"
description: "{{ $labels.host }} CPU load5m is greater than cpu core number(current value: cpu 核数的{{ $value }}倍 )"

- alert: sys-time-Alert
expr: time() - node_time_seconds{job="consul_sd_node_exporter"} > 125
for: 10m
labels:
team: node
organization: ops
annotations:
description: "{{ $labels.host }} system time is inconsistent (current value: {{ $value }}s)"
- alert: sys-time-Alert1
expr: time() - node_time_seconds{job="consul_sd_node_exporter"} < -125
for: 10m
labels:
team: node
organization: ops
annotations:
description: "{{ $labels.host }} system time is inconsistent (current value: {{ $value }}s)"

- alert: memUsageAlert
#expr: ((node_memory_MemTotal_bytes{job="consul_sd_node_exporter"} - (node_memory_MemAvailable_bytes{job="consul_sd_node_exporter"}))/node_memory_MemTotal_bytes{job="consul_sd_node_exporter"}) * 100 > 90
expr: ((node_memory_MemTotal_bytes{job="consul_sd_node_exporter"} - (node_memory_MemFree_bytes{job="consul_sd_node_exporter"}+node_memory_Buffers_bytes{job="consul_sd_node_exporter"}+node_memory_Cached_bytes{job="consul_sd_node_exporter"}+node_memory_SReclaimable_bytes{job="consul_sd_node_exporter"}))/node_memory_MemTotal_bytes{job="consul_sd_node_exporter"}) * 100 > 95
for: 15m
labels:
team: mem
organization: ops
# labels:
# severity: page
annotations:
summary: "{{ $labels.host }} MEM usgae high"
description: "{{ $labels.host }} MEM usage above 95% (current value: {{ $value }})"

- alert: NodeDown
expr: up{job="consul_sd_node_exporter"} == 0
for: 5m
labels:
team: down
organization: ops
# labels: 可以使用这个label在altermanager中配置match分组
# team: node
annotations:
description: '{{ $labels.host}} has been down for 5m'

- alert: disk_utilization_rate
expr: 100 - ((node_filesystem_avail_bytes{business=~'bzl|aws|hy',fstype!~"rootfs|selinuxfs|autofs|rpc_pipefs|tmpfs|fuse.lxcfs|iso9660",mountpoint!~"/boot",job="consul_sd_node_exporter"} * 100) / node_filesystem_size_bytes {business=~'bzl|aws|hy',fstype!~"rootfs|selinuxfs|autofs|rpc_pipefs|tmpfs|fuse.lxcfs|iso9660",mountpoint!~"/boot",job="consul_sd_node_exporter"}) > 80
for: 10m
labels:
team: disk
organization: ops
annotations:
description: '{{$labels.host}} {{$labels.mountpoint}} excess 90% --> {{$value}}'

- alert: network_receive_bytes
expr: irate(node_network_receive_bytes_total{business=~'bzl|aws|hy',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*|eth*|eno*',job="consul_sd_node_exporter"}[15m])*8/1024/1024 > 1000
for: 5m
labels:
team: node
organization: ops
annotations:
description: '{{$labels.host}} network receive bytes reach to {{$value}} MB/s'

- alert: network_transmit_bytes
expr: irate(node_network_transmit_bytes_total{business=~'bzl|aws|hy',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*|eth*|eno*',job="consul_sd_node_exporter"}[15m])*8/1024/1024 > 1000
for: 5m
labels:
team: node
organization: ops
annotations:
description: '{{$labels.host}} network transmit bytes reach to {{$value}} MB/s'

- alert: frame-alert
expr: node_network_receive_frame_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*"} - min_over_time(node_network_receive_frame_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*"}[1h]) >= 5
for: 10s
labels:
team: node
organization: ops
annotations:
description: '{{$labels.host}} network frame growth {{$value}}'

- alert: time_zone-alert
expr: time_zone{tz!="CST"} == 0
for: 10s
labels:
team: node
organization: ops
annotations:
description: '{{$labels.host}} time zone is not CST'

# - alert: doris_node-alert
# expr: sum(up{job=~"doris_job|davinci_monitor"}) by (group,job,instance) < 1
# for: 10s
# labels:
# team: node
# organization: bigdata
# annotations:
# description: '{{$labels.job}} {{$labels.group}} 的节点{{$labels.instance}} 发生异常!!!!'


groups:
- name: rocketmq-delay
rules:
- alert: rocketmq-delay warning1
expr: rocketmq_group_diff{topic="machine_work_log",group="machine_work_log_roketmq2doris"} >2000
for: 20s
labels:
group: bigdata
severity: bigdata
annotations:
description: Topic:{{$labels.topic}} , Group:{{$labels.group}} 出现积压,积压量为:{{$value}}
- alert: rocketmq-delay warning2
expr: rocketmq_group_diff{topic="pro_canal_producer",group="pro_canal_producer_roketmq2doris"} >10
for: 20s
labels:
group: bigdata
severity: bigdata
annotations:
description: Topic:{{$labels.topic}} , Group:{{$labels.group}} 出现积压,积压量为:{{$value}}


groups:
- name: doris_instance_down
rules:
- alert: Doris Backends Down
expr: up {group="be", job="doris_job"} == 0
for: 20s
labels:
user: doris
severity: bigdata
annotations:
summary: "doris Instance {{ $labels.instance }} down"
description: "doris {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 20s."
- alert: Doris Frontends Down
expr: up {group="fe", job="doris_job"} == 0
for: 20s
labels:
user: doris
severity: bigdata
annotations:
summary: "doris Instance {{ $labels.instance }} down"
description: "doris {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 20s."

alertmanage配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
global: 
resolve_timeout: 5m #处理超时时间,默认为5min
smtp_smarthost: 'smtp.139.com:465' # 邮箱smtp服务器代理
smtp_from: 'xxxxxx@139.com' # 发送邮箱名称
smtp_auth_username: 'xxxxx@139.com' # 邮箱名称
smtp_auth_password: 'xxxxxxxxx' # 邮箱密码或授权码
smtp_require_tls: false

# 定义模板信心
templates:
- '/opt/alertmanager/templates/*'

# 定义路由树信息
route:
receiver: 'ops' # 发送警报的接收者的名称,以下receivers name的名称
group_by: ['alertname'] # 报警分组依据
group_wait: 1s # 最初即第一次等待多久时间发送一组警报的通知
group_interval: 10s # 在发送新警报前的等待时间
repeat_interval: 1h # 发送重复警报的周期 对于email配置中,此项不可以设置过低,否则将会由于邮件发送太多频繁,被smtp服务器拒绝
routes:
- receiver: 'bigdata'
group_wait: 10s
match_re:
severity: bigdata


# 定义警报接收者信息
receivers:
- name: 'ops' # 警报
# email_configs: # 邮箱配置
# - to: 'chenyuhua37@countrygarden.com.cn,luminfeng01@countrygarden.com.cn,liangwenjun04@countrygarden.com.cn,maiqixian@countrygarden.com.cn,15121486557@163.com' # 接收警报的email配置
# send_resolved: true
# headers:
# subject: "AWS-Prometheus报警邮件"
# from : "Prometheus监控告警"
# to: "You,Please pay attention!!!"
webhook_configs:
- url: 'http://localhost:8860/dingtalk/webhook1/send'
send_resolved: true
- name: 'bigdata'
webhook_configs:
# - url: 'http://localhost:18060/dingtalk/webhook_bigdata/send'
- url: 'http://localhost:8860/dingtalk/webhook1/send'
send_resolved: true

# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']

钉钉配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
templates:
- dingding.tmpl

targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=8cd8f7502c148fa63a7ad9e424731f83022841eab80c81d476e8000f36d5143a
# secret for signature
secret: SEC000000000000000000000
webhook2:
url: https://oapi.dingtalk.com/robot/send?access_token=8cd8f7502c148fa63a7ad9e424731f83022841eab80c81d476e8000f36d5143a
webhook_legacy:
url: https://oapi.dingtalk.com/robot/send?access_token=8cd8f7502c148fa63a7ad9e424731f83022841eab80c81d476e8000f36d5143a
bigdta_webhook:
url: https://oapi.dingtalk.com/robot/send?access_token=8cd8f7502c148fa63a7ad9e424731f83022841eab80c81d476e8000f36d5143a