Prometheus-安装配置

2023-08-14 监控 all, prometheus 评论

Prometheus-安装配置

supervisor安装

1	yum install -y supervisor

各组件ini

cat << EOF > alertmanager.ini
[program:alertmanager]
command=/data/moni/alertmanager/alertmanager --web.listen-address=":9093"
directory=/data/moni/alertmanager        ; directory to cwd to before exec (def no cwd)
umask=022                     ; umask for process (default None)
priority=999                  ; the relative start priority (default 999)
autostart=true                ; start at supervisord start (default: true)
autorestart=true              ; retstart at unexpected quit (default: true)
startsecs=1                   ; number of secs prog must stay running (def. 1)
startretries=3                ; max # of serial start failures (default 3)
exitcodes=0,2                 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT               ; signal used to kill process (default TERM)
stopwaitsecs=10               ; max num secs to wait b4 SIGKILL (default 10)
user=root                     ; setuid to this UNIX account to run the program
redirect_stderr=true          ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/alertmanager/alertmanager.log        ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB  ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10     ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB   ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false   ; emit events on stdout writes (default false)
environment=A=1,B=2           ; process environment additions (def no adds)
EOF

cat << EOF > consul.ini
[program:consul]
command=/data/moni/consul agent -dev -data-dir=/data/moni/consul-data -node=aws-base-01 -bind=10.77.114.153 -config-dir=/etc/consul.d -enable-script-checks=true -datacenter=hykj -ui -rejoin -client=0.0.0.0
directory=/data/moni        ; directory to cwd to before exec (def no cwd)
umask=022                     ; umask for process (default None)
priority=999                  ; the relative start priority (default 999)
autostart=true                ; start at supervisord start (default: true)
autorestart=true              ; retstart at unexpected quit (default: true)
startsecs=1                   ; number of secs prog must stay running (def. 1)
startretries=3                ; max # of serial start failures (default 3)
exitcodes=0,2                 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT               ; signal used to kill process (default TERM)
stopwaitsecs=10               ; max num secs to wait b4 SIGKILL (default 10)
user=root                     ; setuid to this UNIX account to run the program
redirect_stderr=true          ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/consul/consul.log        ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB  ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10     ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB   ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false   ; emit events on stdout writes (default false)
environment=A=1,B=2           ; process environment additions (def no adds)
#serverurl=AUTO                ; override serverurl computation (childutils)
EOF

cat << EOF > dingtalk.ini
[program:dingtalk]
command=/data/moni/dingtalk/dingtalk --config.file=config.yml --web.listen-address=":8860"
directory=/data/moni/dingtalk    ; directory to cwd to before exec (def no cwd)
umask=022                     ; umask for process (default None)
priority=999                  ; the relative start priority (default 999)
autostart=false               ; start at supervisord start (default: true)
autorestart=true              ; retstart at unexpected quit (default: true)
startsecs=1                   ; number of secs prog must stay running (def. 1)
startretries=3                ; max # of serial start failures (default 3)
exitcodes=0,2                 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT               ; signal used to kill process (default TERM)
stopwaitsecs=10               ; max num secs to wait b4 SIGKILL (default 10)
user=root                     ; setuid to this UNIX account to run the program
redirect_stderr=true          ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/dingtalk/dingtalk.log ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB  ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10     ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB   ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false   ; emit events on stdout writes (default false)
environment=A=1,B=2           ; process environment additions (def no adds)
EOF

cat << EOF > grafana.ini
[program:grafana] 
command=/data/moni/grafana/bin/grafana-server -config /data/moni/grafana/conf/grafana.ini ; 
directory=/data/moni/grafana        ; directory to cwd to before exec (def no cwd)
umask=022                     ; umask for process (default None)
priority=999                  ; the relative start priority (default 999)
autostart=true                ; start at supervisord start (default: true)
autorestart=true              ; retstart at unexpected quit (default: true)
startsecs=1                   ; number of secs prog must stay running (def. 1)
startretries=3                ; max # of serial start failures (default 3)
exitcodes=0,2                 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT               ; signal used to kill process (default TERM)
stopwaitsecs=10               ; max num secs to wait b4 SIGKILL (default 10)
user=root                     ; setuid to this UNIX account to run the program
redirect_stderr=true          ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/grafana/grafana.log        ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB  ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10     ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB   ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false   ; emit events on stdout writes (default false)
environment=A=1,B=2           ; process environment additions (def no adds)
EOF

cat << EOF > prometheus.ini
[program:prometheus]
command=/data/moni/prometheus/prometheus --storage.tsdb.retention.time=30d --storage.tsdb.path="/data/moni/prometheus/data"
directory=/data/moni/prometheus        ; directory to cwd to before exec (def no cwd)
umask=022                     ; umask for process (default None)
priority=999                  ; the relative start priority (default 999)
autostart=true                ; start at supervisord start (default: true)
autorestart=true              ; retstart at unexpected quit (default: true)
startsecs=1                   ; number of secs prog must stay running (def. 1)
startretries=3                ; max # of serial start failures (default 3)
exitcodes=0,2                 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT               ; signal used to kill process (default TERM)
stopwaitsecs=10               ; max num secs to wait b4 SIGKILL (default 10)
user=root                     ; setuid to this UNIX account to run the program
redirect_stderr=true          ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/prometheus/prometheus.log        ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=50MB  ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10     ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB   ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false   ; emit events on stdout writes (default false)
environment=A=1,B=2           ; process environment additions (def no adds)
EOF

supervisor启动

mkdir -p /var/log/{alertmanager,consul,prometheus,dingtalk,grafana}
mkdir -p /etc/consul.d
cp *.ini /etc/supervisord.d/
cp supervisord.conf /etc

systemctl enable supervisord
systemctl restart supervisord

sleep 2
supervisorctl status

prometheus配置

global:
  scrape_interval: 15s 
  evaluation_interval: 15s 

alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - "localhost:9093"

rule_files:
     - "/data/moni/prometheus/rules/*"

scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]

  - job_name: "alertmanager"
    static_configs:
    - targets: ["localhost:9093"]

  - job_name: "doris_job"
    static_configs:
      - targets: ['hy-bigdata-01:18030', 'hy-bigdata-02:18030', 'hy-bigdata-03:18030']
        labels:
          group: fe

      - targets: ['hy-bigdata-01:18040', 'hy-bigdata-02:18040', 'hy-bigdata-03:18040']
        labels:
          group: be

#  - job_name: "pushgateway"
#    honor_labels: true
#    static_configs:
#    - targets: ['10.9.127.245:9091']
#      labels:
#        instance: pushgateway
        
#  - job_name: "java_jvm"
#    static_configs:
#      - targets: ['10.9.127.245:8848']

#  - job_name: "java-jmx-monitor"
#    static_configs:
#      - targets: ['10.249.2.51:3010']
#
#  - job_name: "davinci_monitor"
#    metrics_path: /prometheus 
#    static_configs: 
#      - targets: ['10.249.2.50:14399']

  - job_name: 'consul_sd_node_exporter'
    metrics_path: /metrics
    honor_labels: false
    consul_sd_configs:
      - server: 'localhost:8500'
        scheme: http
        services: ['node_exporter']
    relabel_configs:                        #根据实际监控所需添加label，并在注册时注册
    - source_labels: ['__meta_consul_bzl']   # datacenter，会显示注册到的consul的datacenter
      target_label: 'bgy'
    - source_labels: ['__meta_consul_service_address']
      target_label: 'host'
    - source_labels: ['__meta_consul_service_metadata_hostname']
      target_label: 'hostname'
    - source_labels: ['__meta_consul_service_metadata_business']
      target_label: 'business'

  - job_name: 'sa_process_exporter'
    scrape_interval: 10s
    honor_labels: false
    consul_sd_configs:
      - server: 'localhost:8500'
        scheme: http
        services: ['process_exporter']
    relabel_configs:
    - source_labels: ['__meta_consul_service_address']
      target_label: 'host'
    - source_labels: ['__meta_consul_service_metadata_hostname']
      target_label: 'hostname'

告警配置

groups:
- name: defaultStatsAlert
  rules:
  - alert: cpuUsageAlert
    expr: (100 - (avg by (instance,hostname,hosttype,responsibility)(irate(node_cpu_seconds_total{job="consul_sd_node_exporter",mode="idle"}[3m])) * 100)) > 95
    for: 10m
    labels:
      team: node
      organization: ops 
#    labels:
#      severity: page
    annotations:
      summary: "{{ $labels.host }} CPU usgae high"
      description: "{{ $labels.host }} CPU usage above 95% (current value: {{ $value }})"

  - alert: cpuIOwaitAlert    
    expr: (avg by (hostname,instance,hosttype,responsibility)(irate(node_cpu_seconds_total{business=~'bzl|aws|hy',job="consul_sd_node_exporter",mode="iowait"}[3m])) * 100) > 45
    for: 10m
    labels:
      team: node
      organization: ops
    annotations:
      summary: "{{ $labels.instance }} CPU iowait high"
      description: "{{ $labels.host }} CPU iowait above 45% (current value: {{ $value }})"

  - alert: sysLoadAlert
    expr: sum by(hostname, instance,business,hosttype,responsibility) (node_load5{job="consul_sd_node_exporter"}) / count by(hostname, instance,business,hosttype,responsibility) (count by(cpu, hostname, instance,business,hosttype,responsibility) (node_cpu_seconds_total{job="consul_sd_node_exporter"})) > 3.85 
    for: 10m
    labels:
      team: node
      organization: ops
    annotations:
    #  summary: "{{ $labels.instance }} CPU usgae high"
      #description: "{{ $labels.host }} CPU load1m is greater than cpu core number for 1min(current value: {{ $value }})"
      description: "{{ $labels.host }} CPU load5m is greater than cpu core number(current value: cpu 核数的{{ $value }}倍 )"

  - alert: sys-time-Alert    
    expr: time() - node_time_seconds{job="consul_sd_node_exporter"} > 125
    for: 10m
    labels:
      team: node
      organization: ops
    annotations:
      description: "{{ $labels.host }} system time is inconsistent (current value: {{ $value }}s)"
  - alert: sys-time-Alert1    
    expr: time() - node_time_seconds{job="consul_sd_node_exporter"} < -125 
    for: 10m
    labels:
      team: node
      organization: ops
    annotations:
      description: "{{ $labels.host }} system time is inconsistent (current value: {{ $value }}s)"

  - alert: memUsageAlert
    #expr: ((node_memory_MemTotal_bytes{job="consul_sd_node_exporter"} - (node_memory_MemAvailable_bytes{job="consul_sd_node_exporter"}))/node_memory_MemTotal_bytes{job="consul_sd_node_exporter"}) * 100 > 90
    expr: ((node_memory_MemTotal_bytes{job="consul_sd_node_exporter"} - (node_memory_MemFree_bytes{job="consul_sd_node_exporter"}+node_memory_Buffers_bytes{job="consul_sd_node_exporter"}+node_memory_Cached_bytes{job="consul_sd_node_exporter"}+node_memory_SReclaimable_bytes{job="consul_sd_node_exporter"}))/node_memory_MemTotal_bytes{job="consul_sd_node_exporter"}) * 100 > 95
    for: 15m
    labels:
      team: mem
      organization: ops
#    labels:
#      severity: page
    annotations:
      summary: "{{ $labels.host }} MEM usgae high"
      description: "{{ $labels.host }} MEM usage above 95% (current value: {{ $value }})"

  - alert: NodeDown
    expr: up{job="consul_sd_node_exporter"} == 0
    for: 5m
    labels:
      team: down
      organization: ops
#    labels:   可以使用这个label在altermanager中配置match分组
#      team: node
    annotations:
      description: '{{ $labels.host}} has been down for 5m'

  - alert: disk_utilization_rate
    expr: 100 - ((node_filesystem_avail_bytes{business=~'bzl|aws|hy',fstype!~"rootfs|selinuxfs|autofs|rpc_pipefs|tmpfs|fuse.lxcfs|iso9660",mountpoint!~"/boot",job="consul_sd_node_exporter"} * 100) / node_filesystem_size_bytes {business=~'bzl|aws|hy',fstype!~"rootfs|selinuxfs|autofs|rpc_pipefs|tmpfs|fuse.lxcfs|iso9660",mountpoint!~"/boot",job="consul_sd_node_exporter"}) > 80
    for: 10m
    labels:
      team: disk
      organization: ops
    annotations:
      description: '{{$labels.host}} {{$labels.mountpoint}} excess 90% --> {{$value}}'

  - alert: network_receive_bytes
    expr: irate(node_network_receive_bytes_total{business=~'bzl|aws|hy',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*|eth*|eno*',job="consul_sd_node_exporter"}[15m])*8/1024/1024 > 1000
    for: 5m
    labels:
      team: node
      organization: ops
    annotations:
      description: '{{$labels.host}} network receive bytes reach to {{$value}} MB/s'

  - alert: network_transmit_bytes
    expr: irate(node_network_transmit_bytes_total{business=~'bzl|aws|hy',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*|eth*|eno*',job="consul_sd_node_exporter"}[15m])*8/1024/1024 > 1000
    for: 5m
    labels:
      team: node
      organization: ops
    annotations:
      description: '{{$labels.host}} network transmit bytes reach to {{$value}} MB/s'

  - alert: frame-alert
    expr: node_network_receive_frame_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*"} - min_over_time(node_network_receive_frame_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*"}[1h]) >= 5 
    for: 10s
    labels:
      team: node 
      organization: ops
    annotations:
      description: '{{$labels.host}} network frame growth {{$value}}'

  - alert: time_zone-alert
    expr: time_zone{tz!="CST"} == 0 
    for: 10s
    labels:
      team: node 
      organization: ops
    annotations:
      description: '{{$labels.host}} time zone is not CST'

#  - alert: doris_node-alert
#    expr: sum(up{job=~"doris_job|davinci_monitor"}) by (group,job,instance) < 1
#    for: 10s
#    labels:
#      team: node
#      organization: bigdata
#    annotations:
#      description: '{{$labels.job}} {{$labels.group}} 的节点{{$labels.instance}} 发生异常!!!!'


groups:
  - name: rocketmq-delay
    rules:
      - alert: rocketmq-delay warning1
        expr: rocketmq_group_diff{topic="machine_work_log",group="machine_work_log_roketmq2doris"} >2000
        for: 20s
        labels:
          group: bigdata
          severity: bigdata
        annotations:
          description: Topic：{{$labels.topic}} , Group：{{$labels.group}} 出现积压，积压量为：{{$value}}
      - alert: rocketmq-delay warning2
        expr: rocketmq_group_diff{topic="pro_canal_producer",group="pro_canal_producer_roketmq2doris"} >10
        for: 20s
        labels:
          group: bigdata
          severity: bigdata
        annotations:
          description: Topic：{{$labels.topic}} , Group：{{$labels.group}} 出现积压，积压量为：{{$value}}


groups:
  - name: doris_instance_down
    rules:
      - alert: Doris Backends Down
        expr: up {group="be", job="doris_job"} == 0
        for: 20s
        labels:
          user: doris
          severity: bigdata
        annotations:
          summary: "doris Instance {{ $labels.instance }} down"
          description: "doris {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 20s."
      - alert: Doris Frontends Down
        expr: up {group="fe", job="doris_job"} == 0
        for: 20s
        labels:
          user: doris
          severity: bigdata
        annotations:
          summary: "doris Instance {{ $labels.instance }} down"
          description: "doris {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 20s."

alertmanage配置

global: 
  resolve_timeout: 5m #处理超时时间，默认为5min
  smtp_smarthost: 'smtp.139.com:465' # 邮箱smtp服务器代理
  smtp_from: 'xxxxxx@139.com' # 发送邮箱名称
  smtp_auth_username: 'xxxxx@139.com' # 邮箱名称
  smtp_auth_password: 'xxxxxxxxx' # 邮箱密码或授权码
  smtp_require_tls: false

# 定义模板信心
templates:
  - '/opt/alertmanager/templates/*'

# 定义路由树信息
route:
  receiver: 'ops' # 发送警报的接收者的名称，以下receivers name的名称
  group_by: ['alertname'] # 报警分组依据
  group_wait: 1s # 最初即第一次等待多久时间发送一组警报的通知
  group_interval: 10s # 在发送新警报前的等待时间
  repeat_interval: 1h # 发送重复警报的周期 对于email配置中，此项不可以设置过低，否则将会由于邮件发送太多频繁，被smtp服务器拒绝
  routes:
  - receiver: 'bigdata'
    group_wait: 10s
    match_re:
       severity: bigdata

 
# 定义警报接收者信息
receivers:
  - name: 'ops' # 警报
#    email_configs: # 邮箱配置
#    - to: 'chenyuhua37@countrygarden.com.cn,luminfeng01@countrygarden.com.cn,liangwenjun04@countrygarden.com.cn,maiqixian@countrygarden.com.cn,15121486557@163.com'  # 接收警报的email配置
#      send_resolved: true
#      headers:
#        subject: "AWS-Prometheus报警邮件"
#        from : "Prometheus监控告警"
#        to: "You,Please pay attention!!!"
    webhook_configs:
    - url: 'http://localhost:8860/dingtalk/webhook1/send' 
      send_resolved: true
  - name: 'bigdata'
    webhook_configs:
#    - url: 'http://localhost:18060/dingtalk/webhook_bigdata/send'
    - url: 'http://localhost:8860/dingtalk/webhook1/send' 
      send_resolved: true

# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下，使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。 
inhibit_rules: 
  - source_match: 
     severity: 'critical' 
    target_match: 
     severity: 'warning' 
    equal: ['alertname', 'dev', 'instance']

钉钉配置

templates:
   - dingding.tmpl

targets:
  webhook1:
    url: https://oapi.dingtalk.com/robot/send?access_token=8cd8f7502c148fa63a7ad9e424731f83022841eab80c81d476e8000f36d5143a
    # secret for signature
    secret: SEC000000000000000000000
  webhook2:
          url: https://oapi.dingtalk.com/robot/send?access_token=8cd8f7502c148fa63a7ad9e424731f83022841eab80c81d476e8000f36d5143a
  webhook_legacy:
          url: https://oapi.dingtalk.com/robot/send?access_token=8cd8f7502c148fa63a7ad9e424731f83022841eab80c81d476e8000f36d5143a
  bigdta_webhook:
          url: https://oapi.dingtalk.com/robot/send?access_token=8cd8f7502c148fa63a7ad9e424731f83022841eab80c81d476e8000f36d5143a

本文链接： https://biglovewheat.gihub.io/2023/08/14/prometheus-安装配置/

版权声明： 本博客所有文章除特别声明外，均采用 CC BY 4.0 CN协议许可协议。转载请注明出处！

biglovewheat老年佛系运维

老年佛系运维 | biglovewheat@126.com