Alerts

InstanceDown (1 active)
alert: InstanceDown
expr: up{job="node"}
  == 0
for: 5m
labels:
  severity: warn
annotations:
  summary: Project {{ $labels.project }} instance {{ $labels.instance }} is down
Labels State Active Since Value
alertname="InstanceDown" instance="toolsbeta-sgewebgrid-generic-0901" job="node" project="toolsbeta" severity="warn" firing 2021-10-04 14:45:33.571893423 +0000 UTC 0
PuppetAgentDisabled (1 active)
alert: PuppetAgentDisabled
expr: puppet_agent_enabled
  != 1
for: 15m
labels:
  severity: warn
annotations:
  summary: Puppet agent disabled on instance {{ $labels.instance }} in project {{
    $labels.project }}
Labels State Active Since Value
alertname="PuppetAgentDisabled" instance="deployment-elastic07" job="node" project="deployment-prep" severity="warn" firing 2021-09-15 16:28:33.571893423 +0000 UTC 0
PuppetAgentFailure (2 active)
alert: PuppetAgentFailure
expr: puppet_agent_failed
  != 0
for: 15m
labels:
  severity: warn
annotations:
  summary: Puppet agent failure detected on instance {{ $labels.instance }} in project
    {{ $labels.project }}
Labels State Active Since Value
alertname="PuppetAgentFailure" instance="deployment-maps08" job="node" project="deployment-prep" severity="warn" firing 2021-09-30 10:43:33.571893423 +0000 UTC 1
alertname="PuppetAgentFailure" instance="deployment-cache-upload06" job="node" project="deployment-prep" severity="warn" firing 2021-10-14 13:35:33.571893423 +0000 UTC 1
PuppetAgentStaleLastRun (5 active)
alert: PuppetAgentStaleLastRun
expr: time()
  - puppet_agent_last_run > 86400
for: 15m
labels:
  severity: warn
annotations:
  summary: Last Puppet run was over 24 hours ago on instance {{ $labels.instance }}
    in project {{ $labels.project }}
Labels State Active Since Value
alertname="PuppetAgentStaleLastRun" instance="deployment-logstash04" job="node" project="deployment-prep" severity="warn" firing 2021-09-15 10:34:33.571893423 +0000 UTC 4.202325571000099e+06
alertname="PuppetAgentStaleLastRun" instance="deployment-logstash06" job="node" project="deployment-prep" severity="warn" firing 2021-09-25 03:15:33.571893423 +0000 UTC 2.135076571000099e+06
alertname="PuppetAgentStaleLastRun" instance="deployment-elastic07" job="node" project="deployment-prep" severity="warn" firing 2021-09-16 16:00:33.571893423 +0000 UTC 2.866756571000099e+06
alertname="PuppetAgentStaleLastRun" instance="deployment-logstash05" job="node" project="deployment-prep" severity="warn" firing 2021-09-02 12:49:33.571893423 +0000 UTC 4.087843571000099e+06
alertname="PuppetAgentStaleLastRun" instance="deployment-kafka-jumbo-1" job="node" project="deployment-prep" severity="warn" firing 2021-10-11 07:28:33.571893423 +0000 UTC 1.9968415710000992e+06
PrometheusReloadFailed (0 active)
alert: PrometheusReloadFailed
expr: prometheus_config_last_reload_successful
  == 0
for: 1h
labels:
  severity: warn
annotations:
  summary: Prometheus {{$labels.instance}} config reload fail
PuppetSyncFailure (0 active)
alert: PuppetSyncFailure
expr: puppet_sync_upstream_rebase_success
  != 1
for: 5m
labels:
  severity: warn
annotations:
  summary: Failed to update Puppet repository {{ $labels.repository }} on instance
    {{ $labels.instance }} in project {{ $labels.project }}
ToolsGridQueueProblem (0 active)
alert: ToolsGridQueueProblem
expr: sge_queueproblems{project="tools",state=~".*(e|E).*"}
for: 30m
labels:
  project: tools
  severity: warn
annotations:
  summary: Grid queue {{ $labels.queue }}@{{ $labels.host }} is in state {{ $labels.state
    }}
WidespreadInstanceDown (0 active)
alert: WidespreadInstanceDown
expr: count
  by(project) (up{job="node"} == 0) / count by(project) (up{job="node"})
  * 100 >= 15
for: 5m
labels:
  severity: crit
annotations:
  summary: Widespread instances down in project {{ $labels.project }}
WidespreadPuppetAgentFailure (0 active)
alert: WidespreadPuppetAgentFailure
expr: sum
  by(project) (puppet_agent_failed) / count by(project) (puppet_agent_failed) * 100
  >= 15
for: 15m
labels:
  severity: crit
annotations:
  summary: Widespread puppet agent failures in project {{ $labels.project }}