Files
thrilltrack-explorer/monitoring/loki-alerts.yml
2025-10-30 15:54:32 +00:00

167 lines
6.1 KiB
YAML

# Grafana Loki Alert Rules for Playwright Tests
# Deploy this to AlertManager or Grafana Cloud
groups:
- name: playwright_test_alerts
interval: 1m
rules:
# Critical: All tests are failing
- alert: AllPlaywrightTestsFailing
expr: |
sum(rate({job="playwright-tests", status="passed"}[15m])) == 0
and
sum(rate({job="playwright-tests", event="test_end"}[15m])) > 0
for: 5m
labels:
severity: critical
team: qa
component: playwright
annotations:
summary: "All Playwright tests are failing"
description: "No passing tests detected in the last 15 minutes. Test count: {{ $value }}"
runbook_url: "https://wiki.internal/runbooks/playwright-all-tests-failing"
dashboard_url: "https://grafana.internal/d/playwright-dashboard"
# Warning: High failure rate
- alert: HighPlaywrightFailureRate
expr: |
(
sum(rate({job="playwright-tests", status="failed"}[30m]))
/
sum(rate({job="playwright-tests", event="test_end"}[30m]))
) > 0.20
for: 10m
labels:
severity: warning
team: qa
component: playwright
annotations:
summary: "High Playwright test failure rate detected"
description: "{{ $value | humanizePercentage }} of tests are failing over the last 30 minutes"
runbook_url: "https://wiki.internal/runbooks/playwright-high-failure-rate"
# Warning: Specific browser has high failure rate
- alert: BrowserSpecificFailures
expr: |
(
sum by (browser) (rate({job="playwright-tests", status="failed"}[30m]))
/
sum by (browser) (rate({job="playwright-tests", event="test_end"}[30m]))
) > 0.30
for: 10m
labels:
severity: warning
team: qa
component: playwright
annotations:
summary: "High failure rate in {{ $labels.browser }}"
description: "{{ $labels.browser }} browser has {{ $value | humanizePercentage }} failure rate"
# Warning: Slow test execution
- alert: SlowPlaywrightTests
expr: |
quantile_over_time(0.95,
{job="playwright-tests", event="test_end"} | json | unwrap duration_ms
[30m]) > 300000
for: 15m
labels:
severity: warning
team: qa
component: playwright
annotations:
summary: "Playwright tests are running slowly"
description: "P95 test duration is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
runbook_url: "https://wiki.internal/runbooks/playwright-slow-tests"
# Warning: Test suite timeout
- alert: PlaywrightSuiteTimeout
expr: |
{job="playwright-tests", event="test_suite_end"} | json | unwrap duration_ms > 3600000
labels:
severity: warning
team: qa
component: playwright
annotations:
summary: "Playwright test suite exceeded 1 hour"
description: "Test suite took {{ $value | humanizeDuration }} to complete"
# Info: No tests running (during business hours)
- alert: NoPlaywrightTestsRunning
expr: |
absent_over_time({job="playwright-tests", event="test_start"}[2h])
for: 5m
labels:
severity: info
team: qa
component: playwright
annotations:
summary: "No Playwright tests have run recently"
description: "No test executions detected in the last 2 hours. CI/CD pipeline may be broken."
runbook_url: "https://wiki.internal/runbooks/playwright-no-tests"
# Warning: Flaky test detected
- alert: FlakyPlaywrightTest
expr: |
count by (test_name) (
{job="playwright-tests", status="failed", retry="1"} | json
) > 3
for: 1h
labels:
severity: warning
team: qa
component: playwright
annotations:
summary: "Flaky test detected: {{ $labels.test_name }}"
description: "Test '{{ $labels.test_name }}' has failed {{ $value }} times on retry in the last hour"
runbook_url: "https://wiki.internal/runbooks/playwright-flaky-tests"
# Critical: Test infrastructure failure
- alert: PlaywrightInfrastructureFailure
expr: |
count_over_time({job="playwright-tests", event="test_suite_start"}[30m]) == 0
and
count_over_time({job="playwright-tests"}[30m]) > 0
for: 5m
labels:
severity: critical
team: devops
component: playwright
annotations:
summary: "Playwright test infrastructure may be failing"
description: "Tests are attempting to run but test suite is not starting properly"
runbook_url: "https://wiki.internal/runbooks/playwright-infrastructure"
# Warning: High retry rate
- alert: HighPlaywrightRetryRate
expr: |
(
sum(rate({job="playwright-tests", retry!="0"}[30m]))
/
sum(rate({job="playwright-tests", event="test_end"}[30m]))
) > 0.15
for: 10m
labels:
severity: warning
team: qa
component: playwright
annotations:
summary: "High test retry rate detected"
description: "{{ $value | humanizePercentage }} of tests are being retried"
# Info: Test duration increasing
- alert: PlaywrightDurationIncreasing
expr: |
(
avg_over_time({job="playwright-tests", event="test_end"} | json | unwrap duration_ms [1h])
/
avg_over_time({job="playwright-tests", event="test_end"} | json | unwrap duration_ms [24h] offset 1h)
) > 1.5
for: 30m
labels:
severity: info
team: qa
component: playwright
annotations:
summary: "Playwright test duration is increasing"
description: "Average test duration has increased by {{ $value | humanizePercentage }} compared to previous day"