thrilltrack-explorer/monitoring/loki-alerts.yml

# Grafana Loki Alert Rules for Playwright Tests
# Deploy this to AlertManager or Grafana Cloud

groups:
  - name: playwright_test_alerts
    interval: 1m
    rules:
      # Critical: All tests are failing
      - alert: AllPlaywrightTestsFailing
        expr: |
          sum(rate({job="playwright-tests", status="passed"}[15m])) == 0
          and
          sum(rate({job="playwright-tests", event="test_end"}[15m])) > 0
        for: 5m
        labels:
          severity: critical
          team: qa
          component: playwright
        annotations:
          summary: "All Playwright tests are failing"
          description: "No passing tests detected in the last 15 minutes. Test count: {{ $value }}"
          runbook_url: "https://wiki.internal/runbooks/playwright-all-tests-failing"
          dashboard_url: "https://grafana.internal/d/playwright-dashboard"

      # Warning: High failure rate
      - alert: HighPlaywrightFailureRate
        expr: |
          (
            sum(rate({job="playwright-tests", status="failed"}[30m]))
            /
            sum(rate({job="playwright-tests", event="test_end"}[30m]))
          ) > 0.20
        for: 10m
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "High Playwright test failure rate detected"
          description: "{{ $value | humanizePercentage }} of tests are failing over the last 30 minutes"
          runbook_url: "https://wiki.internal/runbooks/playwright-high-failure-rate"

      # Warning: Specific browser has high failure rate
      - alert: BrowserSpecificFailures
        expr: |
          (
            sum by (browser) (rate({job="playwright-tests", status="failed"}[30m]))
            /
            sum by (browser) (rate({job="playwright-tests", event="test_end"}[30m]))
          ) > 0.30
        for: 10m
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "High failure rate in {{ $labels.browser }}"
          description: "{{ $labels.browser }} browser has {{ $value | humanizePercentage }} failure rate"

      # Warning: Slow test execution
      - alert: SlowPlaywrightTests
        expr: |
          quantile_over_time(0.95,
            {job="playwright-tests", event="test_end"} | json | unwrap duration_ms
          [30m]) > 300000
        for: 15m
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "Playwright tests are running slowly"
          description: "P95 test duration is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
          runbook_url: "https://wiki.internal/runbooks/playwright-slow-tests"

      # Warning: Test suite timeout
      - alert: PlaywrightSuiteTimeout
        expr: |
          {job="playwright-tests", event="test_suite_end"} | json | unwrap duration_ms > 3600000
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "Playwright test suite exceeded 1 hour"
          description: "Test suite took {{ $value | humanizeDuration }} to complete"

      # Info: No tests running (during business hours)
      - alert: NoPlaywrightTestsRunning
        expr: |
          absent_over_time({job="playwright-tests", event="test_start"}[2h])
        for: 5m
        labels:
          severity: info
          team: qa
          component: playwright
        annotations:
          summary: "No Playwright tests have run recently"
          description: "No test executions detected in the last 2 hours. CI/CD pipeline may be broken."
          runbook_url: "https://wiki.internal/runbooks/playwright-no-tests"

      # Warning: Flaky test detected
      - alert: FlakyPlaywrightTest
        expr: |
          count by (test_name) (
            {job="playwright-tests", status="failed", retry="1"} | json
          ) > 3
        for: 1h
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "Flaky test detected: {{ $labels.test_name }}"
          description: "Test '{{ $labels.test_name }}' has failed {{ $value }} times on retry in the last hour"
          runbook_url: "https://wiki.internal/runbooks/playwright-flaky-tests"

      # Critical: Test infrastructure failure
      - alert: PlaywrightInfrastructureFailure
        expr: |
          count_over_time({job="playwright-tests", event="test_suite_start"}[30m]) == 0
          and
          count_over_time({job="playwright-tests"}[30m]) > 0
        for: 5m
        labels:
          severity: critical
          team: devops
          component: playwright
        annotations:
          summary: "Playwright test infrastructure may be failing"
          description: "Tests are attempting to run but test suite is not starting properly"
          runbook_url: "https://wiki.internal/runbooks/playwright-infrastructure"

      # Warning: High retry rate
      - alert: HighPlaywrightRetryRate
        expr: |
          (
            sum(rate({job="playwright-tests", retry!="0"}[30m]))
            /
            sum(rate({job="playwright-tests", event="test_end"}[30m]))
          ) > 0.15
        for: 10m
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "High test retry rate detected"
          description: "{{ $value | humanizePercentage }} of tests are being retried"

      # Info: Test duration increasing
      - alert: PlaywrightDurationIncreasing
        expr: |
          (
            avg_over_time({job="playwright-tests", event="test_end"} | json | unwrap duration_ms [1h])
            /
            avg_over_time({job="playwright-tests", event="test_end"} | json | unwrap duration_ms [24h] offset 1h)
          ) > 1.5
        for: 30m
        labels:
          severity: info
          team: qa
          component: playwright
        annotations:
          summary: "Playwright test duration is increasing"
          description: "Average test duration has increased by {{ $value | humanizePercentage }} compared to previous day"