feat: Integrate Grafana Loki

2026-02-05 08:05:15 -05:00 · 2025-10-30 15:54:32 +00:00
parent 8ac61e01e3
commit 72a7cb7f7c
9 changed files with 1261 additions and 2 deletions
--- a/monitoring/grafana-dashboard.json
+++ b/monitoring/grafana-dashboard.json
@@ -0,0 +1,266 @@
+{
+  "dashboard": {
+    "title": "Playwright Test Execution Dashboard",
+    "tags": ["playwright", "testing", "e2e"],
+    "timezone": "browser",
+    "refresh": "30s",
+    "time": {
+      "from": "now-24h",
+      "to": "now"
+    },
+    "panels": [
+      {
+        "id": 1,
+        "title": "Test Execution Overview",
+        "type": "stat",
+        "gridPos": { "x": 0, "y": 0, "w": 6, "h": 4 },
+        "targets": [
+          {
+            "expr": "count_over_time({job=\"playwright-tests\", event=\"test_end\"}[$__range])",
+            "legendFormat": "Total Tests"
+          }
+        ],
+        "options": {
+          "colorMode": "value",
+          "graphMode": "area",
+          "textMode": "auto"
+        }
+      },
+      {
+        "id": 2,
+        "title": "Pass Rate %",
+        "type": "stat",
+        "gridPos": { "x": 6, "y": 0, "w": 6, "h": 4 },
+        "targets": [
+          {
+            "expr": "(sum(count_over_time({job=\"playwright-tests\", status=\"passed\"}[$__range])) / sum(count_over_time({job=\"playwright-tests\", event=\"test_end\"}[$__range]))) * 100",
+            "legendFormat": "Pass Rate"
+          }
+        ],
+        "options": {
+          "colorMode": "value",
+          "graphMode": "area",
+          "textMode": "auto",
+          "unit": "percent"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                { "value": 0, "color": "red" },
+                { "value": 80, "color": "yellow" },
+                { "value": 95, "color": "green" }
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 3,
+        "title": "Failure Rate %",
+        "type": "stat",
+        "gridPos": { "x": 12, "y": 0, "w": 6, "h": 4 },
+        "targets": [
+          {
+            "expr": "(sum(count_over_time({job=\"playwright-tests\", status=\"failed\"}[$__range])) / sum(count_over_time({job=\"playwright-tests\", event=\"test_end\"}[$__range]))) * 100",
+            "legendFormat": "Failure Rate"
+          }
+        ],
+        "options": {
+          "colorMode": "value",
+          "graphMode": "area",
+          "textMode": "auto",
+          "unit": "percent"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                { "value": 0, "color": "green" },
+                { "value": 5, "color": "yellow" },
+                { "value": 20, "color": "red" }
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 4,
+        "title": "Avg Test Duration",
+        "type": "stat",
+        "gridPos": { "x": 18, "y": 0, "w": 6, "h": 4 },
+        "targets": [
+          {
+            "expr": "avg_over_time({job=\"playwright-tests\", event=\"test_end\"} | json | unwrap duration_ms [$__range])",
+            "legendFormat": "Avg Duration"
+          }
+        ],
+        "options": {
+          "colorMode": "value",
+          "graphMode": "area",
+          "textMode": "auto",
+          "unit": "ms"
+        }
+      },
+      {
+        "id": 5,
+        "title": "Test Status Over Time",
+        "type": "timeseries",
+        "gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
+        "targets": [
+          {
+            "expr": "sum by (status) (count_over_time({job=\"playwright-tests\", event=\"test_end\"} | json [$__interval]))",
+            "legendFormat": "{{status}}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "custom": {
+              "lineInterpolation": "smooth",
+              "fillOpacity": 20
+            }
+          },
+          "overrides": [
+            {
+              "matcher": { "id": "byName", "options": "passed" },
+              "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }]
+            },
+            {
+              "matcher": { "id": "byName", "options": "failed" },
+              "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }]
+            },
+            {
+              "matcher": { "id": "byName", "options": "skipped" },
+              "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "yellow" } }]
+            }
+          ]
+        }
+      },
+      {
+        "id": 6,
+        "title": "Browser Comparison",
+        "type": "bargauge",
+        "gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
+        "targets": [
+          {
+            "expr": "sum by (browser) (count_over_time({job=\"playwright-tests\", status=\"passed\"} [$__range]))",
+            "legendFormat": "{{browser}}"
+          }
+        ],
+        "options": {
+          "orientation": "horizontal",
+          "displayMode": "gradient"
+        }
+      },
+      {
+        "id": 7,
+        "title": "Test Duration Distribution",
+        "type": "histogram",
+        "gridPos": { "x": 0, "y": 12, "w": 12, "h": 8 },
+        "targets": [
+          {
+            "expr": "{job=\"playwright-tests\", event=\"test_end\"} | json | unwrap duration_ms",
+            "legendFormat": "Duration"
+          }
+        ],
+        "options": {
+          "bucketOffset": 0,
+          "bucketSize": 1000
+        }
+      },
+      {
+        "id": 8,
+        "title": "Top 10 Failing Tests",
+        "type": "bargauge",
+        "gridPos": { "x": 12, "y": 12, "w": 12, "h": 8 },
+        "targets": [
+          {
+            "expr": "topk(10, sum by (test_name) (count_over_time({job=\"playwright-tests\", status=\"failed\"} | json [$__range])))",
+            "legendFormat": "{{test_name}}"
+          }
+        ],
+        "options": {
+          "orientation": "horizontal",
+          "displayMode": "gradient",
+          "showUnfilled": true
+        }
+      },
+      {
+        "id": 9,
+        "title": "Recent Test Runs",
+        "type": "table",
+        "gridPos": { "x": 0, "y": 20, "w": 24, "h": 8 },
+        "targets": [
+          {
+            "expr": "{job=\"playwright-tests\", event=\"test_end\"} | json",
+            "legendFormat": ""
+          }
+        ],
+        "options": {
+          "showHeader": true,
+          "sortBy": [{ "displayName": "Time", "desc": true }]
+        },
+        "transformations": [
+          {
+            "id": "organize",
+            "options": {
+              "excludeByName": {},
+              "indexByName": {
+                "Time": 0,
+                "test_name": 1,
+                "test_file": 2,
+                "browser": 3,
+                "status": 4,
+                "duration_ms": 5,
+                "branch": 6,
+                "commit": 7
+              },
+              "renameByName": {
+                "test_name": "Test Name",
+                "test_file": "File",
+                "browser": "Browser",
+                "status": "Status",
+                "duration_ms": "Duration (ms)",
+                "branch": "Branch",
+                "commit": "Commit"
+              }
+            }
+          },
+          {
+            "id": "limit",
+            "options": {
+              "limitField": 20
+            }
+          }
+        ]
+      },
+      {
+        "id": 10,
+        "title": "Slowest Tests (P95)",
+        "type": "table",
+        "gridPos": { "x": 0, "y": 28, "w": 12, "h": 6 },
+        "targets": [
+          {
+            "expr": "topk(10, quantile_over_time(0.95, {job=\"playwright-tests\", event=\"test_end\"} | json | unwrap duration_ms by (test_name) [$__range]))",
+            "legendFormat": "{{test_name}}"
+          }
+        ]
+      },
+      {
+        "id": 11,
+        "title": "Flaky Tests Detection",
+        "type": "table",
+        "gridPos": { "x": 12, "y": 28, "w": 12, "h": 6 },
+        "targets": [
+          {
+            "expr": "(count by (test_name) ({job=\"playwright-tests\", status=\"failed\"} | json) and count by (test_name) ({job=\"playwright-tests\", status=\"passed\"} | json))",
+            "legendFormat": "{{test_name}}"
+          }
+        ],
+        "description": "Tests that have both passed and failed runs (potential flaky tests)"
+      }
+    ]
+  }
+}
--- a/monitoring/loki-alerts.yml
+++ b/monitoring/loki-alerts.yml
@@ -0,0 +1,166 @@
+# Grafana Loki Alert Rules for Playwright Tests
+# Deploy this to AlertManager or Grafana Cloud
+
+groups:
+  - name: playwright_test_alerts
+    interval: 1m
+    rules:
+      # Critical: All tests are failing
+      - alert: AllPlaywrightTestsFailing
+        expr: |
+          sum(rate({job="playwright-tests", status="passed"}[15m])) == 0
+          and
+          sum(rate({job="playwright-tests", event="test_end"}[15m])) > 0
+        for: 5m
+        labels:
+          severity: critical
+          team: qa
+          component: playwright
+        annotations:
+          summary: "All Playwright tests are failing"
+          description: "No passing tests detected in the last 15 minutes. Test count: {{ $value }}"
+          runbook_url: "https://wiki.internal/runbooks/playwright-all-tests-failing"
+          dashboard_url: "https://grafana.internal/d/playwright-dashboard"
+
+      # Warning: High failure rate
+      - alert: HighPlaywrightFailureRate
+        expr: |
+          (
+            sum(rate({job="playwright-tests", status="failed"}[30m]))
+            /
+            sum(rate({job="playwright-tests", event="test_end"}[30m]))
+          ) > 0.20
+        for: 10m
+        labels:
+          severity: warning
+          team: qa
+          component: playwright
+        annotations:
+          summary: "High Playwright test failure rate detected"
+          description: "{{ $value | humanizePercentage }} of tests are failing over the last 30 minutes"
+          runbook_url: "https://wiki.internal/runbooks/playwright-high-failure-rate"
+
+      # Warning: Specific browser has high failure rate
+      - alert: BrowserSpecificFailures
+        expr: |
+          (
+            sum by (browser) (rate({job="playwright-tests", status="failed"}[30m]))
+            /
+            sum by (browser) (rate({job="playwright-tests", event="test_end"}[30m]))
+          ) > 0.30
+        for: 10m
+        labels:
+          severity: warning
+          team: qa
+          component: playwright
+        annotations:
+          summary: "High failure rate in {{ $labels.browser }}"
+          description: "{{ $labels.browser }} browser has {{ $value | humanizePercentage }} failure rate"
+
+      # Warning: Slow test execution
+      - alert: SlowPlaywrightTests
+        expr: |
+          quantile_over_time(0.95, 
+            {job="playwright-tests", event="test_end"} | json | unwrap duration_ms
+          [30m]) > 300000
+        for: 15m
+        labels:
+          severity: warning
+          team: qa
+          component: playwright
+        annotations:
+          summary: "Playwright tests are running slowly"
+          description: "P95 test duration is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
+          runbook_url: "https://wiki.internal/runbooks/playwright-slow-tests"
+
+      # Warning: Test suite timeout
+      - alert: PlaywrightSuiteTimeout
+        expr: |
+          {job="playwright-tests", event="test_suite_end"} | json | unwrap duration_ms > 3600000
+        labels:
+          severity: warning
+          team: qa
+          component: playwright
+        annotations:
+          summary: "Playwright test suite exceeded 1 hour"
+          description: "Test suite took {{ $value | humanizeDuration }} to complete"
+
+      # Info: No tests running (during business hours)
+      - alert: NoPlaywrightTestsRunning
+        expr: |
+          absent_over_time({job="playwright-tests", event="test_start"}[2h])
+        for: 5m
+        labels:
+          severity: info
+          team: qa
+          component: playwright
+        annotations:
+          summary: "No Playwright tests have run recently"
+          description: "No test executions detected in the last 2 hours. CI/CD pipeline may be broken."
+          runbook_url: "https://wiki.internal/runbooks/playwright-no-tests"
+
+      # Warning: Flaky test detected
+      - alert: FlakyPlaywrightTest
+        expr: |
+          count by (test_name) (
+            {job="playwright-tests", status="failed", retry="1"} | json
+          ) > 3
+        for: 1h
+        labels:
+          severity: warning
+          team: qa
+          component: playwright
+        annotations:
+          summary: "Flaky test detected: {{ $labels.test_name }}"
+          description: "Test '{{ $labels.test_name }}' has failed {{ $value }} times on retry in the last hour"
+          runbook_url: "https://wiki.internal/runbooks/playwright-flaky-tests"
+
+      # Critical: Test infrastructure failure
+      - alert: PlaywrightInfrastructureFailure
+        expr: |
+          count_over_time({job="playwright-tests", event="test_suite_start"}[30m]) == 0
+          and
+          count_over_time({job="playwright-tests"}[30m]) > 0
+        for: 5m
+        labels:
+          severity: critical
+          team: devops
+          component: playwright
+        annotations:
+          summary: "Playwright test infrastructure may be failing"
+          description: "Tests are attempting to run but test suite is not starting properly"
+          runbook_url: "https://wiki.internal/runbooks/playwright-infrastructure"
+
+      # Warning: High retry rate
+      - alert: HighPlaywrightRetryRate
+        expr: |
+          (
+            sum(rate({job="playwright-tests", retry!="0"}[30m]))
+            /
+            sum(rate({job="playwright-tests", event="test_end"}[30m]))
+          ) > 0.15
+        for: 10m
+        labels:
+          severity: warning
+          team: qa
+          component: playwright
+        annotations:
+          summary: "High test retry rate detected"
+          description: "{{ $value | humanizePercentage }} of tests are being retried"
+
+      # Info: Test duration increasing
+      - alert: PlaywrightDurationIncreasing
+        expr: |
+          (
+            avg_over_time({job="playwright-tests", event="test_end"} | json | unwrap duration_ms [1h])
+            /
+            avg_over_time({job="playwright-tests", event="test_end"} | json | unwrap duration_ms [24h] offset 1h)
+          ) > 1.5
+        for: 30m
+        labels:
+          severity: info
+          team: qa
+          component: playwright
+        annotations:
+          summary: "Playwright test duration is increasing"
+          description: "Average test duration has increased by {{ $value | humanizePercentage }} compared to previous day"