feat: Integrate Grafana Loki

2025-12-20 02:51:12 -05:00 · 2025-10-30 15:54:32 +00:00
parent 8ac61e01e3
commit 72a7cb7f7c
9 changed files with 1261 additions and 2 deletions
--- a/.github/workflows/playwright.yml
+++ b/.github/workflows/playwright.yml
@@ -6,8 +6,77 @@ on:
  pull_request:
    branches: [main, develop]
 env:
  GRAFANA_LOKI_URL: ${{ secrets.GRAFANA_LOKI_URL }}
  GRAFANA_LOKI_USERNAME: ${{ secrets.GRAFANA_LOKI_USERNAME }}
  GRAFANA_LOKI_PASSWORD: ${{ secrets.GRAFANA_LOKI_PASSWORD }}
 jobs:
  # Pre-flight validation to ensure environment is ready
  preflight:
    name: Validate Environment
    runs-on: ubuntu-latest
    steps:
      - name: Check Required Secrets
        run: |
          echo "🔍 Validating required secrets..."
          if [ -z "${{ secrets.SUPABASE_SERVICE_ROLE_KEY }}" ]; then
            echo "❌ SUPABASE_SERVICE_ROLE_KEY is not set"
            exit 1
          fi
          if [ -z "${{ secrets.TEST_USER_EMAIL }}" ]; then
            echo "⚠️  TEST_USER_EMAIL is not set"
          fi
          echo "✅ Required secrets validated"
      - name: Check Loki Connection
        if: ${{ secrets.GRAFANA_LOKI_URL != '' }}
        run: |
          echo "🔍 Testing Loki connection..."
          if [ -n "${{ secrets.GRAFANA_LOKI_USERNAME }}" ]; then
            response=$(curl -s -o /dev/null -w "%{http_code}" \
              -u "${{ secrets.GRAFANA_LOKI_USERNAME }}:${{ secrets.GRAFANA_LOKI_PASSWORD }}" \
              "${{ secrets.GRAFANA_LOKI_URL }}/ready")
          else
            response=$(curl -s -o /dev/null -w "%{http_code}" \
              "${{ secrets.GRAFANA_LOKI_URL }}/ready")
          fi
          if [ "$response" = "200" ]; then
            echo "✅ Loki is ready at ${{ secrets.GRAFANA_LOKI_URL }}"
          else
            echo "⚠️  Loki connection check returned HTTP $response"
            echo "Tests will continue but logs may not be sent to Loki"
          fi
      - name: Send Pre-flight Event to Loki
        if: ${{ secrets.GRAFANA_LOKI_URL != '' }}
        run: |
          timestamp=$(date +%s)000000000
          auth_header=""
          if [ -n "${{ secrets.GRAFANA_LOKI_USERNAME }}" ]; then
            auth_header="-u ${{ secrets.GRAFANA_LOKI_USERNAME }}:${{ secrets.GRAFANA_LOKI_PASSWORD }}"
          fi
          curl -X POST "${{ secrets.GRAFANA_LOKI_URL }}/loki/api/v1/push" \
            $auth_header \
            -H "Content-Type: application/json" \
            -d "{
              \"streams\": [{
                \"stream\": {
                  \"job\": \"playwright-preflight\",
                  \"workflow\": \"${{ github.workflow }}\",
                  \"branch\": \"${{ github.ref_name }}\",
                  \"commit\": \"${{ github.sha }}\",
                  \"run_id\": \"${{ github.run_id }}\",
                  \"event\": \"preflight_complete\"
                },
                \"values\": [[\"$timestamp\", \"Pre-flight checks completed successfully\"]]
              }]
            }" || echo "⚠️  Failed to send pre-flight event to Loki"
  test:
    needs: preflight
    timeout-minutes: 60
    runs-on: ubuntu-latest
@@ -30,7 +99,35 @@ jobs:
      - name: Install Playwright Browsers
        run: npx playwright install --with-deps ${{ matrix.browser }}
      - name: Send Test Start Event to Loki
        if: ${{ secrets.GRAFANA_LOKI_URL != '' }}
        run: |
          timestamp=$(date +%s)000000000
          auth_header=""
          if [ -n "${{ secrets.GRAFANA_LOKI_USERNAME }}" ]; then
            auth_header="-u ${{ secrets.GRAFANA_LOKI_USERNAME }}:${{ secrets.GRAFANA_LOKI_PASSWORD }}"
          fi
          curl -X POST "${{ secrets.GRAFANA_LOKI_URL }}/loki/api/v1/push" \
            $auth_header \
            -H "Content-Type: application/json" \
            -d "{
              \"streams\": [{
                \"stream\": {
                  \"job\": \"playwright-tests\",
                  \"browser\": \"${{ matrix.browser }}\",
                  \"workflow\": \"${{ github.workflow }}\",
                  \"branch\": \"${{ github.ref_name }}\",
                  \"commit\": \"${{ github.sha }}\",
                  \"run_id\": \"${{ github.run_id }}\",
                  \"event\": \"test_start\"
                },
                \"values\": [[\"$timestamp\", \"Starting Playwright tests for ${{ matrix.browser }}\"]]
              }]
            }" || echo "⚠️  Failed to send start event to Loki"
      - name: Run Playwright tests
        id: playwright-run
        env:
          SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }}
          TEST_USER_EMAIL: ${{ secrets.TEST_USER_EMAIL }}
@@ -38,7 +135,69 @@ jobs:
          TEST_MODERATOR_EMAIL: ${{ secrets.TEST_MODERATOR_EMAIL }}
          TEST_MODERATOR_PASSWORD: ${{ secrets.TEST_MODERATOR_PASSWORD }}
          BASE_URL: ${{ secrets.BASE_URL || 'http://localhost:8080' }}
-        run: npx playwright test --project=${{ matrix.browser }}
+          # Enable Loki reporter
          GRAFANA_LOKI_URL: ${{ secrets.GRAFANA_LOKI_URL }}
          GRAFANA_LOKI_USERNAME: ${{ secrets.GRAFANA_LOKI_USERNAME }}
          GRAFANA_LOKI_PASSWORD: ${{ secrets.GRAFANA_LOKI_PASSWORD }}
        run: |
          echo "🧪 Running Playwright tests for ${{ matrix.browser }}..."
          npx playwright test --project=${{ matrix.browser }} 2>&1 | tee test-execution.log
          TEST_EXIT_CODE=${PIPESTATUS[0]}
          echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
          exit $TEST_EXIT_CODE
        continue-on-error: true
      - name: Parse Test Results
        if: always()
        id: parse-results
        run: |
          if [ -f "test-results.json" ]; then
            echo "📊 Parsing test results..."
            TOTAL=$(jq '[.suites[].specs[]] | length' test-results.json || echo "0")
            PASSED=$(jq '[.suites[].specs[].tests[] | select(.results[].status == "passed")] | length' test-results.json || echo "0")
            FAILED=$(jq '[.suites[].specs[].tests[] | select(.results[].status == "failed")] | length' test-results.json || echo "0")
            SKIPPED=$(jq '[.suites[].specs[].tests[] | select(.results[].status == "skipped")] | length' test-results.json || echo "0")
            DURATION=$(jq '[.suites[].specs[].tests[].results[].duration] | add' test-results.json || echo "0")
            echo "total=$TOTAL" >> $GITHUB_OUTPUT
            echo "passed=$PASSED" >> $GITHUB_OUTPUT
            echo "failed=$FAILED" >> $GITHUB_OUTPUT
            echo "skipped=$SKIPPED" >> $GITHUB_OUTPUT
            echo "duration=$DURATION" >> $GITHUB_OUTPUT
            echo "✅ Results: $PASSED passed, $FAILED failed, $SKIPPED skipped (${DURATION}ms total)"
          else
            echo "⚠️  test-results.json not found"
          fi
      - name: Send Test Results to Loki
        if: always() && secrets.GRAFANA_LOKI_URL != ''
        run: |
          timestamp=$(date +%s)000000000
          STATUS="${{ steps.playwright-run.outputs.test_exit_code == '0' && 'success' || 'failure' }}"
          auth_header=""
          if [ -n "${{ secrets.GRAFANA_LOKI_USERNAME }}" ]; then
            auth_header="-u ${{ secrets.GRAFANA_LOKI_USERNAME }}:${{ secrets.GRAFANA_LOKI_PASSWORD }}"
          fi
          curl -X POST "${{ secrets.GRAFANA_LOKI_URL }}/loki/api/v1/push" \
            $auth_header \
            -H "Content-Type: application/json" \
            -d "{
              \"streams\": [{
                \"stream\": {
                  \"job\": \"playwright-tests\",
                  \"browser\": \"${{ matrix.browser }}\",
                  \"workflow\": \"${{ github.workflow }}\",
                  \"branch\": \"${{ github.ref_name }}\",
                  \"commit\": \"${{ github.sha }}\",
                  \"run_id\": \"${{ github.run_id }}\",
                  \"status\": \"$STATUS\",
                  \"event\": \"test_complete\"
                },
                \"values\": [[\"$timestamp\", \"{\\\"total\\\": ${{ steps.parse-results.outputs.total || 0 }}, \\\"passed\\\": ${{ steps.parse-results.outputs.passed || 0 }}, \\\"failed\\\": ${{ steps.parse-results.outputs.failed || 0 }}, \\\"skipped\\\": ${{ steps.parse-results.outputs.skipped || 0 }}, \\\"duration_ms\\\": ${{ steps.parse-results.outputs.duration || 0 }}}\"]]
              }]
            }" || echo "⚠️  Failed to send results to Loki"
      - name: Upload test results
        uses: actions/upload-artifact@v4
--- a/docker-compose.loki.yml
+++ b/docker-compose.loki.yml
@@ -0,0 +1,63 @@
 version: "3.8"
 # Local Grafana Loki + Grafana stack for testing Playwright integration
 # Usage: docker-compose -f docker-compose.loki.yml up -d
 services:
  loki:
    image: grafana/loki:2.9.0
    container_name: thrillwiki-loki
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yml:/etc/loki/local-config.yaml
      - loki-data:/loki
    command: -config.file=/etc/loki/local-config.yaml
    networks:
      - loki-network
    restart: unless-stopped
  grafana:
    image: grafana/grafana:10.1.0
    container_name: thrillwiki-grafana
    ports:
      - "3000:3000"
    environment:
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_SERVER_ROOT_URL=http://localhost:3000
    volumes:
      - grafana-data:/var/lib/grafana
      - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
      - ./monitoring/grafana-dashboard.json:/etc/grafana/provisioning/dashboards/playwright-dashboard.json
    networks:
      - loki-network
    depends_on:
      - loki
    restart: unless-stopped
  # Optional: Promtail for collecting logs from files
  # promtail:
  #   image: grafana/promtail:2.9.0
  #   container_name: thrillwiki-promtail
  #   volumes:
  #     - ./promtail-config.yml:/etc/promtail/config.yml
  #     - ./test-results:/var/log/playwright:ro
  #   command: -config.file=/etc/promtail/config.yml
  #   networks:
  #     - loki-network
  #   depends_on:
  #     - loki
  #   restart: unless-stopped
 volumes:
  loki-data:
    driver: local
  grafana-data:
    driver: local
 networks:
  loki-network:
    driver: bridge
--- a/grafana-datasources.yml
+++ b/grafana-datasources.yml
@@ -0,0 +1,45 @@
 # Grafana Data Source Provisioning
 # Auto-configures Loki as a data source in Grafana
 apiVersion: 1
 datasources:
  - name: Loki
    type: loki
    access: proxy
    url: http://loki:3100
    isDefault: true
    editable: true
    jsonData:
      maxLines: 1000
      derivedFields:
        # Extract trace ID from logs for distributed tracing
        - datasourceUid: tempo
          matcherRegex: "traceId=(\\w+)"
          name: TraceID
          url: "$${__value.raw}"
        # Extract request ID for correlation
        - matcherRegex: "requestId=(\\w+)"
          name: RequestID
          url: "$${__value.raw}"
    version: 1
  # Optional: Add Prometheus if you have metrics
  # - name: Prometheus
  #   type: prometheus
  #   access: proxy
  #   url: http://prometheus:9090
  #   isDefault: false
  #   editable: true
  #   jsonData:
  #     timeInterval: 15s
  #   version: 1
  # Optional: Add Tempo for distributed tracing
  # - name: Tempo
  #   type: tempo
  #   access: proxy
  #   url: http://tempo:3200
  #   isDefault: false
  #   editable: true
  #   version: 1
--- a/loki-config.yml
+++ b/loki-config.yml
@@ -0,0 +1,112 @@
 # Grafana Loki Configuration for Local Testing
 # This is a basic configuration suitable for development and testing
 auth_enabled: false
 server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: info
 common:
  path_prefix: /loki
  storage:
    filesystem:
      chunks_directory: /loki/chunks
      rules_directory: /loki/rules
  replication_factor: 1
  ring:
    instance_addr: 127.0.0.1
    kvstore:
      store: inmemory
 # Configure the ingester for receiving logs
 ingester:
  lifecycler:
    address: 127.0.0.1
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1
    final_sleep: 0s
  chunk_idle_period: 5m
  chunk_retain_period: 30s
  max_chunk_age: 1h
  chunk_encoding: snappy
 # Schema configuration (defines how data is stored)
 schema_config:
  configs:
    - from: 2020-10-24
      store: boltdb-shipper
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h
 # Storage configuration
 storage_config:
  boltdb_shipper:
    active_index_directory: /loki/boltdb-shipper-active
    cache_location: /loki/boltdb-shipper-cache
    cache_ttl: 24h
    shared_store: filesystem
  filesystem:
    directory: /loki/chunks
 # Limits configuration
 limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h  # 1 week
  ingestion_rate_mb: 10
  ingestion_burst_size_mb: 20
  max_streams_per_user: 10000
  max_query_length: 721h  # 30 days
  max_query_parallelism: 32
  max_entries_limit_per_query: 5000
  max_cache_freshness_per_query: 10m
 # Chunk store configuration
 chunk_store_config:
  max_look_back_period: 0s
 # Table manager configuration
 table_manager:
  retention_deletes_enabled: true
  retention_period: 168h  # 1 week retention for local testing
 # Query range configuration
 query_range:
  align_queries_with_step: true
  max_retries: 5
  parallelise_shardable_queries: true
  cache_results: true
 # Compactor configuration
 compactor:
  working_directory: /loki/compactor
  shared_store: filesystem
  compaction_interval: 10m
  retention_enabled: true
  retention_delete_delay: 2h
  retention_delete_worker_count: 150
 # Ruler configuration (for alerting)
 ruler:
  storage:
    type: local
    local:
      directory: /loki/rules
  rule_path: /loki/rules-temp
  alertmanager_url: http://localhost:9093
  ring:
    kvstore:
      store: inmemory
  enable_api: true
  enable_alertmanager_v2: true
 # Analytics configuration
 analytics:
  reporting_enabled: false
--- a/monitoring/grafana-dashboard.json
+++ b/monitoring/grafana-dashboard.json
@@ -0,0 +1,266 @@
 {
  "dashboard": {
    "title": "Playwright Test Execution Dashboard",
    "tags": ["playwright", "testing", "e2e"],
    "timezone": "browser",
    "refresh": "30s",
    "time": {
      "from": "now-24h",
      "to": "now"
    },
    "panels": [
      {
        "id": 1,
        "title": "Test Execution Overview",
        "type": "stat",
        "gridPos": { "x": 0, "y": 0, "w": 6, "h": 4 },
        "targets": [
          {
            "expr": "count_over_time({job=\"playwright-tests\", event=\"test_end\"}[$__range])",
            "legendFormat": "Total Tests"
          }
        ],
        "options": {
          "colorMode": "value",
          "graphMode": "area",
          "textMode": "auto"
        }
      },
      {
        "id": 2,
        "title": "Pass Rate %",
        "type": "stat",
        "gridPos": { "x": 6, "y": 0, "w": 6, "h": 4 },
        "targets": [
          {
            "expr": "(sum(count_over_time({job=\"playwright-tests\", status=\"passed\"}[$__range])) / sum(count_over_time({job=\"playwright-tests\", event=\"test_end\"}[$__range]))) * 100",
            "legendFormat": "Pass Rate"
          }
        ],
        "options": {
          "colorMode": "value",
          "graphMode": "area",
          "textMode": "auto",
          "unit": "percent"
        },
        "fieldConfig": {
          "defaults": {
            "thresholds": {
              "mode": "absolute",
              "steps": [
                { "value": 0, "color": "red" },
                { "value": 80, "color": "yellow" },
                { "value": 95, "color": "green" }
              ]
            }
          }
        }
      },
      {
        "id": 3,
        "title": "Failure Rate %",
        "type": "stat",
        "gridPos": { "x": 12, "y": 0, "w": 6, "h": 4 },
        "targets": [
          {
            "expr": "(sum(count_over_time({job=\"playwright-tests\", status=\"failed\"}[$__range])) / sum(count_over_time({job=\"playwright-tests\", event=\"test_end\"}[$__range]))) * 100",
            "legendFormat": "Failure Rate"
          }
        ],
        "options": {
          "colorMode": "value",
          "graphMode": "area",
          "textMode": "auto",
          "unit": "percent"
        },
        "fieldConfig": {
          "defaults": {
            "thresholds": {
              "mode": "absolute",
              "steps": [
                { "value": 0, "color": "green" },
                { "value": 5, "color": "yellow" },
                { "value": 20, "color": "red" }
              ]
            }
          }
        }
      },
      {
        "id": 4,
        "title": "Avg Test Duration",
        "type": "stat",
        "gridPos": { "x": 18, "y": 0, "w": 6, "h": 4 },
        "targets": [
          {
            "expr": "avg_over_time({job=\"playwright-tests\", event=\"test_end\"} | json | unwrap duration_ms [$__range])",
            "legendFormat": "Avg Duration"
          }
        ],
        "options": {
          "colorMode": "value",
          "graphMode": "area",
          "textMode": "auto",
          "unit": "ms"
        }
      },
      {
        "id": 5,
        "title": "Test Status Over Time",
        "type": "timeseries",
        "gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
        "targets": [
          {
            "expr": "sum by (status) (count_over_time({job=\"playwright-tests\", event=\"test_end\"} | json [$__interval]))",
            "legendFormat": "{{status}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "custom": {
              "lineInterpolation": "smooth",
              "fillOpacity": 20
            }
          },
          "overrides": [
            {
              "matcher": { "id": "byName", "options": "passed" },
              "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }]
            },
            {
              "matcher": { "id": "byName", "options": "failed" },
              "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }]
            },
            {
              "matcher": { "id": "byName", "options": "skipped" },
              "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "yellow" } }]
            }
          ]
        }
      },
      {
        "id": 6,
        "title": "Browser Comparison",
        "type": "bargauge",
        "gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
        "targets": [
          {
            "expr": "sum by (browser) (count_over_time({job=\"playwright-tests\", status=\"passed\"} [$__range]))",
            "legendFormat": "{{browser}}"
          }
        ],
        "options": {
          "orientation": "horizontal",
          "displayMode": "gradient"
        }
      },
      {
        "id": 7,
        "title": "Test Duration Distribution",
        "type": "histogram",
        "gridPos": { "x": 0, "y": 12, "w": 12, "h": 8 },
        "targets": [
          {
            "expr": "{job=\"playwright-tests\", event=\"test_end\"} | json | unwrap duration_ms",
            "legendFormat": "Duration"
          }
        ],
        "options": {
          "bucketOffset": 0,
          "bucketSize": 1000
        }
      },
      {
        "id": 8,
        "title": "Top 10 Failing Tests",
        "type": "bargauge",
        "gridPos": { "x": 12, "y": 12, "w": 12, "h": 8 },
        "targets": [
          {
            "expr": "topk(10, sum by (test_name) (count_over_time({job=\"playwright-tests\", status=\"failed\"} | json [$__range])))",
            "legendFormat": "{{test_name}}"
          }
        ],
        "options": {
          "orientation": "horizontal",
          "displayMode": "gradient",
          "showUnfilled": true
        }
      },
      {
        "id": 9,
        "title": "Recent Test Runs",
        "type": "table",
        "gridPos": { "x": 0, "y": 20, "w": 24, "h": 8 },
        "targets": [
          {
            "expr": "{job=\"playwright-tests\", event=\"test_end\"} | json",
            "legendFormat": ""
          }
        ],
        "options": {
          "showHeader": true,
          "sortBy": [{ "displayName": "Time", "desc": true }]
        },
        "transformations": [
          {
            "id": "organize",
            "options": {
              "excludeByName": {},
              "indexByName": {
                "Time": 0,
                "test_name": 1,
                "test_file": 2,
                "browser": 3,
                "status": 4,
                "duration_ms": 5,
                "branch": 6,
                "commit": 7
              },
              "renameByName": {
                "test_name": "Test Name",
                "test_file": "File",
                "browser": "Browser",
                "status": "Status",
                "duration_ms": "Duration (ms)",
                "branch": "Branch",
                "commit": "Commit"
              }
            }
          },
          {
            "id": "limit",
            "options": {
              "limitField": 20
            }
          }
        ]
      },
      {
        "id": 10,
        "title": "Slowest Tests (P95)",
        "type": "table",
        "gridPos": { "x": 0, "y": 28, "w": 12, "h": 6 },
        "targets": [
          {
            "expr": "topk(10, quantile_over_time(0.95, {job=\"playwright-tests\", event=\"test_end\"} | json | unwrap duration_ms by (test_name) [$__range]))",
            "legendFormat": "{{test_name}}"
          }
        ]
      },
      {
        "id": 11,
        "title": "Flaky Tests Detection",
        "type": "table",
        "gridPos": { "x": 12, "y": 28, "w": 12, "h": 6 },
        "targets": [
          {
            "expr": "(count by (test_name) ({job=\"playwright-tests\", status=\"failed\"} | json) and count by (test_name) ({job=\"playwright-tests\", status=\"passed\"} | json))",
            "legendFormat": "{{test_name}}"
          }
        ],
        "description": "Tests that have both passed and failed runs (potential flaky tests)"
      }
    ]
  }
 }
--- a/monitoring/loki-alerts.yml
+++ b/monitoring/loki-alerts.yml
@@ -0,0 +1,166 @@
 # Grafana Loki Alert Rules for Playwright Tests
 # Deploy this to AlertManager or Grafana Cloud
 groups:
  - name: playwright_test_alerts
    interval: 1m
    rules:
      # Critical: All tests are failing
      - alert: AllPlaywrightTestsFailing
        expr: |
          sum(rate({job="playwright-tests", status="passed"}[15m])) == 0
          and
          sum(rate({job="playwright-tests", event="test_end"}[15m])) > 0
        for: 5m
        labels:
          severity: critical
          team: qa
          component: playwright
        annotations:
          summary: "All Playwright tests are failing"
          description: "No passing tests detected in the last 15 minutes. Test count: {{ $value }}"
          runbook_url: "https://wiki.internal/runbooks/playwright-all-tests-failing"
          dashboard_url: "https://grafana.internal/d/playwright-dashboard"
      # Warning: High failure rate
      - alert: HighPlaywrightFailureRate
        expr: |
          (
            sum(rate({job="playwright-tests", status="failed"}[30m]))
            /
            sum(rate({job="playwright-tests", event="test_end"}[30m]))
          ) > 0.20
        for: 10m
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "High Playwright test failure rate detected"
          description: "{{ $value | humanizePercentage }} of tests are failing over the last 30 minutes"
          runbook_url: "https://wiki.internal/runbooks/playwright-high-failure-rate"
      # Warning: Specific browser has high failure rate
      - alert: BrowserSpecificFailures
        expr: |
          (
            sum by (browser) (rate({job="playwright-tests", status="failed"}[30m]))
            /
            sum by (browser) (rate({job="playwright-tests", event="test_end"}[30m]))
          ) > 0.30
        for: 10m
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "High failure rate in {{ $labels.browser }}"
          description: "{{ $labels.browser }} browser has {{ $value | humanizePercentage }} failure rate"
      # Warning: Slow test execution
      - alert: SlowPlaywrightTests
        expr: |
          quantile_over_time(0.95, 
            {job="playwright-tests", event="test_end"} | json | unwrap duration_ms
          [30m]) > 300000
        for: 15m
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "Playwright tests are running slowly"
          description: "P95 test duration is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
          runbook_url: "https://wiki.internal/runbooks/playwright-slow-tests"
      # Warning: Test suite timeout
      - alert: PlaywrightSuiteTimeout
        expr: |
          {job="playwright-tests", event="test_suite_end"} | json | unwrap duration_ms > 3600000
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "Playwright test suite exceeded 1 hour"
          description: "Test suite took {{ $value | humanizeDuration }} to complete"
      # Info: No tests running (during business hours)
      - alert: NoPlaywrightTestsRunning
        expr: |
          absent_over_time({job="playwright-tests", event="test_start"}[2h])
        for: 5m
        labels:
          severity: info
          team: qa
          component: playwright
        annotations:
          summary: "No Playwright tests have run recently"
          description: "No test executions detected in the last 2 hours. CI/CD pipeline may be broken."
          runbook_url: "https://wiki.internal/runbooks/playwright-no-tests"
      # Warning: Flaky test detected
      - alert: FlakyPlaywrightTest
        expr: |
          count by (test_name) (
            {job="playwright-tests", status="failed", retry="1"} | json
          ) > 3
        for: 1h
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "Flaky test detected: {{ $labels.test_name }}"
          description: "Test '{{ $labels.test_name }}' has failed {{ $value }} times on retry in the last hour"
          runbook_url: "https://wiki.internal/runbooks/playwright-flaky-tests"
      # Critical: Test infrastructure failure
      - alert: PlaywrightInfrastructureFailure
        expr: |
          count_over_time({job="playwright-tests", event="test_suite_start"}[30m]) == 0
          and
          count_over_time({job="playwright-tests"}[30m]) > 0
        for: 5m
        labels:
          severity: critical
          team: devops
          component: playwright
        annotations:
          summary: "Playwright test infrastructure may be failing"
          description: "Tests are attempting to run but test suite is not starting properly"
          runbook_url: "https://wiki.internal/runbooks/playwright-infrastructure"
      # Warning: High retry rate
      - alert: HighPlaywrightRetryRate
        expr: |
          (
            sum(rate({job="playwright-tests", retry!="0"}[30m]))
            /
            sum(rate({job="playwright-tests", event="test_end"}[30m]))
          ) > 0.15
        for: 10m
        labels:
          severity: warning
          team: qa
          component: playwright
        annotations:
          summary: "High test retry rate detected"
          description: "{{ $value | humanizePercentage }} of tests are being retried"
      # Info: Test duration increasing
      - alert: PlaywrightDurationIncreasing
        expr: |
          (
            avg_over_time({job="playwright-tests", event="test_end"} | json | unwrap duration_ms [1h])
            /
            avg_over_time({job="playwright-tests", event="test_end"} | json | unwrap duration_ms [24h] offset 1h)
          ) > 1.5
        for: 30m
        labels:
          severity: info
          team: qa
          component: playwright
        annotations:
          summary: "Playwright test duration is increasing"
          description: "Average test duration has increased by {{ $value | humanizePercentage }} compared to previous day"
--- a/playwright.config.ts
+++ b/playwright.config.ts
@@ -24,7 +24,13 @@ export default defineConfig({
  reporter: [
    ['html'],
    ['list'],
-    ['json', { outputFile: 'test-results.json' }]
+    ['json', { outputFile: 'test-results.json' }],
    // Grafana Loki reporter for centralized logging
    ['./tests/helpers/loki-reporter.ts', {
      lokiUrl: process.env.GRAFANA_LOKI_URL,
      username: process.env.GRAFANA_LOKI_USERNAME,
      password: process.env.GRAFANA_LOKI_PASSWORD,
    }]
  ],
  /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
--- a/scripts/test-loki-integration.sh
+++ b/scripts/test-loki-integration.sh
@@ -0,0 +1,175 @@
 #!/bin/bash
 set -e
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 echo -e "${BLUE}🚀 Playwright + Grafana Loki Integration Test${NC}"
 echo "=============================================="
 # Check if Docker is running
 if ! docker info > /dev/null 2>&1; then
  echo -e "${RED}❌ Docker is not running. Please start Docker first.${NC}"
  exit 1
 fi
 echo -e "\n${BLUE}📦 Starting local Loki stack...${NC}"
 if [ -f "docker-compose.loki.yml" ]; then
  docker-compose -f docker-compose.loki.yml up -d
 else
  echo -e "${YELLOW}⚠️  docker-compose.loki.yml not found. Creating basic Loki setup...${NC}"
  # Create temporary Loki config
  cat > /tmp/loki-config.yml << 'EOF'
 auth_enabled: false
 server:
  http_listen_port: 3100
 ingester:
  lifecycler:
    address: 127.0.0.1
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1
  chunk_idle_period: 3m
  chunk_retain_period: 1m
 schema_config:
  configs:
    - from: 2020-10-24
      store: boltdb
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 168h
 storage_config:
  boltdb:
    directory: /tmp/loki/index
  filesystem:
    directory: /tmp/loki/chunks
 limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h
 chunk_store_config:
  max_look_back_period: 0s
 table_manager:
  retention_deletes_enabled: false
  retention_period: 0s
 EOF
  # Start Loki container
  docker run -d \
    --name loki-test \
    -p 3100:3100 \
    -v /tmp/loki-config.yml:/etc/loki/local-config.yaml \
    grafana/loki:2.9.0 \
    -config.file=/etc/loki/local-config.yaml
  # Start Grafana container
  docker run -d \
    --name grafana-test \
    -p 3000:3000 \
    -e "GF_AUTH_ANONYMOUS_ENABLED=true" \
    -e "GF_AUTH_ANONYMOUS_ORG_ROLE=Admin" \
    grafana/grafana:10.1.0
 fi
 # Wait for Loki to be ready
 echo -e "\n${YELLOW}⏳ Waiting for Loki to start...${NC}"
 max_attempts=30
 attempt=0
 until curl -s http://localhost:3100/ready | grep -q "ready" || [ $attempt -eq $max_attempts ]; do
  sleep 2
  attempt=$((attempt + 1))
  echo -n "."
 done
 echo ""
 if [ $attempt -eq $max_attempts ]; then
  echo -e "${RED}❌ Loki failed to start within 60 seconds${NC}"
  exit 1
 fi
 echo -e "${GREEN}✅ Loki is ready${NC}"
 # Export environment variables
 export GRAFANA_LOKI_URL="http://localhost:3100"
 export GRAFANA_LOKI_USERNAME=""
 export GRAFANA_LOKI_PASSWORD=""
 echo -e "\n${BLUE}🧪 Running a test Playwright test...${NC}"
 # Check if tests directory exists
 if [ -d "tests/e2e" ]; then
  npx playwright test tests/e2e/auth/login.spec.ts --project=chromium --reporter=./tests/helpers/loki-reporter.ts 2>&1 || true
 else
  echo -e "${YELLOW}⚠️  No test files found. Skipping test execution.${NC}"
 fi
 # Wait a moment for logs to be ingested
 sleep 3
 echo -e "\n${BLUE}🔍 Querying Loki for test logs...${NC}"
 start_time=$(date -u -d '5 minutes ago' +%s)000000000
 end_time=$(date -u +%s)000000000
 response=$(curl -s -G "http://localhost:3100/loki/api/v1/query_range" \
  --data-urlencode 'query={job="playwright-tests"}' \
  --data-urlencode "start=$start_time" \
  --data-urlencode "end=$end_time")
 # Check if we got results
 result_count=$(echo "$response" | jq '.data.result | length')
 if [ "$result_count" -gt 0 ]; then
  echo -e "${GREEN}✅ Found $result_count log streams in Loki${NC}"
  echo -e "\n${BLUE}Sample logs:${NC}"
  echo "$response" | jq -r '.data.result[0].values[0:3][] | .[1]' 2>/dev/null || echo "No log content available"
 else
  echo -e "${YELLOW}⚠️  No logs found in Loki. This might be expected if no tests ran.${NC}"
 fi
 # Display useful queries
 echo -e "\n${BLUE}📊 Useful LogQL Queries:${NC}"
 echo "------------------------------------"
 echo "All test logs:"
 echo '  {job="playwright-tests"}'
 echo ""
 echo "Failed tests only:"
 echo '  {job="playwright-tests", status="failed"}'
 echo ""
 echo "Tests by browser:"
 echo '  {job="playwright-tests", browser="chromium"}'
 echo ""
 echo "Test duration stats:"
 echo '  quantile_over_time(0.95, {job="playwright-tests"} | json | unwrap duration_ms [1h])'
 echo ""
 # Open Grafana
 echo -e "\n${GREEN}🌐 Grafana is available at: http://localhost:3000${NC}"
 echo -e "${BLUE}   Default credentials: admin / admin${NC}"
 echo ""
 echo -e "${YELLOW}📖 To add Loki as a data source in Grafana:${NC}"
 echo "   1. Go to Configuration > Data Sources"
 echo "   2. Add Loki with URL: http://localhost:3100"
 echo "   3. Import the dashboard from: monitoring/grafana-dashboard.json"
 echo ""
 echo -e "${GREEN}✅ Test complete!${NC}"
 echo ""
 echo -e "${BLUE}To stop the containers:${NC}"
 echo "  docker stop loki-test grafana-test"
 echo "  docker rm loki-test grafana-test"
 echo ""
 echo -e "${BLUE}To view logs in real-time:${NC}"
 echo "  docker logs -f loki-test"
--- a/tests/helpers/loki-reporter.ts
+++ b/tests/helpers/loki-reporter.ts
@@ -0,0 +1,267 @@
 /**
 * Custom Playwright Reporter for Grafana Loki
 * 
 * Streams test events and results to Loki in real-time for centralized logging and monitoring.
 */
 import {
  FullConfig,
  FullResult,
  Reporter,
  Suite,
  TestCase,
  TestResult,
  TestStep,
 } from '@playwright/test/reporter';
 interface LokiStream {
  stream: Record<string, string>;
  values: Array<[string, string]>;
 }
 interface LokiPushRequest {
  streams: LokiStream[];
 }
 interface LokiReporterOptions {
  lokiUrl?: string;
  username?: string;
  password?: string;
  batchSize?: number;
  flushInterval?: number;
  labels?: Record<string, string>;
 }
 /**
 * Custom Playwright reporter that sends logs to Grafana Loki
 */
 export default class LokiReporter implements Reporter {
  private lokiUrl: string;
  private basicAuth?: string;
  private batchSize: number;
  private flushInterval: number;
  private buffer: LokiStream[] = [];
  private flushTimer?: NodeJS.Timeout;
  private labels: Record<string, string>;
  private testStartTime?: number;
  constructor(options: LokiReporterOptions = {}) {
    this.lokiUrl = options.lokiUrl || process.env.GRAFANA_LOKI_URL || 'http://localhost:3100';
    this.batchSize = options.batchSize || 10;
    this.flushInterval = options.flushInterval || 5000;
    // Setup basic auth if credentials provided
    const username = options.username || process.env.GRAFANA_LOKI_USERNAME;
    const password = options.password || process.env.GRAFANA_LOKI_PASSWORD;
    if (username && password) {
      this.basicAuth = Buffer.from(`${username}:${password}`).toString('base64');
    }
    // Base labels for all logs
    this.labels = {
      job: 'playwright-tests',
      workflow: process.env.GITHUB_WORKFLOW || 'local',
      branch: process.env.GITHUB_REF_NAME || 'local',
      commit: process.env.GITHUB_SHA || 'local',
      run_id: process.env.GITHUB_RUN_ID || 'local',
      ...options.labels,
    };
    // Setup periodic flush
    this.flushTimer = setInterval(() => this.flush(), this.flushInterval);
  }
  /**
   * Called once before running tests
   */
  async onBegin(config: FullConfig, suite: Suite) {
    this.testStartTime = Date.now();
    const testCount = suite.allTests().length;
    await this.log({
      event: 'test_suite_start',
      message: `Starting Playwright test suite with ${testCount} tests`,
      total_tests: testCount,
      workers: config.workers,
    });
  }
  /**
   * Called after a test has been started
   */
  async onTestBegin(test: TestCase) {
    await this.log({
      event: 'test_start',
      test_name: test.title,
      test_file: this.getRelativePath(test.location.file),
      project: test.parent.project()?.name || 'unknown',
      message: `Test started: ${test.title}`,
    }, {
      browser: test.parent.project()?.name || 'unknown',
      test_file: this.getRelativePath(test.location.file),
    });
  }
  /**
   * Called after a test has been finished
   */
  async onTestEnd(test: TestCase, result: TestResult) {
    const status = result.status;
    const duration = result.duration;
    const browser = test.parent.project()?.name || 'unknown';
    const testFile = this.getRelativePath(test.location.file);
    // Determine log message based on status
    let message = `Test ${status}: ${test.title}`;
    if (status === 'failed' || status === 'timedOut') {
      message = `${message} - ${result.error?.message || 'Unknown error'}`;
    }
    await this.log({
      event: 'test_end',
      test_name: test.title,
      test_file: testFile,
      status,
      duration_ms: duration,
      retry: result.retry,
      message,
      error: status === 'failed' ? result.error?.message : undefined,
      error_stack: status === 'failed' ? result.error?.stack : undefined,
    }, {
      browser,
      test_file: testFile,
      test_name: test.title,
      status,
    });
    // Log individual test steps for failed tests
    if (status === 'failed') {
      for (const step of result.steps) {
        await this.logStep(test, step, browser, testFile);
      }
    }
  }
  /**
   * Log test step details
   */
  private async logStep(test: TestCase, step: TestStep, browser: string, testFile: string) {
    await this.log({
      event: 'test_step',
      test_name: test.title,
      step_title: step.title,
      step_category: step.category,
      duration_ms: step.duration,
      error: step.error?.message,
      message: `Step: ${step.title}`,
    }, {
      browser,
      test_file: testFile,
      step_category: step.category,
    });
  }
  /**
   * Called after all tests have been finished
   */
  async onEnd(result: FullResult) {
    const duration = this.testStartTime ? Date.now() - this.testStartTime : 0;
    await this.log({
      event: 'test_suite_end',
      status: result.status,
      duration_ms: duration,
      message: `Test suite ${result.status} in ${(duration / 1000).toFixed(2)}s`,
    });
    // Flush remaining logs
    await this.flush();
    // Clear flush timer
    if (this.flushTimer) {
      clearInterval(this.flushTimer);
    }
  }
  /**
   * Log a message to Loki
   */
  private async log(data: Record<string, any>, extraLabels: Record<string, string> = {}) {
    const timestamp = Date.now() * 1000000; // Convert to nanoseconds
    const stream: LokiStream = {
      stream: {
        ...this.labels,
        ...extraLabels,
        event: data.event || 'log',
      },
      values: [[timestamp.toString(), JSON.stringify(data)]],
    };
    this.buffer.push(stream);
    // Flush if buffer is full
    if (this.buffer.length >= this.batchSize) {
      await this.flush();
    }
  }
  /**
   * Flush buffered logs to Loki
   */
  private async flush() {
    if (this.buffer.length === 0) {
      return;
    }
    const payload: LokiPushRequest = {
      streams: this.buffer,
    };
    this.buffer = [];
    try {
      const headers: Record<string, string> = {
        'Content-Type': 'application/json',
      };
      if (this.basicAuth) {
        headers['Authorization'] = `Basic ${this.basicAuth}`;
      }
      const response = await fetch(`${this.lokiUrl}/loki/api/v1/push`, {
        method: 'POST',
        headers,
        body: JSON.stringify(payload),
      });
      if (!response.ok) {
        console.error(`Failed to send logs to Loki: ${response.status} ${response.statusText}`);
        const errorText = await response.text();
        console.error(`Response: ${errorText}`);
      }
    } catch (error) {
      console.error('Error sending logs to Loki:', error);
      // Re-add to buffer to retry
      this.buffer.push(...payload.streams);
    }
  }
  /**
   * Get relative path from project root
   */
  private getRelativePath(filePath: string): string {
    const cwd = process.cwd();
    if (filePath.startsWith(cwd)) {
      return filePath.substring(cwd.length + 1);
    }
    return filePath;
  }
  /**
   * Print summary to console
   */
  printsToStdio() {
    return false;
  }
 }