Pac-cogs/videoarchiver/queue/health_checker.py

"""Module for queue health checks"""

import logging
import psutil
import time
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, Optional, Tuple, List, Any, Set
from datetime import datetime, timedelta

logger = logging.getLogger("QueueHealthChecker")

class HealthStatus(Enum):
    """Possible health status values"""
    HEALTHY = "healthy"
    WARNING = "warning"
    CRITICAL = "critical"
    UNKNOWN = "unknown"

class HealthCategory(Enum):
    """Health check categories"""
    MEMORY = "memory"
    PERFORMANCE = "performance"
    ACTIVITY = "activity"
    ERRORS = "errors"
    DEADLOCKS = "deadlocks"
    SYSTEM = "system"

@dataclass
class HealthThresholds:
    """Defines thresholds for health checks"""
    memory_warning_mb: int = 384    # 384MB
    memory_critical_mb: int = 512   # 512MB
    deadlock_warning_sec: int = 30  # 30 seconds
    deadlock_critical_sec: int = 60 # 1 minute
    error_rate_warning: float = 0.1 # 10% errors
    error_rate_critical: float = 0.2 # 20% errors
    inactivity_warning_sec: int = 30
    inactivity_critical_sec: int = 60
    cpu_warning_percent: float = 80.0
    cpu_critical_percent: float = 90.0

@dataclass
class HealthCheckResult:
    """Result of a health check"""
    category: HealthCategory
    status: HealthStatus
    message: str
    value: Optional[float] = None
    timestamp: datetime = field(default_factory=datetime.utcnow)
    details: Dict[str, Any] = field(default_factory=dict)

class HealthHistory:
    """Tracks health check history"""

    def __init__(self, max_history: int = 1000):
        self.max_history = max_history
        self.history: List[HealthCheckResult] = []
        self.status_changes: List[Dict[str, Any]] = []
        self.critical_events: List[Dict[str, Any]] = []

    def add_result(self, result: HealthCheckResult) -> None:
        """Add a health check result"""
        self.history.append(result)
        if len(self.history) > self.max_history:
            self.history.pop(0)

        # Track status changes
        if self.history[-2:-1] and self.history[-1].status != self.history[-2].status:
            self.status_changes.append({
                "timestamp": result.timestamp,
                "category": result.category.value,
                "from_status": self.history[-2].status.value,
                "to_status": result.status.value,
                "message": result.message
            })

        # Track critical events
        if result.status == HealthStatus.CRITICAL:
            self.critical_events.append({
                "timestamp": result.timestamp,
                "category": result.category.value,
                "message": result.message,
                "details": result.details
            })

    def get_status_summary(self) -> Dict[str, Any]:
        """Get summary of health status history"""
        return {
            "total_checks": len(self.history),
            "status_changes": len(self.status_changes),
            "critical_events": len(self.critical_events),
            "recent_status_changes": self.status_changes[-5:],
            "recent_critical_events": self.critical_events[-5:]
        }

class SystemHealthMonitor:
    """Monitors system health metrics"""

    def __init__(self):
        self.process = psutil.Process()

    async def check_system_health(self) -> Dict[str, Any]:
        """Check system health metrics"""
        try:
            cpu_percent = self.process.cpu_percent()
            memory_info = self.process.memory_info()
            io_counters = self.process.io_counters()

            return {
                "cpu_percent": cpu_percent,
                "memory_rss": memory_info.rss / 1024 / 1024,  # MB
                "memory_vms": memory_info.vms / 1024 / 1024,  # MB
                "io_read_mb": io_counters.read_bytes / 1024 / 1024,
                "io_write_mb": io_counters.write_bytes / 1024 / 1024,
                "thread_count": self.process.num_threads(),
                "open_files": len(self.process.open_files()),
                "connections": len(self.process.connections())
            }
        except Exception as e:
            logger.error(f"Error checking system health: {e}")
            return {}

class HealthChecker:
    """Handles health checks for the queue system"""

    def __init__(
        self,
        thresholds: Optional[HealthThresholds] = None,
        history_size: int = 1000
    ):
        self.thresholds = thresholds or HealthThresholds()
        self.history = HealthHistory(history_size)
        self.system_monitor = SystemHealthMonitor()
        self._last_gc_time: Optional[datetime] = None

    async def check_health(
        self,
        metrics: Dict[str, Any],
        queue_info: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Perform comprehensive health check"""
        results = []

        # Check memory health
        memory_result = await self._check_memory_health()
        results.append(memory_result)

        # Check performance health
        perf_result = self._check_performance_health(metrics)
        results.append(perf_result)

        # Check activity health
        activity_result = self._check_activity_health(
            queue_info["last_activity"],
            queue_info["processing_count"] > 0
        )
        results.append(activity_result)

        # Check error health
        error_result = self._check_error_health(metrics)
        results.append(error_result)

        # Check for deadlocks
        deadlock_result = self._check_deadlocks(queue_info)
        results.append(deadlock_result)

        # Check system health
        system_result = await self._check_system_health()
        results.append(system_result)

        # Record results
        for result in results:
            self.history.add_result(result)

        # Determine overall health
        overall_status = self._determine_overall_status(results)

        return {
            "timestamp": datetime.utcnow().isoformat(),
            "overall_status": overall_status.value,
            "checks": [
                {
                    "category": r.category.value,
                    "status": r.status.value,
                    "message": r.message,
                    "value": r.value,
                    "details": r.details
                }
                for r in results
            ],
            "history": self.history.get_status_summary()
        }

    async def _check_memory_health(self) -> HealthCheckResult:
        """Check memory health"""
        try:
            memory_usage = psutil.Process().memory_info().rss / 1024 / 1024  # MB

            if memory_usage > self.thresholds.memory_critical_mb:
                if (
                    not self._last_gc_time or
                    datetime.utcnow() - self._last_gc_time > timedelta(minutes=5)
                ):
                    import gc
                    gc.collect()
                    self._last_gc_time = datetime.utcnow()
                    memory_usage = psutil.Process().memory_info().rss / 1024 / 1024

                status = HealthStatus.CRITICAL
                message = f"Critical memory usage: {memory_usage:.1f}MB"
            elif memory_usage > self.thresholds.memory_warning_mb:
                status = HealthStatus.WARNING
                message = f"High memory usage: {memory_usage:.1f}MB"
            else:
                status = HealthStatus.HEALTHY
                message = f"Normal memory usage: {memory_usage:.1f}MB"

            return HealthCheckResult(
                category=HealthCategory.MEMORY,
                status=status,
                message=message,
                value=memory_usage
            )

        except Exception as e:
            logger.error(f"Error checking memory health: {e}")
            return HealthCheckResult(
                category=HealthCategory.MEMORY,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking memory: {str(e)}"
            )

    def _check_performance_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
        """Check performance health"""
        try:
            avg_time = metrics.get("avg_processing_time", 0)
            success_rate = metrics.get("success_rate", 1.0)

            if success_rate < 0.5:  # Less than 50% success
                status = HealthStatus.CRITICAL
                message = f"Critical performance: {success_rate:.1%} success rate"
            elif success_rate < 0.8:  # Less than 80% success
                status = HealthStatus.WARNING
                message = f"Degraded performance: {success_rate:.1%} success rate"
            else:
                status = HealthStatus.HEALTHY
                message = f"Normal performance: {success_rate:.1%} success rate"

            return HealthCheckResult(
                category=HealthCategory.PERFORMANCE,
                status=status,
                message=message,
                value=success_rate,
                details={"avg_processing_time": avg_time}
            )

        except Exception as e:
            logger.error(f"Error checking performance health: {e}")
            return HealthCheckResult(
                category=HealthCategory.PERFORMANCE,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking performance: {str(e)}"
            )

    def _check_activity_health(
        self,
        last_activity_time: float,
        has_processing_items: bool
    ) -> HealthCheckResult:
        """Check activity health"""
        if not has_processing_items:
            return HealthCheckResult(
                category=HealthCategory.ACTIVITY,
                status=HealthStatus.HEALTHY,
                message="No items being processed"
            )

        inactive_time = time.time() - last_activity_time

        if inactive_time > self.thresholds.inactivity_critical_sec:
            status = HealthStatus.CRITICAL
            message = f"No activity for {inactive_time:.1f}s"
        elif inactive_time > self.thresholds.inactivity_warning_sec:
            status = HealthStatus.WARNING
            message = f"Limited activity for {inactive_time:.1f}s"
        else:
            status = HealthStatus.HEALTHY
            message = "Normal activity levels"

        return HealthCheckResult(
            category=HealthCategory.ACTIVITY,
            status=status,
            message=message,
            value=inactive_time
        )

    def _check_error_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
        """Check error health"""
        try:
            error_rate = metrics.get("error_rate", 0.0)
            error_count = metrics.get("total_errors", 0)

            if error_rate > self.thresholds.error_rate_critical:
                status = HealthStatus.CRITICAL
                message = f"Critical error rate: {error_rate:.1%}"
            elif error_rate > self.thresholds.error_rate_warning:
                status = HealthStatus.WARNING
                message = f"High error rate: {error_rate:.1%}"
            else:
                status = HealthStatus.HEALTHY
                message = f"Normal error rate: {error_rate:.1%}"

            return HealthCheckResult(
                category=HealthCategory.ERRORS,
                status=status,
                message=message,
                value=error_rate,
                details={"error_count": error_count}
            )

        except Exception as e:
            logger.error(f"Error checking error health: {e}")
            return HealthCheckResult(
                category=HealthCategory.ERRORS,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking errors: {str(e)}"
            )

    def _check_deadlocks(self, queue_info: Dict[str, Any]) -> HealthCheckResult:
        """Check for potential deadlocks"""
        try:
            stuck_items = queue_info.get("stuck_items", [])
            if not stuck_items:
                return HealthCheckResult(
                    category=HealthCategory.DEADLOCKS,
                    status=HealthStatus.HEALTHY,
                    message="No stuck items detected"
                )

            longest_stuck = max(
                time.time() - item["start_time"]
                for item in stuck_items
            )

            if longest_stuck > self.thresholds.deadlock_critical_sec:
                status = HealthStatus.CRITICAL
                message = f"Potential deadlock: {len(stuck_items)} items stuck"
            elif longest_stuck > self.thresholds.deadlock_warning_sec:
                status = HealthStatus.WARNING
                message = f"Slow processing: {len(stuck_items)} items delayed"
            else:
                status = HealthStatus.HEALTHY
                message = "Normal processing time"

            return HealthCheckResult(
                category=HealthCategory.DEADLOCKS,
                status=status,
                message=message,
                value=longest_stuck,
                details={"stuck_items": len(stuck_items)}
            )

        except Exception as e:
            logger.error(f"Error checking deadlocks: {e}")
            return HealthCheckResult(
                category=HealthCategory.DEADLOCKS,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking deadlocks: {str(e)}"
            )

    async def _check_system_health(self) -> HealthCheckResult:
        """Check system health"""
        try:
            metrics = await self.system_monitor.check_system_health()

            if not metrics:
                return HealthCheckResult(
                    category=HealthCategory.SYSTEM,
                    status=HealthStatus.UNKNOWN,
                    message="Unable to get system metrics"
                )

            cpu_percent = metrics["cpu_percent"]
            if cpu_percent > self.thresholds.cpu_critical_percent:
                status = HealthStatus.CRITICAL
                message = f"Critical CPU usage: {cpu_percent:.1f}%"
            elif cpu_percent > self.thresholds.cpu_warning_percent:
                status = HealthStatus.WARNING
                message = f"High CPU usage: {cpu_percent:.1f}%"
            else:
                status = HealthStatus.HEALTHY
                message = f"Normal CPU usage: {cpu_percent:.1f}%"

            return HealthCheckResult(
                category=HealthCategory.SYSTEM,
                status=status,
                message=message,
                value=cpu_percent,
                details=metrics
            )

        except Exception as e:
            logger.error(f"Error checking system health: {e}")
            return HealthCheckResult(
                category=HealthCategory.SYSTEM,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking system: {str(e)}"
            )

    def _determine_overall_status(
        self,
        results: List[HealthCheckResult]
    ) -> HealthStatus:
        """Determine overall health status"""
        if any(r.status == HealthStatus.CRITICAL for r in results):
            return HealthStatus.CRITICAL
        if any(r.status == HealthStatus.WARNING for r in results):
            return HealthStatus.WARNING
        if any(r.status == HealthStatus.UNKNOWN for r in results):
            return HealthStatus.UNKNOWN
        return HealthStatus.HEALTHY

    def format_health_report(
        self,
        results: List[HealthCheckResult]
    ) -> str:
        """Format a detailed health report"""
        lines = ["Queue Health Report:"]

        for result in results:
            lines.append(
                f"\n{result.category.value.title()}:"
                f"\n- Status: {result.status.value}"
                f"\n- {result.message}"
            )
            if result.details:
                for key, value in result.details.items():
                    lines.append(f"  - {key}: {value}")

        return "\n".join(lines)