Pac-cogs/videoarchiver/queue/health_checker.py

"""Module for queue health checks"""

import logging
import psutil # type: ignore
import time
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, Optional, Tuple, List, Any, Set
from datetime import datetime, timedelta

logger = logging.getLogger("QueueHealthChecker")

class HealthStatus(Enum):
    """Possible health status values"""
    HEALTHY = "healthy"
    WARNING = "warning"
    CRITICAL = "critical"
    UNKNOWN = "unknown"

class HealthCategory(Enum):
    """Health check categories"""
    MEMORY = "memory"
    PERFORMANCE = "performance"
    ACTIVITY = "activity"
    ERRORS = "errors"
    DEADLOCKS = "deadlocks"
    SYSTEM = "system"

@dataclass
class HealthThresholds:
    """Defines thresholds for health checks"""
    memory_warning_mb: int = 384    # 384MB
    memory_critical_mb: int = 512   # 512MB
    deadlock_warning_sec: int = 30  # 30 seconds
    deadlock_critical_sec: int = 60 # 1 minute
    error_rate_warning: float = 0.1 # 10% errors
    error_rate_critical: float = 0.2 # 20% errors
    inactivity_warning_sec: int = 30
    inactivity_critical_sec: int = 60
    cpu_warning_percent: float = 80.0
    cpu_critical_percent: float = 90.0

@dataclass
class HealthCheckResult:
    """Result of a health check"""
    category: HealthCategory
    status: HealthStatus
    message: str
    value: Optional[float] = None
    timestamp: datetime = field(default_factory=datetime.utcnow)
    details: Dict[str, Any] = field(default_factory=dict)

class HealthHistory:
    """Tracks health check history"""

    def __init__(self, max_history: int = 1000):
        self.max_history = max_history
        self.history: List[HealthCheckResult] = []
        self.status_changes: List[Dict[str, Any]] = []
        self.critical_events: List[Dict[str, Any]] = []

    def add_result(self, result: HealthCheckResult) -> None:
        """Add a health check result"""
        self.history.append(result)
        if len(self.history) > self.max_history:
            self.history.pop(0)

        # Track status changes
        if self.history[-2:-1] and self.history[-1].status != self.history[-2].status:
            self.status_changes.append({
                "timestamp": result.timestamp,
                "category": result.category.value,
                "from_status": self.history[-2].status.value,
                "to_status": result.status.value,
                "message": result.message
            })

        # Track critical events
        if result.status == HealthStatus.CRITICAL:
            self.critical_events.append({
                "timestamp": result.timestamp,
                "category": result.category.value,
                "message": result.message,
                "details": result.details
            })

    def get_status_summary(self) -> Dict[str, Any]:
        """Get summary of health status history"""
        return {
            "total_checks": len(self.history),
            "status_changes": len(self.status_changes),
            "critical_events": len(self.critical_events),
            "recent_status_changes": self.status_changes[-5:],
            "recent_critical_events": self.critical_events[-5:]
        }

class SystemHealthMonitor:
    """Monitors system health metrics"""

    def __init__(self):
        self.process = psutil.Process()

    async def check_system_health(self) -> Dict[str, Any]:
        """Check system health metrics"""
        try:
            cpu_percent = self.process.cpu_percent()
            memory_info = self.process.memory_info()
            io_counters = self.process.io_counters()

            return {
                "cpu_percent": cpu_percent,
                "memory_rss": memory_info.rss / 1024 / 1024,  # MB
                "memory_vms": memory_info.vms / 1024 / 1024,  # MB
                "io_read_mb": io_counters.read_bytes / 1024 / 1024,
                "io_write_mb": io_counters.write_bytes / 1024 / 1024,
                "thread_count": self.process.num_threads(),
                "open_files": len(self.process.open_files()),
                "connections": len(self.process.connections())
            }
        except Exception as e:
            logger.error(f"Error checking system health: {e}")
            return {}

class HealthChecker:
    """Handles health checks for the queue system"""

    def __init__(
        self,
        thresholds: Optional[HealthThresholds] = None,
        history_size: int = 1000
    ):
        self.thresholds = thresholds or HealthThresholds()
        self.history = HealthHistory(history_size)
        self.system_monitor = SystemHealthMonitor()
        self._last_gc_time: Optional[datetime] = None

    async def check_health(
        self,
        metrics: Dict[str, Any],
        queue_info: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Perform comprehensive health check"""
        results = []

        # Check memory health
        memory_result = await self._check_memory_health()
        results.append(memory_result)

        # Check performance health
        perf_result = self._check_performance_health(metrics)
        results.append(perf_result)

        # Check activity health
        activity_result = self._check_activity_health(
            queue_info["last_activity"],
            queue_info["processing_count"] > 0
        )
        results.append(activity_result)

        # Check error health
        error_result = self._check_error_health(metrics)
        results.append(error_result)

        # Check for deadlocks
        deadlock_result = self._check_deadlocks(queue_info)
        results.append(deadlock_result)

        # Check system health
        system_result = await self._check_system_health()
        results.append(system_result)

        # Record results
        for result in results:
            self.history.add_result(result)

        # Determine overall health
        overall_status = self._determine_overall_status(results)

        return {
            "timestamp": datetime.utcnow().isoformat(),
            "overall_status": overall_status.value,
            "checks": [
                {
                    "category": r.category.value,
                    "status": r.status.value,
                    "message": r.message,
                    "value": r.value,
                    "details": r.details
                }
                for r in results
            ],
            "history": self.history.get_status_summary()
        }

    async def _check_memory_health(self) -> HealthCheckResult:
        """Check memory health"""
        try:
            memory_usage = psutil.Process().memory_info().rss / 1024 / 1024  # MB

            if memory_usage > self.thresholds.memory_critical_mb:
                if (
                    not self._last_gc_time or
                    datetime.utcnow() - self._last_gc_time > timedelta(minutes=5)
                ):
                    import gc
                    gc.collect()
                    self._last_gc_time = datetime.utcnow()
                    memory_usage = psutil.Process().memory_info().rss / 1024 / 1024

                status = HealthStatus.CRITICAL
                message = f"Critical memory usage: {memory_usage:.1f}MB"
            elif memory_usage > self.thresholds.memory_warning_mb:
                status = HealthStatus.WARNING
                message = f"High memory usage: {memory_usage:.1f}MB"
            else:
                status = HealthStatus.HEALTHY
                message = f"Normal memory usage: {memory_usage:.1f}MB"

            return HealthCheckResult(
                category=HealthCategory.MEMORY,
                status=status,
                message=message,
                value=memory_usage
            )

        except Exception as e:
            logger.error(f"Error checking memory health: {e}")
            return HealthCheckResult(
                category=HealthCategory.MEMORY,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking memory: {str(e)}"
            )

    def _check_performance_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
        """Check performance health"""
        try:
            avg_time = metrics.get("avg_processing_time", 0)
            success_rate = metrics.get("success_rate", 1.0)

            if success_rate < 0.5:  # Less than 50% success
                status = HealthStatus.CRITICAL
                message = f"Critical performance: {success_rate:.1%} success rate"
            elif success_rate < 0.8:  # Less than 80% success
                status = HealthStatus.WARNING
                message = f"Degraded performance: {success_rate:.1%} success rate"
            else:
                status = HealthStatus.HEALTHY
                message = f"Normal performance: {success_rate:.1%} success rate"

            return HealthCheckResult(
                category=HealthCategory.PERFORMANCE,
                status=status,
                message=message,
                value=success_rate,
                details={"avg_processing_time": avg_time}
            )

        except Exception as e:
            logger.error(f"Error checking performance health: {e}")
            return HealthCheckResult(
                category=HealthCategory.PERFORMANCE,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking performance: {str(e)}"
            )

    def _check_activity_health(
        self,
        last_activity_time: float,
        has_processing_items: bool
    ) -> HealthCheckResult:
        """Check activity health"""
        if not has_processing_items:
            return HealthCheckResult(
                category=HealthCategory.ACTIVITY,
                status=HealthStatus.HEALTHY,
                message="No items being processed"
            )

        inactive_time = time.time() - last_activity_time

        if inactive_time > self.thresholds.inactivity_critical_sec:
            status = HealthStatus.CRITICAL
            message = f"No activity for {inactive_time:.1f}s"
        elif inactive_time > self.thresholds.inactivity_warning_sec:
            status = HealthStatus.WARNING
            message = f"Limited activity for {inactive_time:.1f}s"
        else:
            status = HealthStatus.HEALTHY
            message = "Normal activity levels"

        return HealthCheckResult(
            category=HealthCategory.ACTIVITY,
            status=status,
            message=message,
            value=inactive_time
        )

    def _check_error_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
        """Check error health"""
        try:
            error_rate = metrics.get("error_rate", 0.0)
            error_count = metrics.get("total_errors", 0)

            if error_rate > self.thresholds.error_rate_critical:
                status = HealthStatus.CRITICAL
                message = f"Critical error rate: {error_rate:.1%}"
            elif error_rate > self.thresholds.error_rate_warning:
                status = HealthStatus.WARNING
                message = f"High error rate: {error_rate:.1%}"
            else:
                status = HealthStatus.HEALTHY
                message = f"Normal error rate: {error_rate:.1%}"

            return HealthCheckResult(
                category=HealthCategory.ERRORS,
                status=status,
                message=message,
                value=error_rate,
                details={"error_count": error_count}
            )

        except Exception as e:
            logger.error(f"Error checking error health: {e}")
            return HealthCheckResult(
                category=HealthCategory.ERRORS,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking errors: {str(e)}"
            )

    def _check_deadlocks(self, queue_info: Dict[str, Any]) -> HealthCheckResult:
        """Check for potential deadlocks"""
        try:
            stuck_items = queue_info.get("stuck_items", [])
            if not stuck_items:
                return HealthCheckResult(
                    category=HealthCategory.DEADLOCKS,
                    status=HealthStatus.HEALTHY,
                    message="No stuck items detected"
                )

            longest_stuck = max(
                time.time() - item["start_time"]
                for item in stuck_items
            )

            if longest_stuck > self.thresholds.deadlock_critical_sec:
                status = HealthStatus.CRITICAL
                message = f"Potential deadlock: {len(stuck_items)} items stuck"
            elif longest_stuck > self.thresholds.deadlock_warning_sec:
                status = HealthStatus.WARNING
                message = f"Slow processing: {len(stuck_items)} items delayed"
            else:
                status = HealthStatus.HEALTHY
                message = "Normal processing time"

            return HealthCheckResult(
                category=HealthCategory.DEADLOCKS,
                status=status,
                message=message,
                value=longest_stuck,
                details={"stuck_items": len(stuck_items)}
            )

        except Exception as e:
            logger.error(f"Error checking deadlocks: {e}")
            return HealthCheckResult(
                category=HealthCategory.DEADLOCKS,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking deadlocks: {str(e)}"
            )

    async def _check_system_health(self) -> HealthCheckResult:
        """Check system health"""
        try:
            metrics = await self.system_monitor.check_system_health()

            if not metrics:
                return HealthCheckResult(
                    category=HealthCategory.SYSTEM,
                    status=HealthStatus.UNKNOWN,
                    message="Unable to get system metrics"
                )

            cpu_percent = metrics["cpu_percent"]
            if cpu_percent > self.thresholds.cpu_critical_percent:
                status = HealthStatus.CRITICAL
                message = f"Critical CPU usage: {cpu_percent:.1f}%"
            elif cpu_percent > self.thresholds.cpu_warning_percent:
                status = HealthStatus.WARNING
                message = f"High CPU usage: {cpu_percent:.1f}%"
            else:
                status = HealthStatus.HEALTHY
                message = f"Normal CPU usage: {cpu_percent:.1f}%"

            return HealthCheckResult(
                category=HealthCategory.SYSTEM,
                status=status,
                message=message,
                value=cpu_percent,
                details=metrics
            )

        except Exception as e:
            logger.error(f"Error checking system health: {e}")
            return HealthCheckResult(
                category=HealthCategory.SYSTEM,
                status=HealthStatus.UNKNOWN,
                message=f"Error checking system: {str(e)}"
            )

    def _determine_overall_status(
        self,
        results: List[HealthCheckResult]
    ) -> HealthStatus:
        """Determine overall health status"""
        if any(r.status == HealthStatus.CRITICAL for r in results):
            return HealthStatus.CRITICAL
        if any(r.status == HealthStatus.WARNING for r in results):
            return HealthStatus.WARNING
        if any(r.status == HealthStatus.UNKNOWN for r in results):
            return HealthStatus.UNKNOWN
        return HealthStatus.HEALTHY

    def format_health_report(
        self,
        results: List[HealthCheckResult]
    ) -> str:
        """Format a detailed health report"""
        lines = ["Queue Health Report:"]

        for result in results:
            lines.append(
                f"\n{result.category.value.title()}:"
                f"\n- Status: {result.status.value}"
                f"\n- {result.message}"
            )
            if result.details:
                for key, value in result.details.items():
                    lines.append(f"  - {key}: {value}")

        return "\n".join(lines)