Files
Pac-cogs/videoarchiver/queue/health_checker.py
pacnpal a4ca6e8ea6 Core Systems:
Component-based architecture with lifecycle management
Enhanced error handling and recovery mechanisms
Comprehensive state management and tracking
Event-driven architecture with monitoring
Queue Management:

Multiple processing strategies for different scenarios
Advanced state management with recovery
Comprehensive metrics and health monitoring
Sophisticated cleanup system with multiple strategies
Processing Pipeline:

Enhanced message handling with validation
Improved URL extraction and processing
Better queue management and monitoring
Advanced cleanup mechanisms
Overall Benefits:

Better code organization and maintainability
Improved error handling and recovery
Enhanced monitoring and reporting
More robust and reliable system
2024-11-16 05:01:29 +00:00

442 lines
16 KiB
Python

"""Module for queue health checks"""
import logging
import psutil
import time
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, Optional, Tuple, List, Any, Set
from datetime import datetime, timedelta
logger = logging.getLogger("QueueHealthChecker")
class HealthStatus(Enum):
"""Possible health status values"""
HEALTHY = "healthy"
WARNING = "warning"
CRITICAL = "critical"
UNKNOWN = "unknown"
class HealthCategory(Enum):
"""Health check categories"""
MEMORY = "memory"
PERFORMANCE = "performance"
ACTIVITY = "activity"
ERRORS = "errors"
DEADLOCKS = "deadlocks"
SYSTEM = "system"
@dataclass
class HealthThresholds:
"""Defines thresholds for health checks"""
memory_warning_mb: int = 384 # 384MB
memory_critical_mb: int = 512 # 512MB
deadlock_warning_sec: int = 30 # 30 seconds
deadlock_critical_sec: int = 60 # 1 minute
error_rate_warning: float = 0.1 # 10% errors
error_rate_critical: float = 0.2 # 20% errors
inactivity_warning_sec: int = 30
inactivity_critical_sec: int = 60
cpu_warning_percent: float = 80.0
cpu_critical_percent: float = 90.0
@dataclass
class HealthCheckResult:
"""Result of a health check"""
category: HealthCategory
status: HealthStatus
message: str
value: Optional[float] = None
timestamp: datetime = field(default_factory=datetime.utcnow)
details: Dict[str, Any] = field(default_factory=dict)
class HealthHistory:
"""Tracks health check history"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[HealthCheckResult] = []
self.status_changes: List[Dict[str, Any]] = []
self.critical_events: List[Dict[str, Any]] = []
def add_result(self, result: HealthCheckResult) -> None:
"""Add a health check result"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
# Track status changes
if self.history[-2:-1] and self.history[-1].status != self.history[-2].status:
self.status_changes.append({
"timestamp": result.timestamp,
"category": result.category.value,
"from_status": self.history[-2].status.value,
"to_status": result.status.value,
"message": result.message
})
# Track critical events
if result.status == HealthStatus.CRITICAL:
self.critical_events.append({
"timestamp": result.timestamp,
"category": result.category.value,
"message": result.message,
"details": result.details
})
def get_status_summary(self) -> Dict[str, Any]:
"""Get summary of health status history"""
return {
"total_checks": len(self.history),
"status_changes": len(self.status_changes),
"critical_events": len(self.critical_events),
"recent_status_changes": self.status_changes[-5:],
"recent_critical_events": self.critical_events[-5:]
}
class SystemHealthMonitor:
"""Monitors system health metrics"""
def __init__(self):
self.process = psutil.Process()
async def check_system_health(self) -> Dict[str, Any]:
"""Check system health metrics"""
try:
cpu_percent = self.process.cpu_percent()
memory_info = self.process.memory_info()
io_counters = self.process.io_counters()
return {
"cpu_percent": cpu_percent,
"memory_rss": memory_info.rss / 1024 / 1024, # MB
"memory_vms": memory_info.vms / 1024 / 1024, # MB
"io_read_mb": io_counters.read_bytes / 1024 / 1024,
"io_write_mb": io_counters.write_bytes / 1024 / 1024,
"thread_count": self.process.num_threads(),
"open_files": len(self.process.open_files()),
"connections": len(self.process.connections())
}
except Exception as e:
logger.error(f"Error checking system health: {e}")
return {}
class HealthChecker:
"""Handles health checks for the queue system"""
def __init__(
self,
thresholds: Optional[HealthThresholds] = None,
history_size: int = 1000
):
self.thresholds = thresholds or HealthThresholds()
self.history = HealthHistory(history_size)
self.system_monitor = SystemHealthMonitor()
self._last_gc_time: Optional[datetime] = None
async def check_health(
self,
metrics: Dict[str, Any],
queue_info: Dict[str, Any]
) -> Dict[str, Any]:
"""Perform comprehensive health check"""
results = []
# Check memory health
memory_result = await self._check_memory_health()
results.append(memory_result)
# Check performance health
perf_result = self._check_performance_health(metrics)
results.append(perf_result)
# Check activity health
activity_result = self._check_activity_health(
queue_info["last_activity"],
queue_info["processing_count"] > 0
)
results.append(activity_result)
# Check error health
error_result = self._check_error_health(metrics)
results.append(error_result)
# Check for deadlocks
deadlock_result = self._check_deadlocks(queue_info)
results.append(deadlock_result)
# Check system health
system_result = await self._check_system_health()
results.append(system_result)
# Record results
for result in results:
self.history.add_result(result)
# Determine overall health
overall_status = self._determine_overall_status(results)
return {
"timestamp": datetime.utcnow().isoformat(),
"overall_status": overall_status.value,
"checks": [
{
"category": r.category.value,
"status": r.status.value,
"message": r.message,
"value": r.value,
"details": r.details
}
for r in results
],
"history": self.history.get_status_summary()
}
async def _check_memory_health(self) -> HealthCheckResult:
"""Check memory health"""
try:
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024 # MB
if memory_usage > self.thresholds.memory_critical_mb:
if (
not self._last_gc_time or
datetime.utcnow() - self._last_gc_time > timedelta(minutes=5)
):
import gc
gc.collect()
self._last_gc_time = datetime.utcnow()
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024
status = HealthStatus.CRITICAL
message = f"Critical memory usage: {memory_usage:.1f}MB"
elif memory_usage > self.thresholds.memory_warning_mb:
status = HealthStatus.WARNING
message = f"High memory usage: {memory_usage:.1f}MB"
else:
status = HealthStatus.HEALTHY
message = f"Normal memory usage: {memory_usage:.1f}MB"
return HealthCheckResult(
category=HealthCategory.MEMORY,
status=status,
message=message,
value=memory_usage
)
except Exception as e:
logger.error(f"Error checking memory health: {e}")
return HealthCheckResult(
category=HealthCategory.MEMORY,
status=HealthStatus.UNKNOWN,
message=f"Error checking memory: {str(e)}"
)
def _check_performance_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
"""Check performance health"""
try:
avg_time = metrics.get("avg_processing_time", 0)
success_rate = metrics.get("success_rate", 1.0)
if success_rate < 0.5: # Less than 50% success
status = HealthStatus.CRITICAL
message = f"Critical performance: {success_rate:.1%} success rate"
elif success_rate < 0.8: # Less than 80% success
status = HealthStatus.WARNING
message = f"Degraded performance: {success_rate:.1%} success rate"
else:
status = HealthStatus.HEALTHY
message = f"Normal performance: {success_rate:.1%} success rate"
return HealthCheckResult(
category=HealthCategory.PERFORMANCE,
status=status,
message=message,
value=success_rate,
details={"avg_processing_time": avg_time}
)
except Exception as e:
logger.error(f"Error checking performance health: {e}")
return HealthCheckResult(
category=HealthCategory.PERFORMANCE,
status=HealthStatus.UNKNOWN,
message=f"Error checking performance: {str(e)}"
)
def _check_activity_health(
self,
last_activity_time: float,
has_processing_items: bool
) -> HealthCheckResult:
"""Check activity health"""
if not has_processing_items:
return HealthCheckResult(
category=HealthCategory.ACTIVITY,
status=HealthStatus.HEALTHY,
message="No items being processed"
)
inactive_time = time.time() - last_activity_time
if inactive_time > self.thresholds.inactivity_critical_sec:
status = HealthStatus.CRITICAL
message = f"No activity for {inactive_time:.1f}s"
elif inactive_time > self.thresholds.inactivity_warning_sec:
status = HealthStatus.WARNING
message = f"Limited activity for {inactive_time:.1f}s"
else:
status = HealthStatus.HEALTHY
message = "Normal activity levels"
return HealthCheckResult(
category=HealthCategory.ACTIVITY,
status=status,
message=message,
value=inactive_time
)
def _check_error_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
"""Check error health"""
try:
error_rate = metrics.get("error_rate", 0.0)
error_count = metrics.get("total_errors", 0)
if error_rate > self.thresholds.error_rate_critical:
status = HealthStatus.CRITICAL
message = f"Critical error rate: {error_rate:.1%}"
elif error_rate > self.thresholds.error_rate_warning:
status = HealthStatus.WARNING
message = f"High error rate: {error_rate:.1%}"
else:
status = HealthStatus.HEALTHY
message = f"Normal error rate: {error_rate:.1%}"
return HealthCheckResult(
category=HealthCategory.ERRORS,
status=status,
message=message,
value=error_rate,
details={"error_count": error_count}
)
except Exception as e:
logger.error(f"Error checking error health: {e}")
return HealthCheckResult(
category=HealthCategory.ERRORS,
status=HealthStatus.UNKNOWN,
message=f"Error checking errors: {str(e)}"
)
def _check_deadlocks(self, queue_info: Dict[str, Any]) -> HealthCheckResult:
"""Check for potential deadlocks"""
try:
stuck_items = queue_info.get("stuck_items", [])
if not stuck_items:
return HealthCheckResult(
category=HealthCategory.DEADLOCKS,
status=HealthStatus.HEALTHY,
message="No stuck items detected"
)
longest_stuck = max(
time.time() - item["start_time"]
for item in stuck_items
)
if longest_stuck > self.thresholds.deadlock_critical_sec:
status = HealthStatus.CRITICAL
message = f"Potential deadlock: {len(stuck_items)} items stuck"
elif longest_stuck > self.thresholds.deadlock_warning_sec:
status = HealthStatus.WARNING
message = f"Slow processing: {len(stuck_items)} items delayed"
else:
status = HealthStatus.HEALTHY
message = "Normal processing time"
return HealthCheckResult(
category=HealthCategory.DEADLOCKS,
status=status,
message=message,
value=longest_stuck,
details={"stuck_items": len(stuck_items)}
)
except Exception as e:
logger.error(f"Error checking deadlocks: {e}")
return HealthCheckResult(
category=HealthCategory.DEADLOCKS,
status=HealthStatus.UNKNOWN,
message=f"Error checking deadlocks: {str(e)}"
)
async def _check_system_health(self) -> HealthCheckResult:
"""Check system health"""
try:
metrics = await self.system_monitor.check_system_health()
if not metrics:
return HealthCheckResult(
category=HealthCategory.SYSTEM,
status=HealthStatus.UNKNOWN,
message="Unable to get system metrics"
)
cpu_percent = metrics["cpu_percent"]
if cpu_percent > self.thresholds.cpu_critical_percent:
status = HealthStatus.CRITICAL
message = f"Critical CPU usage: {cpu_percent:.1f}%"
elif cpu_percent > self.thresholds.cpu_warning_percent:
status = HealthStatus.WARNING
message = f"High CPU usage: {cpu_percent:.1f}%"
else:
status = HealthStatus.HEALTHY
message = f"Normal CPU usage: {cpu_percent:.1f}%"
return HealthCheckResult(
category=HealthCategory.SYSTEM,
status=status,
message=message,
value=cpu_percent,
details=metrics
)
except Exception as e:
logger.error(f"Error checking system health: {e}")
return HealthCheckResult(
category=HealthCategory.SYSTEM,
status=HealthStatus.UNKNOWN,
message=f"Error checking system: {str(e)}"
)
def _determine_overall_status(
self,
results: List[HealthCheckResult]
) -> HealthStatus:
"""Determine overall health status"""
if any(r.status == HealthStatus.CRITICAL for r in results):
return HealthStatus.CRITICAL
if any(r.status == HealthStatus.WARNING for r in results):
return HealthStatus.WARNING
if any(r.status == HealthStatus.UNKNOWN for r in results):
return HealthStatus.UNKNOWN
return HealthStatus.HEALTHY
def format_health_report(
self,
results: List[HealthCheckResult]
) -> str:
"""Format a detailed health report"""
lines = ["Queue Health Report:"]
for result in results:
lines.append(
f"\n{result.category.value.title()}:"
f"\n- Status: {result.status.value}"
f"\n- {result.message}"
)
if result.details:
for key, value in result.details.items():
lines.append(f" - {key}: {value}")
return "\n".join(lines)