mirror of
https://github.com/pacnpal/Pac-cogs.git
synced 2025-12-20 02:41:06 -05:00
Core Systems:
Component-based architecture with lifecycle management Enhanced error handling and recovery mechanisms Comprehensive state management and tracking Event-driven architecture with monitoring Queue Management: Multiple processing strategies for different scenarios Advanced state management with recovery Comprehensive metrics and health monitoring Sophisticated cleanup system with multiple strategies Processing Pipeline: Enhanced message handling with validation Improved URL extraction and processing Better queue management and monitoring Advanced cleanup mechanisms Overall Benefits: Better code organization and maintainability Improved error handling and recovery Enhanced monitoring and reporting More robust and reliable system
This commit is contained in:
441
videoarchiver/queue/health_checker.py
Normal file
441
videoarchiver/queue/health_checker.py
Normal file
@@ -0,0 +1,441 @@
|
||||
"""Module for queue health checks"""
|
||||
|
||||
import logging
|
||||
import psutil
|
||||
import time
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Optional, Tuple, List, Any, Set
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
logger = logging.getLogger("QueueHealthChecker")
|
||||
|
||||
class HealthStatus(Enum):
|
||||
"""Possible health status values"""
|
||||
HEALTHY = "healthy"
|
||||
WARNING = "warning"
|
||||
CRITICAL = "critical"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
class HealthCategory(Enum):
|
||||
"""Health check categories"""
|
||||
MEMORY = "memory"
|
||||
PERFORMANCE = "performance"
|
||||
ACTIVITY = "activity"
|
||||
ERRORS = "errors"
|
||||
DEADLOCKS = "deadlocks"
|
||||
SYSTEM = "system"
|
||||
|
||||
@dataclass
|
||||
class HealthThresholds:
|
||||
"""Defines thresholds for health checks"""
|
||||
memory_warning_mb: int = 384 # 384MB
|
||||
memory_critical_mb: int = 512 # 512MB
|
||||
deadlock_warning_sec: int = 30 # 30 seconds
|
||||
deadlock_critical_sec: int = 60 # 1 minute
|
||||
error_rate_warning: float = 0.1 # 10% errors
|
||||
error_rate_critical: float = 0.2 # 20% errors
|
||||
inactivity_warning_sec: int = 30
|
||||
inactivity_critical_sec: int = 60
|
||||
cpu_warning_percent: float = 80.0
|
||||
cpu_critical_percent: float = 90.0
|
||||
|
||||
@dataclass
|
||||
class HealthCheckResult:
|
||||
"""Result of a health check"""
|
||||
category: HealthCategory
|
||||
status: HealthStatus
|
||||
message: str
|
||||
value: Optional[float] = None
|
||||
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
class HealthHistory:
|
||||
"""Tracks health check history"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.history: List[HealthCheckResult] = []
|
||||
self.status_changes: List[Dict[str, Any]] = []
|
||||
self.critical_events: List[Dict[str, Any]] = []
|
||||
|
||||
def add_result(self, result: HealthCheckResult) -> None:
|
||||
"""Add a health check result"""
|
||||
self.history.append(result)
|
||||
if len(self.history) > self.max_history:
|
||||
self.history.pop(0)
|
||||
|
||||
# Track status changes
|
||||
if self.history[-2:-1] and self.history[-1].status != self.history[-2].status:
|
||||
self.status_changes.append({
|
||||
"timestamp": result.timestamp,
|
||||
"category": result.category.value,
|
||||
"from_status": self.history[-2].status.value,
|
||||
"to_status": result.status.value,
|
||||
"message": result.message
|
||||
})
|
||||
|
||||
# Track critical events
|
||||
if result.status == HealthStatus.CRITICAL:
|
||||
self.critical_events.append({
|
||||
"timestamp": result.timestamp,
|
||||
"category": result.category.value,
|
||||
"message": result.message,
|
||||
"details": result.details
|
||||
})
|
||||
|
||||
def get_status_summary(self) -> Dict[str, Any]:
|
||||
"""Get summary of health status history"""
|
||||
return {
|
||||
"total_checks": len(self.history),
|
||||
"status_changes": len(self.status_changes),
|
||||
"critical_events": len(self.critical_events),
|
||||
"recent_status_changes": self.status_changes[-5:],
|
||||
"recent_critical_events": self.critical_events[-5:]
|
||||
}
|
||||
|
||||
class SystemHealthMonitor:
|
||||
"""Monitors system health metrics"""
|
||||
|
||||
def __init__(self):
|
||||
self.process = psutil.Process()
|
||||
|
||||
async def check_system_health(self) -> Dict[str, Any]:
|
||||
"""Check system health metrics"""
|
||||
try:
|
||||
cpu_percent = self.process.cpu_percent()
|
||||
memory_info = self.process.memory_info()
|
||||
io_counters = self.process.io_counters()
|
||||
|
||||
return {
|
||||
"cpu_percent": cpu_percent,
|
||||
"memory_rss": memory_info.rss / 1024 / 1024, # MB
|
||||
"memory_vms": memory_info.vms / 1024 / 1024, # MB
|
||||
"io_read_mb": io_counters.read_bytes / 1024 / 1024,
|
||||
"io_write_mb": io_counters.write_bytes / 1024 / 1024,
|
||||
"thread_count": self.process.num_threads(),
|
||||
"open_files": len(self.process.open_files()),
|
||||
"connections": len(self.process.connections())
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking system health: {e}")
|
||||
return {}
|
||||
|
||||
class HealthChecker:
|
||||
"""Handles health checks for the queue system"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
thresholds: Optional[HealthThresholds] = None,
|
||||
history_size: int = 1000
|
||||
):
|
||||
self.thresholds = thresholds or HealthThresholds()
|
||||
self.history = HealthHistory(history_size)
|
||||
self.system_monitor = SystemHealthMonitor()
|
||||
self._last_gc_time: Optional[datetime] = None
|
||||
|
||||
async def check_health(
|
||||
self,
|
||||
metrics: Dict[str, Any],
|
||||
queue_info: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Perform comprehensive health check"""
|
||||
results = []
|
||||
|
||||
# Check memory health
|
||||
memory_result = await self._check_memory_health()
|
||||
results.append(memory_result)
|
||||
|
||||
# Check performance health
|
||||
perf_result = self._check_performance_health(metrics)
|
||||
results.append(perf_result)
|
||||
|
||||
# Check activity health
|
||||
activity_result = self._check_activity_health(
|
||||
queue_info["last_activity"],
|
||||
queue_info["processing_count"] > 0
|
||||
)
|
||||
results.append(activity_result)
|
||||
|
||||
# Check error health
|
||||
error_result = self._check_error_health(metrics)
|
||||
results.append(error_result)
|
||||
|
||||
# Check for deadlocks
|
||||
deadlock_result = self._check_deadlocks(queue_info)
|
||||
results.append(deadlock_result)
|
||||
|
||||
# Check system health
|
||||
system_result = await self._check_system_health()
|
||||
results.append(system_result)
|
||||
|
||||
# Record results
|
||||
for result in results:
|
||||
self.history.add_result(result)
|
||||
|
||||
# Determine overall health
|
||||
overall_status = self._determine_overall_status(results)
|
||||
|
||||
return {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"overall_status": overall_status.value,
|
||||
"checks": [
|
||||
{
|
||||
"category": r.category.value,
|
||||
"status": r.status.value,
|
||||
"message": r.message,
|
||||
"value": r.value,
|
||||
"details": r.details
|
||||
}
|
||||
for r in results
|
||||
],
|
||||
"history": self.history.get_status_summary()
|
||||
}
|
||||
|
||||
async def _check_memory_health(self) -> HealthCheckResult:
|
||||
"""Check memory health"""
|
||||
try:
|
||||
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024 # MB
|
||||
|
||||
if memory_usage > self.thresholds.memory_critical_mb:
|
||||
if (
|
||||
not self._last_gc_time or
|
||||
datetime.utcnow() - self._last_gc_time > timedelta(minutes=5)
|
||||
):
|
||||
import gc
|
||||
gc.collect()
|
||||
self._last_gc_time = datetime.utcnow()
|
||||
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024
|
||||
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Critical memory usage: {memory_usage:.1f}MB"
|
||||
elif memory_usage > self.thresholds.memory_warning_mb:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"High memory usage: {memory_usage:.1f}MB"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = f"Normal memory usage: {memory_usage:.1f}MB"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.MEMORY,
|
||||
status=status,
|
||||
message=message,
|
||||
value=memory_usage
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking memory health: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.MEMORY,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking memory: {str(e)}"
|
||||
)
|
||||
|
||||
def _check_performance_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
|
||||
"""Check performance health"""
|
||||
try:
|
||||
avg_time = metrics.get("avg_processing_time", 0)
|
||||
success_rate = metrics.get("success_rate", 1.0)
|
||||
|
||||
if success_rate < 0.5: # Less than 50% success
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Critical performance: {success_rate:.1%} success rate"
|
||||
elif success_rate < 0.8: # Less than 80% success
|
||||
status = HealthStatus.WARNING
|
||||
message = f"Degraded performance: {success_rate:.1%} success rate"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = f"Normal performance: {success_rate:.1%} success rate"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.PERFORMANCE,
|
||||
status=status,
|
||||
message=message,
|
||||
value=success_rate,
|
||||
details={"avg_processing_time": avg_time}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking performance health: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.PERFORMANCE,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking performance: {str(e)}"
|
||||
)
|
||||
|
||||
def _check_activity_health(
|
||||
self,
|
||||
last_activity_time: float,
|
||||
has_processing_items: bool
|
||||
) -> HealthCheckResult:
|
||||
"""Check activity health"""
|
||||
if not has_processing_items:
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.ACTIVITY,
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="No items being processed"
|
||||
)
|
||||
|
||||
inactive_time = time.time() - last_activity_time
|
||||
|
||||
if inactive_time > self.thresholds.inactivity_critical_sec:
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"No activity for {inactive_time:.1f}s"
|
||||
elif inactive_time > self.thresholds.inactivity_warning_sec:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"Limited activity for {inactive_time:.1f}s"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Normal activity levels"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.ACTIVITY,
|
||||
status=status,
|
||||
message=message,
|
||||
value=inactive_time
|
||||
)
|
||||
|
||||
def _check_error_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
|
||||
"""Check error health"""
|
||||
try:
|
||||
error_rate = metrics.get("error_rate", 0.0)
|
||||
error_count = metrics.get("total_errors", 0)
|
||||
|
||||
if error_rate > self.thresholds.error_rate_critical:
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Critical error rate: {error_rate:.1%}"
|
||||
elif error_rate > self.thresholds.error_rate_warning:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"High error rate: {error_rate:.1%}"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = f"Normal error rate: {error_rate:.1%}"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.ERRORS,
|
||||
status=status,
|
||||
message=message,
|
||||
value=error_rate,
|
||||
details={"error_count": error_count}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking error health: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.ERRORS,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking errors: {str(e)}"
|
||||
)
|
||||
|
||||
def _check_deadlocks(self, queue_info: Dict[str, Any]) -> HealthCheckResult:
|
||||
"""Check for potential deadlocks"""
|
||||
try:
|
||||
stuck_items = queue_info.get("stuck_items", [])
|
||||
if not stuck_items:
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.DEADLOCKS,
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="No stuck items detected"
|
||||
)
|
||||
|
||||
longest_stuck = max(
|
||||
time.time() - item["start_time"]
|
||||
for item in stuck_items
|
||||
)
|
||||
|
||||
if longest_stuck > self.thresholds.deadlock_critical_sec:
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Potential deadlock: {len(stuck_items)} items stuck"
|
||||
elif longest_stuck > self.thresholds.deadlock_warning_sec:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"Slow processing: {len(stuck_items)} items delayed"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Normal processing time"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.DEADLOCKS,
|
||||
status=status,
|
||||
message=message,
|
||||
value=longest_stuck,
|
||||
details={"stuck_items": len(stuck_items)}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking deadlocks: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.DEADLOCKS,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking deadlocks: {str(e)}"
|
||||
)
|
||||
|
||||
async def _check_system_health(self) -> HealthCheckResult:
|
||||
"""Check system health"""
|
||||
try:
|
||||
metrics = await self.system_monitor.check_system_health()
|
||||
|
||||
if not metrics:
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.SYSTEM,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message="Unable to get system metrics"
|
||||
)
|
||||
|
||||
cpu_percent = metrics["cpu_percent"]
|
||||
if cpu_percent > self.thresholds.cpu_critical_percent:
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Critical CPU usage: {cpu_percent:.1f}%"
|
||||
elif cpu_percent > self.thresholds.cpu_warning_percent:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"High CPU usage: {cpu_percent:.1f}%"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = f"Normal CPU usage: {cpu_percent:.1f}%"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.SYSTEM,
|
||||
status=status,
|
||||
message=message,
|
||||
value=cpu_percent,
|
||||
details=metrics
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking system health: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.SYSTEM,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking system: {str(e)}"
|
||||
)
|
||||
|
||||
def _determine_overall_status(
|
||||
self,
|
||||
results: List[HealthCheckResult]
|
||||
) -> HealthStatus:
|
||||
"""Determine overall health status"""
|
||||
if any(r.status == HealthStatus.CRITICAL for r in results):
|
||||
return HealthStatus.CRITICAL
|
||||
if any(r.status == HealthStatus.WARNING for r in results):
|
||||
return HealthStatus.WARNING
|
||||
if any(r.status == HealthStatus.UNKNOWN for r in results):
|
||||
return HealthStatus.UNKNOWN
|
||||
return HealthStatus.HEALTHY
|
||||
|
||||
def format_health_report(
|
||||
self,
|
||||
results: List[HealthCheckResult]
|
||||
) -> str:
|
||||
"""Format a detailed health report"""
|
||||
lines = ["Queue Health Report:"]
|
||||
|
||||
for result in results:
|
||||
lines.append(
|
||||
f"\n{result.category.value.title()}:"
|
||||
f"\n- Status: {result.status.value}"
|
||||
f"\n- {result.message}"
|
||||
)
|
||||
if result.details:
|
||||
for key, value in result.details.items():
|
||||
lines.append(f" - {key}: {value}")
|
||||
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user