mirror of
https://github.com/pacnpal/Pac-cogs.git
synced 2025-12-20 10:51:05 -05:00
Component-based architecture with lifecycle management Enhanced error handling and recovery mechanisms Comprehensive state management and tracking Event-driven architecture with monitoring Queue Management: Multiple processing strategies for different scenarios Advanced state management with recovery Comprehensive metrics and health monitoring Sophisticated cleanup system with multiple strategies Processing Pipeline: Enhanced message handling with validation Improved URL extraction and processing Better queue management and monitoring Advanced cleanup mechanisms Overall Benefits: Better code organization and maintainability Improved error handling and recovery Enhanced monitoring and reporting More robust and reliable system
442 lines
16 KiB
Python
442 lines
16 KiB
Python
"""Module for queue health checks"""
|
|
|
|
import logging
|
|
import psutil
|
|
import time
|
|
from enum import Enum
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, Optional, Tuple, List, Any, Set
|
|
from datetime import datetime, timedelta
|
|
|
|
logger = logging.getLogger("QueueHealthChecker")
|
|
|
|
class HealthStatus(Enum):
|
|
"""Possible health status values"""
|
|
HEALTHY = "healthy"
|
|
WARNING = "warning"
|
|
CRITICAL = "critical"
|
|
UNKNOWN = "unknown"
|
|
|
|
class HealthCategory(Enum):
|
|
"""Health check categories"""
|
|
MEMORY = "memory"
|
|
PERFORMANCE = "performance"
|
|
ACTIVITY = "activity"
|
|
ERRORS = "errors"
|
|
DEADLOCKS = "deadlocks"
|
|
SYSTEM = "system"
|
|
|
|
@dataclass
|
|
class HealthThresholds:
|
|
"""Defines thresholds for health checks"""
|
|
memory_warning_mb: int = 384 # 384MB
|
|
memory_critical_mb: int = 512 # 512MB
|
|
deadlock_warning_sec: int = 30 # 30 seconds
|
|
deadlock_critical_sec: int = 60 # 1 minute
|
|
error_rate_warning: float = 0.1 # 10% errors
|
|
error_rate_critical: float = 0.2 # 20% errors
|
|
inactivity_warning_sec: int = 30
|
|
inactivity_critical_sec: int = 60
|
|
cpu_warning_percent: float = 80.0
|
|
cpu_critical_percent: float = 90.0
|
|
|
|
@dataclass
|
|
class HealthCheckResult:
|
|
"""Result of a health check"""
|
|
category: HealthCategory
|
|
status: HealthStatus
|
|
message: str
|
|
value: Optional[float] = None
|
|
timestamp: datetime = field(default_factory=datetime.utcnow)
|
|
details: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
class HealthHistory:
|
|
"""Tracks health check history"""
|
|
|
|
def __init__(self, max_history: int = 1000):
|
|
self.max_history = max_history
|
|
self.history: List[HealthCheckResult] = []
|
|
self.status_changes: List[Dict[str, Any]] = []
|
|
self.critical_events: List[Dict[str, Any]] = []
|
|
|
|
def add_result(self, result: HealthCheckResult) -> None:
|
|
"""Add a health check result"""
|
|
self.history.append(result)
|
|
if len(self.history) > self.max_history:
|
|
self.history.pop(0)
|
|
|
|
# Track status changes
|
|
if self.history[-2:-1] and self.history[-1].status != self.history[-2].status:
|
|
self.status_changes.append({
|
|
"timestamp": result.timestamp,
|
|
"category": result.category.value,
|
|
"from_status": self.history[-2].status.value,
|
|
"to_status": result.status.value,
|
|
"message": result.message
|
|
})
|
|
|
|
# Track critical events
|
|
if result.status == HealthStatus.CRITICAL:
|
|
self.critical_events.append({
|
|
"timestamp": result.timestamp,
|
|
"category": result.category.value,
|
|
"message": result.message,
|
|
"details": result.details
|
|
})
|
|
|
|
def get_status_summary(self) -> Dict[str, Any]:
|
|
"""Get summary of health status history"""
|
|
return {
|
|
"total_checks": len(self.history),
|
|
"status_changes": len(self.status_changes),
|
|
"critical_events": len(self.critical_events),
|
|
"recent_status_changes": self.status_changes[-5:],
|
|
"recent_critical_events": self.critical_events[-5:]
|
|
}
|
|
|
|
class SystemHealthMonitor:
|
|
"""Monitors system health metrics"""
|
|
|
|
def __init__(self):
|
|
self.process = psutil.Process()
|
|
|
|
async def check_system_health(self) -> Dict[str, Any]:
|
|
"""Check system health metrics"""
|
|
try:
|
|
cpu_percent = self.process.cpu_percent()
|
|
memory_info = self.process.memory_info()
|
|
io_counters = self.process.io_counters()
|
|
|
|
return {
|
|
"cpu_percent": cpu_percent,
|
|
"memory_rss": memory_info.rss / 1024 / 1024, # MB
|
|
"memory_vms": memory_info.vms / 1024 / 1024, # MB
|
|
"io_read_mb": io_counters.read_bytes / 1024 / 1024,
|
|
"io_write_mb": io_counters.write_bytes / 1024 / 1024,
|
|
"thread_count": self.process.num_threads(),
|
|
"open_files": len(self.process.open_files()),
|
|
"connections": len(self.process.connections())
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error checking system health: {e}")
|
|
return {}
|
|
|
|
class HealthChecker:
|
|
"""Handles health checks for the queue system"""
|
|
|
|
def __init__(
|
|
self,
|
|
thresholds: Optional[HealthThresholds] = None,
|
|
history_size: int = 1000
|
|
):
|
|
self.thresholds = thresholds or HealthThresholds()
|
|
self.history = HealthHistory(history_size)
|
|
self.system_monitor = SystemHealthMonitor()
|
|
self._last_gc_time: Optional[datetime] = None
|
|
|
|
async def check_health(
|
|
self,
|
|
metrics: Dict[str, Any],
|
|
queue_info: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""Perform comprehensive health check"""
|
|
results = []
|
|
|
|
# Check memory health
|
|
memory_result = await self._check_memory_health()
|
|
results.append(memory_result)
|
|
|
|
# Check performance health
|
|
perf_result = self._check_performance_health(metrics)
|
|
results.append(perf_result)
|
|
|
|
# Check activity health
|
|
activity_result = self._check_activity_health(
|
|
queue_info["last_activity"],
|
|
queue_info["processing_count"] > 0
|
|
)
|
|
results.append(activity_result)
|
|
|
|
# Check error health
|
|
error_result = self._check_error_health(metrics)
|
|
results.append(error_result)
|
|
|
|
# Check for deadlocks
|
|
deadlock_result = self._check_deadlocks(queue_info)
|
|
results.append(deadlock_result)
|
|
|
|
# Check system health
|
|
system_result = await self._check_system_health()
|
|
results.append(system_result)
|
|
|
|
# Record results
|
|
for result in results:
|
|
self.history.add_result(result)
|
|
|
|
# Determine overall health
|
|
overall_status = self._determine_overall_status(results)
|
|
|
|
return {
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"overall_status": overall_status.value,
|
|
"checks": [
|
|
{
|
|
"category": r.category.value,
|
|
"status": r.status.value,
|
|
"message": r.message,
|
|
"value": r.value,
|
|
"details": r.details
|
|
}
|
|
for r in results
|
|
],
|
|
"history": self.history.get_status_summary()
|
|
}
|
|
|
|
async def _check_memory_health(self) -> HealthCheckResult:
|
|
"""Check memory health"""
|
|
try:
|
|
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024 # MB
|
|
|
|
if memory_usage > self.thresholds.memory_critical_mb:
|
|
if (
|
|
not self._last_gc_time or
|
|
datetime.utcnow() - self._last_gc_time > timedelta(minutes=5)
|
|
):
|
|
import gc
|
|
gc.collect()
|
|
self._last_gc_time = datetime.utcnow()
|
|
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024
|
|
|
|
status = HealthStatus.CRITICAL
|
|
message = f"Critical memory usage: {memory_usage:.1f}MB"
|
|
elif memory_usage > self.thresholds.memory_warning_mb:
|
|
status = HealthStatus.WARNING
|
|
message = f"High memory usage: {memory_usage:.1f}MB"
|
|
else:
|
|
status = HealthStatus.HEALTHY
|
|
message = f"Normal memory usage: {memory_usage:.1f}MB"
|
|
|
|
return HealthCheckResult(
|
|
category=HealthCategory.MEMORY,
|
|
status=status,
|
|
message=message,
|
|
value=memory_usage
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking memory health: {e}")
|
|
return HealthCheckResult(
|
|
category=HealthCategory.MEMORY,
|
|
status=HealthStatus.UNKNOWN,
|
|
message=f"Error checking memory: {str(e)}"
|
|
)
|
|
|
|
def _check_performance_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
|
|
"""Check performance health"""
|
|
try:
|
|
avg_time = metrics.get("avg_processing_time", 0)
|
|
success_rate = metrics.get("success_rate", 1.0)
|
|
|
|
if success_rate < 0.5: # Less than 50% success
|
|
status = HealthStatus.CRITICAL
|
|
message = f"Critical performance: {success_rate:.1%} success rate"
|
|
elif success_rate < 0.8: # Less than 80% success
|
|
status = HealthStatus.WARNING
|
|
message = f"Degraded performance: {success_rate:.1%} success rate"
|
|
else:
|
|
status = HealthStatus.HEALTHY
|
|
message = f"Normal performance: {success_rate:.1%} success rate"
|
|
|
|
return HealthCheckResult(
|
|
category=HealthCategory.PERFORMANCE,
|
|
status=status,
|
|
message=message,
|
|
value=success_rate,
|
|
details={"avg_processing_time": avg_time}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking performance health: {e}")
|
|
return HealthCheckResult(
|
|
category=HealthCategory.PERFORMANCE,
|
|
status=HealthStatus.UNKNOWN,
|
|
message=f"Error checking performance: {str(e)}"
|
|
)
|
|
|
|
def _check_activity_health(
|
|
self,
|
|
last_activity_time: float,
|
|
has_processing_items: bool
|
|
) -> HealthCheckResult:
|
|
"""Check activity health"""
|
|
if not has_processing_items:
|
|
return HealthCheckResult(
|
|
category=HealthCategory.ACTIVITY,
|
|
status=HealthStatus.HEALTHY,
|
|
message="No items being processed"
|
|
)
|
|
|
|
inactive_time = time.time() - last_activity_time
|
|
|
|
if inactive_time > self.thresholds.inactivity_critical_sec:
|
|
status = HealthStatus.CRITICAL
|
|
message = f"No activity for {inactive_time:.1f}s"
|
|
elif inactive_time > self.thresholds.inactivity_warning_sec:
|
|
status = HealthStatus.WARNING
|
|
message = f"Limited activity for {inactive_time:.1f}s"
|
|
else:
|
|
status = HealthStatus.HEALTHY
|
|
message = "Normal activity levels"
|
|
|
|
return HealthCheckResult(
|
|
category=HealthCategory.ACTIVITY,
|
|
status=status,
|
|
message=message,
|
|
value=inactive_time
|
|
)
|
|
|
|
def _check_error_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
|
|
"""Check error health"""
|
|
try:
|
|
error_rate = metrics.get("error_rate", 0.0)
|
|
error_count = metrics.get("total_errors", 0)
|
|
|
|
if error_rate > self.thresholds.error_rate_critical:
|
|
status = HealthStatus.CRITICAL
|
|
message = f"Critical error rate: {error_rate:.1%}"
|
|
elif error_rate > self.thresholds.error_rate_warning:
|
|
status = HealthStatus.WARNING
|
|
message = f"High error rate: {error_rate:.1%}"
|
|
else:
|
|
status = HealthStatus.HEALTHY
|
|
message = f"Normal error rate: {error_rate:.1%}"
|
|
|
|
return HealthCheckResult(
|
|
category=HealthCategory.ERRORS,
|
|
status=status,
|
|
message=message,
|
|
value=error_rate,
|
|
details={"error_count": error_count}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking error health: {e}")
|
|
return HealthCheckResult(
|
|
category=HealthCategory.ERRORS,
|
|
status=HealthStatus.UNKNOWN,
|
|
message=f"Error checking errors: {str(e)}"
|
|
)
|
|
|
|
def _check_deadlocks(self, queue_info: Dict[str, Any]) -> HealthCheckResult:
|
|
"""Check for potential deadlocks"""
|
|
try:
|
|
stuck_items = queue_info.get("stuck_items", [])
|
|
if not stuck_items:
|
|
return HealthCheckResult(
|
|
category=HealthCategory.DEADLOCKS,
|
|
status=HealthStatus.HEALTHY,
|
|
message="No stuck items detected"
|
|
)
|
|
|
|
longest_stuck = max(
|
|
time.time() - item["start_time"]
|
|
for item in stuck_items
|
|
)
|
|
|
|
if longest_stuck > self.thresholds.deadlock_critical_sec:
|
|
status = HealthStatus.CRITICAL
|
|
message = f"Potential deadlock: {len(stuck_items)} items stuck"
|
|
elif longest_stuck > self.thresholds.deadlock_warning_sec:
|
|
status = HealthStatus.WARNING
|
|
message = f"Slow processing: {len(stuck_items)} items delayed"
|
|
else:
|
|
status = HealthStatus.HEALTHY
|
|
message = "Normal processing time"
|
|
|
|
return HealthCheckResult(
|
|
category=HealthCategory.DEADLOCKS,
|
|
status=status,
|
|
message=message,
|
|
value=longest_stuck,
|
|
details={"stuck_items": len(stuck_items)}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking deadlocks: {e}")
|
|
return HealthCheckResult(
|
|
category=HealthCategory.DEADLOCKS,
|
|
status=HealthStatus.UNKNOWN,
|
|
message=f"Error checking deadlocks: {str(e)}"
|
|
)
|
|
|
|
async def _check_system_health(self) -> HealthCheckResult:
|
|
"""Check system health"""
|
|
try:
|
|
metrics = await self.system_monitor.check_system_health()
|
|
|
|
if not metrics:
|
|
return HealthCheckResult(
|
|
category=HealthCategory.SYSTEM,
|
|
status=HealthStatus.UNKNOWN,
|
|
message="Unable to get system metrics"
|
|
)
|
|
|
|
cpu_percent = metrics["cpu_percent"]
|
|
if cpu_percent > self.thresholds.cpu_critical_percent:
|
|
status = HealthStatus.CRITICAL
|
|
message = f"Critical CPU usage: {cpu_percent:.1f}%"
|
|
elif cpu_percent > self.thresholds.cpu_warning_percent:
|
|
status = HealthStatus.WARNING
|
|
message = f"High CPU usage: {cpu_percent:.1f}%"
|
|
else:
|
|
status = HealthStatus.HEALTHY
|
|
message = f"Normal CPU usage: {cpu_percent:.1f}%"
|
|
|
|
return HealthCheckResult(
|
|
category=HealthCategory.SYSTEM,
|
|
status=status,
|
|
message=message,
|
|
value=cpu_percent,
|
|
details=metrics
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking system health: {e}")
|
|
return HealthCheckResult(
|
|
category=HealthCategory.SYSTEM,
|
|
status=HealthStatus.UNKNOWN,
|
|
message=f"Error checking system: {str(e)}"
|
|
)
|
|
|
|
def _determine_overall_status(
|
|
self,
|
|
results: List[HealthCheckResult]
|
|
) -> HealthStatus:
|
|
"""Determine overall health status"""
|
|
if any(r.status == HealthStatus.CRITICAL for r in results):
|
|
return HealthStatus.CRITICAL
|
|
if any(r.status == HealthStatus.WARNING for r in results):
|
|
return HealthStatus.WARNING
|
|
if any(r.status == HealthStatus.UNKNOWN for r in results):
|
|
return HealthStatus.UNKNOWN
|
|
return HealthStatus.HEALTHY
|
|
|
|
def format_health_report(
|
|
self,
|
|
results: List[HealthCheckResult]
|
|
) -> str:
|
|
"""Format a detailed health report"""
|
|
lines = ["Queue Health Report:"]
|
|
|
|
for result in results:
|
|
lines.append(
|
|
f"\n{result.category.value.title()}:"
|
|
f"\n- Status: {result.status.value}"
|
|
f"\n- {result.message}"
|
|
)
|
|
if result.details:
|
|
for key, value in result.details.items():
|
|
lines.append(f" - {key}: {value}")
|
|
|
|
return "\n".join(lines)
|