Files
Pac-cogs/videoarchiver/queue/monitoring.py
pacnpal 32c63deeff Removed the problematic singleton pattern from queue manager
Added proper activity tracking in the monitoring system
Reduced timeouts and deadlock thresholds
Implemented more aggressive cleanup procedures
Added system-wide FFmpeg process cleanup
2024-11-16 00:24:28 +00:00

230 lines
8.7 KiB
Python

"""Queue monitoring and health checks"""
import asyncio
import logging
import psutil
import time
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Set
from .models import QueueItem, QueueMetrics
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("QueueMonitoring")
class QueueMonitor:
"""Monitors queue health and performance"""
def __init__(
self,
deadlock_threshold: int = 120, # Reduced to 2 minutes
memory_threshold: int = 512, # 512MB
max_retries: int = 3,
check_interval: int = 30 # Reduced to 30 seconds
):
self.deadlock_threshold = deadlock_threshold
self.memory_threshold = memory_threshold
self.max_retries = max_retries
self.check_interval = check_interval
self._shutdown = False
self._last_active_time = time.time()
self._monitoring_task = None
async def start_monitoring(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
processing_lock: asyncio.Lock
) -> None:
"""Start monitoring queue health
Args:
queue: Reference to the queue list
processing: Reference to processing dict
metrics: Reference to queue metrics
processing_lock: Lock for processing dict
"""
if self._monitoring_task is not None:
logger.warning("Monitoring task already running")
return
logger.info("Starting queue monitoring...")
self._monitoring_task = asyncio.create_task(
self._monitor_loop(queue, processing, metrics, processing_lock)
)
async def _monitor_loop(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
processing_lock: asyncio.Lock
) -> None:
"""Main monitoring loop"""
while not self._shutdown:
try:
await self._check_health(queue, processing, metrics, processing_lock)
await asyncio.sleep(self.check_interval)
except asyncio.CancelledError:
logger.info("Queue monitoring cancelled")
break
except Exception as e:
logger.error(f"Error in health monitor: {str(e)}")
await asyncio.sleep(5) # Short sleep on error
def stop_monitoring(self) -> None:
"""Stop the monitoring process"""
logger.info("Stopping queue monitoring...")
self._shutdown = True
if self._monitoring_task:
self._monitoring_task.cancel()
self._monitoring_task = None
def update_activity(self) -> None:
"""Update the last active time"""
self._last_active_time = time.time()
async def _check_health(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
processing_lock: asyncio.Lock
) -> None:
"""Check queue health and performance
Args:
queue: Reference to the queue list
processing: Reference to processing dict
metrics: Reference to queue metrics
processing_lock: Lock for processing dict
"""
try:
current_time = time.time()
# Check memory usage
process = psutil.Process()
memory_usage = process.memory_info().rss / 1024 / 1024 # MB
if memory_usage > self.memory_threshold:
logger.warning(f"High memory usage detected: {memory_usage:.2f}MB")
# Force garbage collection
import gc
gc.collect()
memory_after = process.memory_info().rss / 1024 / 1024
logger.info(f"Memory after GC: {memory_after:.2f}MB")
# Check for potential deadlocks
processing_times = []
stuck_items = []
async with processing_lock:
for url, item in processing.items():
if hasattr(item, 'start_time') and item.start_time:
processing_time = current_time - item.start_time
processing_times.append(processing_time)
if processing_time > self.deadlock_threshold:
stuck_items.append((url, item))
logger.warning(f"Item stuck in processing: {url} for {processing_time:.1f}s")
if stuck_items:
logger.warning(
f"Potential deadlock detected: {len(stuck_items)} items stuck"
)
await self._recover_stuck_items(
stuck_items, queue, processing, processing_lock
)
# Check overall queue activity
if processing and current_time - self._last_active_time > self.deadlock_threshold:
logger.warning("Queue appears to be hung - no activity detected")
# Force recovery of all processing items
async with processing_lock:
all_items = list(processing.items())
await self._recover_stuck_items(
all_items, queue, processing, processing_lock
)
self._last_active_time = current_time
# Update metrics
metrics.last_activity_time = self._last_active_time
metrics.peak_memory_usage = max(metrics.peak_memory_usage, memory_usage)
# Log detailed metrics
logger.info(
f"Queue Health Metrics:\n"
f"- Success Rate: {metrics.success_rate:.2%}\n"
f"- Avg Processing Time: {metrics.avg_processing_time:.2f}s\n"
f"- Memory Usage: {memory_usage:.2f}MB\n"
f"- Peak Memory: {metrics.peak_memory_usage:.2f}MB\n"
f"- Error Distribution: {metrics.errors_by_type}\n"
f"- Queue Size: {len(queue)}\n"
f"- Processing Items: {len(processing)}\n"
f"- Last Activity: {(current_time - self._last_active_time):.1f}s ago"
)
except Exception as e:
logger.error(f"Error checking queue health: {str(e)}")
raise
async def _recover_stuck_items(
self,
stuck_items: List[tuple[str, QueueItem]],
queue: List[QueueItem],
processing: Dict[str, QueueItem],
processing_lock: asyncio.Lock
) -> None:
"""Attempt to recover stuck items
Args:
stuck_items: List of (url, item) tuples for stuck items
queue: Reference to the queue list
processing: Reference to processing dict
processing_lock: Lock for processing dict
"""
try:
recovered = 0
failed = 0
async with processing_lock:
for url, item in stuck_items:
try:
# Move to failed if max retries reached
if item.retry_count >= self.max_retries:
logger.warning(f"Moving stuck item to failed: {url}")
item.status = "failed"
item.error = "Exceeded maximum retries after being stuck"
item.last_error = item.error
item.last_error_time = datetime.utcnow()
processing.pop(url)
failed += 1
else:
# Reset for retry
logger.info(f"Recovering stuck item for retry: {url}")
item.retry_count += 1
item.start_time = None
item.processing_time = 0
item.last_retry = datetime.utcnow()
item.status = "pending"
item.priority = max(0, item.priority - 2) # Lower priority
queue.append(item)
processing.pop(url)
recovered += 1
except Exception as e:
logger.error(f"Error recovering item {url}: {str(e)}")
# Update activity timestamp after recovery
self.update_activity()
logger.info(f"Recovery complete - Recovered: {recovered}, Failed: {failed}")
except Exception as e:
logger.error(f"Error recovering stuck items: {str(e)}")
raise
class MonitoringError(Exception):
"""Base exception for monitoring-related errors"""
pass