Removed the problematic singleton pattern from queue manager

Added proper activity tracking in the monitoring system
Reduced timeouts and deadlock thresholds
Implemented more aggressive cleanup procedures
Added system-wide FFmpeg process cleanup
This commit is contained in:
pacnpal
2024-11-16 00:24:28 +00:00
parent 39061cbf3e
commit 32c63deeff
4 changed files with 208 additions and 119 deletions

View File

@@ -20,17 +20,6 @@ logger = logging.getLogger("QueueManager")
class EnhancedVideoQueueManager:
"""Enhanced queue manager with improved memory management and performance"""
# Class-level initialization lock to prevent multiple instances
_instance_lock = asyncio.Lock()
_instance = None
_initialized = False
def __new__(cls, *args, **kwargs):
"""Ensure singleton instance"""
if not cls._instance:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(
self,
max_retries: int = 3,
@@ -43,10 +32,7 @@ class EnhancedVideoQueueManager:
deadlock_threshold: int = 300, # 5 minutes
check_interval: int = 60, # 1 minute
):
"""Initialize only once"""
if self._initialized:
return
"""Initialize queue manager"""
# Configuration
self.max_retries = max_retries
self.retry_delay = retry_delay
@@ -70,7 +56,8 @@ class EnhancedVideoQueueManager:
# State
self._shutdown = False
self._init_event = asyncio.Event() # Single event for initialization state
self._initialized = False
self._init_event = asyncio.Event()
self.metrics = QueueMetrics()
# Components
@@ -85,64 +72,59 @@ class EnhancedVideoQueueManager:
max_history_age=max_history_age
)
# Mark instance as initialized
self._initialized = True
async def initialize(self) -> None:
"""Initialize the queue manager components sequentially"""
# Use class-level lock to prevent multiple initializations
async with self._instance_lock:
# Check if already initialized
if self._init_event.is_set():
logger.info("Queue manager already initialized")
return
if self._initialized:
logger.info("Queue manager already initialized")
return
try:
logger.info("Starting queue manager initialization...")
# Load persisted state first if available
if self.persistence:
await self._load_persisted_state()
# Start monitoring task
monitor_task = asyncio.create_task(
self.monitor.start_monitoring(
self._queue,
self._processing,
self.metrics,
self._processing_lock
)
try:
logger.info("Starting queue manager initialization...")
# Load persisted state first if available
if self.persistence:
await self._load_persisted_state()
# Start monitoring task
monitor_task = asyncio.create_task(
self.monitor.start_monitoring(
self._queue,
self._processing,
self.metrics,
self._processing_lock
)
self._active_tasks.add(monitor_task)
logger.info("Queue monitoring started")
)
self._active_tasks.add(monitor_task)
logger.info("Queue monitoring started")
# Brief pause to allow monitor to initialize
await asyncio.sleep(0.1)
# Start cleanup task
cleanup_task = asyncio.create_task(
self.cleaner.start_cleanup(
self._queue,
self._completed,
self._failed,
self._guild_queues,
self._channel_queues,
self._processing,
self.metrics,
self._queue_lock
)
# Brief pause to allow monitor to initialize
await asyncio.sleep(0.1)
# Start cleanup task
cleanup_task = asyncio.create_task(
self.cleaner.start_cleanup(
self._queue,
self._completed,
self._failed,
self._guild_queues,
self._channel_queues,
self._processing,
self.metrics,
self._queue_lock
)
self._active_tasks.add(cleanup_task)
logger.info("Queue cleanup started")
)
self._active_tasks.add(cleanup_task)
logger.info("Queue cleanup started")
# Signal initialization complete
self._init_event.set()
logger.info("Queue manager initialization completed")
# Signal initialization complete
self._initialized = True
self._init_event.set()
logger.info("Queue manager initialization completed")
except Exception as e:
logger.error(f"Failed to initialize queue manager: {e}")
self._shutdown = True
raise
except Exception as e:
logger.error(f"Failed to initialize queue manager: {e}")
self._shutdown = True
raise
async def _load_persisted_state(self) -> None:
"""Load persisted queue state"""
@@ -199,6 +181,8 @@ class EnhancedVideoQueueManager:
async with self._processing_lock:
for item in items:
self._processing[item.url] = item
# Update activity timestamp
self.monitor.update_activity()
if not items:
await asyncio.sleep(0.1)
@@ -234,6 +218,7 @@ class EnhancedVideoQueueManager:
logger.info(f"Processing queue item: {item.url}")
item.start_processing()
self.metrics.last_activity_time = time.time()
self.monitor.update_activity() # Update activity timestamp
success, error = await processor(item)
@@ -338,6 +323,7 @@ class EnhancedVideoQueueManager:
self._queue.sort(key=lambda x: (-x.priority, x.added_at))
self.metrics.last_activity_time = time.time()
self.monitor.update_activity() # Update activity timestamp
if self.persistence:
await self._persist_state()
@@ -439,6 +425,9 @@ class EnhancedVideoQueueManager:
self._channel_queues.clear()
self._active_tasks.clear()
# Reset initialization state
self._initialized = False
self._init_event.clear()
logger.info("Queue manager cleanup completed")
except Exception as e:
@@ -470,5 +459,8 @@ class EnhancedVideoQueueManager:
self._processing.clear()
self._active_tasks.clear()
# Reset initialization state
self._initialized = False
self._init_event.clear()
logger.info("Queue manager force stopped")

View File

@@ -19,10 +19,10 @@ class QueueMonitor:
def __init__(
self,
deadlock_threshold: int = 300, # 5 minutes
deadlock_threshold: int = 120, # Reduced to 2 minutes
memory_threshold: int = 512, # 512MB
max_retries: int = 3,
check_interval: int = 60 # Check every minute
check_interval: int = 30 # Reduced to 30 seconds
):
self.deadlock_threshold = deadlock_threshold
self.memory_threshold = memory_threshold
@@ -30,6 +30,7 @@ class QueueMonitor:
self.check_interval = check_interval
self._shutdown = False
self._last_active_time = time.time()
self._monitoring_task = None
async def start_monitoring(
self,
@@ -46,23 +47,41 @@ class QueueMonitor:
metrics: Reference to queue metrics
processing_lock: Lock for processing dict
"""
if self._monitoring_task is not None:
logger.warning("Monitoring task already running")
return
logger.info("Starting queue monitoring...")
self._monitoring_task = asyncio.create_task(
self._monitor_loop(queue, processing, metrics, processing_lock)
)
async def _monitor_loop(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
processing_lock: asyncio.Lock
) -> None:
"""Main monitoring loop"""
while not self._shutdown:
try:
await self._check_health(queue, processing, metrics, processing_lock)
await asyncio.sleep(self.check_interval)
except asyncio.CancelledError:
logger.info("Queue monitoring cancelled")
break
except Exception as e:
logger.error(f"Error in health monitor: {str(e)}")
await asyncio.sleep(30) # Shorter sleep on error
await asyncio.sleep(5) # Short sleep on error
def stop_monitoring(self) -> None:
"""Stop the monitoring process"""
logger.info("Stopping queue monitoring...")
self._shutdown = True
if self._monitoring_task:
self._monitoring_task.cancel()
self._monitoring_task = None
def update_activity(self) -> None:
"""Update the last active time"""
@@ -104,7 +123,6 @@ class QueueMonitor:
async with processing_lock:
for url, item in processing.items():
# Check if item has started processing
if hasattr(item, 'start_time') and item.start_time:
processing_time = current_time - item.start_time
processing_times.append(processing_time)
@@ -131,22 +149,18 @@ class QueueMonitor:
)
self._last_active_time = current_time
# Calculate and log metrics
success_rate = metrics.success_rate
error_distribution = metrics.errors_by_type
avg_processing_time = metrics.avg_processing_time
# Update peak memory usage
# Update metrics
metrics.last_activity_time = self._last_active_time
metrics.peak_memory_usage = max(metrics.peak_memory_usage, memory_usage)
# Log detailed metrics
logger.info(
f"Queue Health Metrics:\n"
f"- Success Rate: {success_rate:.2%}\n"
f"- Avg Processing Time: {avg_processing_time:.2f}s\n"
f"- Success Rate: {metrics.success_rate:.2%}\n"
f"- Avg Processing Time: {metrics.avg_processing_time:.2f}s\n"
f"- Memory Usage: {memory_usage:.2f}MB\n"
f"- Peak Memory: {metrics.peak_memory_usage:.2f}MB\n"
f"- Error Distribution: {error_distribution}\n"
f"- Error Distribution: {metrics.errors_by_type}\n"
f"- Queue Size: {len(queue)}\n"
f"- Processing Items: {len(processing)}\n"
f"- Last Activity: {(current_time - self._last_active_time):.1f}s ago"
@@ -202,6 +216,8 @@ class QueueMonitor:
except Exception as e:
logger.error(f"Error recovering item {url}: {str(e)}")
# Update activity timestamp after recovery
self.update_activity()
logger.info(f"Recovery complete - Recovered: {recovered}, Failed: {failed}")
except Exception as e: