Fixing race conditions and deadlocks in the queue system by:

Using a single lock instead of multiple locks
Properly handling task cancellation
Adding timeouts and retries
Improving error handling and recovery across all components:

Queue manager now properly handles initialization failures
Monitoring system has shorter timeouts and better activity tracking
Cleanup system has proper task tracking and error recovery
Persistence system has file locking and backup mechanisms
Removing deprecated pkg_resources usage and improving the update checker:

Using importlib.metadata for version checking
Adding proper shutdown handling
Improving error handling and retries
This commit is contained in:
pacnpal
2024-11-16 00:36:46 +00:00
parent 51a4e8f48c
commit 3520111cec
5 changed files with 438 additions and 277 deletions

View File

@@ -3,7 +3,7 @@
import asyncio
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Set
from typing import Dict, List, Set, Optional
from .models import QueueItem, QueueMetrics
# Configure logging
@@ -17,12 +17,14 @@ class QueueCleaner:
def __init__(
self,
cleanup_interval: int = 3600, # 1 hour
max_history_age: int = 86400, # 24 hours
cleanup_interval: int = 1800, # 30 minutes
max_history_age: int = 43200, # 12 hours
):
self.cleanup_interval = cleanup_interval
self.max_history_age = max_history_age
self._shutdown = False
self._cleanup_task: Optional[asyncio.Task] = None
self._last_cleanup_time = datetime.utcnow()
async def start_cleanup(
self,
@@ -47,6 +49,36 @@ class QueueCleaner:
metrics: Reference to queue metrics
queue_lock: Lock for queue operations
"""
if self._cleanup_task is not None:
logger.warning("Cleanup task already running")
return
logger.info("Starting queue cleanup task...")
self._cleanup_task = asyncio.create_task(
self._cleanup_loop(
queue,
completed,
failed,
guild_queues,
channel_queues,
processing,
metrics,
queue_lock
)
)
async def _cleanup_loop(
self,
queue: List[QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
queue_lock: asyncio.Lock
) -> None:
"""Main cleanup loop"""
while not self._shutdown:
try:
await self._perform_cleanup(
@@ -59,17 +91,24 @@ class QueueCleaner:
metrics,
queue_lock
)
self._last_cleanup_time = datetime.utcnow()
await asyncio.sleep(self.cleanup_interval)
except asyncio.CancelledError:
logger.info("Queue cleanup cancelled")
break
except Exception as e:
logger.error(f"Error in periodic cleanup: {str(e)}")
await asyncio.sleep(60)
logger.error(f"Error in cleanup loop: {str(e)}")
# Shorter sleep on error to retry sooner
await asyncio.sleep(30)
def stop_cleanup(self) -> None:
"""Stop the cleanup process"""
logger.info("Stopping queue cleanup...")
self._shutdown = True
if self._cleanup_task and not self._cleanup_task.done():
self._cleanup_task.cancel()
self._cleanup_task = None
async def _perform_cleanup(
self,
@@ -97,13 +136,14 @@ class QueueCleaner:
try:
current_time = datetime.utcnow()
cleanup_cutoff = current_time - timedelta(seconds=self.max_history_age)
items_cleaned = 0
async with queue_lock:
# Clean up completed items
completed_count = len(completed)
for url in list(completed.keys()):
try:
item = completed[url]
# Ensure added_at is a datetime object
if not isinstance(item.added_at, datetime):
try:
if isinstance(item.added_at, str):
@@ -115,15 +155,17 @@ class QueueCleaner:
if item.added_at < cleanup_cutoff:
completed.pop(url)
items_cleaned += 1
except Exception as e:
logger.error(f"Error processing completed item {url}: {e}")
logger.error(f"Error cleaning completed item {url}: {e}")
completed.pop(url)
items_cleaned += 1
# Clean up failed items
failed_count = len(failed)
for url in list(failed.keys()):
try:
item = failed[url]
# Ensure added_at is a datetime object
if not isinstance(item.added_at, datetime):
try:
if isinstance(item.added_at, str):
@@ -135,34 +177,53 @@ class QueueCleaner:
if item.added_at < cleanup_cutoff:
failed.pop(url)
items_cleaned += 1
except Exception as e:
logger.error(f"Error processing failed item {url}: {e}")
logger.error(f"Error cleaning failed item {url}: {e}")
failed.pop(url)
items_cleaned += 1
# Clean up guild tracking
guild_count = len(guild_queues)
for guild_id in list(guild_queues.keys()):
original_size = len(guild_queues[guild_id])
guild_queues[guild_id] = {
url for url in guild_queues[guild_id]
if url in queue or url in processing
}
items_cleaned += original_size - len(guild_queues[guild_id])
if not guild_queues[guild_id]:
guild_queues.pop(guild_id)
# Clean up channel tracking
channel_count = len(channel_queues)
for channel_id in list(channel_queues.keys()):
original_size = len(channel_queues[channel_id])
channel_queues[channel_id] = {
url for url in channel_queues[channel_id]
if url in queue or url in processing
}
items_cleaned += original_size - len(channel_queues[channel_id])
if not channel_queues[channel_id]:
channel_queues.pop(channel_id)
metrics.last_cleanup = current_time
logger.info("Completed periodic queue cleanup")
# Update metrics
metrics.last_cleanup = current_time
logger.info(
f"Queue cleanup completed:\n"
f"- Items cleaned: {items_cleaned}\n"
f"- Completed items: {completed_count} -> {len(completed)}\n"
f"- Failed items: {failed_count} -> {len(failed)}\n"
f"- Guild queues: {guild_count} -> {len(guild_queues)}\n"
f"- Channel queues: {channel_count} -> {len(channel_queues)}\n"
f"- Current queue size: {len(queue)}\n"
f"- Processing items: {len(processing)}"
)
except Exception as e:
logger.error(f"Error during cleanup: {str(e)}")
raise
# Don't re-raise to keep cleanup running
async def clear_guild_queue(
self,
@@ -195,6 +256,12 @@ class QueueCleaner:
async with queue_lock:
# Get URLs for this guild
guild_urls = guild_queues.get(guild_id, set())
initial_counts = {
'queue': len([item for item in queue if item.guild_id == guild_id]),
'processing': len([item for item in processing.values() if item.guild_id == guild_id]),
'completed': len([item for item in completed.values() if item.guild_id == guild_id]),
'failed': len([item for item in failed.values() if item.guild_id == guild_id])
}
# Clear from pending queue
queue[:] = [item for item in queue if item.guild_id != guild_id]
@@ -231,12 +298,19 @@ class QueueCleaner:
if not channel_queues[channel_id]:
channel_queues.pop(channel_id)
logger.info(f"Cleared {cleared_count} items from guild {guild_id} queue")
return cleared_count
logger.info(
f"Cleared guild {guild_id} queue:\n"
f"- Queue: {initial_counts['queue']} items\n"
f"- Processing: {initial_counts['processing']} items\n"
f"- Completed: {initial_counts['completed']} items\n"
f"- Failed: {initial_counts['failed']} items\n"
f"Total cleared: {cleared_count} items"
)
return cleared_count
except Exception as e:
logger.error(f"Error clearing guild queue: {str(e)}")
raise
raise CleanupError(f"Failed to clear guild queue: {str(e)}")
class CleanupError(Exception):
"""Base exception for cleanup-related errors"""