mirror of
https://github.com/pacnpal/Pac-cogs.git
synced 2025-12-20 10:51:05 -05:00
Fixing race conditions and deadlocks in the queue system by:
Using a single lock instead of multiple locks Properly handling task cancellation Adding timeouts and retries Improving error handling and recovery across all components: Queue manager now properly handles initialization failures Monitoring system has shorter timeouts and better activity tracking Cleanup system has proper task tracking and error recovery Persistence system has file locking and backup mechanisms Removing deprecated pkg_resources usage and improving the update checker: Using importlib.metadata for version checking Adding proper shutdown handling Improving error handling and retries
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Set
|
||||
from typing import Dict, List, Set, Optional
|
||||
from .models import QueueItem, QueueMetrics
|
||||
|
||||
# Configure logging
|
||||
@@ -17,12 +17,14 @@ class QueueCleaner:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cleanup_interval: int = 3600, # 1 hour
|
||||
max_history_age: int = 86400, # 24 hours
|
||||
cleanup_interval: int = 1800, # 30 minutes
|
||||
max_history_age: int = 43200, # 12 hours
|
||||
):
|
||||
self.cleanup_interval = cleanup_interval
|
||||
self.max_history_age = max_history_age
|
||||
self._shutdown = False
|
||||
self._cleanup_task: Optional[asyncio.Task] = None
|
||||
self._last_cleanup_time = datetime.utcnow()
|
||||
|
||||
async def start_cleanup(
|
||||
self,
|
||||
@@ -47,6 +49,36 @@ class QueueCleaner:
|
||||
metrics: Reference to queue metrics
|
||||
queue_lock: Lock for queue operations
|
||||
"""
|
||||
if self._cleanup_task is not None:
|
||||
logger.warning("Cleanup task already running")
|
||||
return
|
||||
|
||||
logger.info("Starting queue cleanup task...")
|
||||
self._cleanup_task = asyncio.create_task(
|
||||
self._cleanup_loop(
|
||||
queue,
|
||||
completed,
|
||||
failed,
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
processing,
|
||||
metrics,
|
||||
queue_lock
|
||||
)
|
||||
)
|
||||
|
||||
async def _cleanup_loop(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
processing: Dict[str, QueueItem],
|
||||
metrics: QueueMetrics,
|
||||
queue_lock: asyncio.Lock
|
||||
) -> None:
|
||||
"""Main cleanup loop"""
|
||||
while not self._shutdown:
|
||||
try:
|
||||
await self._perform_cleanup(
|
||||
@@ -59,17 +91,24 @@ class QueueCleaner:
|
||||
metrics,
|
||||
queue_lock
|
||||
)
|
||||
self._last_cleanup_time = datetime.utcnow()
|
||||
await asyncio.sleep(self.cleanup_interval)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Queue cleanup cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in periodic cleanup: {str(e)}")
|
||||
await asyncio.sleep(60)
|
||||
logger.error(f"Error in cleanup loop: {str(e)}")
|
||||
# Shorter sleep on error to retry sooner
|
||||
await asyncio.sleep(30)
|
||||
|
||||
def stop_cleanup(self) -> None:
|
||||
"""Stop the cleanup process"""
|
||||
logger.info("Stopping queue cleanup...")
|
||||
self._shutdown = True
|
||||
if self._cleanup_task and not self._cleanup_task.done():
|
||||
self._cleanup_task.cancel()
|
||||
self._cleanup_task = None
|
||||
|
||||
async def _perform_cleanup(
|
||||
self,
|
||||
@@ -97,13 +136,14 @@ class QueueCleaner:
|
||||
try:
|
||||
current_time = datetime.utcnow()
|
||||
cleanup_cutoff = current_time - timedelta(seconds=self.max_history_age)
|
||||
items_cleaned = 0
|
||||
|
||||
async with queue_lock:
|
||||
# Clean up completed items
|
||||
completed_count = len(completed)
|
||||
for url in list(completed.keys()):
|
||||
try:
|
||||
item = completed[url]
|
||||
# Ensure added_at is a datetime object
|
||||
if not isinstance(item.added_at, datetime):
|
||||
try:
|
||||
if isinstance(item.added_at, str):
|
||||
@@ -115,15 +155,17 @@ class QueueCleaner:
|
||||
|
||||
if item.added_at < cleanup_cutoff:
|
||||
completed.pop(url)
|
||||
items_cleaned += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing completed item {url}: {e}")
|
||||
logger.error(f"Error cleaning completed item {url}: {e}")
|
||||
completed.pop(url)
|
||||
items_cleaned += 1
|
||||
|
||||
# Clean up failed items
|
||||
failed_count = len(failed)
|
||||
for url in list(failed.keys()):
|
||||
try:
|
||||
item = failed[url]
|
||||
# Ensure added_at is a datetime object
|
||||
if not isinstance(item.added_at, datetime):
|
||||
try:
|
||||
if isinstance(item.added_at, str):
|
||||
@@ -135,34 +177,53 @@ class QueueCleaner:
|
||||
|
||||
if item.added_at < cleanup_cutoff:
|
||||
failed.pop(url)
|
||||
items_cleaned += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing failed item {url}: {e}")
|
||||
logger.error(f"Error cleaning failed item {url}: {e}")
|
||||
failed.pop(url)
|
||||
items_cleaned += 1
|
||||
|
||||
# Clean up guild tracking
|
||||
guild_count = len(guild_queues)
|
||||
for guild_id in list(guild_queues.keys()):
|
||||
original_size = len(guild_queues[guild_id])
|
||||
guild_queues[guild_id] = {
|
||||
url for url in guild_queues[guild_id]
|
||||
if url in queue or url in processing
|
||||
}
|
||||
items_cleaned += original_size - len(guild_queues[guild_id])
|
||||
if not guild_queues[guild_id]:
|
||||
guild_queues.pop(guild_id)
|
||||
|
||||
# Clean up channel tracking
|
||||
channel_count = len(channel_queues)
|
||||
for channel_id in list(channel_queues.keys()):
|
||||
original_size = len(channel_queues[channel_id])
|
||||
channel_queues[channel_id] = {
|
||||
url for url in channel_queues[channel_id]
|
||||
if url in queue or url in processing
|
||||
}
|
||||
items_cleaned += original_size - len(channel_queues[channel_id])
|
||||
if not channel_queues[channel_id]:
|
||||
channel_queues.pop(channel_id)
|
||||
|
||||
metrics.last_cleanup = current_time
|
||||
logger.info("Completed periodic queue cleanup")
|
||||
# Update metrics
|
||||
metrics.last_cleanup = current_time
|
||||
|
||||
logger.info(
|
||||
f"Queue cleanup completed:\n"
|
||||
f"- Items cleaned: {items_cleaned}\n"
|
||||
f"- Completed items: {completed_count} -> {len(completed)}\n"
|
||||
f"- Failed items: {failed_count} -> {len(failed)}\n"
|
||||
f"- Guild queues: {guild_count} -> {len(guild_queues)}\n"
|
||||
f"- Channel queues: {channel_count} -> {len(channel_queues)}\n"
|
||||
f"- Current queue size: {len(queue)}\n"
|
||||
f"- Processing items: {len(processing)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during cleanup: {str(e)}")
|
||||
raise
|
||||
# Don't re-raise to keep cleanup running
|
||||
|
||||
async def clear_guild_queue(
|
||||
self,
|
||||
@@ -195,6 +256,12 @@ class QueueCleaner:
|
||||
async with queue_lock:
|
||||
# Get URLs for this guild
|
||||
guild_urls = guild_queues.get(guild_id, set())
|
||||
initial_counts = {
|
||||
'queue': len([item for item in queue if item.guild_id == guild_id]),
|
||||
'processing': len([item for item in processing.values() if item.guild_id == guild_id]),
|
||||
'completed': len([item for item in completed.values() if item.guild_id == guild_id]),
|
||||
'failed': len([item for item in failed.values() if item.guild_id == guild_id])
|
||||
}
|
||||
|
||||
# Clear from pending queue
|
||||
queue[:] = [item for item in queue if item.guild_id != guild_id]
|
||||
@@ -231,12 +298,19 @@ class QueueCleaner:
|
||||
if not channel_queues[channel_id]:
|
||||
channel_queues.pop(channel_id)
|
||||
|
||||
logger.info(f"Cleared {cleared_count} items from guild {guild_id} queue")
|
||||
return cleared_count
|
||||
logger.info(
|
||||
f"Cleared guild {guild_id} queue:\n"
|
||||
f"- Queue: {initial_counts['queue']} items\n"
|
||||
f"- Processing: {initial_counts['processing']} items\n"
|
||||
f"- Completed: {initial_counts['completed']} items\n"
|
||||
f"- Failed: {initial_counts['failed']} items\n"
|
||||
f"Total cleared: {cleared_count} items"
|
||||
)
|
||||
return cleared_count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error clearing guild queue: {str(e)}")
|
||||
raise
|
||||
raise CleanupError(f"Failed to clear guild queue: {str(e)}")
|
||||
|
||||
class CleanupError(Exception):
|
||||
"""Base exception for cleanup-related errors"""
|
||||
|
||||
Reference in New Issue
Block a user