Fixing race conditions and deadlocks in the queue system by:

Using a single lock instead of multiple locks
Properly handling task cancellation
Adding timeouts and retries
Improving error handling and recovery across all components:

Queue manager now properly handles initialization failures
Monitoring system has shorter timeouts and better activity tracking
Cleanup system has proper task tracking and error recovery
Persistence system has file locking and backup mechanisms
Removing deprecated pkg_resources usage and improving the update checker:

Using importlib.metadata for version checking
Adding proper shutdown handling
Improving error handling and retries
This commit is contained in:
pacnpal
2024-11-16 00:36:46 +00:00
parent 51a4e8f48c
commit 3520111cec
5 changed files with 438 additions and 277 deletions

View File

@@ -19,10 +19,10 @@ class QueueMonitor:
def __init__(
self,
deadlock_threshold: int = 120, # Reduced to 2 minutes
deadlock_threshold: int = 60, # Reduced to 1 minute
memory_threshold: int = 512, # 512MB
max_retries: int = 3,
check_interval: int = 30 # Reduced to 30 seconds
check_interval: int = 15 # Reduced to 15 seconds
):
self.deadlock_threshold = deadlock_threshold
self.memory_threshold = memory_threshold
@@ -37,7 +37,7 @@ class QueueMonitor:
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
processing_lock: asyncio.Lock
queue_lock: asyncio.Lock
) -> None:
"""Start monitoring queue health
@@ -45,7 +45,7 @@ class QueueMonitor:
queue: Reference to the queue list
processing: Reference to processing dict
metrics: Reference to queue metrics
processing_lock: Lock for processing dict
queue_lock: Lock for queue operations
"""
if self._monitoring_task is not None:
logger.warning("Monitoring task already running")
@@ -53,7 +53,7 @@ class QueueMonitor:
logger.info("Starting queue monitoring...")
self._monitoring_task = asyncio.create_task(
self._monitor_loop(queue, processing, metrics, processing_lock)
self._monitor_loop(queue, processing, metrics, queue_lock)
)
async def _monitor_loop(
@@ -61,27 +61,27 @@ class QueueMonitor:
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
processing_lock: asyncio.Lock
queue_lock: asyncio.Lock
) -> None:
"""Main monitoring loop"""
while not self._shutdown:
try:
await self._check_health(queue, processing, metrics, processing_lock)
await self._check_health(queue, processing, metrics, queue_lock)
await asyncio.sleep(self.check_interval)
except asyncio.CancelledError:
logger.info("Queue monitoring cancelled")
break
except Exception as e:
logger.error(f"Error in health monitor: {str(e)}")
await asyncio.sleep(5) # Short sleep on error
await asyncio.sleep(1) # Reduced sleep on error
def stop_monitoring(self) -> None:
"""Stop the monitoring process"""
logger.info("Stopping queue monitoring...")
self._shutdown = True
if self._monitoring_task:
if self._monitoring_task and not self._monitoring_task.done():
self._monitoring_task.cancel()
self._monitoring_task = None
self._monitoring_task = None
def update_activity(self) -> None:
"""Update the last active time"""
@@ -92,7 +92,7 @@ class QueueMonitor:
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
processing_lock: asyncio.Lock
queue_lock: asyncio.Lock
) -> None:
"""Check queue health and performance
@@ -100,7 +100,7 @@ class QueueMonitor:
queue: Reference to the queue list
processing: Reference to processing dict
metrics: Reference to queue metrics
processing_lock: Lock for processing dict
queue_lock: Lock for queue operations
"""
try:
current_time = time.time()
@@ -118,40 +118,37 @@ class QueueMonitor:
logger.info(f"Memory after GC: {memory_after:.2f}MB")
# Check for potential deadlocks
processing_times = []
stuck_items = []
async with processing_lock:
async with queue_lock:
# Check processing items
for url, item in processing.items():
if hasattr(item, 'start_time') and item.start_time:
processing_time = current_time - item.start_time
processing_times.append(processing_time)
if processing_time > self.deadlock_threshold:
stuck_items.append((url, item))
logger.warning(f"Item stuck in processing: {url} for {processing_time:.1f}s")
if stuck_items:
logger.warning(
f"Potential deadlock detected: {len(stuck_items)} items stuck"
)
await self._recover_stuck_items(
stuck_items, queue, processing, processing_lock
)
# Handle stuck items if found
if stuck_items:
logger.warning(f"Potential deadlock detected: {len(stuck_items)} items stuck")
await self._recover_stuck_items(stuck_items, queue, processing)
# Check overall queue activity
if processing and current_time - self._last_active_time > self.deadlock_threshold:
logger.warning("Queue appears to be hung - no activity detected")
# Force recovery of all processing items
async with processing_lock:
# Check overall queue activity
if processing and current_time - self._last_active_time > self.deadlock_threshold:
logger.warning("Queue appears to be hung - no activity detected")
# Force recovery of all processing items
all_items = list(processing.items())
await self._recover_stuck_items(
all_items, queue, processing, processing_lock
)
self._last_active_time = current_time
await self._recover_stuck_items(all_items, queue, processing)
self._last_active_time = current_time
# Update metrics
metrics.last_activity_time = self._last_active_time
metrics.peak_memory_usage = max(metrics.peak_memory_usage, memory_usage)
# Update metrics
metrics.last_activity_time = self._last_active_time
metrics.peak_memory_usage = max(metrics.peak_memory_usage, memory_usage)
# Calculate current metrics
queue_size = len(queue)
processing_count = len(processing)
# Log detailed metrics
logger.info(
@@ -161,21 +158,20 @@ class QueueMonitor:
f"- Memory Usage: {memory_usage:.2f}MB\n"
f"- Peak Memory: {metrics.peak_memory_usage:.2f}MB\n"
f"- Error Distribution: {metrics.errors_by_type}\n"
f"- Queue Size: {len(queue)}\n"
f"- Processing Items: {len(processing)}\n"
f"- Queue Size: {queue_size}\n"
f"- Processing Items: {processing_count}\n"
f"- Last Activity: {(current_time - self._last_active_time):.1f}s ago"
)
except Exception as e:
logger.error(f"Error checking queue health: {str(e)}")
raise
# Don't re-raise to keep monitoring alive
async def _recover_stuck_items(
self,
stuck_items: List[tuple[str, QueueItem]],
queue: List[QueueItem],
processing: Dict[str, QueueItem],
processing_lock: asyncio.Lock
processing: Dict[str, QueueItem]
) -> None:
"""Attempt to recover stuck items
@@ -183,38 +179,36 @@ class QueueMonitor:
stuck_items: List of (url, item) tuples for stuck items
queue: Reference to the queue list
processing: Reference to processing dict
processing_lock: Lock for processing dict
"""
try:
recovered = 0
failed = 0
async with processing_lock:
for url, item in stuck_items:
try:
# Move to failed if max retries reached
if item.retry_count >= self.max_retries:
logger.warning(f"Moving stuck item to failed: {url}")
item.status = "failed"
item.error = "Exceeded maximum retries after being stuck"
item.last_error = item.error
item.last_error_time = datetime.utcnow()
processing.pop(url)
failed += 1
else:
# Reset for retry
logger.info(f"Recovering stuck item for retry: {url}")
item.retry_count += 1
item.start_time = None
item.processing_time = 0
item.last_retry = datetime.utcnow()
item.status = "pending"
item.priority = max(0, item.priority - 2) # Lower priority
queue.append(item)
processing.pop(url)
recovered += 1
except Exception as e:
logger.error(f"Error recovering item {url}: {str(e)}")
for url, item in stuck_items:
try:
# Move to failed if max retries reached
if item.retry_count >= self.max_retries:
logger.warning(f"Moving stuck item to failed: {url}")
item.status = "failed"
item.error = "Exceeded maximum retries after being stuck"
item.last_error = item.error
item.last_error_time = datetime.utcnow()
processing.pop(url)
failed += 1
else:
# Reset for retry
logger.info(f"Recovering stuck item for retry: {url}")
item.retry_count += 1
item.start_time = None
item.processing_time = 0
item.last_retry = datetime.utcnow()
item.status = "pending"
item.priority = max(0, item.priority - 2) # Lower priority
queue.append(item)
processing.pop(url)
recovered += 1
except Exception as e:
logger.error(f"Error recovering item {url}: {str(e)}")
# Update activity timestamp after recovery
self.update_activity()
@@ -222,7 +216,7 @@ class QueueMonitor:
except Exception as e:
logger.error(f"Error recovering stuck items: {str(e)}")
raise
# Don't re-raise to keep monitoring alive
class MonitoringError(Exception):
"""Base exception for monitoring-related errors"""