Eliminating duplicate queue processing that was causing race conditions

Adding proper processing state tracking and timing
Implementing more aggressive monitoring (1-minute intervals)
Adding activity tracking to detect and recover from hung states
Improving error handling and logging throughout the system
Reducing timeouts and deadlock thresholds for faster recovery
This commit is contained in:
pacnpal
2024-11-15 22:38:36 +00:00
parent 73364e7438
commit 512dd1ff88
6 changed files with 220 additions and 138 deletions

View File

@@ -19,14 +19,17 @@ class QueueMonitor:
def __init__(
self,
deadlock_threshold: int = 900, # 15 minutes
memory_threshold: int = 1024, # 1GB
max_retries: int = 3
deadlock_threshold: int = 300, # 5 minutes
memory_threshold: int = 512, # 512MB
max_retries: int = 3,
check_interval: int = 60 # Check every minute
):
self.deadlock_threshold = deadlock_threshold
self.memory_threshold = memory_threshold
self.max_retries = max_retries
self.check_interval = check_interval
self._shutdown = False
self._last_active_time = time.time()
async def start_monitoring(
self,
@@ -43,21 +46,28 @@ class QueueMonitor:
metrics: Reference to queue metrics
processing_lock: Lock for processing dict
"""
logger.info("Starting queue monitoring...")
while not self._shutdown:
try:
await self._check_health(queue, processing, metrics, processing_lock)
await asyncio.sleep(300) # Check every 5 minutes
await asyncio.sleep(self.check_interval)
except asyncio.CancelledError:
logger.info("Queue monitoring cancelled")
break
except Exception as e:
logger.error(f"Error in health monitor: {str(e)}")
await asyncio.sleep(60)
await asyncio.sleep(30) # Shorter sleep on error
def stop_monitoring(self) -> None:
"""Stop the monitoring process"""
logger.info("Stopping queue monitoring...")
self._shutdown = True
def update_activity(self) -> None:
"""Update the last active time"""
self._last_active_time = time.time()
async def _check_health(
self,
queue: List[QueueItem],
@@ -74,6 +84,8 @@ class QueueMonitor:
processing_lock: Lock for processing dict
"""
try:
current_time = time.time()
# Check memory usage
process = psutil.Process()
memory_usage = process.memory_info().rss / 1024 / 1024 # MB
@@ -83,18 +95,22 @@ class QueueMonitor:
# Force garbage collection
import gc
gc.collect()
memory_after = process.memory_info().rss / 1024 / 1024
logger.info(f"Memory after GC: {memory_after:.2f}MB")
# Check for potential deadlocks
current_time = time.time()
processing_times = []
stuck_items = []
for url, item in processing.items():
if isinstance(item.processing_time, (int, float)) and item.processing_time > 0:
processing_time = current_time - item.processing_time
processing_times.append(processing_time)
if processing_time > self.deadlock_threshold:
stuck_items.append((url, item))
async with processing_lock:
for url, item in processing.items():
# Check if item has started processing
if hasattr(item, 'start_time') and item.start_time:
processing_time = current_time - item.start_time
processing_times.append(processing_time)
if processing_time > self.deadlock_threshold:
stuck_items.append((url, item))
logger.warning(f"Item stuck in processing: {url} for {processing_time:.1f}s")
if stuck_items:
logger.warning(
@@ -104,6 +120,17 @@ class QueueMonitor:
stuck_items, queue, processing, processing_lock
)
# Check overall queue activity
if processing and current_time - self._last_active_time > self.deadlock_threshold:
logger.warning("Queue appears to be hung - no activity detected")
# Force recovery of all processing items
async with processing_lock:
all_items = list(processing.items())
await self._recover_stuck_items(
all_items, queue, processing, processing_lock
)
self._last_active_time = current_time
# Calculate and log metrics
success_rate = metrics.success_rate
error_distribution = metrics.errors_by_type
@@ -112,14 +139,17 @@ class QueueMonitor:
# Update peak memory usage
metrics.peak_memory_usage = max(metrics.peak_memory_usage, memory_usage)
# Log detailed metrics
logger.info(
f"Queue Health Metrics:\n"
f"- Success Rate: {success_rate:.2%}\n"
f"- Avg Processing Time: {avg_processing_time:.2f}s\n"
f"- Memory Usage: {memory_usage:.2f}MB\n"
f"- Peak Memory: {metrics.peak_memory_usage:.2f}MB\n"
f"- Error Distribution: {error_distribution}\n"
f"- Queue Size: {len(queue)}\n"
f"- Processing Items: {len(processing)}"
f"- Processing Items: {len(processing)}\n"
f"- Last Activity: {(current_time - self._last_active_time):.1f}s ago"
)
except Exception as e:
@@ -142,26 +172,37 @@ class QueueMonitor:
processing_lock: Lock for processing dict
"""
try:
recovered = 0
failed = 0
async with processing_lock:
for url, item in stuck_items:
# Move to failed if max retries reached
if item.retry_count >= self.max_retries:
logger.warning(f"Moving stuck item to failed: {url}")
item.status = "failed"
item.error = "Exceeded maximum retries after being stuck"
item.last_error = item.error
item.last_error_time = datetime.utcnow()
processing.pop(url)
else:
# Reset for retry
logger.info(f"Recovering stuck item for retry: {url}")
item.retry_count += 1
item.processing_time = 0
item.last_retry = datetime.utcnow()
item.status = "pending"
item.priority = max(0, item.priority - 2) # Lower priority
queue.append(item)
processing.pop(url)
try:
# Move to failed if max retries reached
if item.retry_count >= self.max_retries:
logger.warning(f"Moving stuck item to failed: {url}")
item.status = "failed"
item.error = "Exceeded maximum retries after being stuck"
item.last_error = item.error
item.last_error_time = datetime.utcnow()
processing.pop(url)
failed += 1
else:
# Reset for retry
logger.info(f"Recovering stuck item for retry: {url}")
item.retry_count += 1
item.start_time = None
item.processing_time = 0
item.last_retry = datetime.utcnow()
item.status = "pending"
item.priority = max(0, item.priority - 2) # Lower priority
queue.append(item)
processing.pop(url)
recovered += 1
except Exception as e:
logger.error(f"Error recovering item {url}: {str(e)}")
logger.info(f"Recovery complete - Recovered: {recovered}, Failed: {failed}")
except Exception as e:
logger.error(f"Error recovering stuck items: {str(e)}")