mirror of
https://github.com/pacnpal/Pac-cogs.git
synced 2025-12-23 12:21:05 -05:00
Eliminating duplicate queue processing that was causing race conditions
Adding proper processing state tracking and timing Implementing more aggressive monitoring (1-minute intervals) Adding activity tracking to detect and recover from hung states Improving error handling and logging throughout the system Reducing timeouts and deadlock thresholds for faster recovery
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, Optional, Set, Tuple, Callable, Any, List
|
||||
from datetime import datetime
|
||||
|
||||
@@ -28,7 +29,8 @@ class EnhancedVideoQueueManager:
|
||||
max_history_age: int = 86400, # 24 hours
|
||||
persistence_path: Optional[str] = None,
|
||||
backup_interval: int = 300, # 5 minutes
|
||||
deadlock_threshold: int = 900, # 15 minutes
|
||||
deadlock_threshold: int = 300, # 5 minutes
|
||||
check_interval: int = 60, # 1 minute
|
||||
):
|
||||
# Configuration
|
||||
self.max_retries = max_retries
|
||||
@@ -58,7 +60,8 @@ class EnhancedVideoQueueManager:
|
||||
self.persistence = QueuePersistenceManager(persistence_path) if persistence_path else None
|
||||
self.monitor = QueueMonitor(
|
||||
deadlock_threshold=deadlock_threshold,
|
||||
max_retries=max_retries
|
||||
max_retries=max_retries,
|
||||
check_interval=check_interval
|
||||
)
|
||||
self.cleaner = QueueCleaner(
|
||||
cleanup_interval=cleanup_interval,
|
||||
@@ -80,6 +83,7 @@ class EnhancedVideoQueueManager:
|
||||
)
|
||||
)
|
||||
self._active_tasks.add(monitor_task)
|
||||
logger.info("Queue monitoring started")
|
||||
|
||||
# Start cleanup
|
||||
cleanup_task = asyncio.create_task(
|
||||
@@ -95,6 +99,7 @@ class EnhancedVideoQueueManager:
|
||||
)
|
||||
)
|
||||
self._active_tasks.add(cleanup_task)
|
||||
logger.info("Queue cleanup started")
|
||||
|
||||
# Load persisted state if available
|
||||
if self.persistence:
|
||||
@@ -120,6 +125,7 @@ class EnhancedVideoQueueManager:
|
||||
self.metrics.compression_failures = metrics_data.get("compression_failures", 0)
|
||||
self.metrics.hardware_accel_failures = metrics_data.get("hardware_accel_failures", 0)
|
||||
|
||||
logger.info("Loaded persisted queue state")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load persisted state: {e}")
|
||||
|
||||
@@ -141,8 +147,6 @@ class EnhancedVideoQueueManager:
|
||||
if self._queue:
|
||||
item = self._queue.pop(0)
|
||||
self._processing[item.url] = item
|
||||
item.status = "processing"
|
||||
item.processing_time = 0.0
|
||||
|
||||
if not item:
|
||||
await asyncio.sleep(1)
|
||||
@@ -151,20 +155,19 @@ class EnhancedVideoQueueManager:
|
||||
try:
|
||||
# Process the item
|
||||
logger.info(f"Processing queue item: {item.url}")
|
||||
item.start_processing() # Start processing tracking
|
||||
self.metrics.last_activity_time = time.time() # Update activity time
|
||||
|
||||
success, error = await processor(item)
|
||||
|
||||
# Update metrics and status
|
||||
async with self._processing_lock:
|
||||
item.finish_processing(success, error) # Update item status
|
||||
|
||||
if success:
|
||||
item.status = "completed"
|
||||
self._completed[item.url] = item
|
||||
logger.info(f"Successfully processed: {item.url}")
|
||||
else:
|
||||
item.status = "failed"
|
||||
item.error = error
|
||||
item.last_error = error
|
||||
item.last_error_time = datetime.utcnow()
|
||||
|
||||
if item.retry_count < self.max_retries:
|
||||
item.retry_count += 1
|
||||
item.status = "pending"
|
||||
@@ -177,16 +180,25 @@ class EnhancedVideoQueueManager:
|
||||
logger.error(f"Failed after {self.max_retries} attempts: {item.url}")
|
||||
|
||||
self._processing.pop(item.url, None)
|
||||
|
||||
# Update metrics
|
||||
self.metrics.update(
|
||||
processing_time=item.processing_time,
|
||||
success=success,
|
||||
error=error
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {item.url}: {e}")
|
||||
async with self._processing_lock:
|
||||
item.status = "failed"
|
||||
item.error = str(e)
|
||||
item.last_error = str(e)
|
||||
item.last_error_time = datetime.utcnow()
|
||||
item.finish_processing(False, str(e))
|
||||
self._failed[item.url] = item
|
||||
self._processing.pop(item.url, None)
|
||||
self.metrics.update(
|
||||
processing_time=item.processing_time,
|
||||
success=False,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
# Persist state if enabled
|
||||
if self.persistence:
|
||||
@@ -215,22 +227,7 @@ class EnhancedVideoQueueManager:
|
||||
author_id: int,
|
||||
priority: int = 0,
|
||||
) -> bool:
|
||||
"""Add a video to the processing queue
|
||||
|
||||
Args:
|
||||
url: Video URL
|
||||
message_id: Discord message ID
|
||||
channel_id: Discord channel ID
|
||||
guild_id: Discord guild ID
|
||||
author_id: Discord author ID
|
||||
priority: Queue priority (higher = higher priority)
|
||||
|
||||
Returns:
|
||||
True if added successfully
|
||||
|
||||
Raises:
|
||||
QueueError: If queue is full or shutting down
|
||||
"""
|
||||
"""Add a video to the processing queue"""
|
||||
if self._shutdown:
|
||||
raise QueueError("Queue manager is shutting down")
|
||||
|
||||
@@ -262,6 +259,9 @@ class EnhancedVideoQueueManager:
|
||||
self._queue.append(item)
|
||||
self._queue.sort(key=lambda x: (-x.priority, x.added_at))
|
||||
|
||||
# Update activity time
|
||||
self.metrics.last_activity_time = time.time()
|
||||
|
||||
if self.persistence:
|
||||
await self.persistence.persist_queue_state(
|
||||
self._queue,
|
||||
@@ -279,14 +279,7 @@ class EnhancedVideoQueueManager:
|
||||
raise QueueError(f"Failed to add to queue: {str(e)}")
|
||||
|
||||
def get_queue_status(self, guild_id: int) -> dict:
|
||||
"""Get current queue status for a guild
|
||||
|
||||
Args:
|
||||
guild_id: Discord guild ID
|
||||
|
||||
Returns:
|
||||
Dict containing queue status and metrics
|
||||
"""
|
||||
"""Get current queue status for a guild"""
|
||||
try:
|
||||
pending = len([item for item in self._queue if item.guild_id == guild_id])
|
||||
processing = len([item for item in self._processing.values() if item.guild_id == guild_id])
|
||||
@@ -308,6 +301,7 @@ class EnhancedVideoQueueManager:
|
||||
"errors_by_type": self.metrics.errors_by_type,
|
||||
"compression_failures": self.metrics.compression_failures,
|
||||
"hardware_accel_failures": self.metrics.hardware_accel_failures,
|
||||
"last_activity": time.time() - self.metrics.last_activity_time,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -328,21 +322,12 @@ class EnhancedVideoQueueManager:
|
||||
"errors_by_type": {},
|
||||
"compression_failures": 0,
|
||||
"hardware_accel_failures": 0,
|
||||
"last_activity": 0,
|
||||
},
|
||||
}
|
||||
|
||||
async def clear_guild_queue(self, guild_id: int) -> int:
|
||||
"""Clear all queue items for a guild
|
||||
|
||||
Args:
|
||||
guild_id: Discord guild ID
|
||||
|
||||
Returns:
|
||||
Number of items cleared
|
||||
|
||||
Raises:
|
||||
QueueError: If queue is shutting down
|
||||
"""
|
||||
"""Clear all queue items for a guild"""
|
||||
if self._shutdown:
|
||||
raise QueueError("Queue manager is shutting down")
|
||||
|
||||
@@ -377,6 +362,7 @@ class EnhancedVideoQueueManager:
|
||||
"""Clean up resources and stop queue processing"""
|
||||
try:
|
||||
self._shutdown = True
|
||||
logger.info("Starting queue manager cleanup...")
|
||||
|
||||
# Stop monitoring and cleanup tasks
|
||||
self.monitor.stop_monitoring()
|
||||
@@ -428,6 +414,7 @@ class EnhancedVideoQueueManager:
|
||||
def force_stop(self) -> None:
|
||||
"""Force stop all queue operations immediately"""
|
||||
self._shutdown = True
|
||||
logger.info("Force stopping queue manager...")
|
||||
|
||||
# Stop monitoring and cleanup
|
||||
self.monitor.stop_monitoring()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Data models for the queue system"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
from typing import Dict, Optional, List, Any
|
||||
@@ -22,14 +23,16 @@ class QueueItem:
|
||||
guild_id: int # Discord ID
|
||||
added_at: datetime = field(default_factory=datetime.utcnow)
|
||||
status: str = "pending"
|
||||
retry_count: int = 0 # Changed from retries to retry_count
|
||||
priority: int = 0 # Added priority field with default value 0
|
||||
retry_count: int = 0
|
||||
priority: int = 0
|
||||
last_retry: Optional[datetime] = None
|
||||
last_error: Optional[str] = None
|
||||
last_error_time: Optional[datetime] = None
|
||||
start_time: Optional[float] = None # Added start_time for processing tracking
|
||||
processing_time: float = 0.0
|
||||
output_path: Optional[str] = None
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
error: Optional[str] = None # Added error field for current error
|
||||
|
||||
def __post_init__(self):
|
||||
"""Convert string dates to datetime objects after initialization"""
|
||||
@@ -57,6 +60,29 @@ class QueueItem:
|
||||
elif not isinstance(self.last_error_time, datetime):
|
||||
self.last_error_time = None
|
||||
|
||||
def start_processing(self) -> None:
|
||||
"""Mark item as started processing"""
|
||||
self.status = "processing"
|
||||
self.start_time = time.time()
|
||||
self.processing_time = 0.0
|
||||
self.error = None
|
||||
|
||||
def finish_processing(self, success: bool, error: Optional[str] = None) -> None:
|
||||
"""Mark item as finished processing"""
|
||||
end_time = time.time()
|
||||
if self.start_time:
|
||||
self.processing_time = end_time - self.start_time
|
||||
|
||||
if success:
|
||||
self.status = "completed"
|
||||
else:
|
||||
self.status = "failed"
|
||||
self.error = error
|
||||
self.last_error = error
|
||||
self.last_error_time = datetime.utcnow()
|
||||
|
||||
self.start_time = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary with datetime handling"""
|
||||
data = asdict(self)
|
||||
@@ -91,6 +117,7 @@ class QueueMetrics:
|
||||
processing_times: List[float] = field(default_factory=list)
|
||||
compression_failures: int = 0
|
||||
hardware_accel_failures: int = 0
|
||||
last_activity_time: float = field(default_factory=time.time) # Added activity tracking
|
||||
|
||||
def __post_init__(self):
|
||||
"""Convert string dates to datetime objects after initialization"""
|
||||
@@ -115,6 +142,8 @@ class QueueMetrics:
|
||||
def update(self, processing_time: float, success: bool, error: str = None):
|
||||
"""Update metrics with new processing information"""
|
||||
self.total_processed += 1
|
||||
self.last_activity_time = time.time() # Update activity timestamp
|
||||
|
||||
if not success:
|
||||
self.total_failed += 1
|
||||
if error:
|
||||
|
||||
@@ -19,14 +19,17 @@ class QueueMonitor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
deadlock_threshold: int = 900, # 15 minutes
|
||||
memory_threshold: int = 1024, # 1GB
|
||||
max_retries: int = 3
|
||||
deadlock_threshold: int = 300, # 5 minutes
|
||||
memory_threshold: int = 512, # 512MB
|
||||
max_retries: int = 3,
|
||||
check_interval: int = 60 # Check every minute
|
||||
):
|
||||
self.deadlock_threshold = deadlock_threshold
|
||||
self.memory_threshold = memory_threshold
|
||||
self.max_retries = max_retries
|
||||
self.check_interval = check_interval
|
||||
self._shutdown = False
|
||||
self._last_active_time = time.time()
|
||||
|
||||
async def start_monitoring(
|
||||
self,
|
||||
@@ -43,21 +46,28 @@ class QueueMonitor:
|
||||
metrics: Reference to queue metrics
|
||||
processing_lock: Lock for processing dict
|
||||
"""
|
||||
logger.info("Starting queue monitoring...")
|
||||
while not self._shutdown:
|
||||
try:
|
||||
await self._check_health(queue, processing, metrics, processing_lock)
|
||||
await asyncio.sleep(300) # Check every 5 minutes
|
||||
await asyncio.sleep(self.check_interval)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Queue monitoring cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in health monitor: {str(e)}")
|
||||
await asyncio.sleep(60)
|
||||
await asyncio.sleep(30) # Shorter sleep on error
|
||||
|
||||
def stop_monitoring(self) -> None:
|
||||
"""Stop the monitoring process"""
|
||||
logger.info("Stopping queue monitoring...")
|
||||
self._shutdown = True
|
||||
|
||||
def update_activity(self) -> None:
|
||||
"""Update the last active time"""
|
||||
self._last_active_time = time.time()
|
||||
|
||||
async def _check_health(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
@@ -74,6 +84,8 @@ class QueueMonitor:
|
||||
processing_lock: Lock for processing dict
|
||||
"""
|
||||
try:
|
||||
current_time = time.time()
|
||||
|
||||
# Check memory usage
|
||||
process = psutil.Process()
|
||||
memory_usage = process.memory_info().rss / 1024 / 1024 # MB
|
||||
@@ -83,18 +95,22 @@ class QueueMonitor:
|
||||
# Force garbage collection
|
||||
import gc
|
||||
gc.collect()
|
||||
memory_after = process.memory_info().rss / 1024 / 1024
|
||||
logger.info(f"Memory after GC: {memory_after:.2f}MB")
|
||||
|
||||
# Check for potential deadlocks
|
||||
current_time = time.time()
|
||||
processing_times = []
|
||||
stuck_items = []
|
||||
|
||||
for url, item in processing.items():
|
||||
if isinstance(item.processing_time, (int, float)) and item.processing_time > 0:
|
||||
processing_time = current_time - item.processing_time
|
||||
processing_times.append(processing_time)
|
||||
if processing_time > self.deadlock_threshold:
|
||||
stuck_items.append((url, item))
|
||||
async with processing_lock:
|
||||
for url, item in processing.items():
|
||||
# Check if item has started processing
|
||||
if hasattr(item, 'start_time') and item.start_time:
|
||||
processing_time = current_time - item.start_time
|
||||
processing_times.append(processing_time)
|
||||
if processing_time > self.deadlock_threshold:
|
||||
stuck_items.append((url, item))
|
||||
logger.warning(f"Item stuck in processing: {url} for {processing_time:.1f}s")
|
||||
|
||||
if stuck_items:
|
||||
logger.warning(
|
||||
@@ -104,6 +120,17 @@ class QueueMonitor:
|
||||
stuck_items, queue, processing, processing_lock
|
||||
)
|
||||
|
||||
# Check overall queue activity
|
||||
if processing and current_time - self._last_active_time > self.deadlock_threshold:
|
||||
logger.warning("Queue appears to be hung - no activity detected")
|
||||
# Force recovery of all processing items
|
||||
async with processing_lock:
|
||||
all_items = list(processing.items())
|
||||
await self._recover_stuck_items(
|
||||
all_items, queue, processing, processing_lock
|
||||
)
|
||||
self._last_active_time = current_time
|
||||
|
||||
# Calculate and log metrics
|
||||
success_rate = metrics.success_rate
|
||||
error_distribution = metrics.errors_by_type
|
||||
@@ -112,14 +139,17 @@ class QueueMonitor:
|
||||
# Update peak memory usage
|
||||
metrics.peak_memory_usage = max(metrics.peak_memory_usage, memory_usage)
|
||||
|
||||
# Log detailed metrics
|
||||
logger.info(
|
||||
f"Queue Health Metrics:\n"
|
||||
f"- Success Rate: {success_rate:.2%}\n"
|
||||
f"- Avg Processing Time: {avg_processing_time:.2f}s\n"
|
||||
f"- Memory Usage: {memory_usage:.2f}MB\n"
|
||||
f"- Peak Memory: {metrics.peak_memory_usage:.2f}MB\n"
|
||||
f"- Error Distribution: {error_distribution}\n"
|
||||
f"- Queue Size: {len(queue)}\n"
|
||||
f"- Processing Items: {len(processing)}"
|
||||
f"- Processing Items: {len(processing)}\n"
|
||||
f"- Last Activity: {(current_time - self._last_active_time):.1f}s ago"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -142,26 +172,37 @@ class QueueMonitor:
|
||||
processing_lock: Lock for processing dict
|
||||
"""
|
||||
try:
|
||||
recovered = 0
|
||||
failed = 0
|
||||
|
||||
async with processing_lock:
|
||||
for url, item in stuck_items:
|
||||
# Move to failed if max retries reached
|
||||
if item.retry_count >= self.max_retries:
|
||||
logger.warning(f"Moving stuck item to failed: {url}")
|
||||
item.status = "failed"
|
||||
item.error = "Exceeded maximum retries after being stuck"
|
||||
item.last_error = item.error
|
||||
item.last_error_time = datetime.utcnow()
|
||||
processing.pop(url)
|
||||
else:
|
||||
# Reset for retry
|
||||
logger.info(f"Recovering stuck item for retry: {url}")
|
||||
item.retry_count += 1
|
||||
item.processing_time = 0
|
||||
item.last_retry = datetime.utcnow()
|
||||
item.status = "pending"
|
||||
item.priority = max(0, item.priority - 2) # Lower priority
|
||||
queue.append(item)
|
||||
processing.pop(url)
|
||||
try:
|
||||
# Move to failed if max retries reached
|
||||
if item.retry_count >= self.max_retries:
|
||||
logger.warning(f"Moving stuck item to failed: {url}")
|
||||
item.status = "failed"
|
||||
item.error = "Exceeded maximum retries after being stuck"
|
||||
item.last_error = item.error
|
||||
item.last_error_time = datetime.utcnow()
|
||||
processing.pop(url)
|
||||
failed += 1
|
||||
else:
|
||||
# Reset for retry
|
||||
logger.info(f"Recovering stuck item for retry: {url}")
|
||||
item.retry_count += 1
|
||||
item.start_time = None
|
||||
item.processing_time = 0
|
||||
item.last_retry = datetime.utcnow()
|
||||
item.status = "pending"
|
||||
item.priority = max(0, item.priority - 2) # Lower priority
|
||||
queue.append(item)
|
||||
processing.pop(url)
|
||||
recovered += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error recovering item {url}: {str(e)}")
|
||||
|
||||
logger.info(f"Recovery complete - Recovered: {recovered}, Failed: {failed}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error recovering stuck items: {str(e)}")
|
||||
|
||||
Reference in New Issue
Block a user