Core Systems:

Component-based architecture with lifecycle management
Enhanced error handling and recovery mechanisms
Comprehensive state management and tracking
Event-driven architecture with monitoring
Queue Management:

Multiple processing strategies for different scenarios
Advanced state management with recovery
Comprehensive metrics and health monitoring
Sophisticated cleanup system with multiple strategies
Processing Pipeline:

Enhanced message handling with validation
Improved URL extraction and processing
Better queue management and monitoring
Advanced cleanup mechanisms
Overall Benefits:

Better code organization and maintainability
Improved error handling and recovery
Enhanced monitoring and reporting
More robust and reliable system
This commit is contained in:
pacnpal
2024-11-16 05:01:29 +00:00
parent 537a325807
commit a4ca6e8ea6
47 changed files with 11085 additions and 2110 deletions

View File

@@ -0,0 +1,500 @@
"""Module for cleaning guild-specific queue items"""
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, List, Set, Tuple, Any, Optional
from datetime import datetime
from ..models import QueueItem
logger = logging.getLogger("GuildCleaner")
class GuildCleanupStrategy(Enum):
"""Guild cleanup strategies"""
FULL = "full" # Clear all guild items
SELECTIVE = "selective" # Clear only specific categories
GRACEFUL = "graceful" # Clear with grace period
class CleanupCategory(Enum):
"""Categories for cleanup"""
QUEUE = "queue"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
TRACKING = "tracking"
@dataclass
class GuildCleanupConfig:
"""Configuration for guild cleanup"""
categories: Set[CleanupCategory] = field(default_factory=lambda: set(CleanupCategory))
grace_period: int = 300 # 5 minutes
preserve_completed: bool = False
preserve_failed: bool = False
batch_size: int = 100
@dataclass
class GuildCleanupResult:
"""Result of a guild cleanup operation"""
guild_id: int
timestamp: datetime
strategy: GuildCleanupStrategy
items_cleared: int
categories_cleared: Set[CleanupCategory]
initial_counts: Dict[str, int]
final_counts: Dict[str, int]
duration: float
error: Optional[str] = None
class GuildCleanupTracker:
"""Tracks guild cleanup operations"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[GuildCleanupResult] = []
self.cleanup_counts: Dict[int, int] = {} # guild_id -> count
self.total_items_cleared = 0
self.last_cleanup: Optional[datetime] = None
def record_cleanup(self, result: GuildCleanupResult) -> None:
"""Record a cleanup operation"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
self.cleanup_counts[result.guild_id] = (
self.cleanup_counts.get(result.guild_id, 0) + 1
)
self.total_items_cleared += result.items_cleared
self.last_cleanup = result.timestamp
def get_stats(self) -> Dict[str, Any]:
"""Get cleanup statistics"""
return {
"total_cleanups": len(self.history),
"total_items_cleared": self.total_items_cleared,
"guilds_cleaned": len(self.cleanup_counts),
"last_cleanup": (
self.last_cleanup.isoformat()
if self.last_cleanup
else None
),
"recent_cleanups": [
{
"guild_id": r.guild_id,
"timestamp": r.timestamp.isoformat(),
"strategy": r.strategy.value,
"items_cleared": r.items_cleared,
"categories": [c.value for c in r.categories_cleared]
}
for r in self.history[-5:] # Last 5 cleanups
]
}
class GuildCleaner:
"""Handles cleanup of guild-specific queue items"""
def __init__(
self,
strategy: GuildCleanupStrategy = GuildCleanupStrategy.GRACEFUL,
config: Optional[GuildCleanupConfig] = None
):
self.strategy = strategy
self.config = config or GuildCleanupConfig()
self.tracker = GuildCleanupTracker()
async def clear_guild_items(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]]
) -> Tuple[int, Dict[str, int]]:
"""Clear all queue items for a specific guild"""
start_time = datetime.utcnow()
cleared_categories = set()
try:
# Get initial counts
initial_counts = self._get_item_counts(
guild_id,
queue,
processing,
completed,
failed
)
# Get URLs for this guild
guild_urls = guild_queues.get(guild_id, set())
# Clear items based on strategy
cleared_count = 0
if self.strategy == GuildCleanupStrategy.FULL:
cleared_count = await self._full_cleanup(
guild_id,
queue,
processing,
completed,
failed,
guild_queues,
channel_queues,
cleared_categories
)
elif self.strategy == GuildCleanupStrategy.SELECTIVE:
cleared_count = await self._selective_cleanup(
guild_id,
queue,
processing,
completed,
failed,
guild_queues,
channel_queues,
cleared_categories
)
else: # GRACEFUL
cleared_count = await self._graceful_cleanup(
guild_id,
queue,
processing,
completed,
failed,
guild_queues,
channel_queues,
cleared_categories
)
# Get final counts
final_counts = self._get_item_counts(
guild_id,
queue,
processing,
completed,
failed
)
# Record cleanup result
duration = (datetime.utcnow() - start_time).total_seconds()
result = GuildCleanupResult(
guild_id=guild_id,
timestamp=datetime.utcnow(),
strategy=self.strategy,
items_cleared=cleared_count,
categories_cleared=cleared_categories,
initial_counts=initial_counts,
final_counts=final_counts,
duration=duration
)
self.tracker.record_cleanup(result)
logger.info(self.format_guild_cleanup_report(
guild_id,
initial_counts,
final_counts,
duration
))
return cleared_count, initial_counts
except Exception as e:
logger.error(f"Error clearing guild {guild_id} queue: {e}")
self.tracker.record_cleanup(GuildCleanupResult(
guild_id=guild_id,
timestamp=datetime.utcnow(),
strategy=self.strategy,
items_cleared=0,
categories_cleared=set(),
initial_counts={},
final_counts={},
duration=0,
error=str(e)
))
raise
async def _full_cleanup(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
cleared_categories: Set[CleanupCategory]
) -> int:
"""Perform full cleanup"""
cleared_count = 0
# Clear from pending queue
queue[:] = [item for item in queue if item.guild_id != guild_id]
cleared_count += len(queue)
cleared_categories.add(CleanupCategory.QUEUE)
# Clear from processing
cleared = await self._clear_from_dict(
processing, guild_id, 'processing'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.PROCESSING)
# Clear from completed
cleared = await self._clear_from_dict(
completed, guild_id, 'completed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.COMPLETED)
# Clear from failed
cleared = await self._clear_from_dict(
failed, guild_id, 'failed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.FAILED)
# Clear tracking
cleared = await self._clear_tracking(
guild_id,
guild_queues,
channel_queues
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.TRACKING)
return cleared_count
async def _selective_cleanup(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
cleared_categories: Set[CleanupCategory]
) -> int:
"""Perform selective cleanup"""
cleared_count = 0
# Clear only configured categories
if CleanupCategory.QUEUE in self.config.categories:
queue[:] = [item for item in queue if item.guild_id != guild_id]
cleared_count += len(queue)
cleared_categories.add(CleanupCategory.QUEUE)
if CleanupCategory.PROCESSING in self.config.categories:
cleared = await self._clear_from_dict(
processing, guild_id, 'processing'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.PROCESSING)
if (
CleanupCategory.COMPLETED in self.config.categories and
not self.config.preserve_completed
):
cleared = await self._clear_from_dict(
completed, guild_id, 'completed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.COMPLETED)
if (
CleanupCategory.FAILED in self.config.categories and
not self.config.preserve_failed
):
cleared = await self._clear_from_dict(
failed, guild_id, 'failed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.FAILED)
if CleanupCategory.TRACKING in self.config.categories:
cleared = await self._clear_tracking(
guild_id,
guild_queues,
channel_queues
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.TRACKING)
return cleared_count
async def _graceful_cleanup(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
cleared_categories: Set[CleanupCategory]
) -> int:
"""Perform graceful cleanup"""
cleared_count = 0
cutoff_time = datetime.utcnow().timestamp() - self.config.grace_period
# Clear queue items beyond grace period
queue[:] = [
item for item in queue
if not (
item.guild_id == guild_id and
item.added_at.timestamp() < cutoff_time
)
]
cleared_count += len(queue)
cleared_categories.add(CleanupCategory.QUEUE)
# Clear processing items beyond grace period
for url in list(processing.keys()):
item = processing[url]
if (
item.guild_id == guild_id and
item.added_at.timestamp() < cutoff_time
):
processing.pop(url)
cleared_count += 1
cleared_categories.add(CleanupCategory.PROCESSING)
# Clear completed and failed based on config
if not self.config.preserve_completed:
cleared = await self._clear_from_dict(
completed, guild_id, 'completed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.COMPLETED)
if not self.config.preserve_failed:
cleared = await self._clear_from_dict(
failed, guild_id, 'failed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.FAILED)
# Clear tracking
cleared = await self._clear_tracking(
guild_id,
guild_queues,
channel_queues
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.TRACKING)
return cleared_count
async def _clear_from_dict(
self,
items_dict: Dict[str, QueueItem],
guild_id: int,
category: str
) -> int:
"""Clear guild items from a dictionary"""
cleared = 0
batch_count = 0
for url in list(items_dict.keys()):
if items_dict[url].guild_id == guild_id:
items_dict.pop(url)
cleared += 1
batch_count += 1
# Process in batches
if batch_count >= self.config.batch_size:
await asyncio.sleep(0) # Yield to event loop
batch_count = 0
logger.debug(f"Cleared {cleared} {category} items for guild {guild_id}")
return cleared
async def _clear_tracking(
self,
guild_id: int,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]]
) -> int:
"""Clear guild tracking data"""
cleared = 0
guild_urls = guild_queues.get(guild_id, set())
# Clear guild tracking
if guild_id in guild_queues:
cleared += len(guild_queues[guild_id])
guild_queues.pop(guild_id)
# Clear channel tracking
await self._clear_channel_tracking(channel_queues, guild_urls)
return cleared
async def _clear_channel_tracking(
self,
channel_queues: Dict[int, Set[str]],
guild_urls: Set[str]
) -> None:
"""Clear channel tracking for guild URLs"""
batch_count = 0
for channel_id in list(channel_queues.keys()):
channel_queues[channel_id] = {
url for url in channel_queues[channel_id]
if url not in guild_urls
}
if not channel_queues[channel_id]:
channel_queues.pop(channel_id)
batch_count += 1
if batch_count >= self.config.batch_size:
await asyncio.sleep(0) # Yield to event loop
batch_count = 0
def _get_item_counts(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem]
) -> Dict[str, int]:
"""Get item counts for a guild"""
return {
'queue': len([item for item in queue if item.guild_id == guild_id]),
'processing': len([item for item in processing.values() if item.guild_id == guild_id]),
'completed': len([item for item in completed.values() if item.guild_id == guild_id]),
'failed': len([item for item in failed.values() if item.guild_id == guild_id])
}
def format_guild_cleanup_report(
self,
guild_id: int,
initial_counts: Dict[str, int],
final_counts: Dict[str, int],
duration: float
) -> str:
"""Format a guild cleanup report"""
return (
f"Guild {guild_id} Cleanup Results:\n"
f"Strategy: {self.strategy.value}\n"
f"Duration: {duration:.2f}s\n"
f"Items:\n"
f"- Queue: {initial_counts['queue']} -> {final_counts['queue']}\n"
f"- Processing: {initial_counts['processing']} -> {final_counts['processing']}\n"
f"- Completed: {initial_counts['completed']} -> {final_counts['completed']}\n"
f"- Failed: {initial_counts['failed']} -> {final_counts['failed']}\n"
f"Total cleared: {sum(initial_counts.values()) - sum(final_counts.values())} items"
)
def get_cleaner_stats(self) -> Dict[str, Any]:
"""Get comprehensive cleaner statistics"""
return {
"strategy": self.strategy.value,
"config": {
"categories": [c.value for c in self.config.categories],
"grace_period": self.config.grace_period,
"preserve_completed": self.config.preserve_completed,
"preserve_failed": self.config.preserve_failed,
"batch_size": self.config.batch_size
},
"tracker": self.tracker.get_stats()
}

View File

@@ -0,0 +1,336 @@
"""Module for cleaning historical queue items"""
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, Optional, List, Any, Set
from datetime import datetime, timedelta
from ..models import QueueItem
logger = logging.getLogger("HistoryCleaner")
class CleanupStrategy(Enum):
"""Cleanup strategies"""
AGGRESSIVE = "aggressive" # Remove more aggressively
CONSERVATIVE = "conservative" # Remove conservatively
BALANCED = "balanced" # Balance between retention and cleanup
class CleanupPolicy(Enum):
"""Cleanup policies"""
AGE = "age" # Clean based on age
SIZE = "size" # Clean based on size
HYBRID = "hybrid" # Consider both age and size
@dataclass
class CleanupThresholds:
"""Thresholds for cleanup operations"""
max_history_age: int = 43200 # 12 hours
max_completed_items: int = 10000
max_failed_items: int = 5000
min_retention_time: int = 3600 # 1 hour
size_threshold: int = 100 * 1024 * 1024 # 100MB
@dataclass
class CleanupResult:
"""Result of a cleanup operation"""
timestamp: datetime
items_cleaned: int
space_freed: int
duration: float
strategy: CleanupStrategy
policy: CleanupPolicy
details: Dict[str, Any] = field(default_factory=dict)
class CleanupTracker:
"""Tracks cleanup operations"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[CleanupResult] = []
self.total_items_cleaned = 0
self.total_space_freed = 0
self.last_cleanup: Optional[datetime] = None
def record_cleanup(self, result: CleanupResult) -> None:
"""Record a cleanup operation"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
self.total_items_cleaned += result.items_cleaned
self.total_space_freed += result.space_freed
self.last_cleanup = result.timestamp
def get_stats(self) -> Dict[str, Any]:
"""Get cleanup statistics"""
return {
"total_cleanups": len(self.history),
"total_items_cleaned": self.total_items_cleaned,
"total_space_freed": self.total_space_freed,
"last_cleanup": (
self.last_cleanup.isoformat()
if self.last_cleanup
else None
),
"recent_cleanups": [
{
"timestamp": r.timestamp.isoformat(),
"items_cleaned": r.items_cleaned,
"space_freed": r.space_freed,
"strategy": r.strategy.value,
"policy": r.policy.value
}
for r in self.history[-5:] # Last 5 cleanups
]
}
class HistoryCleaner:
"""Handles cleanup of historical queue items"""
def __init__(
self,
strategy: CleanupStrategy = CleanupStrategy.BALANCED,
policy: CleanupPolicy = CleanupPolicy.HYBRID,
thresholds: Optional[CleanupThresholds] = None
):
self.strategy = strategy
self.policy = policy
self.thresholds = thresholds or CleanupThresholds()
self.tracker = CleanupTracker()
def _normalize_datetime(self, dt_value: any) -> datetime:
"""Normalize a datetime value"""
current_time = datetime.utcnow()
if not isinstance(dt_value, datetime):
try:
if isinstance(dt_value, str):
return datetime.fromisoformat(dt_value)
else:
return current_time
except (ValueError, TypeError):
return current_time
return dt_value
async def cleanup_completed(
self,
completed: Dict[str, QueueItem],
cleanup_cutoff: datetime
) -> int:
"""Clean up completed items"""
start_time = datetime.utcnow()
items_cleaned = 0
space_freed = 0
completed_count = len(completed)
try:
# Determine cleanup approach based on strategy and policy
if self.policy == CleanupPolicy.SIZE:
items_to_clean = self._get_items_by_size(completed)
elif self.policy == CleanupPolicy.HYBRID:
items_to_clean = self._get_items_hybrid(completed, cleanup_cutoff)
else: # AGE policy
items_to_clean = self._get_items_by_age(completed, cleanup_cutoff)
# Clean items
for url in items_to_clean:
try:
item = completed[url]
space_freed += self._estimate_item_size(item)
completed.pop(url)
items_cleaned += 1
except Exception as e:
logger.error(f"Error cleaning completed item {url}: {e}")
completed.pop(url)
items_cleaned += 1
# Record cleanup
self._record_cleanup_result(
items_cleaned,
space_freed,
start_time,
"completed"
)
logger.debug(f"Cleaned {items_cleaned} completed items")
return items_cleaned
except Exception as e:
logger.error(f"Error during completed items cleanup: {e}")
return 0
async def cleanup_failed(
self,
failed: Dict[str, QueueItem],
cleanup_cutoff: datetime
) -> int:
"""Clean up failed items"""
start_time = datetime.utcnow()
items_cleaned = 0
space_freed = 0
failed_count = len(failed)
try:
# Determine cleanup approach
if self.policy == CleanupPolicy.SIZE:
items_to_clean = self._get_items_by_size(failed)
elif self.policy == CleanupPolicy.HYBRID:
items_to_clean = self._get_items_hybrid(failed, cleanup_cutoff)
else: # AGE policy
items_to_clean = self._get_items_by_age(failed, cleanup_cutoff)
# Clean items
for url in items_to_clean:
try:
item = failed[url]
space_freed += self._estimate_item_size(item)
failed.pop(url)
items_cleaned += 1
except Exception as e:
logger.error(f"Error cleaning failed item {url}: {e}")
failed.pop(url)
items_cleaned += 1
# Record cleanup
self._record_cleanup_result(
items_cleaned,
space_freed,
start_time,
"failed"
)
logger.debug(f"Cleaned {items_cleaned} failed items")
return items_cleaned
except Exception as e:
logger.error(f"Error during failed items cleanup: {e}")
return 0
def _get_items_by_age(
self,
items: Dict[str, QueueItem],
cutoff: datetime
) -> Set[str]:
"""Get items to clean based on age"""
to_clean = set()
for url, item in items.items():
item.added_at = self._normalize_datetime(item.added_at)
if item.added_at < cutoff:
to_clean.add(url)
return to_clean
def _get_items_by_size(self, items: Dict[str, QueueItem]) -> Set[str]:
"""Get items to clean based on size"""
to_clean = set()
total_size = 0
# Sort items by size estimate
sorted_items = sorted(
items.items(),
key=lambda x: self._estimate_item_size(x[1]),
reverse=True
)
for url, item in sorted_items:
total_size += self._estimate_item_size(item)
if total_size > self.thresholds.size_threshold:
to_clean.add(url)
return to_clean
def _get_items_hybrid(
self,
items: Dict[str, QueueItem],
cutoff: datetime
) -> Set[str]:
"""Get items to clean using hybrid approach"""
by_age = self._get_items_by_age(items, cutoff)
by_size = self._get_items_by_size(items)
if self.strategy == CleanupStrategy.AGGRESSIVE:
return by_age.union(by_size)
elif self.strategy == CleanupStrategy.CONSERVATIVE:
return by_age.intersection(by_size)
else: # BALANCED
return by_age
def _estimate_item_size(self, item: QueueItem) -> int:
"""Estimate size of an item in bytes"""
# This could be enhanced with actual file size tracking
base_size = 1024 # 1KB base size
return base_size * (item.retry_count + 1)
def _record_cleanup_result(
self,
items_cleaned: int,
space_freed: int,
start_time: datetime,
cleanup_type: str
) -> None:
"""Record cleanup result"""
duration = (datetime.utcnow() - start_time).total_seconds()
result = CleanupResult(
timestamp=datetime.utcnow(),
items_cleaned=items_cleaned,
space_freed=space_freed,
duration=duration,
strategy=self.strategy,
policy=self.policy,
details={"type": cleanup_type}
)
self.tracker.record_cleanup(result)
def get_cleanup_cutoff(self) -> datetime:
"""Get the cutoff time for cleanup"""
if self.strategy == CleanupStrategy.AGGRESSIVE:
age = self.thresholds.max_history_age // 2
elif self.strategy == CleanupStrategy.CONSERVATIVE:
age = self.thresholds.max_history_age * 2
else: # BALANCED
age = self.thresholds.max_history_age
return datetime.utcnow() - timedelta(seconds=max(
age,
self.thresholds.min_retention_time
))
def format_cleanup_report(
self,
initial_completed: int,
final_completed: int,
initial_failed: int,
final_failed: int
) -> str:
"""Format a cleanup report"""
stats = self.tracker.get_stats()
return (
f"History Cleanup Results:\n"
f"- Completed items: {initial_completed} -> {final_completed}\n"
f"- Failed items: {initial_failed} -> {final_failed}\n"
f"- Total items cleaned: {(initial_completed - final_completed) + (initial_failed - final_failed)}\n"
f"- Space freed: {stats['total_space_freed']} bytes\n"
f"- Strategy: {self.strategy.value}\n"
f"- Policy: {self.policy.value}\n"
f"- Total cleanups: {stats['total_cleanups']}"
)
def get_cleaner_stats(self) -> Dict[str, Any]:
"""Get comprehensive cleaner statistics"""
return {
"strategy": self.strategy.value,
"policy": self.policy.value,
"thresholds": {
"max_history_age": self.thresholds.max_history_age,
"max_completed_items": self.thresholds.max_completed_items,
"max_failed_items": self.thresholds.max_failed_items,
"min_retention_time": self.thresholds.min_retention_time,
"size_threshold": self.thresholds.size_threshold
},
"tracker": self.tracker.get_stats()
}

View File

@@ -0,0 +1,452 @@
"""Module for cleaning queue tracking data"""
import logging
import asyncio
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, List, Set, Tuple, Any, Optional
from datetime import datetime
from ..models import QueueItem
logger = logging.getLogger("TrackingCleaner")
class TrackingCleanupStrategy(Enum):
"""Tracking cleanup strategies"""
AGGRESSIVE = "aggressive" # Remove all invalid entries
CONSERVATIVE = "conservative" # Keep recent invalid entries
BALANCED = "balanced" # Balance between cleanup and retention
class TrackingType(Enum):
"""Types of tracking data"""
GUILD = "guild"
CHANNEL = "channel"
URL = "url"
@dataclass
class TrackingCleanupConfig:
"""Configuration for tracking cleanup"""
batch_size: int = 100
retention_period: int = 3600 # 1 hour
validate_urls: bool = True
cleanup_empty: bool = True
max_invalid_ratio: float = 0.5 # 50% invalid threshold
@dataclass
class TrackingCleanupResult:
"""Result of a tracking cleanup operation"""
timestamp: datetime
strategy: TrackingCleanupStrategy
items_cleaned: int
guilds_cleaned: int
channels_cleaned: int
duration: float
initial_counts: Dict[str, int]
final_counts: Dict[str, int]
error: Optional[str] = None
class TrackingValidator:
"""Validates tracking data"""
@staticmethod
def validate_url(url: str) -> bool:
"""Validate URL format"""
try:
return bool(url and isinstance(url, str) and "://" in url)
except Exception:
return False
@staticmethod
def validate_id(id_value: int) -> bool:
"""Validate ID format"""
try:
return bool(isinstance(id_value, int) and id_value > 0)
except Exception:
return False
class TrackingCleanupTracker:
"""Tracks cleanup operations"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[TrackingCleanupResult] = []
self.total_items_cleaned = 0
self.total_guilds_cleaned = 0
self.total_channels_cleaned = 0
self.last_cleanup: Optional[datetime] = None
def record_cleanup(self, result: TrackingCleanupResult) -> None:
"""Record a cleanup operation"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
self.total_items_cleaned += result.items_cleaned
self.total_guilds_cleaned += result.guilds_cleaned
self.total_channels_cleaned += result.channels_cleaned
self.last_cleanup = result.timestamp
def get_stats(self) -> Dict[str, Any]:
"""Get cleanup statistics"""
return {
"total_cleanups": len(self.history),
"total_items_cleaned": self.total_items_cleaned,
"total_guilds_cleaned": self.total_guilds_cleaned,
"total_channels_cleaned": self.total_channels_cleaned,
"last_cleanup": (
self.last_cleanup.isoformat()
if self.last_cleanup
else None
),
"recent_cleanups": [
{
"timestamp": r.timestamp.isoformat(),
"strategy": r.strategy.value,
"items_cleaned": r.items_cleaned,
"guilds_cleaned": r.guilds_cleaned,
"channels_cleaned": r.channels_cleaned,
"duration": r.duration
}
for r in self.history[-5:] # Last 5 cleanups
]
}
class TrackingCleaner:
"""Handles cleanup of queue tracking data"""
def __init__(
self,
strategy: TrackingCleanupStrategy = TrackingCleanupStrategy.BALANCED,
config: Optional[TrackingCleanupConfig] = None
):
self.strategy = strategy
self.config = config or TrackingCleanupConfig()
self.tracker = TrackingCleanupTracker()
self.validator = TrackingValidator()
async def cleanup_tracking(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
queue: List[QueueItem],
processing: Dict[str, QueueItem]
) -> Tuple[int, Dict[str, int]]:
"""Clean up tracking data"""
start_time = datetime.utcnow()
try:
# Get initial counts
initial_counts = self._get_tracking_counts(
guild_queues,
channel_queues
)
# Get valid URLs
valid_urls = self._get_valid_urls(queue, processing)
# Clean tracking data based on strategy
items_cleaned = 0
guilds_cleaned = 0
channels_cleaned = 0
if self.strategy == TrackingCleanupStrategy.AGGRESSIVE:
cleaned = await self._aggressive_cleanup(
guild_queues,
channel_queues,
valid_urls
)
elif self.strategy == TrackingCleanupStrategy.CONSERVATIVE:
cleaned = await self._conservative_cleanup(
guild_queues,
channel_queues,
valid_urls
)
else: # BALANCED
cleaned = await self._balanced_cleanup(
guild_queues,
channel_queues,
valid_urls
)
items_cleaned = cleaned[0]
guilds_cleaned = cleaned[1]
channels_cleaned = cleaned[2]
# Get final counts
final_counts = self._get_tracking_counts(
guild_queues,
channel_queues
)
# Record cleanup result
duration = (datetime.utcnow() - start_time).total_seconds()
result = TrackingCleanupResult(
timestamp=datetime.utcnow(),
strategy=self.strategy,
items_cleaned=items_cleaned,
guilds_cleaned=guilds_cleaned,
channels_cleaned=channels_cleaned,
duration=duration,
initial_counts=initial_counts,
final_counts=final_counts
)
self.tracker.record_cleanup(result)
logger.info(self.format_tracking_cleanup_report(
initial_counts,
final_counts,
duration
))
return items_cleaned, initial_counts
except Exception as e:
logger.error(f"Error cleaning tracking data: {e}")
self.tracker.record_cleanup(TrackingCleanupResult(
timestamp=datetime.utcnow(),
strategy=self.strategy,
items_cleaned=0,
guilds_cleaned=0,
channels_cleaned=0,
duration=0,
initial_counts={},
final_counts={},
error=str(e)
))
raise
async def _aggressive_cleanup(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
valid_urls: Set[str]
) -> Tuple[int, int, int]:
"""Perform aggressive cleanup"""
items_cleaned = 0
guilds_cleaned = 0
channels_cleaned = 0
# Clean guild tracking
guild_cleaned = await self._cleanup_guild_tracking(
guild_queues,
valid_urls,
validate_all=True
)
items_cleaned += guild_cleaned[0]
guilds_cleaned += guild_cleaned[1]
# Clean channel tracking
channel_cleaned = await self._cleanup_channel_tracking(
channel_queues,
valid_urls,
validate_all=True
)
items_cleaned += channel_cleaned[0]
channels_cleaned += channel_cleaned[1]
return items_cleaned, guilds_cleaned, channels_cleaned
async def _conservative_cleanup(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
valid_urls: Set[str]
) -> Tuple[int, int, int]:
"""Perform conservative cleanup"""
items_cleaned = 0
guilds_cleaned = 0
channels_cleaned = 0
# Only clean if invalid ratio exceeds threshold
for guild_id, urls in list(guild_queues.items()):
invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
if invalid_ratio > self.config.max_invalid_ratio:
cleaned = await self._cleanup_guild_tracking(
{guild_id: urls},
valid_urls,
validate_all=False
)
items_cleaned += cleaned[0]
guilds_cleaned += cleaned[1]
for channel_id, urls in list(channel_queues.items()):
invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
if invalid_ratio > self.config.max_invalid_ratio:
cleaned = await self._cleanup_channel_tracking(
{channel_id: urls},
valid_urls,
validate_all=False
)
items_cleaned += cleaned[0]
channels_cleaned += cleaned[1]
return items_cleaned, guilds_cleaned, channels_cleaned
async def _balanced_cleanup(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
valid_urls: Set[str]
) -> Tuple[int, int, int]:
"""Perform balanced cleanup"""
items_cleaned = 0
guilds_cleaned = 0
channels_cleaned = 0
# Clean guild tracking with validation
guild_cleaned = await self._cleanup_guild_tracking(
guild_queues,
valid_urls,
validate_all=self.config.validate_urls
)
items_cleaned += guild_cleaned[0]
guilds_cleaned += guild_cleaned[1]
# Clean channel tracking with validation
channel_cleaned = await self._cleanup_channel_tracking(
channel_queues,
valid_urls,
validate_all=self.config.validate_urls
)
items_cleaned += channel_cleaned[0]
channels_cleaned += channel_cleaned[1]
return items_cleaned, guilds_cleaned, channels_cleaned
async def _cleanup_guild_tracking(
self,
guild_queues: Dict[int, Set[str]],
valid_urls: Set[str],
validate_all: bool
) -> Tuple[int, int]:
"""Clean up guild tracking data"""
items_cleaned = 0
guilds_cleaned = 0
batch_count = 0
for guild_id in list(guild_queues.keys()):
if not self.validator.validate_id(guild_id):
guild_queues.pop(guild_id)
guilds_cleaned += 1
continue
original_size = len(guild_queues[guild_id])
guild_queues[guild_id] = {
url for url in guild_queues[guild_id]
if (
(not validate_all or self.validator.validate_url(url)) and
url in valid_urls
)
}
items_cleaned += original_size - len(guild_queues[guild_id])
if self.config.cleanup_empty and not guild_queues[guild_id]:
guild_queues.pop(guild_id)
guilds_cleaned += 1
batch_count += 1
if batch_count >= self.config.batch_size:
await asyncio.sleep(0) # Yield to event loop
batch_count = 0
logger.debug(f"Cleaned {items_cleaned} guild tracking items")
return items_cleaned, guilds_cleaned
async def _cleanup_channel_tracking(
self,
channel_queues: Dict[int, Set[str]],
valid_urls: Set[str],
validate_all: bool
) -> Tuple[int, int]:
"""Clean up channel tracking data"""
items_cleaned = 0
channels_cleaned = 0
batch_count = 0
for channel_id in list(channel_queues.keys()):
if not self.validator.validate_id(channel_id):
channel_queues.pop(channel_id)
channels_cleaned += 1
continue
original_size = len(channel_queues[channel_id])
channel_queues[channel_id] = {
url for url in channel_queues[channel_id]
if (
(not validate_all or self.validator.validate_url(url)) and
url in valid_urls
)
}
items_cleaned += original_size - len(channel_queues[channel_id])
if self.config.cleanup_empty and not channel_queues[channel_id]:
channel_queues.pop(channel_id)
channels_cleaned += 1
batch_count += 1
if batch_count >= self.config.batch_size:
await asyncio.sleep(0) # Yield to event loop
batch_count = 0
logger.debug(f"Cleaned {items_cleaned} channel tracking items")
return items_cleaned, channels_cleaned
def _get_valid_urls(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem]
) -> Set[str]:
"""Get set of valid URLs"""
valid_urls = {item.url for item in queue}
valid_urls.update(processing.keys())
return valid_urls
def _get_tracking_counts(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]]
) -> Dict[str, int]:
"""Get tracking data counts"""
return {
'guilds': len(guild_queues),
'channels': len(channel_queues),
'guild_urls': sum(len(urls) for urls in guild_queues.values()),
'channel_urls': sum(len(urls) for urls in channel_queues.values())
}
def format_tracking_cleanup_report(
self,
initial_counts: Dict[str, int],
final_counts: Dict[str, int],
duration: float
) -> str:
"""Format a tracking cleanup report"""
total_cleaned = (
(initial_counts['guild_urls'] - final_counts['guild_urls']) +
(initial_counts['channel_urls'] - final_counts['channel_urls'])
)
return (
f"Tracking Cleanup Results:\n"
f"Strategy: {self.strategy.value}\n"
f"Duration: {duration:.2f}s\n"
f"Items:\n"
f"- Guild Queues: {initial_counts['guilds']} -> {final_counts['guilds']}\n"
f"- Channel Queues: {initial_counts['channels']} -> {final_counts['channels']}\n"
f"- Guild URLs: {initial_counts['guild_urls']} -> {final_counts['guild_urls']}\n"
f"- Channel URLs: {initial_counts['channel_urls']} -> {final_counts['channel_urls']}\n"
f"Total items cleaned: {total_cleaned}"
)
def get_cleaner_stats(self) -> Dict[str, Any]:
"""Get comprehensive cleaner statistics"""
return {
"strategy": self.strategy.value,
"config": {
"batch_size": self.config.batch_size,
"retention_period": self.config.retention_period,
"validate_urls": self.config.validate_urls,
"cleanup_empty": self.config.cleanup_empty,
"max_invalid_ratio": self.config.max_invalid_ratio
},
"tracker": self.tracker.get_stats()
}

View File

@@ -2,316 +2,459 @@
import asyncio
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, List, Set, Optional, Any, Tuple
from datetime import datetime, timedelta
from typing import Dict, List, Set, Optional
from .models import QueueItem, QueueMetrics
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
from .models import QueueItem, QueueMetrics
from .cleaners.history_cleaner import (
HistoryCleaner,
CleanupStrategy as HistoryStrategy
)
from .cleaners.guild_cleaner import (
GuildCleaner,
GuildCleanupStrategy
)
from .cleaners.tracking_cleaner import (
TrackingCleaner,
TrackingCleanupStrategy
)
logger = logging.getLogger("QueueCleanup")
class QueueCleaner:
"""Handles cleanup of old queue items and tracking data"""
class CleanupMode(Enum):
"""Cleanup operation modes"""
NORMAL = "normal" # Regular cleanup
AGGRESSIVE = "aggressive" # More aggressive cleanup
MAINTENANCE = "maintenance" # Maintenance mode cleanup
EMERGENCY = "emergency" # Emergency cleanup
class CleanupPhase(Enum):
"""Cleanup operation phases"""
HISTORY = "history"
TRACKING = "tracking"
GUILD = "guild"
VERIFICATION = "verification"
@dataclass
class CleanupConfig:
"""Configuration for cleanup operations"""
cleanup_interval: int = 1800 # 30 minutes
max_history_age: int = 43200 # 12 hours
batch_size: int = 100
max_concurrent_cleanups: int = 3
verification_interval: int = 300 # 5 minutes
emergency_threshold: int = 10000 # Items threshold for emergency
@dataclass
class CleanupResult:
"""Result of a cleanup operation"""
timestamp: datetime
mode: CleanupMode
duration: float
items_cleaned: Dict[CleanupPhase, int]
error: Optional[str] = None
class CleanupScheduler:
"""Schedules cleanup operations"""
def __init__(self, config: CleanupConfig):
self.config = config
self.next_cleanup: Optional[datetime] = None
self.next_verification: Optional[datetime] = None
self._last_emergency: Optional[datetime] = None
def should_cleanup(self, queue_size: int) -> Tuple[bool, CleanupMode]:
"""Determine if cleanup should run"""
now = datetime.utcnow()
# Check for emergency cleanup
if (
queue_size > self.config.emergency_threshold and
(
not self._last_emergency or
now - self._last_emergency > timedelta(minutes=5)
)
):
self._last_emergency = now
return True, CleanupMode.EMERGENCY
# Check scheduled cleanup
if not self.next_cleanup or now >= self.next_cleanup:
self.next_cleanup = now + timedelta(
seconds=self.config.cleanup_interval
)
return True, CleanupMode.NORMAL
# Check verification
if not self.next_verification or now >= self.next_verification:
self.next_verification = now + timedelta(
seconds=self.config.verification_interval
)
return True, CleanupMode.MAINTENANCE
return False, CleanupMode.NORMAL
class CleanupCoordinator:
"""Coordinates cleanup operations"""
def __init__(self):
self.active_cleanups: Set[CleanupPhase] = set()
self._cleanup_lock = asyncio.Lock()
self._phase_locks: Dict[CleanupPhase, asyncio.Lock] = {
phase: asyncio.Lock() for phase in CleanupPhase
}
async def start_cleanup(self, phase: CleanupPhase) -> bool:
"""Start a cleanup phase"""
async with self._cleanup_lock:
if phase in self.active_cleanups:
return False
self.active_cleanups.add(phase)
return True
async def end_cleanup(self, phase: CleanupPhase) -> None:
"""End a cleanup phase"""
async with self._cleanup_lock:
self.active_cleanups.discard(phase)
async def acquire_phase(self, phase: CleanupPhase) -> bool:
"""Acquire lock for a cleanup phase"""
return await self._phase_locks[phase].acquire()
def release_phase(self, phase: CleanupPhase) -> None:
"""Release lock for a cleanup phase"""
self._phase_locks[phase].release()
class CleanupTracker:
"""Tracks cleanup operations"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[CleanupResult] = []
self.total_items_cleaned = 0
self.last_cleanup: Optional[datetime] = None
self.cleanup_counts: Dict[CleanupMode, int] = {
mode: 0 for mode in CleanupMode
}
def record_cleanup(self, result: CleanupResult) -> None:
"""Record a cleanup operation"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
self.total_items_cleaned += sum(result.items_cleaned.values())
self.last_cleanup = result.timestamp
self.cleanup_counts[result.mode] += 1
def get_stats(self) -> Dict[str, Any]:
"""Get cleanup statistics"""
return {
"total_cleanups": len(self.history),
"total_items_cleaned": self.total_items_cleaned,
"last_cleanup": (
self.last_cleanup.isoformat()
if self.last_cleanup
else None
),
"cleanup_counts": {
mode.value: count
for mode, count in self.cleanup_counts.items()
},
"recent_cleanups": [
{
"timestamp": r.timestamp.isoformat(),
"mode": r.mode.value,
"duration": r.duration,
"items_cleaned": {
phase.value: count
for phase, count in r.items_cleaned.items()
}
}
for r in self.history[-5:] # Last 5 cleanups
]
}
class QueueCleaner:
"""Handles cleanup of queue items and tracking data"""
def __init__(self, config: Optional[CleanupConfig] = None):
self.config = config or CleanupConfig()
self.scheduler = CleanupScheduler(self.config)
self.coordinator = CleanupCoordinator()
self.tracker = CleanupTracker()
# Initialize cleaners
self.history_cleaner = HistoryCleaner()
self.guild_cleaner = GuildCleaner()
self.tracking_cleaner = TrackingCleaner()
def __init__(
self,
cleanup_interval: int = 1800, # 30 minutes
max_history_age: int = 43200, # 12 hours
):
self.cleanup_interval = cleanup_interval
self.max_history_age = max_history_age
self._shutdown = False
self._cleanup_task: Optional[asyncio.Task] = None
self._last_cleanup_time = datetime.utcnow()
async def start_cleanup(
async def start(
self,
queue: List[QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
queue_lock: asyncio.Lock
state_manager,
metrics_manager
) -> None:
"""Start periodic cleanup process
Args:
queue: Reference to the queue list
completed: Reference to completed items dict
failed: Reference to failed items dict
guild_queues: Reference to guild tracking dict
channel_queues: Reference to channel tracking dict
processing: Reference to processing dict
metrics: Reference to queue metrics
queue_lock: Lock for queue operations
"""
"""Start periodic cleanup process"""
if self._cleanup_task is not None:
logger.warning("Cleanup task already running")
return
logger.info("Starting queue cleanup task...")
self._cleanup_task = asyncio.create_task(
self._cleanup_loop(
queue,
completed,
failed,
guild_queues,
channel_queues,
processing,
metrics,
queue_lock
)
self._cleanup_loop(state_manager, metrics_manager)
)
async def _cleanup_loop(
self,
queue: List[QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
queue_lock: asyncio.Lock
state_manager,
metrics_manager
) -> None:
"""Main cleanup loop"""
while not self._shutdown:
try:
await self._perform_cleanup(
queue,
completed,
failed,
guild_queues,
channel_queues,
processing,
metrics,
queue_lock
)
self._last_cleanup_time = datetime.utcnow()
await asyncio.sleep(self.cleanup_interval)
# Check if cleanup should run
queue_size = len(await state_manager.get_queue())
should_run, mode = self.scheduler.should_cleanup(queue_size)
if should_run:
await self._perform_cleanup(
state_manager,
metrics_manager,
mode
)
await asyncio.sleep(1) # Short sleep to prevent CPU hogging
except asyncio.CancelledError:
logger.info("Queue cleanup cancelled")
break
except Exception as e:
logger.error(f"Error in cleanup loop: {str(e)}")
# Shorter sleep on error to retry sooner
await asyncio.sleep(30)
await asyncio.sleep(30) # Longer sleep on error
def stop_cleanup(self) -> None:
async def stop(self) -> None:
"""Stop the cleanup process"""
logger.info("Stopping queue cleanup...")
self._shutdown = True
if self._cleanup_task and not self._cleanup_task.done():
self._cleanup_task.cancel()
try:
await self._cleanup_task
except asyncio.CancelledError:
pass
self._cleanup_task = None
async def _perform_cleanup(
self,
queue: List[QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
queue_lock: asyncio.Lock
state_manager,
metrics_manager,
mode: CleanupMode
) -> None:
"""Perform cleanup operations
Args:
queue: Reference to the queue list
completed: Reference to completed items dict
failed: Reference to failed items dict
guild_queues: Reference to guild tracking dict
channel_queues: Reference to channel tracking dict
processing: Reference to processing dict
metrics: Reference to queue metrics
queue_lock: Lock for queue operations
"""
"""Perform cleanup operations"""
start_time = datetime.utcnow()
items_cleaned: Dict[CleanupPhase, int] = {
phase: 0 for phase in CleanupPhase
}
try:
current_time = datetime.utcnow()
cleanup_cutoff = current_time - timedelta(seconds=self.max_history_age)
items_cleaned = 0
# Get current state
queue = await state_manager.get_queue()
processing = await state_manager.get_processing()
completed = await state_manager.get_completed()
failed = await state_manager.get_failed()
guild_queues = await state_manager.get_guild_queues()
channel_queues = await state_manager.get_channel_queues()
async with queue_lock:
# Clean up completed items
completed_count = len(completed)
for url in list(completed.keys()):
try:
item = completed[url]
if not isinstance(item.added_at, datetime):
try:
if isinstance(item.added_at, str):
item.added_at = datetime.fromisoformat(item.added_at)
else:
item.added_at = current_time
except (ValueError, TypeError):
item.added_at = current_time
if item.added_at < cleanup_cutoff:
completed.pop(url)
items_cleaned += 1
except Exception as e:
logger.error(f"Error cleaning completed item {url}: {e}")
completed.pop(url)
items_cleaned += 1
# Clean historical items
if await self.coordinator.start_cleanup(CleanupPhase.HISTORY):
try:
await self.coordinator.acquire_phase(CleanupPhase.HISTORY)
cleanup_cutoff = self.history_cleaner.get_cleanup_cutoff()
# Adjust strategy based on mode
if mode == CleanupMode.AGGRESSIVE:
self.history_cleaner.strategy = HistoryStrategy.AGGRESSIVE
elif mode == CleanupMode.MAINTENANCE:
self.history_cleaner.strategy = HistoryStrategy.CONSERVATIVE
completed_cleaned = await self.history_cleaner.cleanup_completed(
completed,
cleanup_cutoff
)
failed_cleaned = await self.history_cleaner.cleanup_failed(
failed,
cleanup_cutoff
)
items_cleaned[CleanupPhase.HISTORY] = (
completed_cleaned + failed_cleaned
)
finally:
self.coordinator.release_phase(CleanupPhase.HISTORY)
await self.coordinator.end_cleanup(CleanupPhase.HISTORY)
# Clean up failed items
failed_count = len(failed)
for url in list(failed.keys()):
try:
item = failed[url]
if not isinstance(item.added_at, datetime):
try:
if isinstance(item.added_at, str):
item.added_at = datetime.fromisoformat(item.added_at)
else:
item.added_at = current_time
except (ValueError, TypeError):
item.added_at = current_time
if item.added_at < cleanup_cutoff:
failed.pop(url)
items_cleaned += 1
except Exception as e:
logger.error(f"Error cleaning failed item {url}: {e}")
failed.pop(url)
items_cleaned += 1
# Clean tracking data
if await self.coordinator.start_cleanup(CleanupPhase.TRACKING):
try:
await self.coordinator.acquire_phase(CleanupPhase.TRACKING)
# Adjust strategy based on mode
if mode == CleanupMode.AGGRESSIVE:
self.tracking_cleaner.strategy = TrackingCleanupStrategy.AGGRESSIVE
elif mode == CleanupMode.MAINTENANCE:
self.tracking_cleaner.strategy = TrackingCleanupStrategy.CONSERVATIVE
tracking_cleaned, _ = await self.tracking_cleaner.cleanup_tracking(
guild_queues,
channel_queues,
queue,
processing
)
items_cleaned[CleanupPhase.TRACKING] = tracking_cleaned
finally:
self.coordinator.release_phase(CleanupPhase.TRACKING)
await self.coordinator.end_cleanup(CleanupPhase.TRACKING)
# Clean up guild tracking
guild_count = len(guild_queues)
for guild_id in list(guild_queues.keys()):
original_size = len(guild_queues[guild_id])
guild_queues[guild_id] = {
url for url in guild_queues[guild_id]
if url in queue or url in processing
}
items_cleaned += original_size - len(guild_queues[guild_id])
if not guild_queues[guild_id]:
guild_queues.pop(guild_id)
# Update state
await state_manager.update_state(
completed=completed,
failed=failed,
guild_queues=guild_queues,
channel_queues=channel_queues
)
# Clean up channel tracking
channel_count = len(channel_queues)
for channel_id in list(channel_queues.keys()):
original_size = len(channel_queues[channel_id])
channel_queues[channel_id] = {
url for url in channel_queues[channel_id]
if url in queue or url in processing
}
items_cleaned += original_size - len(channel_queues[channel_id])
if not channel_queues[channel_id]:
channel_queues.pop(channel_id)
# Record cleanup result
duration = (datetime.utcnow() - start_time).total_seconds()
result = CleanupResult(
timestamp=datetime.utcnow(),
mode=mode,
duration=duration,
items_cleaned=items_cleaned
)
self.tracker.record_cleanup(result)
# Update metrics
metrics.last_cleanup = current_time
# Update metrics
metrics_manager.update_cleanup_time()
logger.info(
f"Queue cleanup completed:\n"
f"- Items cleaned: {items_cleaned}\n"
f"- Completed items: {completed_count} -> {len(completed)}\n"
f"- Failed items: {failed_count} -> {len(failed)}\n"
f"- Guild queues: {guild_count} -> {len(guild_queues)}\n"
f"- Channel queues: {channel_count} -> {len(channel_queues)}\n"
f"- Current queue size: {len(queue)}\n"
f"- Processing items: {len(processing)}"
)
logger.info(
f"Cleanup completed ({mode.value}):\n" +
"\n".join(
f"- {phase.value}: {count} items"
for phase, count in items_cleaned.items()
if count > 0
) +
f"\nTotal duration: {duration:.2f}s"
)
except Exception as e:
logger.error(f"Error during cleanup: {str(e)}")
# Don't re-raise to keep cleanup running
duration = (datetime.utcnow() - start_time).total_seconds()
self.tracker.record_cleanup(CleanupResult(
timestamp=datetime.utcnow(),
mode=mode,
duration=duration,
items_cleaned=items_cleaned,
error=str(e)
))
raise CleanupError(f"Cleanup failed: {str(e)}")
async def clear_guild_queue(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
queue_lock: asyncio.Lock
state_manager
) -> int:
"""Clear all queue items for a specific guild
Args:
guild_id: ID of the guild to clear
queue: Reference to the queue list
processing: Reference to processing dict
completed: Reference to completed items dict
failed: Reference to failed items dict
guild_queues: Reference to guild tracking dict
channel_queues: Reference to channel tracking dict
queue_lock: Lock for queue operations
Returns:
Number of items cleared
"""
"""Clear all queue items for a specific guild"""
try:
cleared_count = 0
async with queue_lock:
# Get URLs for this guild
guild_urls = guild_queues.get(guild_id, set())
initial_counts = {
'queue': len([item for item in queue if item.guild_id == guild_id]),
'processing': len([item for item in processing.values() if item.guild_id == guild_id]),
'completed': len([item for item in completed.values() if item.guild_id == guild_id]),
'failed': len([item for item in failed.values() if item.guild_id == guild_id])
}
if not await self.coordinator.start_cleanup(CleanupPhase.GUILD):
raise CleanupError("Guild cleanup already in progress")
# Clear from pending queue
queue[:] = [item for item in queue if item.guild_id != guild_id]
try:
await self.coordinator.acquire_phase(CleanupPhase.GUILD)
# Get current state
queue = await state_manager.get_queue()
processing = await state_manager.get_processing()
completed = await state_manager.get_completed()
failed = await state_manager.get_failed()
guild_queues = await state_manager.get_guild_queues()
channel_queues = await state_manager.get_channel_queues()
# Clear from processing
for url in list(processing.keys()):
if processing[url].guild_id == guild_id:
processing.pop(url)
cleared_count += 1
# Clear from completed
for url in list(completed.keys()):
if completed[url].guild_id == guild_id:
completed.pop(url)
cleared_count += 1
# Clear from failed
for url in list(failed.keys()):
if failed[url].guild_id == guild_id:
failed.pop(url)
cleared_count += 1
# Clear guild tracking
if guild_id in guild_queues:
cleared_count += len(guild_queues[guild_id])
guild_queues.pop(guild_id)
# Clear channel tracking for this guild's channels
for channel_id in list(channel_queues.keys()):
channel_queues[channel_id] = {
url for url in channel_queues[channel_id]
if url not in guild_urls
}
if not channel_queues[channel_id]:
channel_queues.pop(channel_id)
logger.info(
f"Cleared guild {guild_id} queue:\n"
f"- Queue: {initial_counts['queue']} items\n"
f"- Processing: {initial_counts['processing']} items\n"
f"- Completed: {initial_counts['completed']} items\n"
f"- Failed: {initial_counts['failed']} items\n"
f"Total cleared: {cleared_count} items"
# Clear guild items
cleared_count, counts = await self.guild_cleaner.clear_guild_items(
guild_id,
queue,
processing,
completed,
failed,
guild_queues,
channel_queues
)
# Update state
await state_manager.update_state(
queue=queue,
processing=processing,
completed=completed,
failed=failed,
guild_queues=guild_queues,
channel_queues=channel_queues
)
return cleared_count
finally:
self.coordinator.release_phase(CleanupPhase.GUILD)
await self.coordinator.end_cleanup(CleanupPhase.GUILD)
except Exception as e:
logger.error(f"Error clearing guild queue: {str(e)}")
raise CleanupError(f"Failed to clear guild queue: {str(e)}")
def get_cleaner_stats(self) -> Dict[str, Any]:
"""Get comprehensive cleaner statistics"""
return {
"config": {
"cleanup_interval": self.config.cleanup_interval,
"max_history_age": self.config.max_history_age,
"batch_size": self.config.batch_size,
"max_concurrent_cleanups": self.config.max_concurrent_cleanups,
"verification_interval": self.config.verification_interval,
"emergency_threshold": self.config.emergency_threshold
},
"scheduler": {
"next_cleanup": (
self.scheduler.next_cleanup.isoformat()
if self.scheduler.next_cleanup
else None
),
"next_verification": (
self.scheduler.next_verification.isoformat()
if self.scheduler.next_verification
else None
),
"last_emergency": (
self.scheduler._last_emergency.isoformat()
if self.scheduler._last_emergency
else None
)
},
"coordinator": {
"active_cleanups": [
phase.value for phase in self.coordinator.active_cleanups
]
},
"tracker": self.tracker.get_stats(),
"cleaners": {
"history": self.history_cleaner.get_cleaner_stats(),
"guild": self.guild_cleaner.get_cleaner_stats(),
"tracking": self.tracking_cleaner.get_cleaner_stats()
}
}
class CleanupError(Exception):
"""Base exception for cleanup-related errors"""
pass

View File

@@ -0,0 +1,441 @@
"""Module for queue health checks"""
import logging
import psutil
import time
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, Optional, Tuple, List, Any, Set
from datetime import datetime, timedelta
logger = logging.getLogger("QueueHealthChecker")
class HealthStatus(Enum):
"""Possible health status values"""
HEALTHY = "healthy"
WARNING = "warning"
CRITICAL = "critical"
UNKNOWN = "unknown"
class HealthCategory(Enum):
"""Health check categories"""
MEMORY = "memory"
PERFORMANCE = "performance"
ACTIVITY = "activity"
ERRORS = "errors"
DEADLOCKS = "deadlocks"
SYSTEM = "system"
@dataclass
class HealthThresholds:
"""Defines thresholds for health checks"""
memory_warning_mb: int = 384 # 384MB
memory_critical_mb: int = 512 # 512MB
deadlock_warning_sec: int = 30 # 30 seconds
deadlock_critical_sec: int = 60 # 1 minute
error_rate_warning: float = 0.1 # 10% errors
error_rate_critical: float = 0.2 # 20% errors
inactivity_warning_sec: int = 30
inactivity_critical_sec: int = 60
cpu_warning_percent: float = 80.0
cpu_critical_percent: float = 90.0
@dataclass
class HealthCheckResult:
"""Result of a health check"""
category: HealthCategory
status: HealthStatus
message: str
value: Optional[float] = None
timestamp: datetime = field(default_factory=datetime.utcnow)
details: Dict[str, Any] = field(default_factory=dict)
class HealthHistory:
"""Tracks health check history"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[HealthCheckResult] = []
self.status_changes: List[Dict[str, Any]] = []
self.critical_events: List[Dict[str, Any]] = []
def add_result(self, result: HealthCheckResult) -> None:
"""Add a health check result"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
# Track status changes
if self.history[-2:-1] and self.history[-1].status != self.history[-2].status:
self.status_changes.append({
"timestamp": result.timestamp,
"category": result.category.value,
"from_status": self.history[-2].status.value,
"to_status": result.status.value,
"message": result.message
})
# Track critical events
if result.status == HealthStatus.CRITICAL:
self.critical_events.append({
"timestamp": result.timestamp,
"category": result.category.value,
"message": result.message,
"details": result.details
})
def get_status_summary(self) -> Dict[str, Any]:
"""Get summary of health status history"""
return {
"total_checks": len(self.history),
"status_changes": len(self.status_changes),
"critical_events": len(self.critical_events),
"recent_status_changes": self.status_changes[-5:],
"recent_critical_events": self.critical_events[-5:]
}
class SystemHealthMonitor:
"""Monitors system health metrics"""
def __init__(self):
self.process = psutil.Process()
async def check_system_health(self) -> Dict[str, Any]:
"""Check system health metrics"""
try:
cpu_percent = self.process.cpu_percent()
memory_info = self.process.memory_info()
io_counters = self.process.io_counters()
return {
"cpu_percent": cpu_percent,
"memory_rss": memory_info.rss / 1024 / 1024, # MB
"memory_vms": memory_info.vms / 1024 / 1024, # MB
"io_read_mb": io_counters.read_bytes / 1024 / 1024,
"io_write_mb": io_counters.write_bytes / 1024 / 1024,
"thread_count": self.process.num_threads(),
"open_files": len(self.process.open_files()),
"connections": len(self.process.connections())
}
except Exception as e:
logger.error(f"Error checking system health: {e}")
return {}
class HealthChecker:
"""Handles health checks for the queue system"""
def __init__(
self,
thresholds: Optional[HealthThresholds] = None,
history_size: int = 1000
):
self.thresholds = thresholds or HealthThresholds()
self.history = HealthHistory(history_size)
self.system_monitor = SystemHealthMonitor()
self._last_gc_time: Optional[datetime] = None
async def check_health(
self,
metrics: Dict[str, Any],
queue_info: Dict[str, Any]
) -> Dict[str, Any]:
"""Perform comprehensive health check"""
results = []
# Check memory health
memory_result = await self._check_memory_health()
results.append(memory_result)
# Check performance health
perf_result = self._check_performance_health(metrics)
results.append(perf_result)
# Check activity health
activity_result = self._check_activity_health(
queue_info["last_activity"],
queue_info["processing_count"] > 0
)
results.append(activity_result)
# Check error health
error_result = self._check_error_health(metrics)
results.append(error_result)
# Check for deadlocks
deadlock_result = self._check_deadlocks(queue_info)
results.append(deadlock_result)
# Check system health
system_result = await self._check_system_health()
results.append(system_result)
# Record results
for result in results:
self.history.add_result(result)
# Determine overall health
overall_status = self._determine_overall_status(results)
return {
"timestamp": datetime.utcnow().isoformat(),
"overall_status": overall_status.value,
"checks": [
{
"category": r.category.value,
"status": r.status.value,
"message": r.message,
"value": r.value,
"details": r.details
}
for r in results
],
"history": self.history.get_status_summary()
}
async def _check_memory_health(self) -> HealthCheckResult:
"""Check memory health"""
try:
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024 # MB
if memory_usage > self.thresholds.memory_critical_mb:
if (
not self._last_gc_time or
datetime.utcnow() - self._last_gc_time > timedelta(minutes=5)
):
import gc
gc.collect()
self._last_gc_time = datetime.utcnow()
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024
status = HealthStatus.CRITICAL
message = f"Critical memory usage: {memory_usage:.1f}MB"
elif memory_usage > self.thresholds.memory_warning_mb:
status = HealthStatus.WARNING
message = f"High memory usage: {memory_usage:.1f}MB"
else:
status = HealthStatus.HEALTHY
message = f"Normal memory usage: {memory_usage:.1f}MB"
return HealthCheckResult(
category=HealthCategory.MEMORY,
status=status,
message=message,
value=memory_usage
)
except Exception as e:
logger.error(f"Error checking memory health: {e}")
return HealthCheckResult(
category=HealthCategory.MEMORY,
status=HealthStatus.UNKNOWN,
message=f"Error checking memory: {str(e)}"
)
def _check_performance_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
"""Check performance health"""
try:
avg_time = metrics.get("avg_processing_time", 0)
success_rate = metrics.get("success_rate", 1.0)
if success_rate < 0.5: # Less than 50% success
status = HealthStatus.CRITICAL
message = f"Critical performance: {success_rate:.1%} success rate"
elif success_rate < 0.8: # Less than 80% success
status = HealthStatus.WARNING
message = f"Degraded performance: {success_rate:.1%} success rate"
else:
status = HealthStatus.HEALTHY
message = f"Normal performance: {success_rate:.1%} success rate"
return HealthCheckResult(
category=HealthCategory.PERFORMANCE,
status=status,
message=message,
value=success_rate,
details={"avg_processing_time": avg_time}
)
except Exception as e:
logger.error(f"Error checking performance health: {e}")
return HealthCheckResult(
category=HealthCategory.PERFORMANCE,
status=HealthStatus.UNKNOWN,
message=f"Error checking performance: {str(e)}"
)
def _check_activity_health(
self,
last_activity_time: float,
has_processing_items: bool
) -> HealthCheckResult:
"""Check activity health"""
if not has_processing_items:
return HealthCheckResult(
category=HealthCategory.ACTIVITY,
status=HealthStatus.HEALTHY,
message="No items being processed"
)
inactive_time = time.time() - last_activity_time
if inactive_time > self.thresholds.inactivity_critical_sec:
status = HealthStatus.CRITICAL
message = f"No activity for {inactive_time:.1f}s"
elif inactive_time > self.thresholds.inactivity_warning_sec:
status = HealthStatus.WARNING
message = f"Limited activity for {inactive_time:.1f}s"
else:
status = HealthStatus.HEALTHY
message = "Normal activity levels"
return HealthCheckResult(
category=HealthCategory.ACTIVITY,
status=status,
message=message,
value=inactive_time
)
def _check_error_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
"""Check error health"""
try:
error_rate = metrics.get("error_rate", 0.0)
error_count = metrics.get("total_errors", 0)
if error_rate > self.thresholds.error_rate_critical:
status = HealthStatus.CRITICAL
message = f"Critical error rate: {error_rate:.1%}"
elif error_rate > self.thresholds.error_rate_warning:
status = HealthStatus.WARNING
message = f"High error rate: {error_rate:.1%}"
else:
status = HealthStatus.HEALTHY
message = f"Normal error rate: {error_rate:.1%}"
return HealthCheckResult(
category=HealthCategory.ERRORS,
status=status,
message=message,
value=error_rate,
details={"error_count": error_count}
)
except Exception as e:
logger.error(f"Error checking error health: {e}")
return HealthCheckResult(
category=HealthCategory.ERRORS,
status=HealthStatus.UNKNOWN,
message=f"Error checking errors: {str(e)}"
)
def _check_deadlocks(self, queue_info: Dict[str, Any]) -> HealthCheckResult:
"""Check for potential deadlocks"""
try:
stuck_items = queue_info.get("stuck_items", [])
if not stuck_items:
return HealthCheckResult(
category=HealthCategory.DEADLOCKS,
status=HealthStatus.HEALTHY,
message="No stuck items detected"
)
longest_stuck = max(
time.time() - item["start_time"]
for item in stuck_items
)
if longest_stuck > self.thresholds.deadlock_critical_sec:
status = HealthStatus.CRITICAL
message = f"Potential deadlock: {len(stuck_items)} items stuck"
elif longest_stuck > self.thresholds.deadlock_warning_sec:
status = HealthStatus.WARNING
message = f"Slow processing: {len(stuck_items)} items delayed"
else:
status = HealthStatus.HEALTHY
message = "Normal processing time"
return HealthCheckResult(
category=HealthCategory.DEADLOCKS,
status=status,
message=message,
value=longest_stuck,
details={"stuck_items": len(stuck_items)}
)
except Exception as e:
logger.error(f"Error checking deadlocks: {e}")
return HealthCheckResult(
category=HealthCategory.DEADLOCKS,
status=HealthStatus.UNKNOWN,
message=f"Error checking deadlocks: {str(e)}"
)
async def _check_system_health(self) -> HealthCheckResult:
"""Check system health"""
try:
metrics = await self.system_monitor.check_system_health()
if not metrics:
return HealthCheckResult(
category=HealthCategory.SYSTEM,
status=HealthStatus.UNKNOWN,
message="Unable to get system metrics"
)
cpu_percent = metrics["cpu_percent"]
if cpu_percent > self.thresholds.cpu_critical_percent:
status = HealthStatus.CRITICAL
message = f"Critical CPU usage: {cpu_percent:.1f}%"
elif cpu_percent > self.thresholds.cpu_warning_percent:
status = HealthStatus.WARNING
message = f"High CPU usage: {cpu_percent:.1f}%"
else:
status = HealthStatus.HEALTHY
message = f"Normal CPU usage: {cpu_percent:.1f}%"
return HealthCheckResult(
category=HealthCategory.SYSTEM,
status=status,
message=message,
value=cpu_percent,
details=metrics
)
except Exception as e:
logger.error(f"Error checking system health: {e}")
return HealthCheckResult(
category=HealthCategory.SYSTEM,
status=HealthStatus.UNKNOWN,
message=f"Error checking system: {str(e)}"
)
def _determine_overall_status(
self,
results: List[HealthCheckResult]
) -> HealthStatus:
"""Determine overall health status"""
if any(r.status == HealthStatus.CRITICAL for r in results):
return HealthStatus.CRITICAL
if any(r.status == HealthStatus.WARNING for r in results):
return HealthStatus.WARNING
if any(r.status == HealthStatus.UNKNOWN for r in results):
return HealthStatus.UNKNOWN
return HealthStatus.HEALTHY
def format_health_report(
self,
results: List[HealthCheckResult]
) -> str:
"""Format a detailed health report"""
lines = ["Queue Health Report:"]
for result in results:
lines.append(
f"\n{result.category.value.title()}:"
f"\n- Status: {result.status.value}"
f"\n- {result.message}"
)
if result.details:
for key, value in result.details.items():
lines.append(f" - {key}: {value}")
return "\n".join(lines)

View File

@@ -2,274 +2,292 @@
import asyncio
import logging
import time
from typing import Dict, Optional, Set, Tuple, Callable, Any, List
from datetime import datetime
from enum import Enum
from dataclasses import dataclass, field
from typing import Optional, Tuple, Dict, Any, List, Set
from datetime import datetime, timedelta
from .models import QueueItem, QueueMetrics
from .persistence import QueuePersistenceManager, QueueError
from .monitoring import QueueMonitor, MonitoringError
from .cleanup import QueueCleaner, CleanupError
from .state_manager import QueueStateManager
from .processor import QueueProcessor
from .metrics_manager import QueueMetricsManager
from .persistence import QueuePersistenceManager
from .monitoring import QueueMonitor, MonitoringLevel
from .cleanup import QueueCleaner
from .models import QueueItem, QueueError, CleanupError
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("QueueManager")
class QueueState(Enum):
"""Queue operational states"""
UNINITIALIZED = "uninitialized"
INITIALIZING = "initializing"
RUNNING = "running"
PAUSED = "paused"
STOPPING = "stopping"
STOPPED = "stopped"
ERROR = "error"
class QueueMode(Enum):
"""Queue processing modes"""
NORMAL = "normal" # Standard processing
BATCH = "batch" # Batch processing
PRIORITY = "priority" # Priority-based processing
MAINTENANCE = "maintenance" # Maintenance mode
@dataclass
class QueueConfig:
"""Queue configuration settings"""
max_retries: int = 3
retry_delay: int = 5
max_queue_size: int = 1000
cleanup_interval: int = 3600 # 1 hour
max_history_age: int = 86400 # 24 hours
deadlock_threshold: int = 300 # 5 minutes
check_interval: int = 60 # 1 minute
batch_size: int = 10
max_concurrent: int = 3
persistence_enabled: bool = True
monitoring_level: MonitoringLevel = MonitoringLevel.NORMAL
@dataclass
class QueueStats:
"""Queue statistics"""
start_time: datetime = field(default_factory=datetime.utcnow)
total_processed: int = 0
total_failed: int = 0
uptime: timedelta = field(default_factory=lambda: timedelta())
peak_queue_size: int = 0
peak_memory_usage: float = 0.0
state_changes: List[Dict[str, Any]] = field(default_factory=list)
class QueueCoordinator:
"""Coordinates queue operations"""
def __init__(self):
self.state = QueueState.UNINITIALIZED
self.mode = QueueMode.NORMAL
self._state_lock = asyncio.Lock()
self._mode_lock = asyncio.Lock()
self._paused = asyncio.Event()
self._paused.set()
async def set_state(self, state: QueueState) -> None:
"""Set queue state"""
async with self._state_lock:
self.state = state
async def set_mode(self, mode: QueueMode) -> None:
"""Set queue mode"""
async with self._mode_lock:
self.mode = mode
async def pause(self) -> None:
"""Pause queue processing"""
self._paused.clear()
await self.set_state(QueueState.PAUSED)
async def resume(self) -> None:
"""Resume queue processing"""
self._paused.set()
await self.set_state(QueueState.RUNNING)
async def wait_if_paused(self) -> None:
"""Wait if queue is paused"""
await self._paused.wait()
class EnhancedVideoQueueManager:
"""Enhanced queue manager with improved memory management and performance"""
"""Enhanced queue manager with improved organization and maintainability"""
def __init__(
self,
max_retries: int = 3,
retry_delay: int = 5,
max_queue_size: int = 1000,
cleanup_interval: int = 3600, # 1 hour
max_history_age: int = 86400, # 24 hours
persistence_path: Optional[str] = None,
backup_interval: int = 300, # 5 minutes
deadlock_threshold: int = 300, # 5 minutes
check_interval: int = 60, # 1 minute
):
"""Initialize queue manager"""
# Configuration
self.max_retries = max_retries
self.retry_delay = retry_delay
self.max_queue_size = max_queue_size
# Queue storage
self._queue: List[QueueItem] = []
self._processing: Dict[str, QueueItem] = {}
self._completed: Dict[str, QueueItem] = {}
self._failed: Dict[str, QueueItem] = {}
def __init__(self, config: Optional[QueueConfig] = None):
"""Initialize queue manager components"""
self.config = config or QueueConfig()
self.coordinator = QueueCoordinator()
self.stats = QueueStats()
# Tracking
self._guild_queues: Dict[int, Set[str]] = {}
self._channel_queues: Dict[int, Set[str]] = {}
self._active_tasks: Set[asyncio.Task] = set()
# Single lock for all operations to prevent deadlocks
self._lock = asyncio.Lock()
# State
self._shutdown = False
self._initialized = False
self._init_event = asyncio.Event()
self.metrics = QueueMetrics()
# Components
self.persistence = QueuePersistenceManager(persistence_path) if persistence_path else None
# Initialize managers
self.state_manager = QueueStateManager(self.config.max_queue_size)
self.metrics_manager = QueueMetricsManager()
self.monitor = QueueMonitor(
deadlock_threshold=deadlock_threshold,
max_retries=max_retries,
check_interval=check_interval
deadlock_threshold=self.config.deadlock_threshold,
max_retries=self.config.max_retries,
check_interval=self.config.check_interval
)
self.cleaner = QueueCleaner(
cleanup_interval=cleanup_interval,
max_history_age=max_history_age
cleanup_interval=self.config.cleanup_interval,
max_history_age=self.config.max_history_age
)
# Initialize persistence if enabled
self.persistence = (
QueuePersistenceManager()
if self.config.persistence_enabled
else None
)
# Initialize processor
self.processor = QueueProcessor(
state_manager=self.state_manager,
monitor=self.monitor,
max_retries=self.config.max_retries,
retry_delay=self.config.retry_delay,
batch_size=self.config.batch_size,
max_concurrent=self.config.max_concurrent
)
# Background tasks
self._maintenance_task: Optional[asyncio.Task] = None
self._stats_task: Optional[asyncio.Task] = None
async def initialize(self) -> None:
"""Initialize the queue manager components sequentially"""
if self._initialized:
"""Initialize the queue manager components"""
if self.coordinator.state != QueueState.UNINITIALIZED:
logger.info("Queue manager already initialized")
return
try:
await self.coordinator.set_state(QueueState.INITIALIZING)
logger.info("Starting queue manager initialization...")
async with self._lock:
# Load persisted state first if available
if self.persistence:
await self._load_persisted_state()
# Start monitoring task
monitor_task = asyncio.create_task(
self.monitor.start_monitoring(
self._queue,
self._processing,
self.metrics,
self._lock
)
)
self._active_tasks.add(monitor_task)
logger.info("Queue monitoring started")
# Start cleanup task
cleanup_task = asyncio.create_task(
self.cleaner.start_cleanup(
self._queue,
self._completed,
self._failed,
self._guild_queues,
self._channel_queues,
self._processing,
self.metrics,
self._lock
)
)
self._active_tasks.add(cleanup_task)
logger.info("Queue cleanup started")
# Load persisted state if available
if self.persistence:
await self._load_persisted_state()
# Start monitoring with configured level
self.monitor.strategy.level = self.config.monitoring_level
await self.monitor.start(
self.state_manager,
self.metrics_manager
)
# Start cleanup task
await self.cleaner.start(
state_manager=self.state_manager,
metrics_manager=self.metrics_manager
)
# Signal initialization complete
self._initialized = True
self._init_event.set()
logger.info("Queue manager initialization completed")
# Start background tasks
self._start_background_tasks()
await self.coordinator.set_state(QueueState.RUNNING)
logger.info("Queue manager initialization completed")
except Exception as e:
await self.coordinator.set_state(QueueState.ERROR)
logger.error(f"Failed to initialize queue manager: {e}")
self._shutdown = True
raise
async def _load_persisted_state(self) -> None:
"""Load persisted queue state"""
try:
state = self.persistence.load_queue_state()
state = await self.persistence.load_queue_state()
if state:
self._queue = state["queue"]
self._completed = state["completed"]
self._failed = state["failed"]
self._processing = state["processing"]
# Update metrics
metrics_data = state.get("metrics", {})
self.metrics.total_processed = metrics_data.get("total_processed", 0)
self.metrics.total_failed = metrics_data.get("total_failed", 0)
self.metrics.avg_processing_time = metrics_data.get("avg_processing_time", 0.0)
self.metrics.success_rate = metrics_data.get("success_rate", 0.0)
self.metrics.errors_by_type = metrics_data.get("errors_by_type", {})
self.metrics.compression_failures = metrics_data.get("compression_failures", 0)
self.metrics.hardware_accel_failures = metrics_data.get("hardware_accel_failures", 0)
await self.state_manager.restore_state(state)
self.metrics_manager.restore_metrics(state.get("metrics", {}))
logger.info("Loaded persisted queue state")
except Exception as e:
logger.error(f"Failed to load persisted state: {e}")
async def process_queue(
self,
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
) -> None:
"""Process items in the queue"""
# Wait for initialization to complete
await self._init_event.wait()
logger.info("Queue processor started")
last_persist_time = time.time()
persist_interval = 60 # Persist state every 60 seconds
while not self._shutdown:
try:
items = []
async with self._lock:
# Get up to 5 items from queue
while len(items) < 5 and self._queue:
item = self._queue.pop(0)
items.append(item)
self._processing[item.url] = item
# Update activity timestamp
self.monitor.update_activity()
def _start_background_tasks(self) -> None:
"""Start background maintenance tasks"""
self._maintenance_task = asyncio.create_task(
self._maintenance_loop()
)
self._stats_task = asyncio.create_task(
self._stats_loop()
)
if not items:
await asyncio.sleep(0.1)
async def _maintenance_loop(self) -> None:
"""Background maintenance loop"""
while self.coordinator.state not in (QueueState.STOPPED, QueueState.ERROR):
try:
await asyncio.sleep(300) # Every 5 minutes
if self.coordinator.mode == QueueMode.MAINTENANCE:
continue
# Process items concurrently
tasks = []
for item in items:
task = asyncio.create_task(self._process_item(processor, item))
tasks.append(task)
try:
await asyncio.gather(*tasks, return_exceptions=True)
except asyncio.CancelledError:
logger.info("Queue processing cancelled")
break
except Exception as e:
logger.error(f"Error in queue processing: {e}")
# Persist state if interval has passed
current_time = time.time()
if self.persistence and (current_time - last_persist_time) >= persist_interval:
await self._persist_state()
last_persist_time = current_time
# Perform maintenance tasks
await self._perform_maintenance()
except asyncio.CancelledError:
logger.info("Queue processing cancelled")
break
except Exception as e:
logger.error(f"Critical error in queue processor: {e}")
await asyncio.sleep(0.1)
logger.error(f"Error in maintenance loop: {e}")
await asyncio.sleep(0)
async def _stats_loop(self) -> None:
"""Background statistics loop"""
while self.coordinator.state not in (QueueState.STOPPED, QueueState.ERROR):
try:
await asyncio.sleep(60) # Every minute
await self._update_stats()
async def _process_item(
self,
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]],
item: QueueItem
) -> None:
"""Process a single queue item"""
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in stats loop: {e}")
async def _perform_maintenance(self) -> None:
"""Perform maintenance tasks"""
try:
logger.info(f"Processing queue item: {item.url}")
item.start_processing()
self.metrics.last_activity_time = time.time()
self.monitor.update_activity()
success, error = await processor(item)
async with self._lock:
item.finish_processing(success, error)
self._processing.pop(item.url, None)
if success:
self._completed[item.url] = item
logger.info(f"Successfully processed: {item.url}")
else:
if item.retry_count < self.max_retries:
item.retry_count += 1
item.status = "pending"
item.last_retry = datetime.utcnow()
item.priority = max(0, item.priority - 1)
self._queue.append(item)
logger.warning(f"Retrying: {item.url} (attempt {item.retry_count})")
else:
self._failed[item.url] = item
logger.error(f"Failed after {self.max_retries} attempts: {item.url}")
self.metrics.update(
processing_time=item.processing_time,
success=success,
error=error
)
# Switch to maintenance mode
previous_mode = self.coordinator.mode
await self.coordinator.set_mode(QueueMode.MAINTENANCE)
# Perform maintenance tasks
await self._cleanup_old_data()
await self._optimize_queue()
await self._persist_state()
# Restore previous mode
await self.coordinator.set_mode(previous_mode)
except Exception as e:
logger.error(f"Error processing {item.url}: {e}")
async with self._lock:
item.finish_processing(False, str(e))
self._processing.pop(item.url, None)
self._failed[item.url] = item
self.metrics.update(
processing_time=item.processing_time,
success=False,
error=str(e)
)
logger.error(f"Error during maintenance: {e}")
async def _persist_state(self) -> None:
"""Persist current state to storage"""
if not self.persistence:
return
async def _cleanup_old_data(self) -> None:
"""Clean up old data"""
try:
async with self._lock:
await self.persistence.persist_queue_state(
self._queue,
self._processing,
self._completed,
self._failed,
self.metrics
)
await self.cleaner.cleanup_old_data(
self.state_manager,
self.metrics_manager
)
except Exception as e:
logger.error(f"Failed to persist state: {e}")
logger.error(f"Error cleaning up old data: {e}")
async def _optimize_queue(self) -> None:
"""Optimize queue performance"""
try:
# Reorder queue based on priorities
await self.state_manager.optimize_queue()
# Update monitoring level based on queue size
queue_size = len(await self.state_manager.get_all_items())
if queue_size > self.config.max_queue_size * 0.8:
self.monitor.strategy.level = MonitoringLevel.INTENSIVE
elif queue_size < self.config.max_queue_size * 0.2:
self.monitor.strategy.level = self.config.monitoring_level
except Exception as e:
logger.error(f"Error optimizing queue: {e}")
async def _update_stats(self) -> None:
"""Update queue statistics"""
try:
self.stats.uptime = datetime.utcnow() - self.stats.start_time
# Update peak values
queue_size = len(await self.state_manager.get_all_items())
self.stats.peak_queue_size = max(
self.stats.peak_queue_size,
queue_size
)
memory_usage = self.metrics_manager.peak_memory_usage
self.stats.peak_memory_usage = max(
self.stats.peak_memory_usage,
memory_usage
)
except Exception as e:
logger.error(f"Error updating stats: {e}")
async def add_to_queue(
self,
@@ -281,176 +299,169 @@ class EnhancedVideoQueueManager:
priority: int = 0,
) -> bool:
"""Add a video to the processing queue"""
if self._shutdown:
raise QueueError("Queue manager is shutting down")
if self.coordinator.state in (QueueState.STOPPED, QueueState.ERROR):
raise QueueError("Queue manager is not running")
# Wait for initialization
await self._init_event.wait()
# Wait if queue is paused
await self.coordinator.wait_if_paused()
try:
async with self._lock:
if len(self._queue) >= self.max_queue_size:
raise QueueError("Queue is full")
item = QueueItem(
url=url,
message_id=message_id,
channel_id=channel_id,
guild_id=guild_id,
author_id=author_id,
added_at=datetime.utcnow(),
priority=priority,
)
item = QueueItem(
url=url,
message_id=message_id,
channel_id=channel_id,
guild_id=guild_id,
author_id=author_id,
added_at=datetime.utcnow(),
priority=priority,
)
success = await self.state_manager.add_item(item)
if success and self.persistence:
await self._persist_state()
if guild_id not in self._guild_queues:
self._guild_queues[guild_id] = set()
self._guild_queues[guild_id].add(url)
if channel_id not in self._channel_queues:
self._channel_queues[channel_id] = set()
self._channel_queues[channel_id].add(url)
self._queue.append(item)
self._queue.sort(key=lambda x: (-x.priority, x.added_at))
self.metrics.last_activity_time = time.time()
self.monitor.update_activity()
if self.persistence:
await self._persist_state()
logger.info(f"Added to queue: {url} (priority: {priority})")
return True
return success
except Exception as e:
logger.error(f"Error adding to queue: {e}")
raise QueueError(f"Failed to add to queue: {str(e)}")
def get_queue_status(self, guild_id: int) -> dict:
def get_queue_status(self, guild_id: int) -> Dict[str, Any]:
"""Get current queue status for a guild"""
try:
pending = len([item for item in self._queue if item.guild_id == guild_id])
processing = len([item for item in self._processing.values() if item.guild_id == guild_id])
completed = len([item for item in self._completed.values() if item.guild_id == guild_id])
failed = len([item for item in self._failed.values() if item.guild_id == guild_id])
status = self.state_manager.get_guild_status(guild_id)
metrics = self.metrics_manager.get_metrics()
monitor_stats = self.monitor.get_monitoring_stats()
return {
"pending": pending,
"processing": processing,
"completed": completed,
"failed": failed,
"metrics": {
"total_processed": self.metrics.total_processed,
"total_failed": self.metrics.total_failed,
"success_rate": self.metrics.success_rate,
"avg_processing_time": self.metrics.avg_processing_time,
"peak_memory_usage": self.metrics.peak_memory_usage,
"last_cleanup": self.metrics.last_cleanup.strftime("%Y-%m-%d %H:%M:%S"),
"errors_by_type": self.metrics.errors_by_type,
"compression_failures": self.metrics.compression_failures,
"hardware_accel_failures": self.metrics.hardware_accel_failures,
"last_activity": time.time() - self.metrics.last_activity_time,
},
**status,
"metrics": metrics,
"monitoring": monitor_stats,
"state": self.coordinator.state.value,
"mode": self.coordinator.mode.value,
"stats": {
"uptime": self.stats.uptime.total_seconds(),
"peak_queue_size": self.stats.peak_queue_size,
"peak_memory_usage": self.stats.peak_memory_usage,
"total_processed": self.stats.total_processed,
"total_failed": self.stats.total_failed
}
}
except Exception as e:
logger.error(f"Error getting queue status: {e}")
return {
"pending": 0,
"processing": 0,
"completed": 0,
"failed": 0,
"metrics": {
"total_processed": 0,
"total_failed": 0,
"success_rate": 0.0,
"avg_processing_time": 0.0,
"peak_memory_usage": 0.0,
"last_cleanup": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
"errors_by_type": {},
"compression_failures": 0,
"hardware_accel_failures": 0,
"last_activity": 0,
},
}
return self._get_default_status()
async def pause(self) -> None:
"""Pause queue processing"""
await self.coordinator.pause()
logger.info("Queue processing paused")
async def resume(self) -> None:
"""Resume queue processing"""
await self.coordinator.resume()
logger.info("Queue processing resumed")
async def cleanup(self) -> None:
"""Clean up resources and stop queue processing"""
try:
self._shutdown = True
await self.coordinator.set_state(QueueState.STOPPING)
logger.info("Starting queue manager cleanup...")
# Stop monitoring and cleanup tasks
self.monitor.stop_monitoring()
self.cleaner.stop_cleanup()
# Cancel background tasks
if self._maintenance_task:
self._maintenance_task.cancel()
if self._stats_task:
self._stats_task.cancel()
# Stop processor
await self.processor.stop_processing()
# Stop monitoring and cleanup
await self.monitor.stop()
await self.cleaner.stop()
# Cancel all active tasks
for task in self._active_tasks:
if not task.done():
task.cancel()
# Final state persistence
if self.persistence:
await self._persist_state()
await asyncio.gather(*self._active_tasks, return_exceptions=True)
# Clear state
await self.state_manager.clear_state()
async with self._lock:
# Move processing items back to queue
for url, item in self._processing.items():
if item.retry_count < self.max_retries:
item.status = "pending"
item.retry_count += 1
self._queue.append(item)
else:
self._failed[url] = item
self._processing.clear()
# Final state persistence
if self.persistence:
await self._persist_state()
# Clear collections
self._queue.clear()
self._completed.clear()
self._failed.clear()
self._guild_queues.clear()
self._channel_queues.clear()
self._active_tasks.clear()
# Reset initialization state
self._initialized = False
self._init_event.clear()
await self.coordinator.set_state(QueueState.STOPPED)
logger.info("Queue manager cleanup completed")
except Exception as e:
await self.coordinator.set_state(QueueState.ERROR)
logger.error(f"Error during cleanup: {e}")
raise CleanupError(f"Failed to clean up queue manager: {str(e)}")
def force_stop(self) -> None:
async def force_stop(self) -> None:
"""Force stop all queue operations immediately"""
self._shutdown = True
await self.coordinator.set_state(QueueState.STOPPING)
logger.info("Force stopping queue manager...")
# Stop monitoring and cleanup
self.monitor.stop_monitoring()
self.cleaner.stop_cleanup()
# Cancel background tasks
if self._maintenance_task:
self._maintenance_task.cancel()
if self._stats_task:
self._stats_task.cancel()
# Cancel all active tasks
for task in self._active_tasks:
if not task.done():
task.cancel()
# Move processing items back to queue
for url, item in self._processing.items():
if item.retry_count < self.max_retries:
item.status = "pending"
item.retry_count += 1
self._queue.append(item)
else:
self._failed[url] = item
self._processing.clear()
self._active_tasks.clear()
# Force stop all components
await self.processor.stop_processing()
await self.monitor.stop()
await self.cleaner.stop()
# Reset initialization state
self._initialized = False
self._init_event.clear()
# Clear state
await self.state_manager.clear_state()
await self.coordinator.set_state(QueueState.STOPPED)
logger.info("Queue manager force stopped")
async def _persist_state(self) -> None:
"""Persist current state to storage"""
if not self.persistence:
return
try:
state = await self.state_manager.get_state_for_persistence()
state["metrics"] = self.metrics_manager.get_metrics()
state["stats"] = {
"uptime": self.stats.uptime.total_seconds(),
"peak_queue_size": self.stats.peak_queue_size,
"peak_memory_usage": self.stats.peak_memory_usage,
"total_processed": self.stats.total_processed,
"total_failed": self.stats.total_failed
}
await self.persistence.persist_queue_state(state)
except Exception as e:
logger.error(f"Failed to persist state: {e}")
def _get_default_status(self) -> Dict[str, Any]:
"""Get default status when error occurs"""
return {
"pending": 0,
"processing": 0,
"completed": 0,
"failed": 0,
"metrics": {
"total_processed": 0,
"total_failed": 0,
"success_rate": 0.0,
"avg_processing_time": 0.0,
"peak_memory_usage": 0.0,
"last_cleanup": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
"errors_by_type": {},
"compression_failures": 0,
"hardware_accel_failures": 0,
"last_activity": 0,
},
"state": QueueState.ERROR.value,
"mode": QueueMode.NORMAL.value,
"stats": {
"uptime": 0,
"peak_queue_size": 0,
"peak_memory_usage": 0,
"total_processed": 0,
"total_failed": 0
}
}

View File

@@ -0,0 +1,366 @@
"""Module for managing queue metrics"""
import time
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, Optional, List, Any, Set
from datetime import datetime, timedelta
import json
logger = logging.getLogger("QueueMetricsManager")
class MetricCategory(Enum):
"""Categories of metrics"""
PROCESSING = "processing"
PERFORMANCE = "performance"
ERRORS = "errors"
HARDWARE = "hardware"
MEMORY = "memory"
ACTIVITY = "activity"
class ErrorCategory(Enum):
"""Categories of errors"""
NETWORK = "network"
TIMEOUT = "timeout"
PERMISSION = "permission"
MEMORY = "memory"
HARDWARE = "hardware"
COMPRESSION = "compression"
STORAGE = "storage"
OTHER = "other"
@dataclass
class ProcessingMetrics:
"""Processing-related metrics"""
total_processed: int = 0
total_failed: int = 0
success_rate: float = 0.0
avg_processing_time: float = 0.0
_total_processing_time: float = 0.0
_processing_count: int = 0
def update(self, processing_time: float, success: bool) -> None:
"""Update processing metrics"""
self.total_processed += 1
if not success:
self.total_failed += 1
self._total_processing_time += processing_time
self._processing_count += 1
self.success_rate = (
(self.total_processed - self.total_failed)
/ self.total_processed
if self.total_processed > 0
else 0.0
)
self.avg_processing_time = (
self._total_processing_time / self._processing_count
if self._processing_count > 0
else 0.0
)
@dataclass
class ErrorMetrics:
"""Error-related metrics"""
errors_by_type: Dict[str, int] = field(default_factory=dict)
errors_by_category: Dict[ErrorCategory, int] = field(default_factory=dict)
recent_errors: List[Dict[str, Any]] = field(default_factory=list)
error_patterns: Dict[str, int] = field(default_factory=dict)
max_recent_errors: int = 100
def record_error(self, error: str, category: Optional[ErrorCategory] = None) -> None:
"""Record an error occurrence"""
# Track by exact error
self.errors_by_type[error] = self.errors_by_type.get(error, 0) + 1
# Track by category
if category is None:
category = self._categorize_error(error)
self.errors_by_category[category] = self.errors_by_category.get(category, 0) + 1
# Track recent errors
self.recent_errors.append({
"error": error,
"category": category.value,
"timestamp": datetime.utcnow().isoformat()
})
if len(self.recent_errors) > self.max_recent_errors:
self.recent_errors.pop(0)
# Update error patterns
pattern = self._extract_error_pattern(error)
self.error_patterns[pattern] = self.error_patterns.get(pattern, 0) + 1
def _categorize_error(self, error: str) -> ErrorCategory:
"""Categorize an error message"""
error_lower = error.lower()
if any(word in error_lower for word in ["network", "connection", "dns"]):
return ErrorCategory.NETWORK
elif "timeout" in error_lower:
return ErrorCategory.TIMEOUT
elif any(word in error_lower for word in ["permission", "access", "denied"]):
return ErrorCategory.PERMISSION
elif "memory" in error_lower:
return ErrorCategory.MEMORY
elif "hardware" in error_lower:
return ErrorCategory.HARDWARE
elif "compression" in error_lower:
return ErrorCategory.COMPRESSION
elif any(word in error_lower for word in ["disk", "storage", "space"]):
return ErrorCategory.STORAGE
return ErrorCategory.OTHER
def _extract_error_pattern(self, error: str) -> str:
"""Extract general pattern from error message"""
# This could be enhanced with regex or more sophisticated pattern matching
words = error.split()
if len(words) > 5:
return " ".join(words[:5]) + "..."
return error
@dataclass
class PerformanceMetrics:
"""Performance-related metrics"""
peak_memory_usage: float = 0.0
compression_failures: int = 0
hardware_accel_failures: int = 0
peak_queue_size: int = 0
peak_processing_time: float = 0.0
avg_queue_wait_time: float = 0.0
_total_wait_time: float = 0.0
_wait_count: int = 0
def update_memory(self, memory_usage: float) -> None:
"""Update memory usage metrics"""
self.peak_memory_usage = max(self.peak_memory_usage, memory_usage)
def record_wait_time(self, wait_time: float) -> None:
"""Record queue wait time"""
self._total_wait_time += wait_time
self._wait_count += 1
self.avg_queue_wait_time = (
self._total_wait_time / self._wait_count
if self._wait_count > 0
else 0.0
)
class MetricAggregator:
"""Aggregates metrics over time periods"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.hourly_metrics: List[Dict[str, Any]] = []
self.daily_metrics: List[Dict[str, Any]] = []
self.last_aggregation = datetime.utcnow()
def aggregate_metrics(self, current_metrics: Dict[str, Any]) -> None:
"""Aggregate current metrics"""
now = datetime.utcnow()
# Hourly aggregation
if now - self.last_aggregation >= timedelta(hours=1):
self.hourly_metrics.append({
"timestamp": now.isoformat(),
"metrics": current_metrics
})
if len(self.hourly_metrics) > self.max_history:
self.hourly_metrics.pop(0)
# Daily aggregation
if now.date() > self.last_aggregation.date():
daily_avg = self._calculate_daily_average(
self.hourly_metrics,
self.last_aggregation.date()
)
self.daily_metrics.append(daily_avg)
if len(self.daily_metrics) > 30: # Keep last 30 days
self.daily_metrics.pop(0)
self.last_aggregation = now
def _calculate_daily_average(
self,
metrics: List[Dict[str, Any]],
date: datetime.date
) -> Dict[str, Any]:
"""Calculate average metrics for a day"""
day_metrics = [
m for m in metrics
if datetime.fromisoformat(m["timestamp"]).date() == date
]
if not day_metrics:
return {
"date": date.isoformat(),
"metrics": {}
}
# Calculate averages for numeric values
avg_metrics = {}
for key in day_metrics[0]["metrics"].keys():
if isinstance(day_metrics[0]["metrics"][key], (int, float)):
avg_metrics[key] = sum(
m["metrics"][key] for m in day_metrics
) / len(day_metrics)
else:
avg_metrics[key] = day_metrics[-1]["metrics"][key]
return {
"date": date.isoformat(),
"metrics": avg_metrics
}
class QueueMetricsManager:
"""Manages metrics collection and reporting for the queue system"""
def __init__(self):
self.processing = ProcessingMetrics()
self.errors = ErrorMetrics()
self.performance = PerformanceMetrics()
self.aggregator = MetricAggregator()
self.last_activity = time.time()
self.last_cleanup = datetime.utcnow()
def update(
self,
processing_time: float,
success: bool,
error: Optional[str] = None
) -> None:
"""Update metrics with new processing information"""
try:
# Update processing metrics
self.processing.update(processing_time, success)
# Update error tracking
if error:
self.errors.record_error(error)
# Track specific failures
if "hardware acceleration" in error.lower():
self.performance.hardware_accel_failures += 1
elif "compression" in error.lower():
self.performance.compression_failures += 1
# Update activity timestamp
self.last_activity = time.time()
# Aggregate metrics
self.aggregator.aggregate_metrics(self.get_metrics())
except Exception as e:
logger.error(f"Error updating metrics: {e}")
def get_metrics(self) -> Dict[str, Any]:
"""Get current metrics"""
return {
MetricCategory.PROCESSING.value: {
"total_processed": self.processing.total_processed,
"total_failed": self.processing.total_failed,
"success_rate": self.processing.success_rate,
"avg_processing_time": self.processing.avg_processing_time
},
MetricCategory.ERRORS.value: {
"errors_by_type": self.errors.errors_by_type,
"errors_by_category": {
cat.value: count
for cat, count in self.errors.errors_by_category.items()
},
"error_patterns": self.errors.error_patterns,
"recent_errors": self.errors.recent_errors
},
MetricCategory.PERFORMANCE.value: {
"peak_memory_usage": self.performance.peak_memory_usage,
"compression_failures": self.performance.compression_failures,
"hardware_accel_failures": self.performance.hardware_accel_failures,
"peak_queue_size": self.performance.peak_queue_size,
"avg_queue_wait_time": self.performance.avg_queue_wait_time
},
MetricCategory.ACTIVITY.value: {
"last_activity": time.time() - self.last_activity,
"last_cleanup": self.last_cleanup.isoformat()
},
"history": {
"hourly": self.aggregator.hourly_metrics,
"daily": self.aggregator.daily_metrics
}
}
def update_memory_usage(self, memory_usage: float) -> None:
"""Update peak memory usage"""
self.performance.update_memory(memory_usage)
def update_cleanup_time(self) -> None:
"""Update last cleanup timestamp"""
self.last_cleanup = datetime.utcnow()
def reset_metrics(self) -> None:
"""Reset all metrics to initial state"""
self.processing = ProcessingMetrics()
self.errors = ErrorMetrics()
self.performance = PerformanceMetrics()
self.last_activity = time.time()
self.last_cleanup = datetime.utcnow()
def save_metrics(self, file_path: str) -> None:
"""Save metrics to file"""
try:
metrics = self.get_metrics()
with open(file_path, 'w') as f:
json.dump(metrics, f, indent=2)
except Exception as e:
logger.error(f"Error saving metrics: {e}")
def load_metrics(self, file_path: str) -> None:
"""Load metrics from file"""
try:
with open(file_path, 'r') as f:
metrics = json.load(f)
self.restore_metrics(metrics)
except Exception as e:
logger.error(f"Error loading metrics: {e}")
def restore_metrics(self, metrics_data: Dict[str, Any]) -> None:
"""Restore metrics from saved data"""
try:
# Restore processing metrics
proc_data = metrics_data.get(MetricCategory.PROCESSING.value, {})
self.processing = ProcessingMetrics(
total_processed=proc_data.get("total_processed", 0),
total_failed=proc_data.get("total_failed", 0),
success_rate=proc_data.get("success_rate", 0.0),
avg_processing_time=proc_data.get("avg_processing_time", 0.0)
)
# Restore error metrics
error_data = metrics_data.get(MetricCategory.ERRORS.value, {})
self.errors = ErrorMetrics(
errors_by_type=error_data.get("errors_by_type", {}),
errors_by_category={
ErrorCategory[k.upper()]: v
for k, v in error_data.get("errors_by_category", {}).items()
},
error_patterns=error_data.get("error_patterns", {}),
recent_errors=error_data.get("recent_errors", [])
)
# Restore performance metrics
perf_data = metrics_data.get(MetricCategory.PERFORMANCE.value, {})
self.performance = PerformanceMetrics(
peak_memory_usage=perf_data.get("peak_memory_usage", 0.0),
compression_failures=perf_data.get("compression_failures", 0),
hardware_accel_failures=perf_data.get("hardware_accel_failures", 0),
peak_queue_size=perf_data.get("peak_queue_size", 0),
avg_queue_wait_time=perf_data.get("avg_queue_wait_time", 0.0)
)
# Restore history
history = metrics_data.get("history", {})
self.aggregator.hourly_metrics = history.get("hourly", [])
self.aggregator.daily_metrics = history.get("daily", [])
except Exception as e:
logger.error(f"Error restoring metrics: {e}")

View File

@@ -2,221 +2,365 @@
import asyncio
import logging
import psutil
import time
from enum import Enum
from dataclasses import dataclass, field
from typing import Optional, Dict, Any, List, Set
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Set
from .models import QueueItem, QueueMetrics
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
from .health_checker import HealthChecker, HealthStatus, HealthCategory
from .recovery_manager import RecoveryManager, RecoveryStrategy
logger = logging.getLogger("QueueMonitoring")
class MonitoringLevel(Enum):
"""Monitoring intensity levels"""
LIGHT = "light" # Basic monitoring
NORMAL = "normal" # Standard monitoring
INTENSIVE = "intensive" # Detailed monitoring
DEBUG = "debug" # Debug-level monitoring
class AlertSeverity(Enum):
"""Alert severity levels"""
INFO = "info"
WARNING = "warning"
ERROR = "error"
CRITICAL = "critical"
@dataclass
class MonitoringEvent:
"""Represents a monitoring event"""
timestamp: datetime
category: HealthCategory
severity: AlertSeverity
message: str
details: Dict[str, Any] = field(default_factory=dict)
resolved: bool = False
resolution_time: Optional[datetime] = None
@dataclass
class MonitoringThresholds:
"""Monitoring thresholds configuration"""
check_interval: int = 15 # 15 seconds
deadlock_threshold: int = 60 # 1 minute
memory_threshold: int = 512 # 512MB
max_retries: int = 3
alert_threshold: int = 5 # Max alerts before escalation
recovery_timeout: int = 300 # 5 minutes
intensive_threshold: int = 0.8 # 80% resource usage triggers intensive
class AlertManager:
"""Manages monitoring alerts"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.active_alerts: Dict[str, MonitoringEvent] = {}
self.alert_history: List[MonitoringEvent] = []
self.alert_counts: Dict[AlertSeverity, int] = {
severity: 0 for severity in AlertSeverity
}
def create_alert(
self,
category: HealthCategory,
severity: AlertSeverity,
message: str,
details: Dict[str, Any] = None
) -> MonitoringEvent:
"""Create a new alert"""
event = MonitoringEvent(
timestamp=datetime.utcnow(),
category=category,
severity=severity,
message=message,
details=details or {}
)
alert_id = f"{category.value}_{event.timestamp.timestamp()}"
self.active_alerts[alert_id] = event
self.alert_counts[severity] += 1
self.alert_history.append(event)
if len(self.alert_history) > self.max_history:
self.alert_history.pop(0)
return event
def resolve_alert(self, alert_id: str) -> None:
"""Mark an alert as resolved"""
if alert_id in self.active_alerts:
event = self.active_alerts[alert_id]
event.resolved = True
event.resolution_time = datetime.utcnow()
self.active_alerts.pop(alert_id)
def get_active_alerts(self) -> List[MonitoringEvent]:
"""Get currently active alerts"""
return list(self.active_alerts.values())
def get_alert_stats(self) -> Dict[str, Any]:
"""Get alert statistics"""
return {
"active_alerts": len(self.active_alerts),
"total_alerts": len(self.alert_history),
"alert_counts": {
severity.value: count
for severity, count in self.alert_counts.items()
},
"recent_alerts": [
{
"timestamp": event.timestamp.isoformat(),
"category": event.category.value,
"severity": event.severity.value,
"message": event.message,
"resolved": event.resolved
}
for event in self.alert_history[-10:] # Last 10 alerts
]
}
class MonitoringStrategy:
"""Determines monitoring behavior"""
def __init__(
self,
level: MonitoringLevel = MonitoringLevel.NORMAL,
thresholds: Optional[MonitoringThresholds] = None
):
self.level = level
self.thresholds = thresholds or MonitoringThresholds()
self._last_intensive_check = datetime.utcnow()
def should_check_health(self, metrics: Dict[str, Any]) -> bool:
"""Determine if health check should be performed"""
if self.level == MonitoringLevel.INTENSIVE:
return True
elif self.level == MonitoringLevel.LIGHT:
return metrics.get("queue_size", 0) > 0
else: # NORMAL or DEBUG
return True
def get_check_interval(self) -> float:
"""Get the current check interval"""
if self.level == MonitoringLevel.INTENSIVE:
return self.thresholds.check_interval / 2
elif self.level == MonitoringLevel.LIGHT:
return self.thresholds.check_interval * 2
else: # NORMAL or DEBUG
return self.thresholds.check_interval
def should_escalate(self, alert_count: int) -> bool:
"""Determine if monitoring should be escalated"""
return (
self.level != MonitoringLevel.INTENSIVE and
alert_count >= self.thresholds.alert_threshold
)
def should_deescalate(self, alert_count: int) -> bool:
"""Determine if monitoring can be deescalated"""
return (
self.level == MonitoringLevel.INTENSIVE and
alert_count == 0 and
(datetime.utcnow() - self._last_intensive_check).total_seconds() > 300
)
class QueueMonitor:
"""Monitors queue health and performance"""
def __init__(
self,
deadlock_threshold: int = 60, # Reduced to 1 minute
memory_threshold: int = 512, # 512MB
max_retries: int = 3,
check_interval: int = 15 # Reduced to 15 seconds
strategy: Optional[MonitoringStrategy] = None,
thresholds: Optional[MonitoringThresholds] = None
):
self.deadlock_threshold = deadlock_threshold
self.memory_threshold = memory_threshold
self.max_retries = max_retries
self.check_interval = check_interval
self.strategy = strategy or MonitoringStrategy()
self.thresholds = thresholds or MonitoringThresholds()
# Initialize components
self.health_checker = HealthChecker(
memory_threshold=self.thresholds.memory_threshold,
deadlock_threshold=self.thresholds.deadlock_threshold
)
self.recovery_manager = RecoveryManager(max_retries=self.thresholds.max_retries)
self.alert_manager = AlertManager()
self._shutdown = False
self._last_active_time = time.time()
self._monitoring_task = None
self._monitoring_task: Optional[asyncio.Task] = None
async def start_monitoring(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
queue_lock: asyncio.Lock
) -> None:
"""Start monitoring queue health
Args:
queue: Reference to the queue list
processing: Reference to processing dict
metrics: Reference to queue metrics
queue_lock: Lock for queue operations
"""
async def start(self, state_manager, metrics_manager) -> None:
"""Start monitoring queue health"""
if self._monitoring_task is not None:
logger.warning("Monitoring task already running")
return
logger.info("Starting queue monitoring...")
logger.info(f"Starting queue monitoring with level: {self.strategy.level.value}")
self._monitoring_task = asyncio.create_task(
self._monitor_loop(queue, processing, metrics, queue_lock)
self._monitor_loop(state_manager, metrics_manager)
)
async def _monitor_loop(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
queue_lock: asyncio.Lock
) -> None:
async def _monitor_loop(self, state_manager, metrics_manager) -> None:
"""Main monitoring loop"""
while not self._shutdown:
try:
await self._check_health(queue, processing, metrics, queue_lock)
await asyncio.sleep(self.check_interval)
# Get current metrics
metrics = metrics_manager.get_metrics()
# Check if health check should be performed
if self.strategy.should_check_health(metrics):
await self._perform_health_check(
state_manager,
metrics_manager,
metrics
)
# Check for strategy adjustment
self._adjust_monitoring_strategy(metrics)
# Wait for next check
await asyncio.sleep(self.strategy.get_check_interval())
except asyncio.CancelledError:
logger.info("Queue monitoring cancelled")
break
except Exception as e:
logger.error(f"Error in health monitor: {str(e)}")
await asyncio.sleep(1) # Reduced sleep on error
logger.error(f"Error in monitoring loop: {str(e)}")
await asyncio.sleep(1)
def stop_monitoring(self) -> None:
async def stop(self) -> None:
"""Stop the monitoring process"""
logger.info("Stopping queue monitoring...")
self._shutdown = True
if self._monitoring_task and not self._monitoring_task.done():
self._monitoring_task.cancel()
try:
await self._monitoring_task
except asyncio.CancelledError:
pass
self._monitoring_task = None
def update_activity(self) -> None:
"""Update the last active time"""
self._last_active_time = time.time()
async def _check_health(
async def _perform_health_check(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
metrics: QueueMetrics,
queue_lock: asyncio.Lock
state_manager,
metrics_manager,
current_metrics: Dict[str, Any]
) -> None:
"""Check queue health and performance
Args:
queue: Reference to the queue list
processing: Reference to processing dict
metrics: Reference to queue metrics
queue_lock: Lock for queue operations
"""
"""Perform health check and recovery if needed"""
try:
current_time = time.time()
# Check memory usage
process = psutil.Process()
memory_usage = process.memory_info().rss / 1024 / 1024 # MB
memory_usage, is_critical = await self.health_checker.check_memory_usage()
metrics_manager.update_memory_usage(memory_usage)
if memory_usage > self.memory_threshold:
logger.warning(f"High memory usage detected: {memory_usage:.2f}MB")
# Force garbage collection
import gc
gc.collect()
memory_after = process.memory_info().rss / 1024 / 1024
logger.info(f"Memory after GC: {memory_after:.2f}MB")
if is_critical:
self.alert_manager.create_alert(
category=HealthCategory.MEMORY,
severity=AlertSeverity.CRITICAL,
message=f"Critical memory usage: {memory_usage:.1f}MB",
details={"memory_usage": memory_usage}
)
# Check for potential deadlocks
# Get current queue state
queue_stats = await state_manager.get_queue_stats()
processing_items = await state_manager.get_all_processing_items()
# Check for stuck items
stuck_items = []
for item in processing_items:
if self.recovery_manager.should_recover_item(item):
stuck_items.append((item.url, item))
async with queue_lock:
# Check processing items
for url, item in processing.items():
if hasattr(item, 'start_time') and item.start_time:
processing_time = current_time - item.start_time
if processing_time > self.deadlock_threshold:
stuck_items.append((url, item))
logger.warning(f"Item stuck in processing: {url} for {processing_time:.1f}s")
# Handle stuck items if found
if stuck_items:
self.alert_manager.create_alert(
category=HealthCategory.DEADLOCKS,
severity=AlertSeverity.WARNING,
message=f"Potential deadlock: {len(stuck_items)} items stuck",
details={"stuck_items": [item[0] for item in stuck_items]}
)
await self.recovery_manager.recover_stuck_items(
stuck_items,
state_manager,
metrics_manager
)
# Handle stuck items if found
if stuck_items:
logger.warning(f"Potential deadlock detected: {len(stuck_items)} items stuck")
await self._recover_stuck_items(stuck_items, queue, processing)
# Check overall queue activity
if processing_items and self.health_checker.check_queue_activity(
self._last_active_time,
bool(processing_items)
):
self.alert_manager.create_alert(
category=HealthCategory.ACTIVITY,
severity=AlertSeverity.ERROR,
message="Queue appears to be hung",
details={"last_active": self._last_active_time}
)
await self.recovery_manager.perform_emergency_recovery(
state_manager,
metrics_manager
)
self.update_activity()
# Check overall queue activity
if processing and current_time - self._last_active_time > self.deadlock_threshold:
logger.warning("Queue appears to be hung - no activity detected")
# Force recovery of all processing items
all_items = list(processing.items())
await self._recover_stuck_items(all_items, queue, processing)
self._last_active_time = current_time
# Check error rates
error_rate = current_metrics.get("error_rate", 0)
if error_rate > 0.2: # 20% error rate
self.alert_manager.create_alert(
category=HealthCategory.ERRORS,
severity=AlertSeverity.ERROR,
message=f"High error rate: {error_rate:.1%}",
details={"error_rate": error_rate}
)
# Update metrics
metrics.last_activity_time = self._last_active_time
metrics.peak_memory_usage = max(metrics.peak_memory_usage, memory_usage)
# Log health report
if self.strategy.level in (MonitoringLevel.INTENSIVE, MonitoringLevel.DEBUG):
health_report = self.health_checker.format_health_report(
memory_usage=memory_usage,
queue_size=queue_stats["queue_size"],
processing_count=queue_stats["processing_count"],
success_rate=metrics_manager.success_rate,
avg_processing_time=metrics_manager.avg_processing_time,
peak_memory=metrics_manager.peak_memory_usage,
error_distribution=metrics_manager.errors_by_type,
last_activity_delta=time.time() - self._last_active_time
)
logger.info(health_report)
# Calculate current metrics
queue_size = len(queue)
processing_count = len(processing)
# Log detailed metrics
logger.info(
f"Queue Health Metrics:\n"
f"- Success Rate: {metrics.success_rate:.2%}\n"
f"- Avg Processing Time: {metrics.avg_processing_time:.2f}s\n"
f"- Memory Usage: {memory_usage:.2f}MB\n"
f"- Peak Memory: {metrics.peak_memory_usage:.2f}MB\n"
f"- Error Distribution: {metrics.errors_by_type}\n"
f"- Queue Size: {queue_size}\n"
f"- Processing Items: {processing_count}\n"
f"- Last Activity: {(current_time - self._last_active_time):.1f}s ago"
except Exception as e:
logger.error(f"Error performing health check: {str(e)}")
self.alert_manager.create_alert(
category=HealthCategory.SYSTEM,
severity=AlertSeverity.ERROR,
message=f"Health check error: {str(e)}"
)
except Exception as e:
logger.error(f"Error checking queue health: {str(e)}")
# Don't re-raise to keep monitoring alive
async def _recover_stuck_items(
self,
stuck_items: List[tuple[str, QueueItem]],
queue: List[QueueItem],
processing: Dict[str, QueueItem]
) -> None:
"""Attempt to recover stuck items
def _adjust_monitoring_strategy(self, metrics: Dict[str, Any]) -> None:
"""Adjust monitoring strategy based on current state"""
active_alerts = self.alert_manager.get_active_alerts()
Args:
stuck_items: List of (url, item) tuples for stuck items
queue: Reference to the queue list
processing: Reference to processing dict
"""
try:
recovered = 0
failed = 0
for url, item in stuck_items:
try:
# Move to failed if max retries reached
if item.retry_count >= self.max_retries:
logger.warning(f"Moving stuck item to failed: {url}")
item.status = "failed"
item.error = "Exceeded maximum retries after being stuck"
item.last_error = item.error
item.last_error_time = datetime.utcnow()
processing.pop(url)
failed += 1
else:
# Reset for retry
logger.info(f"Recovering stuck item for retry: {url}")
item.retry_count += 1
item.start_time = None
item.processing_time = 0
item.last_retry = datetime.utcnow()
item.status = "pending"
item.priority = max(0, item.priority - 2) # Lower priority
queue.append(item)
processing.pop(url)
recovered += 1
except Exception as e:
logger.error(f"Error recovering item {url}: {str(e)}")
# Check for escalation
if self.strategy.should_escalate(len(active_alerts)):
logger.warning("Escalating to intensive monitoring")
self.strategy.level = MonitoringLevel.INTENSIVE
self.strategy._last_intensive_check = datetime.utcnow()
# Check for de-escalation
elif self.strategy.should_deescalate(len(active_alerts)):
logger.info("De-escalating to normal monitoring")
self.strategy.level = MonitoringLevel.NORMAL
# Update activity timestamp after recovery
self.update_activity()
logger.info(f"Recovery complete - Recovered: {recovered}, Failed: {failed}")
except Exception as e:
logger.error(f"Error recovering stuck items: {str(e)}")
# Don't re-raise to keep monitoring alive
def get_monitoring_stats(self) -> Dict[str, Any]:
"""Get comprehensive monitoring statistics"""
return {
"monitoring_level": self.strategy.level.value,
"last_active": self._last_active_time,
"alerts": self.alert_manager.get_alert_stats(),
"recovery": self.recovery_manager.get_recovery_stats(),
"health": self.health_checker.get_health_stats()
}
class MonitoringError(Exception):
"""Base exception for monitoring-related errors"""

View File

@@ -0,0 +1,351 @@
"""Module for processing queue items"""
import asyncio
import logging
import time
from enum import Enum
from dataclasses import dataclass
from typing import Callable, Optional, Tuple, List, Set, Dict, Any
from datetime import datetime, timedelta
from .models import QueueItem
from .state_manager import QueueStateManager, ItemState
from .monitoring import QueueMonitor
logger = logging.getLogger("QueueProcessor")
class ProcessingStrategy(Enum):
"""Processing strategies"""
SEQUENTIAL = "sequential" # Process items one at a time
CONCURRENT = "concurrent" # Process multiple items concurrently
BATCHED = "batched" # Process items in batches
PRIORITY = "priority" # Process based on priority
@dataclass
class ProcessingMetrics:
"""Metrics for processing operations"""
total_processed: int = 0
successful: int = 0
failed: int = 0
retried: int = 0
avg_processing_time: float = 0.0
peak_concurrent_tasks: int = 0
last_processed: Optional[datetime] = None
error_counts: Dict[str, int] = None
def __post_init__(self):
self.error_counts = {}
def record_success(self, processing_time: float) -> None:
"""Record successful processing"""
self.total_processed += 1
self.successful += 1
self._update_avg_time(processing_time)
self.last_processed = datetime.utcnow()
def record_failure(self, error: str) -> None:
"""Record processing failure"""
self.total_processed += 1
self.failed += 1
self.error_counts[error] = self.error_counts.get(error, 0) + 1
self.last_processed = datetime.utcnow()
def record_retry(self) -> None:
"""Record processing retry"""
self.retried += 1
def _update_avg_time(self, new_time: float) -> None:
"""Update average processing time"""
if self.total_processed == 1:
self.avg_processing_time = new_time
else:
self.avg_processing_time = (
(self.avg_processing_time * (self.total_processed - 1) + new_time)
/ self.total_processed
)
def get_stats(self) -> Dict[str, Any]:
"""Get processing statistics"""
return {
"total_processed": self.total_processed,
"successful": self.successful,
"failed": self.failed,
"retried": self.retried,
"success_rate": (
self.successful / self.total_processed
if self.total_processed > 0
else 0
),
"avg_processing_time": self.avg_processing_time,
"peak_concurrent_tasks": self.peak_concurrent_tasks,
"last_processed": (
self.last_processed.isoformat()
if self.last_processed
else None
),
"error_distribution": self.error_counts
}
class BatchManager:
"""Manages processing batches"""
def __init__(
self,
batch_size: int,
max_concurrent: int,
timeout: float = 30.0
):
self.batch_size = batch_size
self.max_concurrent = max_concurrent
self.timeout = timeout
self.current_batch: List[QueueItem] = []
self.processing_start: Optional[datetime] = None
async def process_batch(
self,
items: List[QueueItem],
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
) -> List[Tuple[QueueItem, bool, Optional[str]]]:
"""Process a batch of items"""
self.current_batch = items
self.processing_start = datetime.utcnow()
tasks = [
asyncio.create_task(self._process_item(processor, item))
for item in items
]
try:
results = await asyncio.gather(*tasks, return_exceptions=True)
return [
(item, *self._handle_result(result))
for item, result in zip(items, results)
]
finally:
self.current_batch = []
self.processing_start = None
async def _process_item(
self,
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]],
item: QueueItem
) -> Tuple[bool, Optional[str]]:
"""Process a single item with timeout"""
try:
return await asyncio.wait_for(
processor(item),
timeout=self.timeout
)
except asyncio.TimeoutError:
return False, "Processing timeout"
except Exception as e:
return False, str(e)
def _handle_result(
self,
result: Any
) -> Tuple[bool, Optional[str]]:
"""Handle processing result"""
if isinstance(result, tuple) and len(result) == 2:
return result
if isinstance(result, Exception):
return False, str(result)
return False, "Unknown error"
def get_batch_status(self) -> Dict[str, Any]:
"""Get current batch status"""
return {
"batch_size": len(self.current_batch),
"processing_time": (
(datetime.utcnow() - self.processing_start).total_seconds()
if self.processing_start
else 0
),
"items": [item.url for item in self.current_batch]
}
class QueueProcessor:
"""Handles the processing of queue items"""
def __init__(
self,
state_manager: QueueStateManager,
monitor: QueueMonitor,
strategy: ProcessingStrategy = ProcessingStrategy.CONCURRENT,
max_retries: int = 3,
retry_delay: int = 5,
batch_size: int = 5,
max_concurrent: int = 3
):
self.state_manager = state_manager
self.monitor = monitor
self.strategy = strategy
self.max_retries = max_retries
self.retry_delay = retry_delay
self.batch_manager = BatchManager(batch_size, max_concurrent)
self.metrics = ProcessingMetrics()
self._shutdown = False
self._active_tasks: Set[asyncio.Task] = set()
self._processing_lock = asyncio.Lock()
async def start_processing(
self,
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
) -> None:
"""Start processing items in the queue"""
logger.info(f"Queue processor started with strategy: {self.strategy.value}")
while not self._shutdown:
try:
if self.strategy == ProcessingStrategy.BATCHED:
await self._process_batch(processor)
elif self.strategy == ProcessingStrategy.CONCURRENT:
await self._process_concurrent(processor)
else: # SEQUENTIAL or PRIORITY
await self._process_sequential(processor)
except asyncio.CancelledError:
logger.info("Queue processing cancelled")
break
except Exception as e:
logger.error(f"Critical error in queue processor: {e}")
await asyncio.sleep(1) # Delay before retry
await asyncio.sleep(0)
async def _process_batch(
self,
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
) -> None:
"""Process items in batches"""
items = await self.state_manager.get_next_items(self.batch_manager.batch_size)
if not items:
await asyncio.sleep(0.1)
return
start_time = time.time()
results = await self.batch_manager.process_batch(items, processor)
for item, success, error in results:
await self._handle_result(
item,
success,
error,
time.time() - start_time
)
async def _process_concurrent(
self,
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
) -> None:
"""Process items concurrently"""
if len(self._active_tasks) >= self.batch_manager.max_concurrent:
await asyncio.sleep(0.1)
return
items = await self.state_manager.get_next_items(
self.batch_manager.max_concurrent - len(self._active_tasks)
)
for item in items:
task = asyncio.create_task(self._process_item(processor, item))
self._active_tasks.add(task)
task.add_done_callback(self._active_tasks.discard)
self.metrics.peak_concurrent_tasks = max(
self.metrics.peak_concurrent_tasks,
len(self._active_tasks)
)
async def _process_sequential(
self,
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
) -> None:
"""Process items sequentially"""
items = await self.state_manager.get_next_items(1)
if not items:
await asyncio.sleep(0.1)
return
await self._process_item(processor, items[0])
async def _process_item(
self,
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]],
item: QueueItem
) -> None:
"""Process a single queue item"""
try:
logger.info(f"Processing queue item: {item.url}")
start_time = time.time()
async with self._processing_lock:
item.start_processing()
self.monitor.update_activity()
success, error = await processor(item)
processing_time = time.time() - start_time
await self._handle_result(item, success, error, processing_time)
except Exception as e:
logger.error(f"Error processing {item.url}: {e}")
await self._handle_result(item, False, str(e), 0)
async def _handle_result(
self,
item: QueueItem,
success: bool,
error: Optional[str],
processing_time: float
) -> None:
"""Handle processing result"""
item.finish_processing(success, error)
if success:
await self.state_manager.mark_completed(item, True)
self.metrics.record_success(processing_time)
logger.info(f"Successfully processed: {item.url}")
else:
if item.retry_count < self.max_retries:
item.retry_count += 1
await self.state_manager.retry_item(item)
self.metrics.record_retry()
logger.warning(f"Retrying: {item.url} (attempt {item.retry_count})")
await asyncio.sleep(self.retry_delay)
else:
await self.state_manager.mark_completed(item, False, error)
self.metrics.record_failure(error or "Unknown error")
logger.error(f"Failed after {self.max_retries} attempts: {item.url}")
async def stop_processing(self) -> None:
"""Stop processing queue items"""
self._shutdown = True
# Cancel all active tasks
for task in self._active_tasks:
if not task.done():
task.cancel()
# Wait for tasks to complete
if self._active_tasks:
await asyncio.gather(*self._active_tasks, return_exceptions=True)
self._active_tasks.clear()
logger.info("Queue processor stopped")
def is_processing(self) -> bool:
"""Check if the processor is currently processing items"""
return bool(self._active_tasks)
def get_processor_stats(self) -> Dict[str, Any]:
"""Get processor statistics"""
return {
"strategy": self.strategy.value,
"active_tasks": len(self._active_tasks),
"metrics": self.metrics.get_stats(),
"batch_status": self.batch_manager.get_batch_status(),
"is_processing": self.is_processing()
}

View File

@@ -0,0 +1,359 @@
"""Module for handling queue item recovery operations"""
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import List, Tuple, Dict, Optional, Any, Set
from datetime import datetime, timedelta
from .models import QueueItem
logger = logging.getLogger("QueueRecoveryManager")
class RecoveryStrategy(Enum):
"""Recovery strategies"""
RETRY = "retry" # Retry the item
FAIL = "fail" # Mark as failed
REQUEUE = "requeue" # Add back to queue
EMERGENCY = "emergency" # Emergency recovery
class RecoveryPolicy(Enum):
"""Recovery policies"""
AGGRESSIVE = "aggressive" # Recover quickly, more retries
CONSERVATIVE = "conservative" # Recover slowly, fewer retries
BALANCED = "balanced" # Balance between speed and reliability
@dataclass
class RecoveryThresholds:
"""Thresholds for recovery operations"""
max_retries: int = 3
deadlock_threshold: int = 300 # 5 minutes
emergency_threshold: int = 600 # 10 minutes
backoff_base: int = 5 # Base delay for exponential backoff
max_concurrent_recoveries: int = 5
@dataclass
class RecoveryResult:
"""Result of a recovery operation"""
item_url: str
strategy: RecoveryStrategy
success: bool
error: Optional[str] = None
retry_count: int = 0
timestamp: datetime = field(default_factory=datetime.utcnow)
class RecoveryTracker:
"""Tracks recovery operations"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[RecoveryResult] = []
self.active_recoveries: Set[str] = set()
self.recovery_counts: Dict[str, int] = {}
self.success_counts: Dict[str, int] = {}
self.error_counts: Dict[str, int] = {}
def record_recovery(self, result: RecoveryResult) -> None:
"""Record a recovery operation"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
self.recovery_counts[result.item_url] = (
self.recovery_counts.get(result.item_url, 0) + 1
)
if result.success:
self.success_counts[result.item_url] = (
self.success_counts.get(result.item_url, 0) + 1
)
else:
self.error_counts[result.item_url] = (
self.error_counts.get(result.item_url, 0) + 1
)
def start_recovery(self, url: str) -> None:
"""Start tracking a recovery operation"""
self.active_recoveries.add(url)
def end_recovery(self, url: str) -> None:
"""End tracking a recovery operation"""
self.active_recoveries.discard(url)
def get_stats(self) -> Dict[str, Any]:
"""Get recovery statistics"""
return {
"total_recoveries": len(self.history),
"active_recoveries": len(self.active_recoveries),
"success_rate": (
sum(self.success_counts.values()) /
len(self.history) if self.history else 0
),
"recovery_counts": self.recovery_counts.copy(),
"error_counts": self.error_counts.copy(),
"recent_recoveries": [
{
"url": r.item_url,
"strategy": r.strategy.value,
"success": r.success,
"error": r.error,
"timestamp": r.timestamp.isoformat()
}
for r in self.history[-10:] # Last 10 recoveries
]
}
class RecoveryManager:
"""Handles recovery of stuck or failed queue items"""
def __init__(
self,
thresholds: Optional[RecoveryThresholds] = None,
policy: RecoveryPolicy = RecoveryPolicy.BALANCED
):
self.thresholds = thresholds or RecoveryThresholds()
self.policy = policy
self.tracker = RecoveryTracker()
self._recovery_lock = asyncio.Lock()
async def recover_stuck_items(
self,
stuck_items: List[Tuple[str, QueueItem]],
state_manager,
metrics_manager
) -> Tuple[int, int]:
"""Recover stuck items"""
recovered = 0
failed = 0
try:
async with self._recovery_lock:
for url, item in stuck_items:
if len(self.tracker.active_recoveries) >= self.thresholds.max_concurrent_recoveries:
logger.warning("Max concurrent recoveries reached, waiting...")
await asyncio.sleep(1)
continue
try:
self.tracker.start_recovery(url)
strategy = self._determine_strategy(item)
success = await self._execute_recovery(
url,
item,
strategy,
state_manager,
metrics_manager
)
if success:
recovered += 1
else:
failed += 1
except Exception as e:
logger.error(f"Error recovering item {url}: {str(e)}")
failed += 1
finally:
self.tracker.end_recovery(url)
logger.info(f"Recovery complete - Recovered: {recovered}, Failed: {failed}")
return recovered, failed
except Exception as e:
logger.error(f"Error in recovery process: {str(e)}")
return 0, len(stuck_items)
def _determine_strategy(self, item: QueueItem) -> RecoveryStrategy:
"""Determine recovery strategy based on item state"""
if item.retry_count >= self.thresholds.max_retries:
return RecoveryStrategy.FAIL
processing_time = (
datetime.utcnow().timestamp() - item.start_time
if item.start_time
else 0
)
if processing_time > self.thresholds.emergency_threshold:
return RecoveryStrategy.EMERGENCY
elif self.policy == RecoveryPolicy.AGGRESSIVE:
return RecoveryStrategy.RETRY
elif self.policy == RecoveryPolicy.CONSERVATIVE:
return RecoveryStrategy.REQUEUE
else: # BALANCED
return (
RecoveryStrategy.RETRY
if item.retry_count < self.thresholds.max_retries // 2
else RecoveryStrategy.REQUEUE
)
async def _execute_recovery(
self,
url: str,
item: QueueItem,
strategy: RecoveryStrategy,
state_manager,
metrics_manager
) -> bool:
"""Execute recovery strategy"""
try:
if strategy == RecoveryStrategy.FAIL:
await self._handle_failed_item(url, item, state_manager, metrics_manager)
success = False
elif strategy == RecoveryStrategy.RETRY:
await self._handle_retry_item(url, item, state_manager)
success = True
elif strategy == RecoveryStrategy.REQUEUE:
await self._handle_requeue_item(url, item, state_manager)
success = True
else: # EMERGENCY
await self._handle_emergency_recovery(url, item, state_manager, metrics_manager)
success = True
self.tracker.record_recovery(RecoveryResult(
item_url=url,
strategy=strategy,
success=success,
retry_count=item.retry_count
))
return success
except Exception as e:
self.tracker.record_recovery(RecoveryResult(
item_url=url,
strategy=strategy,
success=False,
error=str(e),
retry_count=item.retry_count
))
raise
async def _handle_failed_item(
self,
url: str,
item: QueueItem,
state_manager,
metrics_manager
) -> None:
"""Handle an item that has exceeded retry attempts"""
logger.warning(f"Moving stuck item to failed: {url}")
item.status = "failed"
item.error = "Exceeded maximum retries after being stuck"
item.last_error = item.error
item.last_error_time = datetime.utcnow()
await state_manager.mark_completed(item, False, item.error)
metrics_manager.update(
processing_time=item.processing_time or 0,
success=False,
error=item.error
)
async def _handle_retry_item(
self,
url: str,
item: QueueItem,
state_manager
) -> None:
"""Handle an item that will be retried"""
logger.info(f"Recovering stuck item for retry: {url}")
item.retry_count += 1
item.start_time = None
item.processing_time = 0
item.last_retry = datetime.utcnow()
item.status = "pending"
item.priority = max(0, item.priority - 2)
await state_manager.retry_item(item)
async def _handle_requeue_item(
self,
url: str,
item: QueueItem,
state_manager
) -> None:
"""Handle an item that will be requeued"""
logger.info(f"Requeuing stuck item: {url}")
item.retry_count += 1
item.start_time = None
item.processing_time = 0
item.last_retry = datetime.utcnow()
item.status = "pending"
item.priority = 0 # Reset priority
# Calculate backoff delay
backoff = self.thresholds.backoff_base * (2 ** (item.retry_count - 1))
await asyncio.sleep(min(backoff, 60)) # Cap at 60 seconds
await state_manager.retry_item(item)
async def _handle_emergency_recovery(
self,
url: str,
item: QueueItem,
state_manager,
metrics_manager
) -> None:
"""Handle emergency recovery of an item"""
logger.warning(f"Emergency recovery for item: {url}")
# Force item cleanup
await state_manager.force_cleanup_item(item)
# Reset item state
item.retry_count = 0
item.start_time = None
item.processing_time = 0
item.status = "pending"
item.priority = 10 # High priority
# Add back to queue
await state_manager.retry_item(item)
async def perform_emergency_recovery(
self,
state_manager,
metrics_manager
) -> None:
"""Perform emergency recovery of all processing items"""
try:
logger.warning("Performing emergency recovery of all processing items")
processing_items = await state_manager.get_all_processing_items()
recovered, failed = await self.recover_stuck_items(
[(item.url, item) for item in processing_items],
state_manager,
metrics_manager
)
logger.info(f"Emergency recovery complete - Recovered: {recovered}, Failed: {failed}")
except Exception as e:
logger.error(f"Error during emergency recovery: {str(e)}")
def should_recover_item(self, item: QueueItem) -> bool:
"""Check if an item should be recovered"""
if not hasattr(item, 'start_time') or not item.start_time:
return False
processing_time = datetime.utcnow().timestamp() - item.start_time
return processing_time > self.thresholds.deadlock_threshold
def get_recovery_stats(self) -> Dict[str, Any]:
"""Get recovery statistics"""
return {
"policy": self.policy.value,
"thresholds": {
"max_retries": self.thresholds.max_retries,
"deadlock_threshold": self.thresholds.deadlock_threshold,
"emergency_threshold": self.thresholds.emergency_threshold,
"max_concurrent": self.thresholds.max_concurrent_recoveries
},
"tracker": self.tracker.get_stats()
}

View File

@@ -0,0 +1,366 @@
"""Module for managing queue state"""
import logging
import asyncio
from enum import Enum
from dataclasses import dataclass
from typing import Dict, Set, List, Optional, Any
from datetime import datetime
from .models import QueueItem, QueueMetrics
logger = logging.getLogger("QueueStateManager")
class ItemState(Enum):
"""Possible states for queue items"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
RETRYING = "retrying"
@dataclass
class StateTransition:
"""Records a state transition"""
item_url: str
from_state: ItemState
to_state: ItemState
timestamp: datetime
reason: Optional[str] = None
class StateSnapshot:
"""Represents a point-in-time snapshot of queue state"""
def __init__(self):
self.timestamp = datetime.utcnow()
self.queue: List[QueueItem] = []
self.processing: Dict[str, QueueItem] = {}
self.completed: Dict[str, QueueItem] = {}
self.failed: Dict[str, QueueItem] = {}
self.guild_queues: Dict[int, Set[str]] = {}
self.channel_queues: Dict[int, Set[str]] = {}
def to_dict(self) -> Dict[str, Any]:
"""Convert snapshot to dictionary"""
return {
"timestamp": self.timestamp.isoformat(),
"queue": [item.__dict__ for item in self.queue],
"processing": {url: item.__dict__ for url, item in self.processing.items()},
"completed": {url: item.__dict__ for url, item in self.completed.items()},
"failed": {url: item.__dict__ for url, item in self.failed.items()},
"guild_queues": {gid: list(urls) for gid, urls in self.guild_queues.items()},
"channel_queues": {cid: list(urls) for cid, urls in self.channel_queues.items()}
}
class StateValidator:
"""Validates queue state"""
@staticmethod
def validate_item(item: QueueItem) -> bool:
"""Validate a queue item"""
return all([
isinstance(item.url, str) and item.url,
isinstance(item.guild_id, int) and item.guild_id > 0,
isinstance(item.channel_id, int) and item.channel_id > 0,
isinstance(item.priority, int) and 0 <= item.priority <= 10,
isinstance(item.added_at, datetime),
isinstance(item.status, str)
])
@staticmethod
def validate_transition(
item: QueueItem,
from_state: ItemState,
to_state: ItemState
) -> bool:
"""Validate a state transition"""
valid_transitions = {
ItemState.PENDING: {ItemState.PROCESSING, ItemState.FAILED},
ItemState.PROCESSING: {ItemState.COMPLETED, ItemState.FAILED, ItemState.RETRYING},
ItemState.FAILED: {ItemState.RETRYING},
ItemState.RETRYING: {ItemState.PENDING},
ItemState.COMPLETED: set() # No transitions from completed
}
return to_state in valid_transitions.get(from_state, set())
class StateTracker:
"""Tracks state changes and transitions"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.transitions: List[StateTransition] = []
self.snapshots: List[StateSnapshot] = []
self.state_counts: Dict[ItemState, int] = {state: 0 for state in ItemState}
def record_transition(
self,
transition: StateTransition
) -> None:
"""Record a state transition"""
self.transitions.append(transition)
if len(self.transitions) > self.max_history:
self.transitions.pop(0)
self.state_counts[transition.from_state] -= 1
self.state_counts[transition.to_state] += 1
def take_snapshot(self, state_manager: 'QueueStateManager') -> None:
"""Take a snapshot of current state"""
snapshot = StateSnapshot()
snapshot.queue = state_manager._queue.copy()
snapshot.processing = state_manager._processing.copy()
snapshot.completed = state_manager._completed.copy()
snapshot.failed = state_manager._failed.copy()
snapshot.guild_queues = {
gid: urls.copy() for gid, urls in state_manager._guild_queues.items()
}
snapshot.channel_queues = {
cid: urls.copy() for cid, urls in state_manager._channel_queues.items()
}
self.snapshots.append(snapshot)
if len(self.snapshots) > self.max_history:
self.snapshots.pop(0)
def get_state_history(self) -> Dict[str, Any]:
"""Get state history statistics"""
return {
"transitions": len(self.transitions),
"snapshots": len(self.snapshots),
"state_counts": {
state.value: count
for state, count in self.state_counts.items()
},
"latest_snapshot": (
self.snapshots[-1].to_dict()
if self.snapshots
else None
)
}
class QueueStateManager:
"""Manages the state of the queue system"""
def __init__(self, max_queue_size: int = 1000):
self.max_queue_size = max_queue_size
# Queue storage
self._queue: List[QueueItem] = []
self._processing: Dict[str, QueueItem] = {}
self._completed: Dict[str, QueueItem] = {}
self._failed: Dict[str, QueueItem] = {}
# Tracking
self._guild_queues: Dict[int, Set[str]] = {}
self._channel_queues: Dict[int, Set[str]] = {}
# State management
self._lock = asyncio.Lock()
self.validator = StateValidator()
self.tracker = StateTracker()
async def add_item(self, item: QueueItem) -> bool:
"""Add an item to the queue"""
if not self.validator.validate_item(item):
logger.error(f"Invalid queue item: {item}")
return False
async with self._lock:
if len(self._queue) >= self.max_queue_size:
return False
# Record transition
self.tracker.record_transition(StateTransition(
item_url=item.url,
from_state=ItemState.PENDING,
to_state=ItemState.PENDING,
timestamp=datetime.utcnow(),
reason="Initial add"
))
# Add to main queue
self._queue.append(item)
self._queue.sort(key=lambda x: (-x.priority, x.added_at))
# Update tracking
if item.guild_id not in self._guild_queues:
self._guild_queues[item.guild_id] = set()
self._guild_queues[item.guild_id].add(item.url)
if item.channel_id not in self._channel_queues:
self._channel_queues[item.channel_id] = set()
self._channel_queues[item.channel_id].add(item.url)
# Take snapshot periodically
if len(self._queue) % 100 == 0:
self.tracker.take_snapshot(self)
return True
async def get_next_items(self, count: int = 5) -> List[QueueItem]:
"""Get the next batch of items to process"""
items = []
async with self._lock:
while len(items) < count and self._queue:
item = self._queue.pop(0)
items.append(item)
self._processing[item.url] = item
# Record transition
self.tracker.record_transition(StateTransition(
item_url=item.url,
from_state=ItemState.PENDING,
to_state=ItemState.PROCESSING,
timestamp=datetime.utcnow()
))
return items
async def mark_completed(
self,
item: QueueItem,
success: bool,
error: Optional[str] = None
) -> None:
"""Mark an item as completed or failed"""
async with self._lock:
self._processing.pop(item.url, None)
to_state = ItemState.COMPLETED if success else ItemState.FAILED
self.tracker.record_transition(StateTransition(
item_url=item.url,
from_state=ItemState.PROCESSING,
to_state=to_state,
timestamp=datetime.utcnow(),
reason=error if error else None
))
if success:
self._completed[item.url] = item
else:
self._failed[item.url] = item
async def retry_item(self, item: QueueItem) -> None:
"""Add an item back to the queue for retry"""
if not self.validator.validate_transition(
item,
ItemState.FAILED,
ItemState.RETRYING
):
logger.error(f"Invalid retry transition for item: {item}")
return
async with self._lock:
self._processing.pop(item.url, None)
item.status = ItemState.PENDING.value
item.last_retry = datetime.utcnow()
item.priority = max(0, item.priority - 1)
# Record transitions
self.tracker.record_transition(StateTransition(
item_url=item.url,
from_state=ItemState.FAILED,
to_state=ItemState.RETRYING,
timestamp=datetime.utcnow()
))
self.tracker.record_transition(StateTransition(
item_url=item.url,
from_state=ItemState.RETRYING,
to_state=ItemState.PENDING,
timestamp=datetime.utcnow()
))
self._queue.append(item)
self._queue.sort(key=lambda x: (-x.priority, x.added_at))
async def get_guild_status(self, guild_id: int) -> Dict[str, int]:
"""Get queue status for a specific guild"""
async with self._lock:
return {
"pending": len([
item for item in self._queue
if item.guild_id == guild_id
]),
"processing": len([
item for item in self._processing.values()
if item.guild_id == guild_id
]),
"completed": len([
item for item in self._completed.values()
if item.guild_id == guild_id
]),
"failed": len([
item for item in self._failed.values()
if item.guild_id == guild_id
])
}
async def clear_state(self) -> None:
"""Clear all state data"""
async with self._lock:
self._queue.clear()
self._processing.clear()
self._completed.clear()
self._failed.clear()
self._guild_queues.clear()
self._channel_queues.clear()
# Take final snapshot before clearing
self.tracker.take_snapshot(self)
async def get_state_for_persistence(self) -> Dict[str, Any]:
"""Get current state for persistence"""
async with self._lock:
# Take snapshot before persistence
self.tracker.take_snapshot(self)
return {
"queue": self._queue,
"processing": self._processing,
"completed": self._completed,
"failed": self._failed,
"history": self.tracker.get_state_history()
}
async def restore_state(self, state: Dict[str, Any]) -> None:
"""Restore state from persisted data"""
async with self._lock:
self._queue = state.get("queue", [])
self._processing = state.get("processing", {})
self._completed = state.get("completed", {})
self._failed = state.get("failed", {})
# Validate restored items
for item in self._queue:
if not self.validator.validate_item(item):
logger.warning(f"Removing invalid restored item: {item}")
self._queue.remove(item)
# Rebuild tracking
self._rebuild_tracking()
def _rebuild_tracking(self) -> None:
"""Rebuild guild and channel tracking from queue data"""
self._guild_queues.clear()
self._channel_queues.clear()
for item in self._queue:
if item.guild_id not in self._guild_queues:
self._guild_queues[item.guild_id] = set()
self._guild_queues[item.guild_id].add(item.url)
if item.channel_id not in self._channel_queues:
self._channel_queues[item.channel_id] = set()
self._channel_queues[item.channel_id].add(item.url)
def get_state_stats(self) -> Dict[str, Any]:
"""Get comprehensive state statistics"""
return {
"queue_size": len(self._queue),
"processing_count": len(self._processing),
"completed_count": len(self._completed),
"failed_count": len(self._failed),
"guild_count": len(self._guild_queues),
"channel_count": len(self._channel_queues),
"history": self.tracker.get_state_history()
}