mirror of
https://github.com/pacnpal/Pac-cogs.git
synced 2025-12-20 02:41:06 -05:00
Core Systems:
Component-based architecture with lifecycle management Enhanced error handling and recovery mechanisms Comprehensive state management and tracking Event-driven architecture with monitoring Queue Management: Multiple processing strategies for different scenarios Advanced state management with recovery Comprehensive metrics and health monitoring Sophisticated cleanup system with multiple strategies Processing Pipeline: Enhanced message handling with validation Improved URL extraction and processing Better queue management and monitoring Advanced cleanup mechanisms Overall Benefits: Better code organization and maintainability Improved error handling and recovery Enhanced monitoring and reporting More robust and reliable system
This commit is contained in:
500
videoarchiver/queue/cleaners/guild_cleaner.py
Normal file
500
videoarchiver/queue/cleaners/guild_cleaner.py
Normal file
@@ -0,0 +1,500 @@
|
||||
"""Module for cleaning guild-specific queue items"""
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Set, Tuple, Any, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from ..models import QueueItem
|
||||
|
||||
logger = logging.getLogger("GuildCleaner")
|
||||
|
||||
class GuildCleanupStrategy(Enum):
|
||||
"""Guild cleanup strategies"""
|
||||
FULL = "full" # Clear all guild items
|
||||
SELECTIVE = "selective" # Clear only specific categories
|
||||
GRACEFUL = "graceful" # Clear with grace period
|
||||
|
||||
class CleanupCategory(Enum):
|
||||
"""Categories for cleanup"""
|
||||
QUEUE = "queue"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
TRACKING = "tracking"
|
||||
|
||||
@dataclass
|
||||
class GuildCleanupConfig:
|
||||
"""Configuration for guild cleanup"""
|
||||
categories: Set[CleanupCategory] = field(default_factory=lambda: set(CleanupCategory))
|
||||
grace_period: int = 300 # 5 minutes
|
||||
preserve_completed: bool = False
|
||||
preserve_failed: bool = False
|
||||
batch_size: int = 100
|
||||
|
||||
@dataclass
|
||||
class GuildCleanupResult:
|
||||
"""Result of a guild cleanup operation"""
|
||||
guild_id: int
|
||||
timestamp: datetime
|
||||
strategy: GuildCleanupStrategy
|
||||
items_cleared: int
|
||||
categories_cleared: Set[CleanupCategory]
|
||||
initial_counts: Dict[str, int]
|
||||
final_counts: Dict[str, int]
|
||||
duration: float
|
||||
error: Optional[str] = None
|
||||
|
||||
class GuildCleanupTracker:
|
||||
"""Tracks guild cleanup operations"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.history: List[GuildCleanupResult] = []
|
||||
self.cleanup_counts: Dict[int, int] = {} # guild_id -> count
|
||||
self.total_items_cleared = 0
|
||||
self.last_cleanup: Optional[datetime] = None
|
||||
|
||||
def record_cleanup(self, result: GuildCleanupResult) -> None:
|
||||
"""Record a cleanup operation"""
|
||||
self.history.append(result)
|
||||
if len(self.history) > self.max_history:
|
||||
self.history.pop(0)
|
||||
|
||||
self.cleanup_counts[result.guild_id] = (
|
||||
self.cleanup_counts.get(result.guild_id, 0) + 1
|
||||
)
|
||||
self.total_items_cleared += result.items_cleared
|
||||
self.last_cleanup = result.timestamp
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get cleanup statistics"""
|
||||
return {
|
||||
"total_cleanups": len(self.history),
|
||||
"total_items_cleared": self.total_items_cleared,
|
||||
"guilds_cleaned": len(self.cleanup_counts),
|
||||
"last_cleanup": (
|
||||
self.last_cleanup.isoformat()
|
||||
if self.last_cleanup
|
||||
else None
|
||||
),
|
||||
"recent_cleanups": [
|
||||
{
|
||||
"guild_id": r.guild_id,
|
||||
"timestamp": r.timestamp.isoformat(),
|
||||
"strategy": r.strategy.value,
|
||||
"items_cleared": r.items_cleared,
|
||||
"categories": [c.value for c in r.categories_cleared]
|
||||
}
|
||||
for r in self.history[-5:] # Last 5 cleanups
|
||||
]
|
||||
}
|
||||
|
||||
class GuildCleaner:
|
||||
"""Handles cleanup of guild-specific queue items"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
strategy: GuildCleanupStrategy = GuildCleanupStrategy.GRACEFUL,
|
||||
config: Optional[GuildCleanupConfig] = None
|
||||
):
|
||||
self.strategy = strategy
|
||||
self.config = config or GuildCleanupConfig()
|
||||
self.tracker = GuildCleanupTracker()
|
||||
|
||||
async def clear_guild_items(
|
||||
self,
|
||||
guild_id: int,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]]
|
||||
) -> Tuple[int, Dict[str, int]]:
|
||||
"""Clear all queue items for a specific guild"""
|
||||
start_time = datetime.utcnow()
|
||||
cleared_categories = set()
|
||||
|
||||
try:
|
||||
# Get initial counts
|
||||
initial_counts = self._get_item_counts(
|
||||
guild_id,
|
||||
queue,
|
||||
processing,
|
||||
completed,
|
||||
failed
|
||||
)
|
||||
|
||||
# Get URLs for this guild
|
||||
guild_urls = guild_queues.get(guild_id, set())
|
||||
|
||||
# Clear items based on strategy
|
||||
cleared_count = 0
|
||||
if self.strategy == GuildCleanupStrategy.FULL:
|
||||
cleared_count = await self._full_cleanup(
|
||||
guild_id,
|
||||
queue,
|
||||
processing,
|
||||
completed,
|
||||
failed,
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
cleared_categories
|
||||
)
|
||||
elif self.strategy == GuildCleanupStrategy.SELECTIVE:
|
||||
cleared_count = await self._selective_cleanup(
|
||||
guild_id,
|
||||
queue,
|
||||
processing,
|
||||
completed,
|
||||
failed,
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
cleared_categories
|
||||
)
|
||||
else: # GRACEFUL
|
||||
cleared_count = await self._graceful_cleanup(
|
||||
guild_id,
|
||||
queue,
|
||||
processing,
|
||||
completed,
|
||||
failed,
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
cleared_categories
|
||||
)
|
||||
|
||||
# Get final counts
|
||||
final_counts = self._get_item_counts(
|
||||
guild_id,
|
||||
queue,
|
||||
processing,
|
||||
completed,
|
||||
failed
|
||||
)
|
||||
|
||||
# Record cleanup result
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
result = GuildCleanupResult(
|
||||
guild_id=guild_id,
|
||||
timestamp=datetime.utcnow(),
|
||||
strategy=self.strategy,
|
||||
items_cleared=cleared_count,
|
||||
categories_cleared=cleared_categories,
|
||||
initial_counts=initial_counts,
|
||||
final_counts=final_counts,
|
||||
duration=duration
|
||||
)
|
||||
self.tracker.record_cleanup(result)
|
||||
|
||||
logger.info(self.format_guild_cleanup_report(
|
||||
guild_id,
|
||||
initial_counts,
|
||||
final_counts,
|
||||
duration
|
||||
))
|
||||
return cleared_count, initial_counts
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error clearing guild {guild_id} queue: {e}")
|
||||
self.tracker.record_cleanup(GuildCleanupResult(
|
||||
guild_id=guild_id,
|
||||
timestamp=datetime.utcnow(),
|
||||
strategy=self.strategy,
|
||||
items_cleared=0,
|
||||
categories_cleared=set(),
|
||||
initial_counts={},
|
||||
final_counts={},
|
||||
duration=0,
|
||||
error=str(e)
|
||||
))
|
||||
raise
|
||||
|
||||
async def _full_cleanup(
|
||||
self,
|
||||
guild_id: int,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
cleared_categories: Set[CleanupCategory]
|
||||
) -> int:
|
||||
"""Perform full cleanup"""
|
||||
cleared_count = 0
|
||||
|
||||
# Clear from pending queue
|
||||
queue[:] = [item for item in queue if item.guild_id != guild_id]
|
||||
cleared_count += len(queue)
|
||||
cleared_categories.add(CleanupCategory.QUEUE)
|
||||
|
||||
# Clear from processing
|
||||
cleared = await self._clear_from_dict(
|
||||
processing, guild_id, 'processing'
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.PROCESSING)
|
||||
|
||||
# Clear from completed
|
||||
cleared = await self._clear_from_dict(
|
||||
completed, guild_id, 'completed'
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.COMPLETED)
|
||||
|
||||
# Clear from failed
|
||||
cleared = await self._clear_from_dict(
|
||||
failed, guild_id, 'failed'
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.FAILED)
|
||||
|
||||
# Clear tracking
|
||||
cleared = await self._clear_tracking(
|
||||
guild_id,
|
||||
guild_queues,
|
||||
channel_queues
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.TRACKING)
|
||||
|
||||
return cleared_count
|
||||
|
||||
async def _selective_cleanup(
|
||||
self,
|
||||
guild_id: int,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
cleared_categories: Set[CleanupCategory]
|
||||
) -> int:
|
||||
"""Perform selective cleanup"""
|
||||
cleared_count = 0
|
||||
|
||||
# Clear only configured categories
|
||||
if CleanupCategory.QUEUE in self.config.categories:
|
||||
queue[:] = [item for item in queue if item.guild_id != guild_id]
|
||||
cleared_count += len(queue)
|
||||
cleared_categories.add(CleanupCategory.QUEUE)
|
||||
|
||||
if CleanupCategory.PROCESSING in self.config.categories:
|
||||
cleared = await self._clear_from_dict(
|
||||
processing, guild_id, 'processing'
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.PROCESSING)
|
||||
|
||||
if (
|
||||
CleanupCategory.COMPLETED in self.config.categories and
|
||||
not self.config.preserve_completed
|
||||
):
|
||||
cleared = await self._clear_from_dict(
|
||||
completed, guild_id, 'completed'
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.COMPLETED)
|
||||
|
||||
if (
|
||||
CleanupCategory.FAILED in self.config.categories and
|
||||
not self.config.preserve_failed
|
||||
):
|
||||
cleared = await self._clear_from_dict(
|
||||
failed, guild_id, 'failed'
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.FAILED)
|
||||
|
||||
if CleanupCategory.TRACKING in self.config.categories:
|
||||
cleared = await self._clear_tracking(
|
||||
guild_id,
|
||||
guild_queues,
|
||||
channel_queues
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.TRACKING)
|
||||
|
||||
return cleared_count
|
||||
|
||||
async def _graceful_cleanup(
|
||||
self,
|
||||
guild_id: int,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
cleared_categories: Set[CleanupCategory]
|
||||
) -> int:
|
||||
"""Perform graceful cleanup"""
|
||||
cleared_count = 0
|
||||
cutoff_time = datetime.utcnow().timestamp() - self.config.grace_period
|
||||
|
||||
# Clear queue items beyond grace period
|
||||
queue[:] = [
|
||||
item for item in queue
|
||||
if not (
|
||||
item.guild_id == guild_id and
|
||||
item.added_at.timestamp() < cutoff_time
|
||||
)
|
||||
]
|
||||
cleared_count += len(queue)
|
||||
cleared_categories.add(CleanupCategory.QUEUE)
|
||||
|
||||
# Clear processing items beyond grace period
|
||||
for url in list(processing.keys()):
|
||||
item = processing[url]
|
||||
if (
|
||||
item.guild_id == guild_id and
|
||||
item.added_at.timestamp() < cutoff_time
|
||||
):
|
||||
processing.pop(url)
|
||||
cleared_count += 1
|
||||
cleared_categories.add(CleanupCategory.PROCESSING)
|
||||
|
||||
# Clear completed and failed based on config
|
||||
if not self.config.preserve_completed:
|
||||
cleared = await self._clear_from_dict(
|
||||
completed, guild_id, 'completed'
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.COMPLETED)
|
||||
|
||||
if not self.config.preserve_failed:
|
||||
cleared = await self._clear_from_dict(
|
||||
failed, guild_id, 'failed'
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.FAILED)
|
||||
|
||||
# Clear tracking
|
||||
cleared = await self._clear_tracking(
|
||||
guild_id,
|
||||
guild_queues,
|
||||
channel_queues
|
||||
)
|
||||
cleared_count += cleared
|
||||
cleared_categories.add(CleanupCategory.TRACKING)
|
||||
|
||||
return cleared_count
|
||||
|
||||
async def _clear_from_dict(
|
||||
self,
|
||||
items_dict: Dict[str, QueueItem],
|
||||
guild_id: int,
|
||||
category: str
|
||||
) -> int:
|
||||
"""Clear guild items from a dictionary"""
|
||||
cleared = 0
|
||||
batch_count = 0
|
||||
|
||||
for url in list(items_dict.keys()):
|
||||
if items_dict[url].guild_id == guild_id:
|
||||
items_dict.pop(url)
|
||||
cleared += 1
|
||||
batch_count += 1
|
||||
|
||||
# Process in batches
|
||||
if batch_count >= self.config.batch_size:
|
||||
await asyncio.sleep(0) # Yield to event loop
|
||||
batch_count = 0
|
||||
|
||||
logger.debug(f"Cleared {cleared} {category} items for guild {guild_id}")
|
||||
return cleared
|
||||
|
||||
async def _clear_tracking(
|
||||
self,
|
||||
guild_id: int,
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]]
|
||||
) -> int:
|
||||
"""Clear guild tracking data"""
|
||||
cleared = 0
|
||||
guild_urls = guild_queues.get(guild_id, set())
|
||||
|
||||
# Clear guild tracking
|
||||
if guild_id in guild_queues:
|
||||
cleared += len(guild_queues[guild_id])
|
||||
guild_queues.pop(guild_id)
|
||||
|
||||
# Clear channel tracking
|
||||
await self._clear_channel_tracking(channel_queues, guild_urls)
|
||||
|
||||
return cleared
|
||||
|
||||
async def _clear_channel_tracking(
|
||||
self,
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
guild_urls: Set[str]
|
||||
) -> None:
|
||||
"""Clear channel tracking for guild URLs"""
|
||||
batch_count = 0
|
||||
|
||||
for channel_id in list(channel_queues.keys()):
|
||||
channel_queues[channel_id] = {
|
||||
url for url in channel_queues[channel_id]
|
||||
if url not in guild_urls
|
||||
}
|
||||
if not channel_queues[channel_id]:
|
||||
channel_queues.pop(channel_id)
|
||||
|
||||
batch_count += 1
|
||||
if batch_count >= self.config.batch_size:
|
||||
await asyncio.sleep(0) # Yield to event loop
|
||||
batch_count = 0
|
||||
|
||||
def _get_item_counts(
|
||||
self,
|
||||
guild_id: int,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem]
|
||||
) -> Dict[str, int]:
|
||||
"""Get item counts for a guild"""
|
||||
return {
|
||||
'queue': len([item for item in queue if item.guild_id == guild_id]),
|
||||
'processing': len([item for item in processing.values() if item.guild_id == guild_id]),
|
||||
'completed': len([item for item in completed.values() if item.guild_id == guild_id]),
|
||||
'failed': len([item for item in failed.values() if item.guild_id == guild_id])
|
||||
}
|
||||
|
||||
def format_guild_cleanup_report(
|
||||
self,
|
||||
guild_id: int,
|
||||
initial_counts: Dict[str, int],
|
||||
final_counts: Dict[str, int],
|
||||
duration: float
|
||||
) -> str:
|
||||
"""Format a guild cleanup report"""
|
||||
return (
|
||||
f"Guild {guild_id} Cleanup Results:\n"
|
||||
f"Strategy: {self.strategy.value}\n"
|
||||
f"Duration: {duration:.2f}s\n"
|
||||
f"Items:\n"
|
||||
f"- Queue: {initial_counts['queue']} -> {final_counts['queue']}\n"
|
||||
f"- Processing: {initial_counts['processing']} -> {final_counts['processing']}\n"
|
||||
f"- Completed: {initial_counts['completed']} -> {final_counts['completed']}\n"
|
||||
f"- Failed: {initial_counts['failed']} -> {final_counts['failed']}\n"
|
||||
f"Total cleared: {sum(initial_counts.values()) - sum(final_counts.values())} items"
|
||||
)
|
||||
|
||||
def get_cleaner_stats(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive cleaner statistics"""
|
||||
return {
|
||||
"strategy": self.strategy.value,
|
||||
"config": {
|
||||
"categories": [c.value for c in self.config.categories],
|
||||
"grace_period": self.config.grace_period,
|
||||
"preserve_completed": self.config.preserve_completed,
|
||||
"preserve_failed": self.config.preserve_failed,
|
||||
"batch_size": self.config.batch_size
|
||||
},
|
||||
"tracker": self.tracker.get_stats()
|
||||
}
|
||||
336
videoarchiver/queue/cleaners/history_cleaner.py
Normal file
336
videoarchiver/queue/cleaners/history_cleaner.py
Normal file
@@ -0,0 +1,336 @@
|
||||
"""Module for cleaning historical queue items"""
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Optional, List, Any, Set
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from ..models import QueueItem
|
||||
|
||||
logger = logging.getLogger("HistoryCleaner")
|
||||
|
||||
class CleanupStrategy(Enum):
|
||||
"""Cleanup strategies"""
|
||||
AGGRESSIVE = "aggressive" # Remove more aggressively
|
||||
CONSERVATIVE = "conservative" # Remove conservatively
|
||||
BALANCED = "balanced" # Balance between retention and cleanup
|
||||
|
||||
class CleanupPolicy(Enum):
|
||||
"""Cleanup policies"""
|
||||
AGE = "age" # Clean based on age
|
||||
SIZE = "size" # Clean based on size
|
||||
HYBRID = "hybrid" # Consider both age and size
|
||||
|
||||
@dataclass
|
||||
class CleanupThresholds:
|
||||
"""Thresholds for cleanup operations"""
|
||||
max_history_age: int = 43200 # 12 hours
|
||||
max_completed_items: int = 10000
|
||||
max_failed_items: int = 5000
|
||||
min_retention_time: int = 3600 # 1 hour
|
||||
size_threshold: int = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@dataclass
|
||||
class CleanupResult:
|
||||
"""Result of a cleanup operation"""
|
||||
timestamp: datetime
|
||||
items_cleaned: int
|
||||
space_freed: int
|
||||
duration: float
|
||||
strategy: CleanupStrategy
|
||||
policy: CleanupPolicy
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
class CleanupTracker:
|
||||
"""Tracks cleanup operations"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.history: List[CleanupResult] = []
|
||||
self.total_items_cleaned = 0
|
||||
self.total_space_freed = 0
|
||||
self.last_cleanup: Optional[datetime] = None
|
||||
|
||||
def record_cleanup(self, result: CleanupResult) -> None:
|
||||
"""Record a cleanup operation"""
|
||||
self.history.append(result)
|
||||
if len(self.history) > self.max_history:
|
||||
self.history.pop(0)
|
||||
|
||||
self.total_items_cleaned += result.items_cleaned
|
||||
self.total_space_freed += result.space_freed
|
||||
self.last_cleanup = result.timestamp
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get cleanup statistics"""
|
||||
return {
|
||||
"total_cleanups": len(self.history),
|
||||
"total_items_cleaned": self.total_items_cleaned,
|
||||
"total_space_freed": self.total_space_freed,
|
||||
"last_cleanup": (
|
||||
self.last_cleanup.isoformat()
|
||||
if self.last_cleanup
|
||||
else None
|
||||
),
|
||||
"recent_cleanups": [
|
||||
{
|
||||
"timestamp": r.timestamp.isoformat(),
|
||||
"items_cleaned": r.items_cleaned,
|
||||
"space_freed": r.space_freed,
|
||||
"strategy": r.strategy.value,
|
||||
"policy": r.policy.value
|
||||
}
|
||||
for r in self.history[-5:] # Last 5 cleanups
|
||||
]
|
||||
}
|
||||
|
||||
class HistoryCleaner:
|
||||
"""Handles cleanup of historical queue items"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
strategy: CleanupStrategy = CleanupStrategy.BALANCED,
|
||||
policy: CleanupPolicy = CleanupPolicy.HYBRID,
|
||||
thresholds: Optional[CleanupThresholds] = None
|
||||
):
|
||||
self.strategy = strategy
|
||||
self.policy = policy
|
||||
self.thresholds = thresholds or CleanupThresholds()
|
||||
self.tracker = CleanupTracker()
|
||||
|
||||
def _normalize_datetime(self, dt_value: any) -> datetime:
|
||||
"""Normalize a datetime value"""
|
||||
current_time = datetime.utcnow()
|
||||
|
||||
if not isinstance(dt_value, datetime):
|
||||
try:
|
||||
if isinstance(dt_value, str):
|
||||
return datetime.fromisoformat(dt_value)
|
||||
else:
|
||||
return current_time
|
||||
except (ValueError, TypeError):
|
||||
return current_time
|
||||
return dt_value
|
||||
|
||||
async def cleanup_completed(
|
||||
self,
|
||||
completed: Dict[str, QueueItem],
|
||||
cleanup_cutoff: datetime
|
||||
) -> int:
|
||||
"""Clean up completed items"""
|
||||
start_time = datetime.utcnow()
|
||||
items_cleaned = 0
|
||||
space_freed = 0
|
||||
completed_count = len(completed)
|
||||
|
||||
try:
|
||||
# Determine cleanup approach based on strategy and policy
|
||||
if self.policy == CleanupPolicy.SIZE:
|
||||
items_to_clean = self._get_items_by_size(completed)
|
||||
elif self.policy == CleanupPolicy.HYBRID:
|
||||
items_to_clean = self._get_items_hybrid(completed, cleanup_cutoff)
|
||||
else: # AGE policy
|
||||
items_to_clean = self._get_items_by_age(completed, cleanup_cutoff)
|
||||
|
||||
# Clean items
|
||||
for url in items_to_clean:
|
||||
try:
|
||||
item = completed[url]
|
||||
space_freed += self._estimate_item_size(item)
|
||||
completed.pop(url)
|
||||
items_cleaned += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning completed item {url}: {e}")
|
||||
completed.pop(url)
|
||||
items_cleaned += 1
|
||||
|
||||
# Record cleanup
|
||||
self._record_cleanup_result(
|
||||
items_cleaned,
|
||||
space_freed,
|
||||
start_time,
|
||||
"completed"
|
||||
)
|
||||
|
||||
logger.debug(f"Cleaned {items_cleaned} completed items")
|
||||
return items_cleaned
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during completed items cleanup: {e}")
|
||||
return 0
|
||||
|
||||
async def cleanup_failed(
|
||||
self,
|
||||
failed: Dict[str, QueueItem],
|
||||
cleanup_cutoff: datetime
|
||||
) -> int:
|
||||
"""Clean up failed items"""
|
||||
start_time = datetime.utcnow()
|
||||
items_cleaned = 0
|
||||
space_freed = 0
|
||||
failed_count = len(failed)
|
||||
|
||||
try:
|
||||
# Determine cleanup approach
|
||||
if self.policy == CleanupPolicy.SIZE:
|
||||
items_to_clean = self._get_items_by_size(failed)
|
||||
elif self.policy == CleanupPolicy.HYBRID:
|
||||
items_to_clean = self._get_items_hybrid(failed, cleanup_cutoff)
|
||||
else: # AGE policy
|
||||
items_to_clean = self._get_items_by_age(failed, cleanup_cutoff)
|
||||
|
||||
# Clean items
|
||||
for url in items_to_clean:
|
||||
try:
|
||||
item = failed[url]
|
||||
space_freed += self._estimate_item_size(item)
|
||||
failed.pop(url)
|
||||
items_cleaned += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning failed item {url}: {e}")
|
||||
failed.pop(url)
|
||||
items_cleaned += 1
|
||||
|
||||
# Record cleanup
|
||||
self._record_cleanup_result(
|
||||
items_cleaned,
|
||||
space_freed,
|
||||
start_time,
|
||||
"failed"
|
||||
)
|
||||
|
||||
logger.debug(f"Cleaned {items_cleaned} failed items")
|
||||
return items_cleaned
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during failed items cleanup: {e}")
|
||||
return 0
|
||||
|
||||
def _get_items_by_age(
|
||||
self,
|
||||
items: Dict[str, QueueItem],
|
||||
cutoff: datetime
|
||||
) -> Set[str]:
|
||||
"""Get items to clean based on age"""
|
||||
to_clean = set()
|
||||
|
||||
for url, item in items.items():
|
||||
item.added_at = self._normalize_datetime(item.added_at)
|
||||
if item.added_at < cutoff:
|
||||
to_clean.add(url)
|
||||
|
||||
return to_clean
|
||||
|
||||
def _get_items_by_size(self, items: Dict[str, QueueItem]) -> Set[str]:
|
||||
"""Get items to clean based on size"""
|
||||
to_clean = set()
|
||||
total_size = 0
|
||||
|
||||
# Sort items by size estimate
|
||||
sorted_items = sorted(
|
||||
items.items(),
|
||||
key=lambda x: self._estimate_item_size(x[1]),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
for url, item in sorted_items:
|
||||
total_size += self._estimate_item_size(item)
|
||||
if total_size > self.thresholds.size_threshold:
|
||||
to_clean.add(url)
|
||||
|
||||
return to_clean
|
||||
|
||||
def _get_items_hybrid(
|
||||
self,
|
||||
items: Dict[str, QueueItem],
|
||||
cutoff: datetime
|
||||
) -> Set[str]:
|
||||
"""Get items to clean using hybrid approach"""
|
||||
by_age = self._get_items_by_age(items, cutoff)
|
||||
by_size = self._get_items_by_size(items)
|
||||
|
||||
if self.strategy == CleanupStrategy.AGGRESSIVE:
|
||||
return by_age.union(by_size)
|
||||
elif self.strategy == CleanupStrategy.CONSERVATIVE:
|
||||
return by_age.intersection(by_size)
|
||||
else: # BALANCED
|
||||
return by_age
|
||||
|
||||
def _estimate_item_size(self, item: QueueItem) -> int:
|
||||
"""Estimate size of an item in bytes"""
|
||||
# This could be enhanced with actual file size tracking
|
||||
base_size = 1024 # 1KB base size
|
||||
return base_size * (item.retry_count + 1)
|
||||
|
||||
def _record_cleanup_result(
|
||||
self,
|
||||
items_cleaned: int,
|
||||
space_freed: int,
|
||||
start_time: datetime,
|
||||
cleanup_type: str
|
||||
) -> None:
|
||||
"""Record cleanup result"""
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
result = CleanupResult(
|
||||
timestamp=datetime.utcnow(),
|
||||
items_cleaned=items_cleaned,
|
||||
space_freed=space_freed,
|
||||
duration=duration,
|
||||
strategy=self.strategy,
|
||||
policy=self.policy,
|
||||
details={"type": cleanup_type}
|
||||
)
|
||||
|
||||
self.tracker.record_cleanup(result)
|
||||
|
||||
def get_cleanup_cutoff(self) -> datetime:
|
||||
"""Get the cutoff time for cleanup"""
|
||||
if self.strategy == CleanupStrategy.AGGRESSIVE:
|
||||
age = self.thresholds.max_history_age // 2
|
||||
elif self.strategy == CleanupStrategy.CONSERVATIVE:
|
||||
age = self.thresholds.max_history_age * 2
|
||||
else: # BALANCED
|
||||
age = self.thresholds.max_history_age
|
||||
|
||||
return datetime.utcnow() - timedelta(seconds=max(
|
||||
age,
|
||||
self.thresholds.min_retention_time
|
||||
))
|
||||
|
||||
def format_cleanup_report(
|
||||
self,
|
||||
initial_completed: int,
|
||||
final_completed: int,
|
||||
initial_failed: int,
|
||||
final_failed: int
|
||||
) -> str:
|
||||
"""Format a cleanup report"""
|
||||
stats = self.tracker.get_stats()
|
||||
|
||||
return (
|
||||
f"History Cleanup Results:\n"
|
||||
f"- Completed items: {initial_completed} -> {final_completed}\n"
|
||||
f"- Failed items: {initial_failed} -> {final_failed}\n"
|
||||
f"- Total items cleaned: {(initial_completed - final_completed) + (initial_failed - final_failed)}\n"
|
||||
f"- Space freed: {stats['total_space_freed']} bytes\n"
|
||||
f"- Strategy: {self.strategy.value}\n"
|
||||
f"- Policy: {self.policy.value}\n"
|
||||
f"- Total cleanups: {stats['total_cleanups']}"
|
||||
)
|
||||
|
||||
def get_cleaner_stats(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive cleaner statistics"""
|
||||
return {
|
||||
"strategy": self.strategy.value,
|
||||
"policy": self.policy.value,
|
||||
"thresholds": {
|
||||
"max_history_age": self.thresholds.max_history_age,
|
||||
"max_completed_items": self.thresholds.max_completed_items,
|
||||
"max_failed_items": self.thresholds.max_failed_items,
|
||||
"min_retention_time": self.thresholds.min_retention_time,
|
||||
"size_threshold": self.thresholds.size_threshold
|
||||
},
|
||||
"tracker": self.tracker.get_stats()
|
||||
}
|
||||
452
videoarchiver/queue/cleaners/tracking_cleaner.py
Normal file
452
videoarchiver/queue/cleaners/tracking_cleaner.py
Normal file
@@ -0,0 +1,452 @@
|
||||
"""Module for cleaning queue tracking data"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Set, Tuple, Any, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from ..models import QueueItem
|
||||
|
||||
logger = logging.getLogger("TrackingCleaner")
|
||||
|
||||
class TrackingCleanupStrategy(Enum):
|
||||
"""Tracking cleanup strategies"""
|
||||
AGGRESSIVE = "aggressive" # Remove all invalid entries
|
||||
CONSERVATIVE = "conservative" # Keep recent invalid entries
|
||||
BALANCED = "balanced" # Balance between cleanup and retention
|
||||
|
||||
class TrackingType(Enum):
|
||||
"""Types of tracking data"""
|
||||
GUILD = "guild"
|
||||
CHANNEL = "channel"
|
||||
URL = "url"
|
||||
|
||||
@dataclass
|
||||
class TrackingCleanupConfig:
|
||||
"""Configuration for tracking cleanup"""
|
||||
batch_size: int = 100
|
||||
retention_period: int = 3600 # 1 hour
|
||||
validate_urls: bool = True
|
||||
cleanup_empty: bool = True
|
||||
max_invalid_ratio: float = 0.5 # 50% invalid threshold
|
||||
|
||||
@dataclass
|
||||
class TrackingCleanupResult:
|
||||
"""Result of a tracking cleanup operation"""
|
||||
timestamp: datetime
|
||||
strategy: TrackingCleanupStrategy
|
||||
items_cleaned: int
|
||||
guilds_cleaned: int
|
||||
channels_cleaned: int
|
||||
duration: float
|
||||
initial_counts: Dict[str, int]
|
||||
final_counts: Dict[str, int]
|
||||
error: Optional[str] = None
|
||||
|
||||
class TrackingValidator:
|
||||
"""Validates tracking data"""
|
||||
|
||||
@staticmethod
|
||||
def validate_url(url: str) -> bool:
|
||||
"""Validate URL format"""
|
||||
try:
|
||||
return bool(url and isinstance(url, str) and "://" in url)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def validate_id(id_value: int) -> bool:
|
||||
"""Validate ID format"""
|
||||
try:
|
||||
return bool(isinstance(id_value, int) and id_value > 0)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
class TrackingCleanupTracker:
|
||||
"""Tracks cleanup operations"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.history: List[TrackingCleanupResult] = []
|
||||
self.total_items_cleaned = 0
|
||||
self.total_guilds_cleaned = 0
|
||||
self.total_channels_cleaned = 0
|
||||
self.last_cleanup: Optional[datetime] = None
|
||||
|
||||
def record_cleanup(self, result: TrackingCleanupResult) -> None:
|
||||
"""Record a cleanup operation"""
|
||||
self.history.append(result)
|
||||
if len(self.history) > self.max_history:
|
||||
self.history.pop(0)
|
||||
|
||||
self.total_items_cleaned += result.items_cleaned
|
||||
self.total_guilds_cleaned += result.guilds_cleaned
|
||||
self.total_channels_cleaned += result.channels_cleaned
|
||||
self.last_cleanup = result.timestamp
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get cleanup statistics"""
|
||||
return {
|
||||
"total_cleanups": len(self.history),
|
||||
"total_items_cleaned": self.total_items_cleaned,
|
||||
"total_guilds_cleaned": self.total_guilds_cleaned,
|
||||
"total_channels_cleaned": self.total_channels_cleaned,
|
||||
"last_cleanup": (
|
||||
self.last_cleanup.isoformat()
|
||||
if self.last_cleanup
|
||||
else None
|
||||
),
|
||||
"recent_cleanups": [
|
||||
{
|
||||
"timestamp": r.timestamp.isoformat(),
|
||||
"strategy": r.strategy.value,
|
||||
"items_cleaned": r.items_cleaned,
|
||||
"guilds_cleaned": r.guilds_cleaned,
|
||||
"channels_cleaned": r.channels_cleaned,
|
||||
"duration": r.duration
|
||||
}
|
||||
for r in self.history[-5:] # Last 5 cleanups
|
||||
]
|
||||
}
|
||||
|
||||
class TrackingCleaner:
|
||||
"""Handles cleanup of queue tracking data"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
strategy: TrackingCleanupStrategy = TrackingCleanupStrategy.BALANCED,
|
||||
config: Optional[TrackingCleanupConfig] = None
|
||||
):
|
||||
self.strategy = strategy
|
||||
self.config = config or TrackingCleanupConfig()
|
||||
self.tracker = TrackingCleanupTracker()
|
||||
self.validator = TrackingValidator()
|
||||
|
||||
async def cleanup_tracking(
|
||||
self,
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem]
|
||||
) -> Tuple[int, Dict[str, int]]:
|
||||
"""Clean up tracking data"""
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
try:
|
||||
# Get initial counts
|
||||
initial_counts = self._get_tracking_counts(
|
||||
guild_queues,
|
||||
channel_queues
|
||||
)
|
||||
|
||||
# Get valid URLs
|
||||
valid_urls = self._get_valid_urls(queue, processing)
|
||||
|
||||
# Clean tracking data based on strategy
|
||||
items_cleaned = 0
|
||||
guilds_cleaned = 0
|
||||
channels_cleaned = 0
|
||||
|
||||
if self.strategy == TrackingCleanupStrategy.AGGRESSIVE:
|
||||
cleaned = await self._aggressive_cleanup(
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
valid_urls
|
||||
)
|
||||
elif self.strategy == TrackingCleanupStrategy.CONSERVATIVE:
|
||||
cleaned = await self._conservative_cleanup(
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
valid_urls
|
||||
)
|
||||
else: # BALANCED
|
||||
cleaned = await self._balanced_cleanup(
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
valid_urls
|
||||
)
|
||||
|
||||
items_cleaned = cleaned[0]
|
||||
guilds_cleaned = cleaned[1]
|
||||
channels_cleaned = cleaned[2]
|
||||
|
||||
# Get final counts
|
||||
final_counts = self._get_tracking_counts(
|
||||
guild_queues,
|
||||
channel_queues
|
||||
)
|
||||
|
||||
# Record cleanup result
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
result = TrackingCleanupResult(
|
||||
timestamp=datetime.utcnow(),
|
||||
strategy=self.strategy,
|
||||
items_cleaned=items_cleaned,
|
||||
guilds_cleaned=guilds_cleaned,
|
||||
channels_cleaned=channels_cleaned,
|
||||
duration=duration,
|
||||
initial_counts=initial_counts,
|
||||
final_counts=final_counts
|
||||
)
|
||||
self.tracker.record_cleanup(result)
|
||||
|
||||
logger.info(self.format_tracking_cleanup_report(
|
||||
initial_counts,
|
||||
final_counts,
|
||||
duration
|
||||
))
|
||||
return items_cleaned, initial_counts
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning tracking data: {e}")
|
||||
self.tracker.record_cleanup(TrackingCleanupResult(
|
||||
timestamp=datetime.utcnow(),
|
||||
strategy=self.strategy,
|
||||
items_cleaned=0,
|
||||
guilds_cleaned=0,
|
||||
channels_cleaned=0,
|
||||
duration=0,
|
||||
initial_counts={},
|
||||
final_counts={},
|
||||
error=str(e)
|
||||
))
|
||||
raise
|
||||
|
||||
async def _aggressive_cleanup(
|
||||
self,
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
valid_urls: Set[str]
|
||||
) -> Tuple[int, int, int]:
|
||||
"""Perform aggressive cleanup"""
|
||||
items_cleaned = 0
|
||||
guilds_cleaned = 0
|
||||
channels_cleaned = 0
|
||||
|
||||
# Clean guild tracking
|
||||
guild_cleaned = await self._cleanup_guild_tracking(
|
||||
guild_queues,
|
||||
valid_urls,
|
||||
validate_all=True
|
||||
)
|
||||
items_cleaned += guild_cleaned[0]
|
||||
guilds_cleaned += guild_cleaned[1]
|
||||
|
||||
# Clean channel tracking
|
||||
channel_cleaned = await self._cleanup_channel_tracking(
|
||||
channel_queues,
|
||||
valid_urls,
|
||||
validate_all=True
|
||||
)
|
||||
items_cleaned += channel_cleaned[0]
|
||||
channels_cleaned += channel_cleaned[1]
|
||||
|
||||
return items_cleaned, guilds_cleaned, channels_cleaned
|
||||
|
||||
async def _conservative_cleanup(
|
||||
self,
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
valid_urls: Set[str]
|
||||
) -> Tuple[int, int, int]:
|
||||
"""Perform conservative cleanup"""
|
||||
items_cleaned = 0
|
||||
guilds_cleaned = 0
|
||||
channels_cleaned = 0
|
||||
|
||||
# Only clean if invalid ratio exceeds threshold
|
||||
for guild_id, urls in list(guild_queues.items()):
|
||||
invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
|
||||
if invalid_ratio > self.config.max_invalid_ratio:
|
||||
cleaned = await self._cleanup_guild_tracking(
|
||||
{guild_id: urls},
|
||||
valid_urls,
|
||||
validate_all=False
|
||||
)
|
||||
items_cleaned += cleaned[0]
|
||||
guilds_cleaned += cleaned[1]
|
||||
|
||||
for channel_id, urls in list(channel_queues.items()):
|
||||
invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
|
||||
if invalid_ratio > self.config.max_invalid_ratio:
|
||||
cleaned = await self._cleanup_channel_tracking(
|
||||
{channel_id: urls},
|
||||
valid_urls,
|
||||
validate_all=False
|
||||
)
|
||||
items_cleaned += cleaned[0]
|
||||
channels_cleaned += cleaned[1]
|
||||
|
||||
return items_cleaned, guilds_cleaned, channels_cleaned
|
||||
|
||||
async def _balanced_cleanup(
|
||||
self,
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
valid_urls: Set[str]
|
||||
) -> Tuple[int, int, int]:
|
||||
"""Perform balanced cleanup"""
|
||||
items_cleaned = 0
|
||||
guilds_cleaned = 0
|
||||
channels_cleaned = 0
|
||||
|
||||
# Clean guild tracking with validation
|
||||
guild_cleaned = await self._cleanup_guild_tracking(
|
||||
guild_queues,
|
||||
valid_urls,
|
||||
validate_all=self.config.validate_urls
|
||||
)
|
||||
items_cleaned += guild_cleaned[0]
|
||||
guilds_cleaned += guild_cleaned[1]
|
||||
|
||||
# Clean channel tracking with validation
|
||||
channel_cleaned = await self._cleanup_channel_tracking(
|
||||
channel_queues,
|
||||
valid_urls,
|
||||
validate_all=self.config.validate_urls
|
||||
)
|
||||
items_cleaned += channel_cleaned[0]
|
||||
channels_cleaned += channel_cleaned[1]
|
||||
|
||||
return items_cleaned, guilds_cleaned, channels_cleaned
|
||||
|
||||
async def _cleanup_guild_tracking(
|
||||
self,
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
valid_urls: Set[str],
|
||||
validate_all: bool
|
||||
) -> Tuple[int, int]:
|
||||
"""Clean up guild tracking data"""
|
||||
items_cleaned = 0
|
||||
guilds_cleaned = 0
|
||||
batch_count = 0
|
||||
|
||||
for guild_id in list(guild_queues.keys()):
|
||||
if not self.validator.validate_id(guild_id):
|
||||
guild_queues.pop(guild_id)
|
||||
guilds_cleaned += 1
|
||||
continue
|
||||
|
||||
original_size = len(guild_queues[guild_id])
|
||||
guild_queues[guild_id] = {
|
||||
url for url in guild_queues[guild_id]
|
||||
if (
|
||||
(not validate_all or self.validator.validate_url(url)) and
|
||||
url in valid_urls
|
||||
)
|
||||
}
|
||||
items_cleaned += original_size - len(guild_queues[guild_id])
|
||||
|
||||
if self.config.cleanup_empty and not guild_queues[guild_id]:
|
||||
guild_queues.pop(guild_id)
|
||||
guilds_cleaned += 1
|
||||
|
||||
batch_count += 1
|
||||
if batch_count >= self.config.batch_size:
|
||||
await asyncio.sleep(0) # Yield to event loop
|
||||
batch_count = 0
|
||||
|
||||
logger.debug(f"Cleaned {items_cleaned} guild tracking items")
|
||||
return items_cleaned, guilds_cleaned
|
||||
|
||||
async def _cleanup_channel_tracking(
|
||||
self,
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
valid_urls: Set[str],
|
||||
validate_all: bool
|
||||
) -> Tuple[int, int]:
|
||||
"""Clean up channel tracking data"""
|
||||
items_cleaned = 0
|
||||
channels_cleaned = 0
|
||||
batch_count = 0
|
||||
|
||||
for channel_id in list(channel_queues.keys()):
|
||||
if not self.validator.validate_id(channel_id):
|
||||
channel_queues.pop(channel_id)
|
||||
channels_cleaned += 1
|
||||
continue
|
||||
|
||||
original_size = len(channel_queues[channel_id])
|
||||
channel_queues[channel_id] = {
|
||||
url for url in channel_queues[channel_id]
|
||||
if (
|
||||
(not validate_all or self.validator.validate_url(url)) and
|
||||
url in valid_urls
|
||||
)
|
||||
}
|
||||
items_cleaned += original_size - len(channel_queues[channel_id])
|
||||
|
||||
if self.config.cleanup_empty and not channel_queues[channel_id]:
|
||||
channel_queues.pop(channel_id)
|
||||
channels_cleaned += 1
|
||||
|
||||
batch_count += 1
|
||||
if batch_count >= self.config.batch_size:
|
||||
await asyncio.sleep(0) # Yield to event loop
|
||||
batch_count = 0
|
||||
|
||||
logger.debug(f"Cleaned {items_cleaned} channel tracking items")
|
||||
return items_cleaned, channels_cleaned
|
||||
|
||||
def _get_valid_urls(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem]
|
||||
) -> Set[str]:
|
||||
"""Get set of valid URLs"""
|
||||
valid_urls = {item.url for item in queue}
|
||||
valid_urls.update(processing.keys())
|
||||
return valid_urls
|
||||
|
||||
def _get_tracking_counts(
|
||||
self,
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]]
|
||||
) -> Dict[str, int]:
|
||||
"""Get tracking data counts"""
|
||||
return {
|
||||
'guilds': len(guild_queues),
|
||||
'channels': len(channel_queues),
|
||||
'guild_urls': sum(len(urls) for urls in guild_queues.values()),
|
||||
'channel_urls': sum(len(urls) for urls in channel_queues.values())
|
||||
}
|
||||
|
||||
def format_tracking_cleanup_report(
|
||||
self,
|
||||
initial_counts: Dict[str, int],
|
||||
final_counts: Dict[str, int],
|
||||
duration: float
|
||||
) -> str:
|
||||
"""Format a tracking cleanup report"""
|
||||
total_cleaned = (
|
||||
(initial_counts['guild_urls'] - final_counts['guild_urls']) +
|
||||
(initial_counts['channel_urls'] - final_counts['channel_urls'])
|
||||
)
|
||||
|
||||
return (
|
||||
f"Tracking Cleanup Results:\n"
|
||||
f"Strategy: {self.strategy.value}\n"
|
||||
f"Duration: {duration:.2f}s\n"
|
||||
f"Items:\n"
|
||||
f"- Guild Queues: {initial_counts['guilds']} -> {final_counts['guilds']}\n"
|
||||
f"- Channel Queues: {initial_counts['channels']} -> {final_counts['channels']}\n"
|
||||
f"- Guild URLs: {initial_counts['guild_urls']} -> {final_counts['guild_urls']}\n"
|
||||
f"- Channel URLs: {initial_counts['channel_urls']} -> {final_counts['channel_urls']}\n"
|
||||
f"Total items cleaned: {total_cleaned}"
|
||||
)
|
||||
|
||||
def get_cleaner_stats(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive cleaner statistics"""
|
||||
return {
|
||||
"strategy": self.strategy.value,
|
||||
"config": {
|
||||
"batch_size": self.config.batch_size,
|
||||
"retention_period": self.config.retention_period,
|
||||
"validate_urls": self.config.validate_urls,
|
||||
"cleanup_empty": self.config.cleanup_empty,
|
||||
"max_invalid_ratio": self.config.max_invalid_ratio
|
||||
},
|
||||
"tracker": self.tracker.get_stats()
|
||||
}
|
||||
@@ -2,316 +2,459 @@
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Set, Optional, Any, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Set, Optional
|
||||
from .models import QueueItem, QueueMetrics
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
from .models import QueueItem, QueueMetrics
|
||||
from .cleaners.history_cleaner import (
|
||||
HistoryCleaner,
|
||||
CleanupStrategy as HistoryStrategy
|
||||
)
|
||||
from .cleaners.guild_cleaner import (
|
||||
GuildCleaner,
|
||||
GuildCleanupStrategy
|
||||
)
|
||||
from .cleaners.tracking_cleaner import (
|
||||
TrackingCleaner,
|
||||
TrackingCleanupStrategy
|
||||
)
|
||||
|
||||
logger = logging.getLogger("QueueCleanup")
|
||||
|
||||
class QueueCleaner:
|
||||
"""Handles cleanup of old queue items and tracking data"""
|
||||
class CleanupMode(Enum):
|
||||
"""Cleanup operation modes"""
|
||||
NORMAL = "normal" # Regular cleanup
|
||||
AGGRESSIVE = "aggressive" # More aggressive cleanup
|
||||
MAINTENANCE = "maintenance" # Maintenance mode cleanup
|
||||
EMERGENCY = "emergency" # Emergency cleanup
|
||||
|
||||
class CleanupPhase(Enum):
|
||||
"""Cleanup operation phases"""
|
||||
HISTORY = "history"
|
||||
TRACKING = "tracking"
|
||||
GUILD = "guild"
|
||||
VERIFICATION = "verification"
|
||||
|
||||
@dataclass
|
||||
class CleanupConfig:
|
||||
"""Configuration for cleanup operations"""
|
||||
cleanup_interval: int = 1800 # 30 minutes
|
||||
max_history_age: int = 43200 # 12 hours
|
||||
batch_size: int = 100
|
||||
max_concurrent_cleanups: int = 3
|
||||
verification_interval: int = 300 # 5 minutes
|
||||
emergency_threshold: int = 10000 # Items threshold for emergency
|
||||
|
||||
@dataclass
|
||||
class CleanupResult:
|
||||
"""Result of a cleanup operation"""
|
||||
timestamp: datetime
|
||||
mode: CleanupMode
|
||||
duration: float
|
||||
items_cleaned: Dict[CleanupPhase, int]
|
||||
error: Optional[str] = None
|
||||
|
||||
class CleanupScheduler:
|
||||
"""Schedules cleanup operations"""
|
||||
|
||||
def __init__(self, config: CleanupConfig):
|
||||
self.config = config
|
||||
self.next_cleanup: Optional[datetime] = None
|
||||
self.next_verification: Optional[datetime] = None
|
||||
self._last_emergency: Optional[datetime] = None
|
||||
|
||||
def should_cleanup(self, queue_size: int) -> Tuple[bool, CleanupMode]:
|
||||
"""Determine if cleanup should run"""
|
||||
now = datetime.utcnow()
|
||||
|
||||
# Check for emergency cleanup
|
||||
if (
|
||||
queue_size > self.config.emergency_threshold and
|
||||
(
|
||||
not self._last_emergency or
|
||||
now - self._last_emergency > timedelta(minutes=5)
|
||||
)
|
||||
):
|
||||
self._last_emergency = now
|
||||
return True, CleanupMode.EMERGENCY
|
||||
|
||||
# Check scheduled cleanup
|
||||
if not self.next_cleanup or now >= self.next_cleanup:
|
||||
self.next_cleanup = now + timedelta(
|
||||
seconds=self.config.cleanup_interval
|
||||
)
|
||||
return True, CleanupMode.NORMAL
|
||||
|
||||
# Check verification
|
||||
if not self.next_verification or now >= self.next_verification:
|
||||
self.next_verification = now + timedelta(
|
||||
seconds=self.config.verification_interval
|
||||
)
|
||||
return True, CleanupMode.MAINTENANCE
|
||||
|
||||
return False, CleanupMode.NORMAL
|
||||
|
||||
class CleanupCoordinator:
|
||||
"""Coordinates cleanup operations"""
|
||||
|
||||
def __init__(self):
|
||||
self.active_cleanups: Set[CleanupPhase] = set()
|
||||
self._cleanup_lock = asyncio.Lock()
|
||||
self._phase_locks: Dict[CleanupPhase, asyncio.Lock] = {
|
||||
phase: asyncio.Lock() for phase in CleanupPhase
|
||||
}
|
||||
|
||||
async def start_cleanup(self, phase: CleanupPhase) -> bool:
|
||||
"""Start a cleanup phase"""
|
||||
async with self._cleanup_lock:
|
||||
if phase in self.active_cleanups:
|
||||
return False
|
||||
self.active_cleanups.add(phase)
|
||||
return True
|
||||
|
||||
async def end_cleanup(self, phase: CleanupPhase) -> None:
|
||||
"""End a cleanup phase"""
|
||||
async with self._cleanup_lock:
|
||||
self.active_cleanups.discard(phase)
|
||||
|
||||
async def acquire_phase(self, phase: CleanupPhase) -> bool:
|
||||
"""Acquire lock for a cleanup phase"""
|
||||
return await self._phase_locks[phase].acquire()
|
||||
|
||||
def release_phase(self, phase: CleanupPhase) -> None:
|
||||
"""Release lock for a cleanup phase"""
|
||||
self._phase_locks[phase].release()
|
||||
|
||||
class CleanupTracker:
|
||||
"""Tracks cleanup operations"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.history: List[CleanupResult] = []
|
||||
self.total_items_cleaned = 0
|
||||
self.last_cleanup: Optional[datetime] = None
|
||||
self.cleanup_counts: Dict[CleanupMode, int] = {
|
||||
mode: 0 for mode in CleanupMode
|
||||
}
|
||||
|
||||
def record_cleanup(self, result: CleanupResult) -> None:
|
||||
"""Record a cleanup operation"""
|
||||
self.history.append(result)
|
||||
if len(self.history) > self.max_history:
|
||||
self.history.pop(0)
|
||||
|
||||
self.total_items_cleaned += sum(result.items_cleaned.values())
|
||||
self.last_cleanup = result.timestamp
|
||||
self.cleanup_counts[result.mode] += 1
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get cleanup statistics"""
|
||||
return {
|
||||
"total_cleanups": len(self.history),
|
||||
"total_items_cleaned": self.total_items_cleaned,
|
||||
"last_cleanup": (
|
||||
self.last_cleanup.isoformat()
|
||||
if self.last_cleanup
|
||||
else None
|
||||
),
|
||||
"cleanup_counts": {
|
||||
mode.value: count
|
||||
for mode, count in self.cleanup_counts.items()
|
||||
},
|
||||
"recent_cleanups": [
|
||||
{
|
||||
"timestamp": r.timestamp.isoformat(),
|
||||
"mode": r.mode.value,
|
||||
"duration": r.duration,
|
||||
"items_cleaned": {
|
||||
phase.value: count
|
||||
for phase, count in r.items_cleaned.items()
|
||||
}
|
||||
}
|
||||
for r in self.history[-5:] # Last 5 cleanups
|
||||
]
|
||||
}
|
||||
|
||||
class QueueCleaner:
|
||||
"""Handles cleanup of queue items and tracking data"""
|
||||
|
||||
def __init__(self, config: Optional[CleanupConfig] = None):
|
||||
self.config = config or CleanupConfig()
|
||||
self.scheduler = CleanupScheduler(self.config)
|
||||
self.coordinator = CleanupCoordinator()
|
||||
self.tracker = CleanupTracker()
|
||||
|
||||
# Initialize cleaners
|
||||
self.history_cleaner = HistoryCleaner()
|
||||
self.guild_cleaner = GuildCleaner()
|
||||
self.tracking_cleaner = TrackingCleaner()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cleanup_interval: int = 1800, # 30 minutes
|
||||
max_history_age: int = 43200, # 12 hours
|
||||
):
|
||||
self.cleanup_interval = cleanup_interval
|
||||
self.max_history_age = max_history_age
|
||||
self._shutdown = False
|
||||
self._cleanup_task: Optional[asyncio.Task] = None
|
||||
self._last_cleanup_time = datetime.utcnow()
|
||||
|
||||
async def start_cleanup(
|
||||
async def start(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
processing: Dict[str, QueueItem],
|
||||
metrics: QueueMetrics,
|
||||
queue_lock: asyncio.Lock
|
||||
state_manager,
|
||||
metrics_manager
|
||||
) -> None:
|
||||
"""Start periodic cleanup process
|
||||
|
||||
Args:
|
||||
queue: Reference to the queue list
|
||||
completed: Reference to completed items dict
|
||||
failed: Reference to failed items dict
|
||||
guild_queues: Reference to guild tracking dict
|
||||
channel_queues: Reference to channel tracking dict
|
||||
processing: Reference to processing dict
|
||||
metrics: Reference to queue metrics
|
||||
queue_lock: Lock for queue operations
|
||||
"""
|
||||
"""Start periodic cleanup process"""
|
||||
if self._cleanup_task is not None:
|
||||
logger.warning("Cleanup task already running")
|
||||
return
|
||||
|
||||
logger.info("Starting queue cleanup task...")
|
||||
self._cleanup_task = asyncio.create_task(
|
||||
self._cleanup_loop(
|
||||
queue,
|
||||
completed,
|
||||
failed,
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
processing,
|
||||
metrics,
|
||||
queue_lock
|
||||
)
|
||||
self._cleanup_loop(state_manager, metrics_manager)
|
||||
)
|
||||
|
||||
async def _cleanup_loop(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
processing: Dict[str, QueueItem],
|
||||
metrics: QueueMetrics,
|
||||
queue_lock: asyncio.Lock
|
||||
state_manager,
|
||||
metrics_manager
|
||||
) -> None:
|
||||
"""Main cleanup loop"""
|
||||
while not self._shutdown:
|
||||
try:
|
||||
await self._perform_cleanup(
|
||||
queue,
|
||||
completed,
|
||||
failed,
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
processing,
|
||||
metrics,
|
||||
queue_lock
|
||||
)
|
||||
self._last_cleanup_time = datetime.utcnow()
|
||||
await asyncio.sleep(self.cleanup_interval)
|
||||
# Check if cleanup should run
|
||||
queue_size = len(await state_manager.get_queue())
|
||||
should_run, mode = self.scheduler.should_cleanup(queue_size)
|
||||
|
||||
if should_run:
|
||||
await self._perform_cleanup(
|
||||
state_manager,
|
||||
metrics_manager,
|
||||
mode
|
||||
)
|
||||
|
||||
await asyncio.sleep(1) # Short sleep to prevent CPU hogging
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Queue cleanup cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup loop: {str(e)}")
|
||||
# Shorter sleep on error to retry sooner
|
||||
await asyncio.sleep(30)
|
||||
await asyncio.sleep(30) # Longer sleep on error
|
||||
|
||||
def stop_cleanup(self) -> None:
|
||||
async def stop(self) -> None:
|
||||
"""Stop the cleanup process"""
|
||||
logger.info("Stopping queue cleanup...")
|
||||
self._shutdown = True
|
||||
if self._cleanup_task and not self._cleanup_task.done():
|
||||
self._cleanup_task.cancel()
|
||||
try:
|
||||
await self._cleanup_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._cleanup_task = None
|
||||
|
||||
async def _perform_cleanup(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
processing: Dict[str, QueueItem],
|
||||
metrics: QueueMetrics,
|
||||
queue_lock: asyncio.Lock
|
||||
state_manager,
|
||||
metrics_manager,
|
||||
mode: CleanupMode
|
||||
) -> None:
|
||||
"""Perform cleanup operations
|
||||
|
||||
Args:
|
||||
queue: Reference to the queue list
|
||||
completed: Reference to completed items dict
|
||||
failed: Reference to failed items dict
|
||||
guild_queues: Reference to guild tracking dict
|
||||
channel_queues: Reference to channel tracking dict
|
||||
processing: Reference to processing dict
|
||||
metrics: Reference to queue metrics
|
||||
queue_lock: Lock for queue operations
|
||||
"""
|
||||
"""Perform cleanup operations"""
|
||||
start_time = datetime.utcnow()
|
||||
items_cleaned: Dict[CleanupPhase, int] = {
|
||||
phase: 0 for phase in CleanupPhase
|
||||
}
|
||||
|
||||
try:
|
||||
current_time = datetime.utcnow()
|
||||
cleanup_cutoff = current_time - timedelta(seconds=self.max_history_age)
|
||||
items_cleaned = 0
|
||||
# Get current state
|
||||
queue = await state_manager.get_queue()
|
||||
processing = await state_manager.get_processing()
|
||||
completed = await state_manager.get_completed()
|
||||
failed = await state_manager.get_failed()
|
||||
guild_queues = await state_manager.get_guild_queues()
|
||||
channel_queues = await state_manager.get_channel_queues()
|
||||
|
||||
async with queue_lock:
|
||||
# Clean up completed items
|
||||
completed_count = len(completed)
|
||||
for url in list(completed.keys()):
|
||||
try:
|
||||
item = completed[url]
|
||||
if not isinstance(item.added_at, datetime):
|
||||
try:
|
||||
if isinstance(item.added_at, str):
|
||||
item.added_at = datetime.fromisoformat(item.added_at)
|
||||
else:
|
||||
item.added_at = current_time
|
||||
except (ValueError, TypeError):
|
||||
item.added_at = current_time
|
||||
|
||||
if item.added_at < cleanup_cutoff:
|
||||
completed.pop(url)
|
||||
items_cleaned += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning completed item {url}: {e}")
|
||||
completed.pop(url)
|
||||
items_cleaned += 1
|
||||
# Clean historical items
|
||||
if await self.coordinator.start_cleanup(CleanupPhase.HISTORY):
|
||||
try:
|
||||
await self.coordinator.acquire_phase(CleanupPhase.HISTORY)
|
||||
cleanup_cutoff = self.history_cleaner.get_cleanup_cutoff()
|
||||
|
||||
# Adjust strategy based on mode
|
||||
if mode == CleanupMode.AGGRESSIVE:
|
||||
self.history_cleaner.strategy = HistoryStrategy.AGGRESSIVE
|
||||
elif mode == CleanupMode.MAINTENANCE:
|
||||
self.history_cleaner.strategy = HistoryStrategy.CONSERVATIVE
|
||||
|
||||
completed_cleaned = await self.history_cleaner.cleanup_completed(
|
||||
completed,
|
||||
cleanup_cutoff
|
||||
)
|
||||
failed_cleaned = await self.history_cleaner.cleanup_failed(
|
||||
failed,
|
||||
cleanup_cutoff
|
||||
)
|
||||
items_cleaned[CleanupPhase.HISTORY] = (
|
||||
completed_cleaned + failed_cleaned
|
||||
)
|
||||
finally:
|
||||
self.coordinator.release_phase(CleanupPhase.HISTORY)
|
||||
await self.coordinator.end_cleanup(CleanupPhase.HISTORY)
|
||||
|
||||
# Clean up failed items
|
||||
failed_count = len(failed)
|
||||
for url in list(failed.keys()):
|
||||
try:
|
||||
item = failed[url]
|
||||
if not isinstance(item.added_at, datetime):
|
||||
try:
|
||||
if isinstance(item.added_at, str):
|
||||
item.added_at = datetime.fromisoformat(item.added_at)
|
||||
else:
|
||||
item.added_at = current_time
|
||||
except (ValueError, TypeError):
|
||||
item.added_at = current_time
|
||||
|
||||
if item.added_at < cleanup_cutoff:
|
||||
failed.pop(url)
|
||||
items_cleaned += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning failed item {url}: {e}")
|
||||
failed.pop(url)
|
||||
items_cleaned += 1
|
||||
# Clean tracking data
|
||||
if await self.coordinator.start_cleanup(CleanupPhase.TRACKING):
|
||||
try:
|
||||
await self.coordinator.acquire_phase(CleanupPhase.TRACKING)
|
||||
|
||||
# Adjust strategy based on mode
|
||||
if mode == CleanupMode.AGGRESSIVE:
|
||||
self.tracking_cleaner.strategy = TrackingCleanupStrategy.AGGRESSIVE
|
||||
elif mode == CleanupMode.MAINTENANCE:
|
||||
self.tracking_cleaner.strategy = TrackingCleanupStrategy.CONSERVATIVE
|
||||
|
||||
tracking_cleaned, _ = await self.tracking_cleaner.cleanup_tracking(
|
||||
guild_queues,
|
||||
channel_queues,
|
||||
queue,
|
||||
processing
|
||||
)
|
||||
items_cleaned[CleanupPhase.TRACKING] = tracking_cleaned
|
||||
finally:
|
||||
self.coordinator.release_phase(CleanupPhase.TRACKING)
|
||||
await self.coordinator.end_cleanup(CleanupPhase.TRACKING)
|
||||
|
||||
# Clean up guild tracking
|
||||
guild_count = len(guild_queues)
|
||||
for guild_id in list(guild_queues.keys()):
|
||||
original_size = len(guild_queues[guild_id])
|
||||
guild_queues[guild_id] = {
|
||||
url for url in guild_queues[guild_id]
|
||||
if url in queue or url in processing
|
||||
}
|
||||
items_cleaned += original_size - len(guild_queues[guild_id])
|
||||
if not guild_queues[guild_id]:
|
||||
guild_queues.pop(guild_id)
|
||||
# Update state
|
||||
await state_manager.update_state(
|
||||
completed=completed,
|
||||
failed=failed,
|
||||
guild_queues=guild_queues,
|
||||
channel_queues=channel_queues
|
||||
)
|
||||
|
||||
# Clean up channel tracking
|
||||
channel_count = len(channel_queues)
|
||||
for channel_id in list(channel_queues.keys()):
|
||||
original_size = len(channel_queues[channel_id])
|
||||
channel_queues[channel_id] = {
|
||||
url for url in channel_queues[channel_id]
|
||||
if url in queue or url in processing
|
||||
}
|
||||
items_cleaned += original_size - len(channel_queues[channel_id])
|
||||
if not channel_queues[channel_id]:
|
||||
channel_queues.pop(channel_id)
|
||||
# Record cleanup result
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
result = CleanupResult(
|
||||
timestamp=datetime.utcnow(),
|
||||
mode=mode,
|
||||
duration=duration,
|
||||
items_cleaned=items_cleaned
|
||||
)
|
||||
self.tracker.record_cleanup(result)
|
||||
|
||||
# Update metrics
|
||||
metrics.last_cleanup = current_time
|
||||
# Update metrics
|
||||
metrics_manager.update_cleanup_time()
|
||||
|
||||
logger.info(
|
||||
f"Queue cleanup completed:\n"
|
||||
f"- Items cleaned: {items_cleaned}\n"
|
||||
f"- Completed items: {completed_count} -> {len(completed)}\n"
|
||||
f"- Failed items: {failed_count} -> {len(failed)}\n"
|
||||
f"- Guild queues: {guild_count} -> {len(guild_queues)}\n"
|
||||
f"- Channel queues: {channel_count} -> {len(channel_queues)}\n"
|
||||
f"- Current queue size: {len(queue)}\n"
|
||||
f"- Processing items: {len(processing)}"
|
||||
)
|
||||
logger.info(
|
||||
f"Cleanup completed ({mode.value}):\n" +
|
||||
"\n".join(
|
||||
f"- {phase.value}: {count} items"
|
||||
for phase, count in items_cleaned.items()
|
||||
if count > 0
|
||||
) +
|
||||
f"\nTotal duration: {duration:.2f}s"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during cleanup: {str(e)}")
|
||||
# Don't re-raise to keep cleanup running
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
self.tracker.record_cleanup(CleanupResult(
|
||||
timestamp=datetime.utcnow(),
|
||||
mode=mode,
|
||||
duration=duration,
|
||||
items_cleaned=items_cleaned,
|
||||
error=str(e)
|
||||
))
|
||||
raise CleanupError(f"Cleanup failed: {str(e)}")
|
||||
|
||||
async def clear_guild_queue(
|
||||
self,
|
||||
guild_id: int,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
completed: Dict[str, QueueItem],
|
||||
failed: Dict[str, QueueItem],
|
||||
guild_queues: Dict[int, Set[str]],
|
||||
channel_queues: Dict[int, Set[str]],
|
||||
queue_lock: asyncio.Lock
|
||||
state_manager
|
||||
) -> int:
|
||||
"""Clear all queue items for a specific guild
|
||||
|
||||
Args:
|
||||
guild_id: ID of the guild to clear
|
||||
queue: Reference to the queue list
|
||||
processing: Reference to processing dict
|
||||
completed: Reference to completed items dict
|
||||
failed: Reference to failed items dict
|
||||
guild_queues: Reference to guild tracking dict
|
||||
channel_queues: Reference to channel tracking dict
|
||||
queue_lock: Lock for queue operations
|
||||
|
||||
Returns:
|
||||
Number of items cleared
|
||||
"""
|
||||
"""Clear all queue items for a specific guild"""
|
||||
try:
|
||||
cleared_count = 0
|
||||
async with queue_lock:
|
||||
# Get URLs for this guild
|
||||
guild_urls = guild_queues.get(guild_id, set())
|
||||
initial_counts = {
|
||||
'queue': len([item for item in queue if item.guild_id == guild_id]),
|
||||
'processing': len([item for item in processing.values() if item.guild_id == guild_id]),
|
||||
'completed': len([item for item in completed.values() if item.guild_id == guild_id]),
|
||||
'failed': len([item for item in failed.values() if item.guild_id == guild_id])
|
||||
}
|
||||
if not await self.coordinator.start_cleanup(CleanupPhase.GUILD):
|
||||
raise CleanupError("Guild cleanup already in progress")
|
||||
|
||||
# Clear from pending queue
|
||||
queue[:] = [item for item in queue if item.guild_id != guild_id]
|
||||
try:
|
||||
await self.coordinator.acquire_phase(CleanupPhase.GUILD)
|
||||
|
||||
# Get current state
|
||||
queue = await state_manager.get_queue()
|
||||
processing = await state_manager.get_processing()
|
||||
completed = await state_manager.get_completed()
|
||||
failed = await state_manager.get_failed()
|
||||
guild_queues = await state_manager.get_guild_queues()
|
||||
channel_queues = await state_manager.get_channel_queues()
|
||||
|
||||
# Clear from processing
|
||||
for url in list(processing.keys()):
|
||||
if processing[url].guild_id == guild_id:
|
||||
processing.pop(url)
|
||||
cleared_count += 1
|
||||
|
||||
# Clear from completed
|
||||
for url in list(completed.keys()):
|
||||
if completed[url].guild_id == guild_id:
|
||||
completed.pop(url)
|
||||
cleared_count += 1
|
||||
|
||||
# Clear from failed
|
||||
for url in list(failed.keys()):
|
||||
if failed[url].guild_id == guild_id:
|
||||
failed.pop(url)
|
||||
cleared_count += 1
|
||||
|
||||
# Clear guild tracking
|
||||
if guild_id in guild_queues:
|
||||
cleared_count += len(guild_queues[guild_id])
|
||||
guild_queues.pop(guild_id)
|
||||
|
||||
# Clear channel tracking for this guild's channels
|
||||
for channel_id in list(channel_queues.keys()):
|
||||
channel_queues[channel_id] = {
|
||||
url for url in channel_queues[channel_id]
|
||||
if url not in guild_urls
|
||||
}
|
||||
if not channel_queues[channel_id]:
|
||||
channel_queues.pop(channel_id)
|
||||
|
||||
logger.info(
|
||||
f"Cleared guild {guild_id} queue:\n"
|
||||
f"- Queue: {initial_counts['queue']} items\n"
|
||||
f"- Processing: {initial_counts['processing']} items\n"
|
||||
f"- Completed: {initial_counts['completed']} items\n"
|
||||
f"- Failed: {initial_counts['failed']} items\n"
|
||||
f"Total cleared: {cleared_count} items"
|
||||
# Clear guild items
|
||||
cleared_count, counts = await self.guild_cleaner.clear_guild_items(
|
||||
guild_id,
|
||||
queue,
|
||||
processing,
|
||||
completed,
|
||||
failed,
|
||||
guild_queues,
|
||||
channel_queues
|
||||
)
|
||||
|
||||
# Update state
|
||||
await state_manager.update_state(
|
||||
queue=queue,
|
||||
processing=processing,
|
||||
completed=completed,
|
||||
failed=failed,
|
||||
guild_queues=guild_queues,
|
||||
channel_queues=channel_queues
|
||||
)
|
||||
|
||||
return cleared_count
|
||||
|
||||
finally:
|
||||
self.coordinator.release_phase(CleanupPhase.GUILD)
|
||||
await self.coordinator.end_cleanup(CleanupPhase.GUILD)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error clearing guild queue: {str(e)}")
|
||||
raise CleanupError(f"Failed to clear guild queue: {str(e)}")
|
||||
|
||||
def get_cleaner_stats(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive cleaner statistics"""
|
||||
return {
|
||||
"config": {
|
||||
"cleanup_interval": self.config.cleanup_interval,
|
||||
"max_history_age": self.config.max_history_age,
|
||||
"batch_size": self.config.batch_size,
|
||||
"max_concurrent_cleanups": self.config.max_concurrent_cleanups,
|
||||
"verification_interval": self.config.verification_interval,
|
||||
"emergency_threshold": self.config.emergency_threshold
|
||||
},
|
||||
"scheduler": {
|
||||
"next_cleanup": (
|
||||
self.scheduler.next_cleanup.isoformat()
|
||||
if self.scheduler.next_cleanup
|
||||
else None
|
||||
),
|
||||
"next_verification": (
|
||||
self.scheduler.next_verification.isoformat()
|
||||
if self.scheduler.next_verification
|
||||
else None
|
||||
),
|
||||
"last_emergency": (
|
||||
self.scheduler._last_emergency.isoformat()
|
||||
if self.scheduler._last_emergency
|
||||
else None
|
||||
)
|
||||
},
|
||||
"coordinator": {
|
||||
"active_cleanups": [
|
||||
phase.value for phase in self.coordinator.active_cleanups
|
||||
]
|
||||
},
|
||||
"tracker": self.tracker.get_stats(),
|
||||
"cleaners": {
|
||||
"history": self.history_cleaner.get_cleaner_stats(),
|
||||
"guild": self.guild_cleaner.get_cleaner_stats(),
|
||||
"tracking": self.tracking_cleaner.get_cleaner_stats()
|
||||
}
|
||||
}
|
||||
|
||||
class CleanupError(Exception):
|
||||
"""Base exception for cleanup-related errors"""
|
||||
pass
|
||||
|
||||
441
videoarchiver/queue/health_checker.py
Normal file
441
videoarchiver/queue/health_checker.py
Normal file
@@ -0,0 +1,441 @@
|
||||
"""Module for queue health checks"""
|
||||
|
||||
import logging
|
||||
import psutil
|
||||
import time
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Optional, Tuple, List, Any, Set
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
logger = logging.getLogger("QueueHealthChecker")
|
||||
|
||||
class HealthStatus(Enum):
|
||||
"""Possible health status values"""
|
||||
HEALTHY = "healthy"
|
||||
WARNING = "warning"
|
||||
CRITICAL = "critical"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
class HealthCategory(Enum):
|
||||
"""Health check categories"""
|
||||
MEMORY = "memory"
|
||||
PERFORMANCE = "performance"
|
||||
ACTIVITY = "activity"
|
||||
ERRORS = "errors"
|
||||
DEADLOCKS = "deadlocks"
|
||||
SYSTEM = "system"
|
||||
|
||||
@dataclass
|
||||
class HealthThresholds:
|
||||
"""Defines thresholds for health checks"""
|
||||
memory_warning_mb: int = 384 # 384MB
|
||||
memory_critical_mb: int = 512 # 512MB
|
||||
deadlock_warning_sec: int = 30 # 30 seconds
|
||||
deadlock_critical_sec: int = 60 # 1 minute
|
||||
error_rate_warning: float = 0.1 # 10% errors
|
||||
error_rate_critical: float = 0.2 # 20% errors
|
||||
inactivity_warning_sec: int = 30
|
||||
inactivity_critical_sec: int = 60
|
||||
cpu_warning_percent: float = 80.0
|
||||
cpu_critical_percent: float = 90.0
|
||||
|
||||
@dataclass
|
||||
class HealthCheckResult:
|
||||
"""Result of a health check"""
|
||||
category: HealthCategory
|
||||
status: HealthStatus
|
||||
message: str
|
||||
value: Optional[float] = None
|
||||
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
class HealthHistory:
|
||||
"""Tracks health check history"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.history: List[HealthCheckResult] = []
|
||||
self.status_changes: List[Dict[str, Any]] = []
|
||||
self.critical_events: List[Dict[str, Any]] = []
|
||||
|
||||
def add_result(self, result: HealthCheckResult) -> None:
|
||||
"""Add a health check result"""
|
||||
self.history.append(result)
|
||||
if len(self.history) > self.max_history:
|
||||
self.history.pop(0)
|
||||
|
||||
# Track status changes
|
||||
if self.history[-2:-1] and self.history[-1].status != self.history[-2].status:
|
||||
self.status_changes.append({
|
||||
"timestamp": result.timestamp,
|
||||
"category": result.category.value,
|
||||
"from_status": self.history[-2].status.value,
|
||||
"to_status": result.status.value,
|
||||
"message": result.message
|
||||
})
|
||||
|
||||
# Track critical events
|
||||
if result.status == HealthStatus.CRITICAL:
|
||||
self.critical_events.append({
|
||||
"timestamp": result.timestamp,
|
||||
"category": result.category.value,
|
||||
"message": result.message,
|
||||
"details": result.details
|
||||
})
|
||||
|
||||
def get_status_summary(self) -> Dict[str, Any]:
|
||||
"""Get summary of health status history"""
|
||||
return {
|
||||
"total_checks": len(self.history),
|
||||
"status_changes": len(self.status_changes),
|
||||
"critical_events": len(self.critical_events),
|
||||
"recent_status_changes": self.status_changes[-5:],
|
||||
"recent_critical_events": self.critical_events[-5:]
|
||||
}
|
||||
|
||||
class SystemHealthMonitor:
|
||||
"""Monitors system health metrics"""
|
||||
|
||||
def __init__(self):
|
||||
self.process = psutil.Process()
|
||||
|
||||
async def check_system_health(self) -> Dict[str, Any]:
|
||||
"""Check system health metrics"""
|
||||
try:
|
||||
cpu_percent = self.process.cpu_percent()
|
||||
memory_info = self.process.memory_info()
|
||||
io_counters = self.process.io_counters()
|
||||
|
||||
return {
|
||||
"cpu_percent": cpu_percent,
|
||||
"memory_rss": memory_info.rss / 1024 / 1024, # MB
|
||||
"memory_vms": memory_info.vms / 1024 / 1024, # MB
|
||||
"io_read_mb": io_counters.read_bytes / 1024 / 1024,
|
||||
"io_write_mb": io_counters.write_bytes / 1024 / 1024,
|
||||
"thread_count": self.process.num_threads(),
|
||||
"open_files": len(self.process.open_files()),
|
||||
"connections": len(self.process.connections())
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking system health: {e}")
|
||||
return {}
|
||||
|
||||
class HealthChecker:
|
||||
"""Handles health checks for the queue system"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
thresholds: Optional[HealthThresholds] = None,
|
||||
history_size: int = 1000
|
||||
):
|
||||
self.thresholds = thresholds or HealthThresholds()
|
||||
self.history = HealthHistory(history_size)
|
||||
self.system_monitor = SystemHealthMonitor()
|
||||
self._last_gc_time: Optional[datetime] = None
|
||||
|
||||
async def check_health(
|
||||
self,
|
||||
metrics: Dict[str, Any],
|
||||
queue_info: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Perform comprehensive health check"""
|
||||
results = []
|
||||
|
||||
# Check memory health
|
||||
memory_result = await self._check_memory_health()
|
||||
results.append(memory_result)
|
||||
|
||||
# Check performance health
|
||||
perf_result = self._check_performance_health(metrics)
|
||||
results.append(perf_result)
|
||||
|
||||
# Check activity health
|
||||
activity_result = self._check_activity_health(
|
||||
queue_info["last_activity"],
|
||||
queue_info["processing_count"] > 0
|
||||
)
|
||||
results.append(activity_result)
|
||||
|
||||
# Check error health
|
||||
error_result = self._check_error_health(metrics)
|
||||
results.append(error_result)
|
||||
|
||||
# Check for deadlocks
|
||||
deadlock_result = self._check_deadlocks(queue_info)
|
||||
results.append(deadlock_result)
|
||||
|
||||
# Check system health
|
||||
system_result = await self._check_system_health()
|
||||
results.append(system_result)
|
||||
|
||||
# Record results
|
||||
for result in results:
|
||||
self.history.add_result(result)
|
||||
|
||||
# Determine overall health
|
||||
overall_status = self._determine_overall_status(results)
|
||||
|
||||
return {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"overall_status": overall_status.value,
|
||||
"checks": [
|
||||
{
|
||||
"category": r.category.value,
|
||||
"status": r.status.value,
|
||||
"message": r.message,
|
||||
"value": r.value,
|
||||
"details": r.details
|
||||
}
|
||||
for r in results
|
||||
],
|
||||
"history": self.history.get_status_summary()
|
||||
}
|
||||
|
||||
async def _check_memory_health(self) -> HealthCheckResult:
|
||||
"""Check memory health"""
|
||||
try:
|
||||
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024 # MB
|
||||
|
||||
if memory_usage > self.thresholds.memory_critical_mb:
|
||||
if (
|
||||
not self._last_gc_time or
|
||||
datetime.utcnow() - self._last_gc_time > timedelta(minutes=5)
|
||||
):
|
||||
import gc
|
||||
gc.collect()
|
||||
self._last_gc_time = datetime.utcnow()
|
||||
memory_usage = psutil.Process().memory_info().rss / 1024 / 1024
|
||||
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Critical memory usage: {memory_usage:.1f}MB"
|
||||
elif memory_usage > self.thresholds.memory_warning_mb:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"High memory usage: {memory_usage:.1f}MB"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = f"Normal memory usage: {memory_usage:.1f}MB"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.MEMORY,
|
||||
status=status,
|
||||
message=message,
|
||||
value=memory_usage
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking memory health: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.MEMORY,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking memory: {str(e)}"
|
||||
)
|
||||
|
||||
def _check_performance_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
|
||||
"""Check performance health"""
|
||||
try:
|
||||
avg_time = metrics.get("avg_processing_time", 0)
|
||||
success_rate = metrics.get("success_rate", 1.0)
|
||||
|
||||
if success_rate < 0.5: # Less than 50% success
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Critical performance: {success_rate:.1%} success rate"
|
||||
elif success_rate < 0.8: # Less than 80% success
|
||||
status = HealthStatus.WARNING
|
||||
message = f"Degraded performance: {success_rate:.1%} success rate"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = f"Normal performance: {success_rate:.1%} success rate"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.PERFORMANCE,
|
||||
status=status,
|
||||
message=message,
|
||||
value=success_rate,
|
||||
details={"avg_processing_time": avg_time}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking performance health: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.PERFORMANCE,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking performance: {str(e)}"
|
||||
)
|
||||
|
||||
def _check_activity_health(
|
||||
self,
|
||||
last_activity_time: float,
|
||||
has_processing_items: bool
|
||||
) -> HealthCheckResult:
|
||||
"""Check activity health"""
|
||||
if not has_processing_items:
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.ACTIVITY,
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="No items being processed"
|
||||
)
|
||||
|
||||
inactive_time = time.time() - last_activity_time
|
||||
|
||||
if inactive_time > self.thresholds.inactivity_critical_sec:
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"No activity for {inactive_time:.1f}s"
|
||||
elif inactive_time > self.thresholds.inactivity_warning_sec:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"Limited activity for {inactive_time:.1f}s"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Normal activity levels"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.ACTIVITY,
|
||||
status=status,
|
||||
message=message,
|
||||
value=inactive_time
|
||||
)
|
||||
|
||||
def _check_error_health(self, metrics: Dict[str, Any]) -> HealthCheckResult:
|
||||
"""Check error health"""
|
||||
try:
|
||||
error_rate = metrics.get("error_rate", 0.0)
|
||||
error_count = metrics.get("total_errors", 0)
|
||||
|
||||
if error_rate > self.thresholds.error_rate_critical:
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Critical error rate: {error_rate:.1%}"
|
||||
elif error_rate > self.thresholds.error_rate_warning:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"High error rate: {error_rate:.1%}"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = f"Normal error rate: {error_rate:.1%}"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.ERRORS,
|
||||
status=status,
|
||||
message=message,
|
||||
value=error_rate,
|
||||
details={"error_count": error_count}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking error health: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.ERRORS,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking errors: {str(e)}"
|
||||
)
|
||||
|
||||
def _check_deadlocks(self, queue_info: Dict[str, Any]) -> HealthCheckResult:
|
||||
"""Check for potential deadlocks"""
|
||||
try:
|
||||
stuck_items = queue_info.get("stuck_items", [])
|
||||
if not stuck_items:
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.DEADLOCKS,
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="No stuck items detected"
|
||||
)
|
||||
|
||||
longest_stuck = max(
|
||||
time.time() - item["start_time"]
|
||||
for item in stuck_items
|
||||
)
|
||||
|
||||
if longest_stuck > self.thresholds.deadlock_critical_sec:
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Potential deadlock: {len(stuck_items)} items stuck"
|
||||
elif longest_stuck > self.thresholds.deadlock_warning_sec:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"Slow processing: {len(stuck_items)} items delayed"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = "Normal processing time"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.DEADLOCKS,
|
||||
status=status,
|
||||
message=message,
|
||||
value=longest_stuck,
|
||||
details={"stuck_items": len(stuck_items)}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking deadlocks: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.DEADLOCKS,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking deadlocks: {str(e)}"
|
||||
)
|
||||
|
||||
async def _check_system_health(self) -> HealthCheckResult:
|
||||
"""Check system health"""
|
||||
try:
|
||||
metrics = await self.system_monitor.check_system_health()
|
||||
|
||||
if not metrics:
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.SYSTEM,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message="Unable to get system metrics"
|
||||
)
|
||||
|
||||
cpu_percent = metrics["cpu_percent"]
|
||||
if cpu_percent > self.thresholds.cpu_critical_percent:
|
||||
status = HealthStatus.CRITICAL
|
||||
message = f"Critical CPU usage: {cpu_percent:.1f}%"
|
||||
elif cpu_percent > self.thresholds.cpu_warning_percent:
|
||||
status = HealthStatus.WARNING
|
||||
message = f"High CPU usage: {cpu_percent:.1f}%"
|
||||
else:
|
||||
status = HealthStatus.HEALTHY
|
||||
message = f"Normal CPU usage: {cpu_percent:.1f}%"
|
||||
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.SYSTEM,
|
||||
status=status,
|
||||
message=message,
|
||||
value=cpu_percent,
|
||||
details=metrics
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking system health: {e}")
|
||||
return HealthCheckResult(
|
||||
category=HealthCategory.SYSTEM,
|
||||
status=HealthStatus.UNKNOWN,
|
||||
message=f"Error checking system: {str(e)}"
|
||||
)
|
||||
|
||||
def _determine_overall_status(
|
||||
self,
|
||||
results: List[HealthCheckResult]
|
||||
) -> HealthStatus:
|
||||
"""Determine overall health status"""
|
||||
if any(r.status == HealthStatus.CRITICAL for r in results):
|
||||
return HealthStatus.CRITICAL
|
||||
if any(r.status == HealthStatus.WARNING for r in results):
|
||||
return HealthStatus.WARNING
|
||||
if any(r.status == HealthStatus.UNKNOWN for r in results):
|
||||
return HealthStatus.UNKNOWN
|
||||
return HealthStatus.HEALTHY
|
||||
|
||||
def format_health_report(
|
||||
self,
|
||||
results: List[HealthCheckResult]
|
||||
) -> str:
|
||||
"""Format a detailed health report"""
|
||||
lines = ["Queue Health Report:"]
|
||||
|
||||
for result in results:
|
||||
lines.append(
|
||||
f"\n{result.category.value.title()}:"
|
||||
f"\n- Status: {result.status.value}"
|
||||
f"\n- {result.message}"
|
||||
)
|
||||
if result.details:
|
||||
for key, value in result.details.items():
|
||||
lines.append(f" - {key}: {value}")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -2,274 +2,292 @@
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, Optional, Set, Tuple, Callable, Any, List
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Tuple, Dict, Any, List, Set
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .models import QueueItem, QueueMetrics
|
||||
from .persistence import QueuePersistenceManager, QueueError
|
||||
from .monitoring import QueueMonitor, MonitoringError
|
||||
from .cleanup import QueueCleaner, CleanupError
|
||||
from .state_manager import QueueStateManager
|
||||
from .processor import QueueProcessor
|
||||
from .metrics_manager import QueueMetricsManager
|
||||
from .persistence import QueuePersistenceManager
|
||||
from .monitoring import QueueMonitor, MonitoringLevel
|
||||
from .cleanup import QueueCleaner
|
||||
from .models import QueueItem, QueueError, CleanupError
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("QueueManager")
|
||||
|
||||
class QueueState(Enum):
|
||||
"""Queue operational states"""
|
||||
UNINITIALIZED = "uninitialized"
|
||||
INITIALIZING = "initializing"
|
||||
RUNNING = "running"
|
||||
PAUSED = "paused"
|
||||
STOPPING = "stopping"
|
||||
STOPPED = "stopped"
|
||||
ERROR = "error"
|
||||
|
||||
class QueueMode(Enum):
|
||||
"""Queue processing modes"""
|
||||
NORMAL = "normal" # Standard processing
|
||||
BATCH = "batch" # Batch processing
|
||||
PRIORITY = "priority" # Priority-based processing
|
||||
MAINTENANCE = "maintenance" # Maintenance mode
|
||||
|
||||
@dataclass
|
||||
class QueueConfig:
|
||||
"""Queue configuration settings"""
|
||||
max_retries: int = 3
|
||||
retry_delay: int = 5
|
||||
max_queue_size: int = 1000
|
||||
cleanup_interval: int = 3600 # 1 hour
|
||||
max_history_age: int = 86400 # 24 hours
|
||||
deadlock_threshold: int = 300 # 5 minutes
|
||||
check_interval: int = 60 # 1 minute
|
||||
batch_size: int = 10
|
||||
max_concurrent: int = 3
|
||||
persistence_enabled: bool = True
|
||||
monitoring_level: MonitoringLevel = MonitoringLevel.NORMAL
|
||||
|
||||
@dataclass
|
||||
class QueueStats:
|
||||
"""Queue statistics"""
|
||||
start_time: datetime = field(default_factory=datetime.utcnow)
|
||||
total_processed: int = 0
|
||||
total_failed: int = 0
|
||||
uptime: timedelta = field(default_factory=lambda: timedelta())
|
||||
peak_queue_size: int = 0
|
||||
peak_memory_usage: float = 0.0
|
||||
state_changes: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
class QueueCoordinator:
|
||||
"""Coordinates queue operations"""
|
||||
|
||||
def __init__(self):
|
||||
self.state = QueueState.UNINITIALIZED
|
||||
self.mode = QueueMode.NORMAL
|
||||
self._state_lock = asyncio.Lock()
|
||||
self._mode_lock = asyncio.Lock()
|
||||
self._paused = asyncio.Event()
|
||||
self._paused.set()
|
||||
|
||||
async def set_state(self, state: QueueState) -> None:
|
||||
"""Set queue state"""
|
||||
async with self._state_lock:
|
||||
self.state = state
|
||||
|
||||
async def set_mode(self, mode: QueueMode) -> None:
|
||||
"""Set queue mode"""
|
||||
async with self._mode_lock:
|
||||
self.mode = mode
|
||||
|
||||
async def pause(self) -> None:
|
||||
"""Pause queue processing"""
|
||||
self._paused.clear()
|
||||
await self.set_state(QueueState.PAUSED)
|
||||
|
||||
async def resume(self) -> None:
|
||||
"""Resume queue processing"""
|
||||
self._paused.set()
|
||||
await self.set_state(QueueState.RUNNING)
|
||||
|
||||
async def wait_if_paused(self) -> None:
|
||||
"""Wait if queue is paused"""
|
||||
await self._paused.wait()
|
||||
|
||||
class EnhancedVideoQueueManager:
|
||||
"""Enhanced queue manager with improved memory management and performance"""
|
||||
"""Enhanced queue manager with improved organization and maintainability"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_retries: int = 3,
|
||||
retry_delay: int = 5,
|
||||
max_queue_size: int = 1000,
|
||||
cleanup_interval: int = 3600, # 1 hour
|
||||
max_history_age: int = 86400, # 24 hours
|
||||
persistence_path: Optional[str] = None,
|
||||
backup_interval: int = 300, # 5 minutes
|
||||
deadlock_threshold: int = 300, # 5 minutes
|
||||
check_interval: int = 60, # 1 minute
|
||||
):
|
||||
"""Initialize queue manager"""
|
||||
# Configuration
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.max_queue_size = max_queue_size
|
||||
|
||||
# Queue storage
|
||||
self._queue: List[QueueItem] = []
|
||||
self._processing: Dict[str, QueueItem] = {}
|
||||
self._completed: Dict[str, QueueItem] = {}
|
||||
self._failed: Dict[str, QueueItem] = {}
|
||||
def __init__(self, config: Optional[QueueConfig] = None):
|
||||
"""Initialize queue manager components"""
|
||||
self.config = config or QueueConfig()
|
||||
self.coordinator = QueueCoordinator()
|
||||
self.stats = QueueStats()
|
||||
|
||||
# Tracking
|
||||
self._guild_queues: Dict[int, Set[str]] = {}
|
||||
self._channel_queues: Dict[int, Set[str]] = {}
|
||||
self._active_tasks: Set[asyncio.Task] = set()
|
||||
|
||||
# Single lock for all operations to prevent deadlocks
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
# State
|
||||
self._shutdown = False
|
||||
self._initialized = False
|
||||
self._init_event = asyncio.Event()
|
||||
self.metrics = QueueMetrics()
|
||||
|
||||
# Components
|
||||
self.persistence = QueuePersistenceManager(persistence_path) if persistence_path else None
|
||||
# Initialize managers
|
||||
self.state_manager = QueueStateManager(self.config.max_queue_size)
|
||||
self.metrics_manager = QueueMetricsManager()
|
||||
self.monitor = QueueMonitor(
|
||||
deadlock_threshold=deadlock_threshold,
|
||||
max_retries=max_retries,
|
||||
check_interval=check_interval
|
||||
deadlock_threshold=self.config.deadlock_threshold,
|
||||
max_retries=self.config.max_retries,
|
||||
check_interval=self.config.check_interval
|
||||
)
|
||||
self.cleaner = QueueCleaner(
|
||||
cleanup_interval=cleanup_interval,
|
||||
max_history_age=max_history_age
|
||||
cleanup_interval=self.config.cleanup_interval,
|
||||
max_history_age=self.config.max_history_age
|
||||
)
|
||||
|
||||
# Initialize persistence if enabled
|
||||
self.persistence = (
|
||||
QueuePersistenceManager()
|
||||
if self.config.persistence_enabled
|
||||
else None
|
||||
)
|
||||
|
||||
# Initialize processor
|
||||
self.processor = QueueProcessor(
|
||||
state_manager=self.state_manager,
|
||||
monitor=self.monitor,
|
||||
max_retries=self.config.max_retries,
|
||||
retry_delay=self.config.retry_delay,
|
||||
batch_size=self.config.batch_size,
|
||||
max_concurrent=self.config.max_concurrent
|
||||
)
|
||||
|
||||
# Background tasks
|
||||
self._maintenance_task: Optional[asyncio.Task] = None
|
||||
self._stats_task: Optional[asyncio.Task] = None
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize the queue manager components sequentially"""
|
||||
if self._initialized:
|
||||
"""Initialize the queue manager components"""
|
||||
if self.coordinator.state != QueueState.UNINITIALIZED:
|
||||
logger.info("Queue manager already initialized")
|
||||
return
|
||||
|
||||
try:
|
||||
await self.coordinator.set_state(QueueState.INITIALIZING)
|
||||
logger.info("Starting queue manager initialization...")
|
||||
|
||||
async with self._lock:
|
||||
# Load persisted state first if available
|
||||
if self.persistence:
|
||||
await self._load_persisted_state()
|
||||
|
||||
# Start monitoring task
|
||||
monitor_task = asyncio.create_task(
|
||||
self.monitor.start_monitoring(
|
||||
self._queue,
|
||||
self._processing,
|
||||
self.metrics,
|
||||
self._lock
|
||||
)
|
||||
)
|
||||
self._active_tasks.add(monitor_task)
|
||||
logger.info("Queue monitoring started")
|
||||
|
||||
# Start cleanup task
|
||||
cleanup_task = asyncio.create_task(
|
||||
self.cleaner.start_cleanup(
|
||||
self._queue,
|
||||
self._completed,
|
||||
self._failed,
|
||||
self._guild_queues,
|
||||
self._channel_queues,
|
||||
self._processing,
|
||||
self.metrics,
|
||||
self._lock
|
||||
)
|
||||
)
|
||||
self._active_tasks.add(cleanup_task)
|
||||
logger.info("Queue cleanup started")
|
||||
# Load persisted state if available
|
||||
if self.persistence:
|
||||
await self._load_persisted_state()
|
||||
|
||||
# Start monitoring with configured level
|
||||
self.monitor.strategy.level = self.config.monitoring_level
|
||||
await self.monitor.start(
|
||||
self.state_manager,
|
||||
self.metrics_manager
|
||||
)
|
||||
|
||||
# Start cleanup task
|
||||
await self.cleaner.start(
|
||||
state_manager=self.state_manager,
|
||||
metrics_manager=self.metrics_manager
|
||||
)
|
||||
|
||||
# Signal initialization complete
|
||||
self._initialized = True
|
||||
self._init_event.set()
|
||||
logger.info("Queue manager initialization completed")
|
||||
# Start background tasks
|
||||
self._start_background_tasks()
|
||||
|
||||
await self.coordinator.set_state(QueueState.RUNNING)
|
||||
logger.info("Queue manager initialization completed")
|
||||
|
||||
except Exception as e:
|
||||
await self.coordinator.set_state(QueueState.ERROR)
|
||||
logger.error(f"Failed to initialize queue manager: {e}")
|
||||
self._shutdown = True
|
||||
raise
|
||||
|
||||
async def _load_persisted_state(self) -> None:
|
||||
"""Load persisted queue state"""
|
||||
try:
|
||||
state = self.persistence.load_queue_state()
|
||||
state = await self.persistence.load_queue_state()
|
||||
if state:
|
||||
self._queue = state["queue"]
|
||||
self._completed = state["completed"]
|
||||
self._failed = state["failed"]
|
||||
self._processing = state["processing"]
|
||||
|
||||
# Update metrics
|
||||
metrics_data = state.get("metrics", {})
|
||||
self.metrics.total_processed = metrics_data.get("total_processed", 0)
|
||||
self.metrics.total_failed = metrics_data.get("total_failed", 0)
|
||||
self.metrics.avg_processing_time = metrics_data.get("avg_processing_time", 0.0)
|
||||
self.metrics.success_rate = metrics_data.get("success_rate", 0.0)
|
||||
self.metrics.errors_by_type = metrics_data.get("errors_by_type", {})
|
||||
self.metrics.compression_failures = metrics_data.get("compression_failures", 0)
|
||||
self.metrics.hardware_accel_failures = metrics_data.get("hardware_accel_failures", 0)
|
||||
|
||||
await self.state_manager.restore_state(state)
|
||||
self.metrics_manager.restore_metrics(state.get("metrics", {}))
|
||||
logger.info("Loaded persisted queue state")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load persisted state: {e}")
|
||||
|
||||
async def process_queue(
|
||||
self,
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
|
||||
) -> None:
|
||||
"""Process items in the queue"""
|
||||
# Wait for initialization to complete
|
||||
await self._init_event.wait()
|
||||
|
||||
logger.info("Queue processor started")
|
||||
last_persist_time = time.time()
|
||||
persist_interval = 60 # Persist state every 60 seconds
|
||||
|
||||
while not self._shutdown:
|
||||
try:
|
||||
items = []
|
||||
async with self._lock:
|
||||
# Get up to 5 items from queue
|
||||
while len(items) < 5 and self._queue:
|
||||
item = self._queue.pop(0)
|
||||
items.append(item)
|
||||
self._processing[item.url] = item
|
||||
# Update activity timestamp
|
||||
self.monitor.update_activity()
|
||||
def _start_background_tasks(self) -> None:
|
||||
"""Start background maintenance tasks"""
|
||||
self._maintenance_task = asyncio.create_task(
|
||||
self._maintenance_loop()
|
||||
)
|
||||
self._stats_task = asyncio.create_task(
|
||||
self._stats_loop()
|
||||
)
|
||||
|
||||
if not items:
|
||||
await asyncio.sleep(0.1)
|
||||
async def _maintenance_loop(self) -> None:
|
||||
"""Background maintenance loop"""
|
||||
while self.coordinator.state not in (QueueState.STOPPED, QueueState.ERROR):
|
||||
try:
|
||||
await asyncio.sleep(300) # Every 5 minutes
|
||||
if self.coordinator.mode == QueueMode.MAINTENANCE:
|
||||
continue
|
||||
|
||||
# Process items concurrently
|
||||
tasks = []
|
||||
for item in items:
|
||||
task = asyncio.create_task(self._process_item(processor, item))
|
||||
tasks.append(task)
|
||||
|
||||
try:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Queue processing cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in queue processing: {e}")
|
||||
|
||||
# Persist state if interval has passed
|
||||
current_time = time.time()
|
||||
if self.persistence and (current_time - last_persist_time) >= persist_interval:
|
||||
await self._persist_state()
|
||||
last_persist_time = current_time
|
||||
# Perform maintenance tasks
|
||||
await self._perform_maintenance()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Queue processing cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Critical error in queue processor: {e}")
|
||||
await asyncio.sleep(0.1)
|
||||
logger.error(f"Error in maintenance loop: {e}")
|
||||
|
||||
await asyncio.sleep(0)
|
||||
async def _stats_loop(self) -> None:
|
||||
"""Background statistics loop"""
|
||||
while self.coordinator.state not in (QueueState.STOPPED, QueueState.ERROR):
|
||||
try:
|
||||
await asyncio.sleep(60) # Every minute
|
||||
await self._update_stats()
|
||||
|
||||
async def _process_item(
|
||||
self,
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]],
|
||||
item: QueueItem
|
||||
) -> None:
|
||||
"""Process a single queue item"""
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in stats loop: {e}")
|
||||
|
||||
async def _perform_maintenance(self) -> None:
|
||||
"""Perform maintenance tasks"""
|
||||
try:
|
||||
logger.info(f"Processing queue item: {item.url}")
|
||||
item.start_processing()
|
||||
self.metrics.last_activity_time = time.time()
|
||||
self.monitor.update_activity()
|
||||
|
||||
success, error = await processor(item)
|
||||
|
||||
async with self._lock:
|
||||
item.finish_processing(success, error)
|
||||
self._processing.pop(item.url, None)
|
||||
|
||||
if success:
|
||||
self._completed[item.url] = item
|
||||
logger.info(f"Successfully processed: {item.url}")
|
||||
else:
|
||||
if item.retry_count < self.max_retries:
|
||||
item.retry_count += 1
|
||||
item.status = "pending"
|
||||
item.last_retry = datetime.utcnow()
|
||||
item.priority = max(0, item.priority - 1)
|
||||
self._queue.append(item)
|
||||
logger.warning(f"Retrying: {item.url} (attempt {item.retry_count})")
|
||||
else:
|
||||
self._failed[item.url] = item
|
||||
logger.error(f"Failed after {self.max_retries} attempts: {item.url}")
|
||||
|
||||
self.metrics.update(
|
||||
processing_time=item.processing_time,
|
||||
success=success,
|
||||
error=error
|
||||
)
|
||||
# Switch to maintenance mode
|
||||
previous_mode = self.coordinator.mode
|
||||
await self.coordinator.set_mode(QueueMode.MAINTENANCE)
|
||||
|
||||
# Perform maintenance tasks
|
||||
await self._cleanup_old_data()
|
||||
await self._optimize_queue()
|
||||
await self._persist_state()
|
||||
|
||||
# Restore previous mode
|
||||
await self.coordinator.set_mode(previous_mode)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {item.url}: {e}")
|
||||
async with self._lock:
|
||||
item.finish_processing(False, str(e))
|
||||
self._processing.pop(item.url, None)
|
||||
self._failed[item.url] = item
|
||||
self.metrics.update(
|
||||
processing_time=item.processing_time,
|
||||
success=False,
|
||||
error=str(e)
|
||||
)
|
||||
logger.error(f"Error during maintenance: {e}")
|
||||
|
||||
async def _persist_state(self) -> None:
|
||||
"""Persist current state to storage"""
|
||||
if not self.persistence:
|
||||
return
|
||||
|
||||
async def _cleanup_old_data(self) -> None:
|
||||
"""Clean up old data"""
|
||||
try:
|
||||
async with self._lock:
|
||||
await self.persistence.persist_queue_state(
|
||||
self._queue,
|
||||
self._processing,
|
||||
self._completed,
|
||||
self._failed,
|
||||
self.metrics
|
||||
)
|
||||
await self.cleaner.cleanup_old_data(
|
||||
self.state_manager,
|
||||
self.metrics_manager
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to persist state: {e}")
|
||||
logger.error(f"Error cleaning up old data: {e}")
|
||||
|
||||
async def _optimize_queue(self) -> None:
|
||||
"""Optimize queue performance"""
|
||||
try:
|
||||
# Reorder queue based on priorities
|
||||
await self.state_manager.optimize_queue()
|
||||
|
||||
# Update monitoring level based on queue size
|
||||
queue_size = len(await self.state_manager.get_all_items())
|
||||
if queue_size > self.config.max_queue_size * 0.8:
|
||||
self.monitor.strategy.level = MonitoringLevel.INTENSIVE
|
||||
elif queue_size < self.config.max_queue_size * 0.2:
|
||||
self.monitor.strategy.level = self.config.monitoring_level
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error optimizing queue: {e}")
|
||||
|
||||
async def _update_stats(self) -> None:
|
||||
"""Update queue statistics"""
|
||||
try:
|
||||
self.stats.uptime = datetime.utcnow() - self.stats.start_time
|
||||
|
||||
# Update peak values
|
||||
queue_size = len(await self.state_manager.get_all_items())
|
||||
self.stats.peak_queue_size = max(
|
||||
self.stats.peak_queue_size,
|
||||
queue_size
|
||||
)
|
||||
|
||||
memory_usage = self.metrics_manager.peak_memory_usage
|
||||
self.stats.peak_memory_usage = max(
|
||||
self.stats.peak_memory_usage,
|
||||
memory_usage
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating stats: {e}")
|
||||
|
||||
async def add_to_queue(
|
||||
self,
|
||||
@@ -281,176 +299,169 @@ class EnhancedVideoQueueManager:
|
||||
priority: int = 0,
|
||||
) -> bool:
|
||||
"""Add a video to the processing queue"""
|
||||
if self._shutdown:
|
||||
raise QueueError("Queue manager is shutting down")
|
||||
if self.coordinator.state in (QueueState.STOPPED, QueueState.ERROR):
|
||||
raise QueueError("Queue manager is not running")
|
||||
|
||||
# Wait for initialization
|
||||
await self._init_event.wait()
|
||||
# Wait if queue is paused
|
||||
await self.coordinator.wait_if_paused()
|
||||
|
||||
try:
|
||||
async with self._lock:
|
||||
if len(self._queue) >= self.max_queue_size:
|
||||
raise QueueError("Queue is full")
|
||||
item = QueueItem(
|
||||
url=url,
|
||||
message_id=message_id,
|
||||
channel_id=channel_id,
|
||||
guild_id=guild_id,
|
||||
author_id=author_id,
|
||||
added_at=datetime.utcnow(),
|
||||
priority=priority,
|
||||
)
|
||||
|
||||
item = QueueItem(
|
||||
url=url,
|
||||
message_id=message_id,
|
||||
channel_id=channel_id,
|
||||
guild_id=guild_id,
|
||||
author_id=author_id,
|
||||
added_at=datetime.utcnow(),
|
||||
priority=priority,
|
||||
)
|
||||
success = await self.state_manager.add_item(item)
|
||||
if success and self.persistence:
|
||||
await self._persist_state()
|
||||
|
||||
if guild_id not in self._guild_queues:
|
||||
self._guild_queues[guild_id] = set()
|
||||
self._guild_queues[guild_id].add(url)
|
||||
|
||||
if channel_id not in self._channel_queues:
|
||||
self._channel_queues[channel_id] = set()
|
||||
self._channel_queues[channel_id].add(url)
|
||||
|
||||
self._queue.append(item)
|
||||
self._queue.sort(key=lambda x: (-x.priority, x.added_at))
|
||||
|
||||
self.metrics.last_activity_time = time.time()
|
||||
self.monitor.update_activity()
|
||||
|
||||
if self.persistence:
|
||||
await self._persist_state()
|
||||
|
||||
logger.info(f"Added to queue: {url} (priority: {priority})")
|
||||
return True
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding to queue: {e}")
|
||||
raise QueueError(f"Failed to add to queue: {str(e)}")
|
||||
|
||||
def get_queue_status(self, guild_id: int) -> dict:
|
||||
def get_queue_status(self, guild_id: int) -> Dict[str, Any]:
|
||||
"""Get current queue status for a guild"""
|
||||
try:
|
||||
pending = len([item for item in self._queue if item.guild_id == guild_id])
|
||||
processing = len([item for item in self._processing.values() if item.guild_id == guild_id])
|
||||
completed = len([item for item in self._completed.values() if item.guild_id == guild_id])
|
||||
failed = len([item for item in self._failed.values() if item.guild_id == guild_id])
|
||||
|
||||
status = self.state_manager.get_guild_status(guild_id)
|
||||
metrics = self.metrics_manager.get_metrics()
|
||||
monitor_stats = self.monitor.get_monitoring_stats()
|
||||
|
||||
return {
|
||||
"pending": pending,
|
||||
"processing": processing,
|
||||
"completed": completed,
|
||||
"failed": failed,
|
||||
"metrics": {
|
||||
"total_processed": self.metrics.total_processed,
|
||||
"total_failed": self.metrics.total_failed,
|
||||
"success_rate": self.metrics.success_rate,
|
||||
"avg_processing_time": self.metrics.avg_processing_time,
|
||||
"peak_memory_usage": self.metrics.peak_memory_usage,
|
||||
"last_cleanup": self.metrics.last_cleanup.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"errors_by_type": self.metrics.errors_by_type,
|
||||
"compression_failures": self.metrics.compression_failures,
|
||||
"hardware_accel_failures": self.metrics.hardware_accel_failures,
|
||||
"last_activity": time.time() - self.metrics.last_activity_time,
|
||||
},
|
||||
**status,
|
||||
"metrics": metrics,
|
||||
"monitoring": monitor_stats,
|
||||
"state": self.coordinator.state.value,
|
||||
"mode": self.coordinator.mode.value,
|
||||
"stats": {
|
||||
"uptime": self.stats.uptime.total_seconds(),
|
||||
"peak_queue_size": self.stats.peak_queue_size,
|
||||
"peak_memory_usage": self.stats.peak_memory_usage,
|
||||
"total_processed": self.stats.total_processed,
|
||||
"total_failed": self.stats.total_failed
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting queue status: {e}")
|
||||
return {
|
||||
"pending": 0,
|
||||
"processing": 0,
|
||||
"completed": 0,
|
||||
"failed": 0,
|
||||
"metrics": {
|
||||
"total_processed": 0,
|
||||
"total_failed": 0,
|
||||
"success_rate": 0.0,
|
||||
"avg_processing_time": 0.0,
|
||||
"peak_memory_usage": 0.0,
|
||||
"last_cleanup": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"errors_by_type": {},
|
||||
"compression_failures": 0,
|
||||
"hardware_accel_failures": 0,
|
||||
"last_activity": 0,
|
||||
},
|
||||
}
|
||||
return self._get_default_status()
|
||||
|
||||
async def pause(self) -> None:
|
||||
"""Pause queue processing"""
|
||||
await self.coordinator.pause()
|
||||
logger.info("Queue processing paused")
|
||||
|
||||
async def resume(self) -> None:
|
||||
"""Resume queue processing"""
|
||||
await self.coordinator.resume()
|
||||
logger.info("Queue processing resumed")
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
"""Clean up resources and stop queue processing"""
|
||||
try:
|
||||
self._shutdown = True
|
||||
await self.coordinator.set_state(QueueState.STOPPING)
|
||||
logger.info("Starting queue manager cleanup...")
|
||||
|
||||
# Stop monitoring and cleanup tasks
|
||||
self.monitor.stop_monitoring()
|
||||
self.cleaner.stop_cleanup()
|
||||
# Cancel background tasks
|
||||
if self._maintenance_task:
|
||||
self._maintenance_task.cancel()
|
||||
if self._stats_task:
|
||||
self._stats_task.cancel()
|
||||
|
||||
# Stop processor
|
||||
await self.processor.stop_processing()
|
||||
|
||||
# Stop monitoring and cleanup
|
||||
await self.monitor.stop()
|
||||
await self.cleaner.stop()
|
||||
|
||||
# Cancel all active tasks
|
||||
for task in self._active_tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
# Final state persistence
|
||||
if self.persistence:
|
||||
await self._persist_state()
|
||||
|
||||
await asyncio.gather(*self._active_tasks, return_exceptions=True)
|
||||
# Clear state
|
||||
await self.state_manager.clear_state()
|
||||
|
||||
async with self._lock:
|
||||
# Move processing items back to queue
|
||||
for url, item in self._processing.items():
|
||||
if item.retry_count < self.max_retries:
|
||||
item.status = "pending"
|
||||
item.retry_count += 1
|
||||
self._queue.append(item)
|
||||
else:
|
||||
self._failed[url] = item
|
||||
|
||||
self._processing.clear()
|
||||
|
||||
# Final state persistence
|
||||
if self.persistence:
|
||||
await self._persist_state()
|
||||
|
||||
# Clear collections
|
||||
self._queue.clear()
|
||||
self._completed.clear()
|
||||
self._failed.clear()
|
||||
self._guild_queues.clear()
|
||||
self._channel_queues.clear()
|
||||
self._active_tasks.clear()
|
||||
|
||||
# Reset initialization state
|
||||
self._initialized = False
|
||||
self._init_event.clear()
|
||||
await self.coordinator.set_state(QueueState.STOPPED)
|
||||
logger.info("Queue manager cleanup completed")
|
||||
|
||||
except Exception as e:
|
||||
await self.coordinator.set_state(QueueState.ERROR)
|
||||
logger.error(f"Error during cleanup: {e}")
|
||||
raise CleanupError(f"Failed to clean up queue manager: {str(e)}")
|
||||
|
||||
def force_stop(self) -> None:
|
||||
async def force_stop(self) -> None:
|
||||
"""Force stop all queue operations immediately"""
|
||||
self._shutdown = True
|
||||
await self.coordinator.set_state(QueueState.STOPPING)
|
||||
logger.info("Force stopping queue manager...")
|
||||
|
||||
# Stop monitoring and cleanup
|
||||
self.monitor.stop_monitoring()
|
||||
self.cleaner.stop_cleanup()
|
||||
# Cancel background tasks
|
||||
if self._maintenance_task:
|
||||
self._maintenance_task.cancel()
|
||||
if self._stats_task:
|
||||
self._stats_task.cancel()
|
||||
|
||||
# Cancel all active tasks
|
||||
for task in self._active_tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
|
||||
# Move processing items back to queue
|
||||
for url, item in self._processing.items():
|
||||
if item.retry_count < self.max_retries:
|
||||
item.status = "pending"
|
||||
item.retry_count += 1
|
||||
self._queue.append(item)
|
||||
else:
|
||||
self._failed[url] = item
|
||||
|
||||
self._processing.clear()
|
||||
self._active_tasks.clear()
|
||||
# Force stop all components
|
||||
await self.processor.stop_processing()
|
||||
await self.monitor.stop()
|
||||
await self.cleaner.stop()
|
||||
|
||||
# Reset initialization state
|
||||
self._initialized = False
|
||||
self._init_event.clear()
|
||||
# Clear state
|
||||
await self.state_manager.clear_state()
|
||||
|
||||
await self.coordinator.set_state(QueueState.STOPPED)
|
||||
logger.info("Queue manager force stopped")
|
||||
|
||||
async def _persist_state(self) -> None:
|
||||
"""Persist current state to storage"""
|
||||
if not self.persistence:
|
||||
return
|
||||
|
||||
try:
|
||||
state = await self.state_manager.get_state_for_persistence()
|
||||
state["metrics"] = self.metrics_manager.get_metrics()
|
||||
state["stats"] = {
|
||||
"uptime": self.stats.uptime.total_seconds(),
|
||||
"peak_queue_size": self.stats.peak_queue_size,
|
||||
"peak_memory_usage": self.stats.peak_memory_usage,
|
||||
"total_processed": self.stats.total_processed,
|
||||
"total_failed": self.stats.total_failed
|
||||
}
|
||||
await self.persistence.persist_queue_state(state)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to persist state: {e}")
|
||||
|
||||
def _get_default_status(self) -> Dict[str, Any]:
|
||||
"""Get default status when error occurs"""
|
||||
return {
|
||||
"pending": 0,
|
||||
"processing": 0,
|
||||
"completed": 0,
|
||||
"failed": 0,
|
||||
"metrics": {
|
||||
"total_processed": 0,
|
||||
"total_failed": 0,
|
||||
"success_rate": 0.0,
|
||||
"avg_processing_time": 0.0,
|
||||
"peak_memory_usage": 0.0,
|
||||
"last_cleanup": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"errors_by_type": {},
|
||||
"compression_failures": 0,
|
||||
"hardware_accel_failures": 0,
|
||||
"last_activity": 0,
|
||||
},
|
||||
"state": QueueState.ERROR.value,
|
||||
"mode": QueueMode.NORMAL.value,
|
||||
"stats": {
|
||||
"uptime": 0,
|
||||
"peak_queue_size": 0,
|
||||
"peak_memory_usage": 0,
|
||||
"total_processed": 0,
|
||||
"total_failed": 0
|
||||
}
|
||||
}
|
||||
|
||||
366
videoarchiver/queue/metrics_manager.py
Normal file
366
videoarchiver/queue/metrics_manager.py
Normal file
@@ -0,0 +1,366 @@
|
||||
"""Module for managing queue metrics"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Optional, List, Any, Set
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
|
||||
logger = logging.getLogger("QueueMetricsManager")
|
||||
|
||||
class MetricCategory(Enum):
|
||||
"""Categories of metrics"""
|
||||
PROCESSING = "processing"
|
||||
PERFORMANCE = "performance"
|
||||
ERRORS = "errors"
|
||||
HARDWARE = "hardware"
|
||||
MEMORY = "memory"
|
||||
ACTIVITY = "activity"
|
||||
|
||||
class ErrorCategory(Enum):
|
||||
"""Categories of errors"""
|
||||
NETWORK = "network"
|
||||
TIMEOUT = "timeout"
|
||||
PERMISSION = "permission"
|
||||
MEMORY = "memory"
|
||||
HARDWARE = "hardware"
|
||||
COMPRESSION = "compression"
|
||||
STORAGE = "storage"
|
||||
OTHER = "other"
|
||||
|
||||
@dataclass
|
||||
class ProcessingMetrics:
|
||||
"""Processing-related metrics"""
|
||||
total_processed: int = 0
|
||||
total_failed: int = 0
|
||||
success_rate: float = 0.0
|
||||
avg_processing_time: float = 0.0
|
||||
_total_processing_time: float = 0.0
|
||||
_processing_count: int = 0
|
||||
|
||||
def update(self, processing_time: float, success: bool) -> None:
|
||||
"""Update processing metrics"""
|
||||
self.total_processed += 1
|
||||
if not success:
|
||||
self.total_failed += 1
|
||||
|
||||
self._total_processing_time += processing_time
|
||||
self._processing_count += 1
|
||||
|
||||
self.success_rate = (
|
||||
(self.total_processed - self.total_failed)
|
||||
/ self.total_processed
|
||||
if self.total_processed > 0
|
||||
else 0.0
|
||||
)
|
||||
self.avg_processing_time = (
|
||||
self._total_processing_time / self._processing_count
|
||||
if self._processing_count > 0
|
||||
else 0.0
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class ErrorMetrics:
|
||||
"""Error-related metrics"""
|
||||
errors_by_type: Dict[str, int] = field(default_factory=dict)
|
||||
errors_by_category: Dict[ErrorCategory, int] = field(default_factory=dict)
|
||||
recent_errors: List[Dict[str, Any]] = field(default_factory=list)
|
||||
error_patterns: Dict[str, int] = field(default_factory=dict)
|
||||
max_recent_errors: int = 100
|
||||
|
||||
def record_error(self, error: str, category: Optional[ErrorCategory] = None) -> None:
|
||||
"""Record an error occurrence"""
|
||||
# Track by exact error
|
||||
self.errors_by_type[error] = self.errors_by_type.get(error, 0) + 1
|
||||
|
||||
# Track by category
|
||||
if category is None:
|
||||
category = self._categorize_error(error)
|
||||
self.errors_by_category[category] = self.errors_by_category.get(category, 0) + 1
|
||||
|
||||
# Track recent errors
|
||||
self.recent_errors.append({
|
||||
"error": error,
|
||||
"category": category.value,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
})
|
||||
if len(self.recent_errors) > self.max_recent_errors:
|
||||
self.recent_errors.pop(0)
|
||||
|
||||
# Update error patterns
|
||||
pattern = self._extract_error_pattern(error)
|
||||
self.error_patterns[pattern] = self.error_patterns.get(pattern, 0) + 1
|
||||
|
||||
def _categorize_error(self, error: str) -> ErrorCategory:
|
||||
"""Categorize an error message"""
|
||||
error_lower = error.lower()
|
||||
|
||||
if any(word in error_lower for word in ["network", "connection", "dns"]):
|
||||
return ErrorCategory.NETWORK
|
||||
elif "timeout" in error_lower:
|
||||
return ErrorCategory.TIMEOUT
|
||||
elif any(word in error_lower for word in ["permission", "access", "denied"]):
|
||||
return ErrorCategory.PERMISSION
|
||||
elif "memory" in error_lower:
|
||||
return ErrorCategory.MEMORY
|
||||
elif "hardware" in error_lower:
|
||||
return ErrorCategory.HARDWARE
|
||||
elif "compression" in error_lower:
|
||||
return ErrorCategory.COMPRESSION
|
||||
elif any(word in error_lower for word in ["disk", "storage", "space"]):
|
||||
return ErrorCategory.STORAGE
|
||||
return ErrorCategory.OTHER
|
||||
|
||||
def _extract_error_pattern(self, error: str) -> str:
|
||||
"""Extract general pattern from error message"""
|
||||
# This could be enhanced with regex or more sophisticated pattern matching
|
||||
words = error.split()
|
||||
if len(words) > 5:
|
||||
return " ".join(words[:5]) + "..."
|
||||
return error
|
||||
|
||||
@dataclass
|
||||
class PerformanceMetrics:
|
||||
"""Performance-related metrics"""
|
||||
peak_memory_usage: float = 0.0
|
||||
compression_failures: int = 0
|
||||
hardware_accel_failures: int = 0
|
||||
peak_queue_size: int = 0
|
||||
peak_processing_time: float = 0.0
|
||||
avg_queue_wait_time: float = 0.0
|
||||
_total_wait_time: float = 0.0
|
||||
_wait_count: int = 0
|
||||
|
||||
def update_memory(self, memory_usage: float) -> None:
|
||||
"""Update memory usage metrics"""
|
||||
self.peak_memory_usage = max(self.peak_memory_usage, memory_usage)
|
||||
|
||||
def record_wait_time(self, wait_time: float) -> None:
|
||||
"""Record queue wait time"""
|
||||
self._total_wait_time += wait_time
|
||||
self._wait_count += 1
|
||||
self.avg_queue_wait_time = (
|
||||
self._total_wait_time / self._wait_count
|
||||
if self._wait_count > 0
|
||||
else 0.0
|
||||
)
|
||||
|
||||
class MetricAggregator:
|
||||
"""Aggregates metrics over time periods"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.hourly_metrics: List[Dict[str, Any]] = []
|
||||
self.daily_metrics: List[Dict[str, Any]] = []
|
||||
self.last_aggregation = datetime.utcnow()
|
||||
|
||||
def aggregate_metrics(self, current_metrics: Dict[str, Any]) -> None:
|
||||
"""Aggregate current metrics"""
|
||||
now = datetime.utcnow()
|
||||
|
||||
# Hourly aggregation
|
||||
if now - self.last_aggregation >= timedelta(hours=1):
|
||||
self.hourly_metrics.append({
|
||||
"timestamp": now.isoformat(),
|
||||
"metrics": current_metrics
|
||||
})
|
||||
if len(self.hourly_metrics) > self.max_history:
|
||||
self.hourly_metrics.pop(0)
|
||||
|
||||
# Daily aggregation
|
||||
if now.date() > self.last_aggregation.date():
|
||||
daily_avg = self._calculate_daily_average(
|
||||
self.hourly_metrics,
|
||||
self.last_aggregation.date()
|
||||
)
|
||||
self.daily_metrics.append(daily_avg)
|
||||
if len(self.daily_metrics) > 30: # Keep last 30 days
|
||||
self.daily_metrics.pop(0)
|
||||
|
||||
self.last_aggregation = now
|
||||
|
||||
def _calculate_daily_average(
|
||||
self,
|
||||
metrics: List[Dict[str, Any]],
|
||||
date: datetime.date
|
||||
) -> Dict[str, Any]:
|
||||
"""Calculate average metrics for a day"""
|
||||
day_metrics = [
|
||||
m for m in metrics
|
||||
if datetime.fromisoformat(m["timestamp"]).date() == date
|
||||
]
|
||||
|
||||
if not day_metrics:
|
||||
return {
|
||||
"date": date.isoformat(),
|
||||
"metrics": {}
|
||||
}
|
||||
|
||||
# Calculate averages for numeric values
|
||||
avg_metrics = {}
|
||||
for key in day_metrics[0]["metrics"].keys():
|
||||
if isinstance(day_metrics[0]["metrics"][key], (int, float)):
|
||||
avg_metrics[key] = sum(
|
||||
m["metrics"][key] for m in day_metrics
|
||||
) / len(day_metrics)
|
||||
else:
|
||||
avg_metrics[key] = day_metrics[-1]["metrics"][key]
|
||||
|
||||
return {
|
||||
"date": date.isoformat(),
|
||||
"metrics": avg_metrics
|
||||
}
|
||||
|
||||
class QueueMetricsManager:
|
||||
"""Manages metrics collection and reporting for the queue system"""
|
||||
|
||||
def __init__(self):
|
||||
self.processing = ProcessingMetrics()
|
||||
self.errors = ErrorMetrics()
|
||||
self.performance = PerformanceMetrics()
|
||||
self.aggregator = MetricAggregator()
|
||||
self.last_activity = time.time()
|
||||
self.last_cleanup = datetime.utcnow()
|
||||
|
||||
def update(
|
||||
self,
|
||||
processing_time: float,
|
||||
success: bool,
|
||||
error: Optional[str] = None
|
||||
) -> None:
|
||||
"""Update metrics with new processing information"""
|
||||
try:
|
||||
# Update processing metrics
|
||||
self.processing.update(processing_time, success)
|
||||
|
||||
# Update error tracking
|
||||
if error:
|
||||
self.errors.record_error(error)
|
||||
|
||||
# Track specific failures
|
||||
if "hardware acceleration" in error.lower():
|
||||
self.performance.hardware_accel_failures += 1
|
||||
elif "compression" in error.lower():
|
||||
self.performance.compression_failures += 1
|
||||
|
||||
# Update activity timestamp
|
||||
self.last_activity = time.time()
|
||||
|
||||
# Aggregate metrics
|
||||
self.aggregator.aggregate_metrics(self.get_metrics())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating metrics: {e}")
|
||||
|
||||
def get_metrics(self) -> Dict[str, Any]:
|
||||
"""Get current metrics"""
|
||||
return {
|
||||
MetricCategory.PROCESSING.value: {
|
||||
"total_processed": self.processing.total_processed,
|
||||
"total_failed": self.processing.total_failed,
|
||||
"success_rate": self.processing.success_rate,
|
||||
"avg_processing_time": self.processing.avg_processing_time
|
||||
},
|
||||
MetricCategory.ERRORS.value: {
|
||||
"errors_by_type": self.errors.errors_by_type,
|
||||
"errors_by_category": {
|
||||
cat.value: count
|
||||
for cat, count in self.errors.errors_by_category.items()
|
||||
},
|
||||
"error_patterns": self.errors.error_patterns,
|
||||
"recent_errors": self.errors.recent_errors
|
||||
},
|
||||
MetricCategory.PERFORMANCE.value: {
|
||||
"peak_memory_usage": self.performance.peak_memory_usage,
|
||||
"compression_failures": self.performance.compression_failures,
|
||||
"hardware_accel_failures": self.performance.hardware_accel_failures,
|
||||
"peak_queue_size": self.performance.peak_queue_size,
|
||||
"avg_queue_wait_time": self.performance.avg_queue_wait_time
|
||||
},
|
||||
MetricCategory.ACTIVITY.value: {
|
||||
"last_activity": time.time() - self.last_activity,
|
||||
"last_cleanup": self.last_cleanup.isoformat()
|
||||
},
|
||||
"history": {
|
||||
"hourly": self.aggregator.hourly_metrics,
|
||||
"daily": self.aggregator.daily_metrics
|
||||
}
|
||||
}
|
||||
|
||||
def update_memory_usage(self, memory_usage: float) -> None:
|
||||
"""Update peak memory usage"""
|
||||
self.performance.update_memory(memory_usage)
|
||||
|
||||
def update_cleanup_time(self) -> None:
|
||||
"""Update last cleanup timestamp"""
|
||||
self.last_cleanup = datetime.utcnow()
|
||||
|
||||
def reset_metrics(self) -> None:
|
||||
"""Reset all metrics to initial state"""
|
||||
self.processing = ProcessingMetrics()
|
||||
self.errors = ErrorMetrics()
|
||||
self.performance = PerformanceMetrics()
|
||||
self.last_activity = time.time()
|
||||
self.last_cleanup = datetime.utcnow()
|
||||
|
||||
def save_metrics(self, file_path: str) -> None:
|
||||
"""Save metrics to file"""
|
||||
try:
|
||||
metrics = self.get_metrics()
|
||||
with open(file_path, 'w') as f:
|
||||
json.dump(metrics, f, indent=2)
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving metrics: {e}")
|
||||
|
||||
def load_metrics(self, file_path: str) -> None:
|
||||
"""Load metrics from file"""
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
metrics = json.load(f)
|
||||
self.restore_metrics(metrics)
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading metrics: {e}")
|
||||
|
||||
def restore_metrics(self, metrics_data: Dict[str, Any]) -> None:
|
||||
"""Restore metrics from saved data"""
|
||||
try:
|
||||
# Restore processing metrics
|
||||
proc_data = metrics_data.get(MetricCategory.PROCESSING.value, {})
|
||||
self.processing = ProcessingMetrics(
|
||||
total_processed=proc_data.get("total_processed", 0),
|
||||
total_failed=proc_data.get("total_failed", 0),
|
||||
success_rate=proc_data.get("success_rate", 0.0),
|
||||
avg_processing_time=proc_data.get("avg_processing_time", 0.0)
|
||||
)
|
||||
|
||||
# Restore error metrics
|
||||
error_data = metrics_data.get(MetricCategory.ERRORS.value, {})
|
||||
self.errors = ErrorMetrics(
|
||||
errors_by_type=error_data.get("errors_by_type", {}),
|
||||
errors_by_category={
|
||||
ErrorCategory[k.upper()]: v
|
||||
for k, v in error_data.get("errors_by_category", {}).items()
|
||||
},
|
||||
error_patterns=error_data.get("error_patterns", {}),
|
||||
recent_errors=error_data.get("recent_errors", [])
|
||||
)
|
||||
|
||||
# Restore performance metrics
|
||||
perf_data = metrics_data.get(MetricCategory.PERFORMANCE.value, {})
|
||||
self.performance = PerformanceMetrics(
|
||||
peak_memory_usage=perf_data.get("peak_memory_usage", 0.0),
|
||||
compression_failures=perf_data.get("compression_failures", 0),
|
||||
hardware_accel_failures=perf_data.get("hardware_accel_failures", 0),
|
||||
peak_queue_size=perf_data.get("peak_queue_size", 0),
|
||||
avg_queue_wait_time=perf_data.get("avg_queue_wait_time", 0.0)
|
||||
)
|
||||
|
||||
# Restore history
|
||||
history = metrics_data.get("history", {})
|
||||
self.aggregator.hourly_metrics = history.get("hourly", [])
|
||||
self.aggregator.daily_metrics = history.get("daily", [])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error restoring metrics: {e}")
|
||||
@@ -2,221 +2,365 @@
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import psutil
|
||||
import time
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Dict, Any, List, Set
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, Set
|
||||
from .models import QueueItem, QueueMetrics
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
from .health_checker import HealthChecker, HealthStatus, HealthCategory
|
||||
from .recovery_manager import RecoveryManager, RecoveryStrategy
|
||||
|
||||
logger = logging.getLogger("QueueMonitoring")
|
||||
|
||||
class MonitoringLevel(Enum):
|
||||
"""Monitoring intensity levels"""
|
||||
LIGHT = "light" # Basic monitoring
|
||||
NORMAL = "normal" # Standard monitoring
|
||||
INTENSIVE = "intensive" # Detailed monitoring
|
||||
DEBUG = "debug" # Debug-level monitoring
|
||||
|
||||
class AlertSeverity(Enum):
|
||||
"""Alert severity levels"""
|
||||
INFO = "info"
|
||||
WARNING = "warning"
|
||||
ERROR = "error"
|
||||
CRITICAL = "critical"
|
||||
|
||||
@dataclass
|
||||
class MonitoringEvent:
|
||||
"""Represents a monitoring event"""
|
||||
timestamp: datetime
|
||||
category: HealthCategory
|
||||
severity: AlertSeverity
|
||||
message: str
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
resolved: bool = False
|
||||
resolution_time: Optional[datetime] = None
|
||||
|
||||
@dataclass
|
||||
class MonitoringThresholds:
|
||||
"""Monitoring thresholds configuration"""
|
||||
check_interval: int = 15 # 15 seconds
|
||||
deadlock_threshold: int = 60 # 1 minute
|
||||
memory_threshold: int = 512 # 512MB
|
||||
max_retries: int = 3
|
||||
alert_threshold: int = 5 # Max alerts before escalation
|
||||
recovery_timeout: int = 300 # 5 minutes
|
||||
intensive_threshold: int = 0.8 # 80% resource usage triggers intensive
|
||||
|
||||
class AlertManager:
|
||||
"""Manages monitoring alerts"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.active_alerts: Dict[str, MonitoringEvent] = {}
|
||||
self.alert_history: List[MonitoringEvent] = []
|
||||
self.alert_counts: Dict[AlertSeverity, int] = {
|
||||
severity: 0 for severity in AlertSeverity
|
||||
}
|
||||
|
||||
def create_alert(
|
||||
self,
|
||||
category: HealthCategory,
|
||||
severity: AlertSeverity,
|
||||
message: str,
|
||||
details: Dict[str, Any] = None
|
||||
) -> MonitoringEvent:
|
||||
"""Create a new alert"""
|
||||
event = MonitoringEvent(
|
||||
timestamp=datetime.utcnow(),
|
||||
category=category,
|
||||
severity=severity,
|
||||
message=message,
|
||||
details=details or {}
|
||||
)
|
||||
|
||||
alert_id = f"{category.value}_{event.timestamp.timestamp()}"
|
||||
self.active_alerts[alert_id] = event
|
||||
self.alert_counts[severity] += 1
|
||||
|
||||
self.alert_history.append(event)
|
||||
if len(self.alert_history) > self.max_history:
|
||||
self.alert_history.pop(0)
|
||||
|
||||
return event
|
||||
|
||||
def resolve_alert(self, alert_id: str) -> None:
|
||||
"""Mark an alert as resolved"""
|
||||
if alert_id in self.active_alerts:
|
||||
event = self.active_alerts[alert_id]
|
||||
event.resolved = True
|
||||
event.resolution_time = datetime.utcnow()
|
||||
self.active_alerts.pop(alert_id)
|
||||
|
||||
def get_active_alerts(self) -> List[MonitoringEvent]:
|
||||
"""Get currently active alerts"""
|
||||
return list(self.active_alerts.values())
|
||||
|
||||
def get_alert_stats(self) -> Dict[str, Any]:
|
||||
"""Get alert statistics"""
|
||||
return {
|
||||
"active_alerts": len(self.active_alerts),
|
||||
"total_alerts": len(self.alert_history),
|
||||
"alert_counts": {
|
||||
severity.value: count
|
||||
for severity, count in self.alert_counts.items()
|
||||
},
|
||||
"recent_alerts": [
|
||||
{
|
||||
"timestamp": event.timestamp.isoformat(),
|
||||
"category": event.category.value,
|
||||
"severity": event.severity.value,
|
||||
"message": event.message,
|
||||
"resolved": event.resolved
|
||||
}
|
||||
for event in self.alert_history[-10:] # Last 10 alerts
|
||||
]
|
||||
}
|
||||
|
||||
class MonitoringStrategy:
|
||||
"""Determines monitoring behavior"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
level: MonitoringLevel = MonitoringLevel.NORMAL,
|
||||
thresholds: Optional[MonitoringThresholds] = None
|
||||
):
|
||||
self.level = level
|
||||
self.thresholds = thresholds or MonitoringThresholds()
|
||||
self._last_intensive_check = datetime.utcnow()
|
||||
|
||||
def should_check_health(self, metrics: Dict[str, Any]) -> bool:
|
||||
"""Determine if health check should be performed"""
|
||||
if self.level == MonitoringLevel.INTENSIVE:
|
||||
return True
|
||||
elif self.level == MonitoringLevel.LIGHT:
|
||||
return metrics.get("queue_size", 0) > 0
|
||||
else: # NORMAL or DEBUG
|
||||
return True
|
||||
|
||||
def get_check_interval(self) -> float:
|
||||
"""Get the current check interval"""
|
||||
if self.level == MonitoringLevel.INTENSIVE:
|
||||
return self.thresholds.check_interval / 2
|
||||
elif self.level == MonitoringLevel.LIGHT:
|
||||
return self.thresholds.check_interval * 2
|
||||
else: # NORMAL or DEBUG
|
||||
return self.thresholds.check_interval
|
||||
|
||||
def should_escalate(self, alert_count: int) -> bool:
|
||||
"""Determine if monitoring should be escalated"""
|
||||
return (
|
||||
self.level != MonitoringLevel.INTENSIVE and
|
||||
alert_count >= self.thresholds.alert_threshold
|
||||
)
|
||||
|
||||
def should_deescalate(self, alert_count: int) -> bool:
|
||||
"""Determine if monitoring can be deescalated"""
|
||||
return (
|
||||
self.level == MonitoringLevel.INTENSIVE and
|
||||
alert_count == 0 and
|
||||
(datetime.utcnow() - self._last_intensive_check).total_seconds() > 300
|
||||
)
|
||||
|
||||
class QueueMonitor:
|
||||
"""Monitors queue health and performance"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
deadlock_threshold: int = 60, # Reduced to 1 minute
|
||||
memory_threshold: int = 512, # 512MB
|
||||
max_retries: int = 3,
|
||||
check_interval: int = 15 # Reduced to 15 seconds
|
||||
strategy: Optional[MonitoringStrategy] = None,
|
||||
thresholds: Optional[MonitoringThresholds] = None
|
||||
):
|
||||
self.deadlock_threshold = deadlock_threshold
|
||||
self.memory_threshold = memory_threshold
|
||||
self.max_retries = max_retries
|
||||
self.check_interval = check_interval
|
||||
self.strategy = strategy or MonitoringStrategy()
|
||||
self.thresholds = thresholds or MonitoringThresholds()
|
||||
|
||||
# Initialize components
|
||||
self.health_checker = HealthChecker(
|
||||
memory_threshold=self.thresholds.memory_threshold,
|
||||
deadlock_threshold=self.thresholds.deadlock_threshold
|
||||
)
|
||||
self.recovery_manager = RecoveryManager(max_retries=self.thresholds.max_retries)
|
||||
self.alert_manager = AlertManager()
|
||||
|
||||
self._shutdown = False
|
||||
self._last_active_time = time.time()
|
||||
self._monitoring_task = None
|
||||
self._monitoring_task: Optional[asyncio.Task] = None
|
||||
|
||||
async def start_monitoring(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
metrics: QueueMetrics,
|
||||
queue_lock: asyncio.Lock
|
||||
) -> None:
|
||||
"""Start monitoring queue health
|
||||
|
||||
Args:
|
||||
queue: Reference to the queue list
|
||||
processing: Reference to processing dict
|
||||
metrics: Reference to queue metrics
|
||||
queue_lock: Lock for queue operations
|
||||
"""
|
||||
async def start(self, state_manager, metrics_manager) -> None:
|
||||
"""Start monitoring queue health"""
|
||||
if self._monitoring_task is not None:
|
||||
logger.warning("Monitoring task already running")
|
||||
return
|
||||
|
||||
logger.info("Starting queue monitoring...")
|
||||
logger.info(f"Starting queue monitoring with level: {self.strategy.level.value}")
|
||||
self._monitoring_task = asyncio.create_task(
|
||||
self._monitor_loop(queue, processing, metrics, queue_lock)
|
||||
self._monitor_loop(state_manager, metrics_manager)
|
||||
)
|
||||
|
||||
async def _monitor_loop(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
metrics: QueueMetrics,
|
||||
queue_lock: asyncio.Lock
|
||||
) -> None:
|
||||
async def _monitor_loop(self, state_manager, metrics_manager) -> None:
|
||||
"""Main monitoring loop"""
|
||||
while not self._shutdown:
|
||||
try:
|
||||
await self._check_health(queue, processing, metrics, queue_lock)
|
||||
await asyncio.sleep(self.check_interval)
|
||||
# Get current metrics
|
||||
metrics = metrics_manager.get_metrics()
|
||||
|
||||
# Check if health check should be performed
|
||||
if self.strategy.should_check_health(metrics):
|
||||
await self._perform_health_check(
|
||||
state_manager,
|
||||
metrics_manager,
|
||||
metrics
|
||||
)
|
||||
|
||||
# Check for strategy adjustment
|
||||
self._adjust_monitoring_strategy(metrics)
|
||||
|
||||
# Wait for next check
|
||||
await asyncio.sleep(self.strategy.get_check_interval())
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Queue monitoring cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in health monitor: {str(e)}")
|
||||
await asyncio.sleep(1) # Reduced sleep on error
|
||||
logger.error(f"Error in monitoring loop: {str(e)}")
|
||||
await asyncio.sleep(1)
|
||||
|
||||
def stop_monitoring(self) -> None:
|
||||
async def stop(self) -> None:
|
||||
"""Stop the monitoring process"""
|
||||
logger.info("Stopping queue monitoring...")
|
||||
self._shutdown = True
|
||||
if self._monitoring_task and not self._monitoring_task.done():
|
||||
self._monitoring_task.cancel()
|
||||
try:
|
||||
await self._monitoring_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._monitoring_task = None
|
||||
|
||||
def update_activity(self) -> None:
|
||||
"""Update the last active time"""
|
||||
self._last_active_time = time.time()
|
||||
|
||||
async def _check_health(
|
||||
async def _perform_health_check(
|
||||
self,
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem],
|
||||
metrics: QueueMetrics,
|
||||
queue_lock: asyncio.Lock
|
||||
state_manager,
|
||||
metrics_manager,
|
||||
current_metrics: Dict[str, Any]
|
||||
) -> None:
|
||||
"""Check queue health and performance
|
||||
|
||||
Args:
|
||||
queue: Reference to the queue list
|
||||
processing: Reference to processing dict
|
||||
metrics: Reference to queue metrics
|
||||
queue_lock: Lock for queue operations
|
||||
"""
|
||||
"""Perform health check and recovery if needed"""
|
||||
try:
|
||||
current_time = time.time()
|
||||
|
||||
# Check memory usage
|
||||
process = psutil.Process()
|
||||
memory_usage = process.memory_info().rss / 1024 / 1024 # MB
|
||||
memory_usage, is_critical = await self.health_checker.check_memory_usage()
|
||||
metrics_manager.update_memory_usage(memory_usage)
|
||||
|
||||
if memory_usage > self.memory_threshold:
|
||||
logger.warning(f"High memory usage detected: {memory_usage:.2f}MB")
|
||||
# Force garbage collection
|
||||
import gc
|
||||
gc.collect()
|
||||
memory_after = process.memory_info().rss / 1024 / 1024
|
||||
logger.info(f"Memory after GC: {memory_after:.2f}MB")
|
||||
if is_critical:
|
||||
self.alert_manager.create_alert(
|
||||
category=HealthCategory.MEMORY,
|
||||
severity=AlertSeverity.CRITICAL,
|
||||
message=f"Critical memory usage: {memory_usage:.1f}MB",
|
||||
details={"memory_usage": memory_usage}
|
||||
)
|
||||
|
||||
# Check for potential deadlocks
|
||||
# Get current queue state
|
||||
queue_stats = await state_manager.get_queue_stats()
|
||||
processing_items = await state_manager.get_all_processing_items()
|
||||
|
||||
# Check for stuck items
|
||||
stuck_items = []
|
||||
for item in processing_items:
|
||||
if self.recovery_manager.should_recover_item(item):
|
||||
stuck_items.append((item.url, item))
|
||||
|
||||
async with queue_lock:
|
||||
# Check processing items
|
||||
for url, item in processing.items():
|
||||
if hasattr(item, 'start_time') and item.start_time:
|
||||
processing_time = current_time - item.start_time
|
||||
if processing_time > self.deadlock_threshold:
|
||||
stuck_items.append((url, item))
|
||||
logger.warning(f"Item stuck in processing: {url} for {processing_time:.1f}s")
|
||||
# Handle stuck items if found
|
||||
if stuck_items:
|
||||
self.alert_manager.create_alert(
|
||||
category=HealthCategory.DEADLOCKS,
|
||||
severity=AlertSeverity.WARNING,
|
||||
message=f"Potential deadlock: {len(stuck_items)} items stuck",
|
||||
details={"stuck_items": [item[0] for item in stuck_items]}
|
||||
)
|
||||
|
||||
await self.recovery_manager.recover_stuck_items(
|
||||
stuck_items,
|
||||
state_manager,
|
||||
metrics_manager
|
||||
)
|
||||
|
||||
# Handle stuck items if found
|
||||
if stuck_items:
|
||||
logger.warning(f"Potential deadlock detected: {len(stuck_items)} items stuck")
|
||||
await self._recover_stuck_items(stuck_items, queue, processing)
|
||||
# Check overall queue activity
|
||||
if processing_items and self.health_checker.check_queue_activity(
|
||||
self._last_active_time,
|
||||
bool(processing_items)
|
||||
):
|
||||
self.alert_manager.create_alert(
|
||||
category=HealthCategory.ACTIVITY,
|
||||
severity=AlertSeverity.ERROR,
|
||||
message="Queue appears to be hung",
|
||||
details={"last_active": self._last_active_time}
|
||||
)
|
||||
|
||||
await self.recovery_manager.perform_emergency_recovery(
|
||||
state_manager,
|
||||
metrics_manager
|
||||
)
|
||||
self.update_activity()
|
||||
|
||||
# Check overall queue activity
|
||||
if processing and current_time - self._last_active_time > self.deadlock_threshold:
|
||||
logger.warning("Queue appears to be hung - no activity detected")
|
||||
# Force recovery of all processing items
|
||||
all_items = list(processing.items())
|
||||
await self._recover_stuck_items(all_items, queue, processing)
|
||||
self._last_active_time = current_time
|
||||
# Check error rates
|
||||
error_rate = current_metrics.get("error_rate", 0)
|
||||
if error_rate > 0.2: # 20% error rate
|
||||
self.alert_manager.create_alert(
|
||||
category=HealthCategory.ERRORS,
|
||||
severity=AlertSeverity.ERROR,
|
||||
message=f"High error rate: {error_rate:.1%}",
|
||||
details={"error_rate": error_rate}
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.last_activity_time = self._last_active_time
|
||||
metrics.peak_memory_usage = max(metrics.peak_memory_usage, memory_usage)
|
||||
# Log health report
|
||||
if self.strategy.level in (MonitoringLevel.INTENSIVE, MonitoringLevel.DEBUG):
|
||||
health_report = self.health_checker.format_health_report(
|
||||
memory_usage=memory_usage,
|
||||
queue_size=queue_stats["queue_size"],
|
||||
processing_count=queue_stats["processing_count"],
|
||||
success_rate=metrics_manager.success_rate,
|
||||
avg_processing_time=metrics_manager.avg_processing_time,
|
||||
peak_memory=metrics_manager.peak_memory_usage,
|
||||
error_distribution=metrics_manager.errors_by_type,
|
||||
last_activity_delta=time.time() - self._last_active_time
|
||||
)
|
||||
logger.info(health_report)
|
||||
|
||||
# Calculate current metrics
|
||||
queue_size = len(queue)
|
||||
processing_count = len(processing)
|
||||
|
||||
# Log detailed metrics
|
||||
logger.info(
|
||||
f"Queue Health Metrics:\n"
|
||||
f"- Success Rate: {metrics.success_rate:.2%}\n"
|
||||
f"- Avg Processing Time: {metrics.avg_processing_time:.2f}s\n"
|
||||
f"- Memory Usage: {memory_usage:.2f}MB\n"
|
||||
f"- Peak Memory: {metrics.peak_memory_usage:.2f}MB\n"
|
||||
f"- Error Distribution: {metrics.errors_by_type}\n"
|
||||
f"- Queue Size: {queue_size}\n"
|
||||
f"- Processing Items: {processing_count}\n"
|
||||
f"- Last Activity: {(current_time - self._last_active_time):.1f}s ago"
|
||||
except Exception as e:
|
||||
logger.error(f"Error performing health check: {str(e)}")
|
||||
self.alert_manager.create_alert(
|
||||
category=HealthCategory.SYSTEM,
|
||||
severity=AlertSeverity.ERROR,
|
||||
message=f"Health check error: {str(e)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking queue health: {str(e)}")
|
||||
# Don't re-raise to keep monitoring alive
|
||||
|
||||
async def _recover_stuck_items(
|
||||
self,
|
||||
stuck_items: List[tuple[str, QueueItem]],
|
||||
queue: List[QueueItem],
|
||||
processing: Dict[str, QueueItem]
|
||||
) -> None:
|
||||
"""Attempt to recover stuck items
|
||||
def _adjust_monitoring_strategy(self, metrics: Dict[str, Any]) -> None:
|
||||
"""Adjust monitoring strategy based on current state"""
|
||||
active_alerts = self.alert_manager.get_active_alerts()
|
||||
|
||||
Args:
|
||||
stuck_items: List of (url, item) tuples for stuck items
|
||||
queue: Reference to the queue list
|
||||
processing: Reference to processing dict
|
||||
"""
|
||||
try:
|
||||
recovered = 0
|
||||
failed = 0
|
||||
|
||||
for url, item in stuck_items:
|
||||
try:
|
||||
# Move to failed if max retries reached
|
||||
if item.retry_count >= self.max_retries:
|
||||
logger.warning(f"Moving stuck item to failed: {url}")
|
||||
item.status = "failed"
|
||||
item.error = "Exceeded maximum retries after being stuck"
|
||||
item.last_error = item.error
|
||||
item.last_error_time = datetime.utcnow()
|
||||
processing.pop(url)
|
||||
failed += 1
|
||||
else:
|
||||
# Reset for retry
|
||||
logger.info(f"Recovering stuck item for retry: {url}")
|
||||
item.retry_count += 1
|
||||
item.start_time = None
|
||||
item.processing_time = 0
|
||||
item.last_retry = datetime.utcnow()
|
||||
item.status = "pending"
|
||||
item.priority = max(0, item.priority - 2) # Lower priority
|
||||
queue.append(item)
|
||||
processing.pop(url)
|
||||
recovered += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error recovering item {url}: {str(e)}")
|
||||
# Check for escalation
|
||||
if self.strategy.should_escalate(len(active_alerts)):
|
||||
logger.warning("Escalating to intensive monitoring")
|
||||
self.strategy.level = MonitoringLevel.INTENSIVE
|
||||
self.strategy._last_intensive_check = datetime.utcnow()
|
||||
|
||||
# Check for de-escalation
|
||||
elif self.strategy.should_deescalate(len(active_alerts)):
|
||||
logger.info("De-escalating to normal monitoring")
|
||||
self.strategy.level = MonitoringLevel.NORMAL
|
||||
|
||||
# Update activity timestamp after recovery
|
||||
self.update_activity()
|
||||
logger.info(f"Recovery complete - Recovered: {recovered}, Failed: {failed}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error recovering stuck items: {str(e)}")
|
||||
# Don't re-raise to keep monitoring alive
|
||||
def get_monitoring_stats(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive monitoring statistics"""
|
||||
return {
|
||||
"monitoring_level": self.strategy.level.value,
|
||||
"last_active": self._last_active_time,
|
||||
"alerts": self.alert_manager.get_alert_stats(),
|
||||
"recovery": self.recovery_manager.get_recovery_stats(),
|
||||
"health": self.health_checker.get_health_stats()
|
||||
}
|
||||
|
||||
class MonitoringError(Exception):
|
||||
"""Base exception for monitoring-related errors"""
|
||||
|
||||
351
videoarchiver/queue/processor.py
Normal file
351
videoarchiver/queue/processor.py
Normal file
@@ -0,0 +1,351 @@
|
||||
"""Module for processing queue items"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable, Optional, Tuple, List, Set, Dict, Any
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .models import QueueItem
|
||||
from .state_manager import QueueStateManager, ItemState
|
||||
from .monitoring import QueueMonitor
|
||||
|
||||
logger = logging.getLogger("QueueProcessor")
|
||||
|
||||
class ProcessingStrategy(Enum):
|
||||
"""Processing strategies"""
|
||||
SEQUENTIAL = "sequential" # Process items one at a time
|
||||
CONCURRENT = "concurrent" # Process multiple items concurrently
|
||||
BATCHED = "batched" # Process items in batches
|
||||
PRIORITY = "priority" # Process based on priority
|
||||
|
||||
@dataclass
|
||||
class ProcessingMetrics:
|
||||
"""Metrics for processing operations"""
|
||||
total_processed: int = 0
|
||||
successful: int = 0
|
||||
failed: int = 0
|
||||
retried: int = 0
|
||||
avg_processing_time: float = 0.0
|
||||
peak_concurrent_tasks: int = 0
|
||||
last_processed: Optional[datetime] = None
|
||||
error_counts: Dict[str, int] = None
|
||||
|
||||
def __post_init__(self):
|
||||
self.error_counts = {}
|
||||
|
||||
def record_success(self, processing_time: float) -> None:
|
||||
"""Record successful processing"""
|
||||
self.total_processed += 1
|
||||
self.successful += 1
|
||||
self._update_avg_time(processing_time)
|
||||
self.last_processed = datetime.utcnow()
|
||||
|
||||
def record_failure(self, error: str) -> None:
|
||||
"""Record processing failure"""
|
||||
self.total_processed += 1
|
||||
self.failed += 1
|
||||
self.error_counts[error] = self.error_counts.get(error, 0) + 1
|
||||
self.last_processed = datetime.utcnow()
|
||||
|
||||
def record_retry(self) -> None:
|
||||
"""Record processing retry"""
|
||||
self.retried += 1
|
||||
|
||||
def _update_avg_time(self, new_time: float) -> None:
|
||||
"""Update average processing time"""
|
||||
if self.total_processed == 1:
|
||||
self.avg_processing_time = new_time
|
||||
else:
|
||||
self.avg_processing_time = (
|
||||
(self.avg_processing_time * (self.total_processed - 1) + new_time)
|
||||
/ self.total_processed
|
||||
)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get processing statistics"""
|
||||
return {
|
||||
"total_processed": self.total_processed,
|
||||
"successful": self.successful,
|
||||
"failed": self.failed,
|
||||
"retried": self.retried,
|
||||
"success_rate": (
|
||||
self.successful / self.total_processed
|
||||
if self.total_processed > 0
|
||||
else 0
|
||||
),
|
||||
"avg_processing_time": self.avg_processing_time,
|
||||
"peak_concurrent_tasks": self.peak_concurrent_tasks,
|
||||
"last_processed": (
|
||||
self.last_processed.isoformat()
|
||||
if self.last_processed
|
||||
else None
|
||||
),
|
||||
"error_distribution": self.error_counts
|
||||
}
|
||||
|
||||
class BatchManager:
|
||||
"""Manages processing batches"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
batch_size: int,
|
||||
max_concurrent: int,
|
||||
timeout: float = 30.0
|
||||
):
|
||||
self.batch_size = batch_size
|
||||
self.max_concurrent = max_concurrent
|
||||
self.timeout = timeout
|
||||
self.current_batch: List[QueueItem] = []
|
||||
self.processing_start: Optional[datetime] = None
|
||||
|
||||
async def process_batch(
|
||||
self,
|
||||
items: List[QueueItem],
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
|
||||
) -> List[Tuple[QueueItem, bool, Optional[str]]]:
|
||||
"""Process a batch of items"""
|
||||
self.current_batch = items
|
||||
self.processing_start = datetime.utcnow()
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(self._process_item(processor, item))
|
||||
for item in items
|
||||
]
|
||||
|
||||
try:
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
return [
|
||||
(item, *self._handle_result(result))
|
||||
for item, result in zip(items, results)
|
||||
]
|
||||
finally:
|
||||
self.current_batch = []
|
||||
self.processing_start = None
|
||||
|
||||
async def _process_item(
|
||||
self,
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]],
|
||||
item: QueueItem
|
||||
) -> Tuple[bool, Optional[str]]:
|
||||
"""Process a single item with timeout"""
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
processor(item),
|
||||
timeout=self.timeout
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
return False, "Processing timeout"
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
def _handle_result(
|
||||
self,
|
||||
result: Any
|
||||
) -> Tuple[bool, Optional[str]]:
|
||||
"""Handle processing result"""
|
||||
if isinstance(result, tuple) and len(result) == 2:
|
||||
return result
|
||||
if isinstance(result, Exception):
|
||||
return False, str(result)
|
||||
return False, "Unknown error"
|
||||
|
||||
def get_batch_status(self) -> Dict[str, Any]:
|
||||
"""Get current batch status"""
|
||||
return {
|
||||
"batch_size": len(self.current_batch),
|
||||
"processing_time": (
|
||||
(datetime.utcnow() - self.processing_start).total_seconds()
|
||||
if self.processing_start
|
||||
else 0
|
||||
),
|
||||
"items": [item.url for item in self.current_batch]
|
||||
}
|
||||
|
||||
class QueueProcessor:
|
||||
"""Handles the processing of queue items"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
state_manager: QueueStateManager,
|
||||
monitor: QueueMonitor,
|
||||
strategy: ProcessingStrategy = ProcessingStrategy.CONCURRENT,
|
||||
max_retries: int = 3,
|
||||
retry_delay: int = 5,
|
||||
batch_size: int = 5,
|
||||
max_concurrent: int = 3
|
||||
):
|
||||
self.state_manager = state_manager
|
||||
self.monitor = monitor
|
||||
self.strategy = strategy
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
|
||||
self.batch_manager = BatchManager(batch_size, max_concurrent)
|
||||
self.metrics = ProcessingMetrics()
|
||||
|
||||
self._shutdown = False
|
||||
self._active_tasks: Set[asyncio.Task] = set()
|
||||
self._processing_lock = asyncio.Lock()
|
||||
|
||||
async def start_processing(
|
||||
self,
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
|
||||
) -> None:
|
||||
"""Start processing items in the queue"""
|
||||
logger.info(f"Queue processor started with strategy: {self.strategy.value}")
|
||||
|
||||
while not self._shutdown:
|
||||
try:
|
||||
if self.strategy == ProcessingStrategy.BATCHED:
|
||||
await self._process_batch(processor)
|
||||
elif self.strategy == ProcessingStrategy.CONCURRENT:
|
||||
await self._process_concurrent(processor)
|
||||
else: # SEQUENTIAL or PRIORITY
|
||||
await self._process_sequential(processor)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Queue processing cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Critical error in queue processor: {e}")
|
||||
await asyncio.sleep(1) # Delay before retry
|
||||
|
||||
await asyncio.sleep(0)
|
||||
|
||||
async def _process_batch(
|
||||
self,
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
|
||||
) -> None:
|
||||
"""Process items in batches"""
|
||||
items = await self.state_manager.get_next_items(self.batch_manager.batch_size)
|
||||
if not items:
|
||||
await asyncio.sleep(0.1)
|
||||
return
|
||||
|
||||
start_time = time.time()
|
||||
results = await self.batch_manager.process_batch(items, processor)
|
||||
|
||||
for item, success, error in results:
|
||||
await self._handle_result(
|
||||
item,
|
||||
success,
|
||||
error,
|
||||
time.time() - start_time
|
||||
)
|
||||
|
||||
async def _process_concurrent(
|
||||
self,
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
|
||||
) -> None:
|
||||
"""Process items concurrently"""
|
||||
if len(self._active_tasks) >= self.batch_manager.max_concurrent:
|
||||
await asyncio.sleep(0.1)
|
||||
return
|
||||
|
||||
items = await self.state_manager.get_next_items(
|
||||
self.batch_manager.max_concurrent - len(self._active_tasks)
|
||||
)
|
||||
|
||||
for item in items:
|
||||
task = asyncio.create_task(self._process_item(processor, item))
|
||||
self._active_tasks.add(task)
|
||||
task.add_done_callback(self._active_tasks.discard)
|
||||
|
||||
self.metrics.peak_concurrent_tasks = max(
|
||||
self.metrics.peak_concurrent_tasks,
|
||||
len(self._active_tasks)
|
||||
)
|
||||
|
||||
async def _process_sequential(
|
||||
self,
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]]
|
||||
) -> None:
|
||||
"""Process items sequentially"""
|
||||
items = await self.state_manager.get_next_items(1)
|
||||
if not items:
|
||||
await asyncio.sleep(0.1)
|
||||
return
|
||||
|
||||
await self._process_item(processor, items[0])
|
||||
|
||||
async def _process_item(
|
||||
self,
|
||||
processor: Callable[[QueueItem], Tuple[bool, Optional[str]]],
|
||||
item: QueueItem
|
||||
) -> None:
|
||||
"""Process a single queue item"""
|
||||
try:
|
||||
logger.info(f"Processing queue item: {item.url}")
|
||||
start_time = time.time()
|
||||
|
||||
async with self._processing_lock:
|
||||
item.start_processing()
|
||||
self.monitor.update_activity()
|
||||
|
||||
success, error = await processor(item)
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
await self._handle_result(item, success, error, processing_time)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {item.url}: {e}")
|
||||
await self._handle_result(item, False, str(e), 0)
|
||||
|
||||
async def _handle_result(
|
||||
self,
|
||||
item: QueueItem,
|
||||
success: bool,
|
||||
error: Optional[str],
|
||||
processing_time: float
|
||||
) -> None:
|
||||
"""Handle processing result"""
|
||||
item.finish_processing(success, error)
|
||||
|
||||
if success:
|
||||
await self.state_manager.mark_completed(item, True)
|
||||
self.metrics.record_success(processing_time)
|
||||
logger.info(f"Successfully processed: {item.url}")
|
||||
else:
|
||||
if item.retry_count < self.max_retries:
|
||||
item.retry_count += 1
|
||||
await self.state_manager.retry_item(item)
|
||||
self.metrics.record_retry()
|
||||
logger.warning(f"Retrying: {item.url} (attempt {item.retry_count})")
|
||||
await asyncio.sleep(self.retry_delay)
|
||||
else:
|
||||
await self.state_manager.mark_completed(item, False, error)
|
||||
self.metrics.record_failure(error or "Unknown error")
|
||||
logger.error(f"Failed after {self.max_retries} attempts: {item.url}")
|
||||
|
||||
async def stop_processing(self) -> None:
|
||||
"""Stop processing queue items"""
|
||||
self._shutdown = True
|
||||
|
||||
# Cancel all active tasks
|
||||
for task in self._active_tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
|
||||
# Wait for tasks to complete
|
||||
if self._active_tasks:
|
||||
await asyncio.gather(*self._active_tasks, return_exceptions=True)
|
||||
|
||||
self._active_tasks.clear()
|
||||
logger.info("Queue processor stopped")
|
||||
|
||||
def is_processing(self) -> bool:
|
||||
"""Check if the processor is currently processing items"""
|
||||
return bool(self._active_tasks)
|
||||
|
||||
def get_processor_stats(self) -> Dict[str, Any]:
|
||||
"""Get processor statistics"""
|
||||
return {
|
||||
"strategy": self.strategy.value,
|
||||
"active_tasks": len(self._active_tasks),
|
||||
"metrics": self.metrics.get_stats(),
|
||||
"batch_status": self.batch_manager.get_batch_status(),
|
||||
"is_processing": self.is_processing()
|
||||
}
|
||||
359
videoarchiver/queue/recovery_manager.py
Normal file
359
videoarchiver/queue/recovery_manager.py
Normal file
@@ -0,0 +1,359 @@
|
||||
"""Module for handling queue item recovery operations"""
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Tuple, Dict, Optional, Any, Set
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .models import QueueItem
|
||||
|
||||
logger = logging.getLogger("QueueRecoveryManager")
|
||||
|
||||
class RecoveryStrategy(Enum):
|
||||
"""Recovery strategies"""
|
||||
RETRY = "retry" # Retry the item
|
||||
FAIL = "fail" # Mark as failed
|
||||
REQUEUE = "requeue" # Add back to queue
|
||||
EMERGENCY = "emergency" # Emergency recovery
|
||||
|
||||
class RecoveryPolicy(Enum):
|
||||
"""Recovery policies"""
|
||||
AGGRESSIVE = "aggressive" # Recover quickly, more retries
|
||||
CONSERVATIVE = "conservative" # Recover slowly, fewer retries
|
||||
BALANCED = "balanced" # Balance between speed and reliability
|
||||
|
||||
@dataclass
|
||||
class RecoveryThresholds:
|
||||
"""Thresholds for recovery operations"""
|
||||
max_retries: int = 3
|
||||
deadlock_threshold: int = 300 # 5 minutes
|
||||
emergency_threshold: int = 600 # 10 minutes
|
||||
backoff_base: int = 5 # Base delay for exponential backoff
|
||||
max_concurrent_recoveries: int = 5
|
||||
|
||||
@dataclass
|
||||
class RecoveryResult:
|
||||
"""Result of a recovery operation"""
|
||||
item_url: str
|
||||
strategy: RecoveryStrategy
|
||||
success: bool
|
||||
error: Optional[str] = None
|
||||
retry_count: int = 0
|
||||
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||
|
||||
class RecoveryTracker:
|
||||
"""Tracks recovery operations"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.history: List[RecoveryResult] = []
|
||||
self.active_recoveries: Set[str] = set()
|
||||
self.recovery_counts: Dict[str, int] = {}
|
||||
self.success_counts: Dict[str, int] = {}
|
||||
self.error_counts: Dict[str, int] = {}
|
||||
|
||||
def record_recovery(self, result: RecoveryResult) -> None:
|
||||
"""Record a recovery operation"""
|
||||
self.history.append(result)
|
||||
if len(self.history) > self.max_history:
|
||||
self.history.pop(0)
|
||||
|
||||
self.recovery_counts[result.item_url] = (
|
||||
self.recovery_counts.get(result.item_url, 0) + 1
|
||||
)
|
||||
|
||||
if result.success:
|
||||
self.success_counts[result.item_url] = (
|
||||
self.success_counts.get(result.item_url, 0) + 1
|
||||
)
|
||||
else:
|
||||
self.error_counts[result.item_url] = (
|
||||
self.error_counts.get(result.item_url, 0) + 1
|
||||
)
|
||||
|
||||
def start_recovery(self, url: str) -> None:
|
||||
"""Start tracking a recovery operation"""
|
||||
self.active_recoveries.add(url)
|
||||
|
||||
def end_recovery(self, url: str) -> None:
|
||||
"""End tracking a recovery operation"""
|
||||
self.active_recoveries.discard(url)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get recovery statistics"""
|
||||
return {
|
||||
"total_recoveries": len(self.history),
|
||||
"active_recoveries": len(self.active_recoveries),
|
||||
"success_rate": (
|
||||
sum(self.success_counts.values()) /
|
||||
len(self.history) if self.history else 0
|
||||
),
|
||||
"recovery_counts": self.recovery_counts.copy(),
|
||||
"error_counts": self.error_counts.copy(),
|
||||
"recent_recoveries": [
|
||||
{
|
||||
"url": r.item_url,
|
||||
"strategy": r.strategy.value,
|
||||
"success": r.success,
|
||||
"error": r.error,
|
||||
"timestamp": r.timestamp.isoformat()
|
||||
}
|
||||
for r in self.history[-10:] # Last 10 recoveries
|
||||
]
|
||||
}
|
||||
|
||||
class RecoveryManager:
|
||||
"""Handles recovery of stuck or failed queue items"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
thresholds: Optional[RecoveryThresholds] = None,
|
||||
policy: RecoveryPolicy = RecoveryPolicy.BALANCED
|
||||
):
|
||||
self.thresholds = thresholds or RecoveryThresholds()
|
||||
self.policy = policy
|
||||
self.tracker = RecoveryTracker()
|
||||
self._recovery_lock = asyncio.Lock()
|
||||
|
||||
async def recover_stuck_items(
|
||||
self,
|
||||
stuck_items: List[Tuple[str, QueueItem]],
|
||||
state_manager,
|
||||
metrics_manager
|
||||
) -> Tuple[int, int]:
|
||||
"""Recover stuck items"""
|
||||
recovered = 0
|
||||
failed = 0
|
||||
|
||||
try:
|
||||
async with self._recovery_lock:
|
||||
for url, item in stuck_items:
|
||||
if len(self.tracker.active_recoveries) >= self.thresholds.max_concurrent_recoveries:
|
||||
logger.warning("Max concurrent recoveries reached, waiting...")
|
||||
await asyncio.sleep(1)
|
||||
continue
|
||||
|
||||
try:
|
||||
self.tracker.start_recovery(url)
|
||||
strategy = self._determine_strategy(item)
|
||||
|
||||
success = await self._execute_recovery(
|
||||
url,
|
||||
item,
|
||||
strategy,
|
||||
state_manager,
|
||||
metrics_manager
|
||||
)
|
||||
|
||||
if success:
|
||||
recovered += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error recovering item {url}: {str(e)}")
|
||||
failed += 1
|
||||
finally:
|
||||
self.tracker.end_recovery(url)
|
||||
|
||||
logger.info(f"Recovery complete - Recovered: {recovered}, Failed: {failed}")
|
||||
return recovered, failed
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in recovery process: {str(e)}")
|
||||
return 0, len(stuck_items)
|
||||
|
||||
def _determine_strategy(self, item: QueueItem) -> RecoveryStrategy:
|
||||
"""Determine recovery strategy based on item state"""
|
||||
if item.retry_count >= self.thresholds.max_retries:
|
||||
return RecoveryStrategy.FAIL
|
||||
|
||||
processing_time = (
|
||||
datetime.utcnow().timestamp() - item.start_time
|
||||
if item.start_time
|
||||
else 0
|
||||
)
|
||||
|
||||
if processing_time > self.thresholds.emergency_threshold:
|
||||
return RecoveryStrategy.EMERGENCY
|
||||
elif self.policy == RecoveryPolicy.AGGRESSIVE:
|
||||
return RecoveryStrategy.RETRY
|
||||
elif self.policy == RecoveryPolicy.CONSERVATIVE:
|
||||
return RecoveryStrategy.REQUEUE
|
||||
else: # BALANCED
|
||||
return (
|
||||
RecoveryStrategy.RETRY
|
||||
if item.retry_count < self.thresholds.max_retries // 2
|
||||
else RecoveryStrategy.REQUEUE
|
||||
)
|
||||
|
||||
async def _execute_recovery(
|
||||
self,
|
||||
url: str,
|
||||
item: QueueItem,
|
||||
strategy: RecoveryStrategy,
|
||||
state_manager,
|
||||
metrics_manager
|
||||
) -> bool:
|
||||
"""Execute recovery strategy"""
|
||||
try:
|
||||
if strategy == RecoveryStrategy.FAIL:
|
||||
await self._handle_failed_item(url, item, state_manager, metrics_manager)
|
||||
success = False
|
||||
elif strategy == RecoveryStrategy.RETRY:
|
||||
await self._handle_retry_item(url, item, state_manager)
|
||||
success = True
|
||||
elif strategy == RecoveryStrategy.REQUEUE:
|
||||
await self._handle_requeue_item(url, item, state_manager)
|
||||
success = True
|
||||
else: # EMERGENCY
|
||||
await self._handle_emergency_recovery(url, item, state_manager, metrics_manager)
|
||||
success = True
|
||||
|
||||
self.tracker.record_recovery(RecoveryResult(
|
||||
item_url=url,
|
||||
strategy=strategy,
|
||||
success=success,
|
||||
retry_count=item.retry_count
|
||||
))
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
self.tracker.record_recovery(RecoveryResult(
|
||||
item_url=url,
|
||||
strategy=strategy,
|
||||
success=False,
|
||||
error=str(e),
|
||||
retry_count=item.retry_count
|
||||
))
|
||||
raise
|
||||
|
||||
async def _handle_failed_item(
|
||||
self,
|
||||
url: str,
|
||||
item: QueueItem,
|
||||
state_manager,
|
||||
metrics_manager
|
||||
) -> None:
|
||||
"""Handle an item that has exceeded retry attempts"""
|
||||
logger.warning(f"Moving stuck item to failed: {url}")
|
||||
|
||||
item.status = "failed"
|
||||
item.error = "Exceeded maximum retries after being stuck"
|
||||
item.last_error = item.error
|
||||
item.last_error_time = datetime.utcnow()
|
||||
|
||||
await state_manager.mark_completed(item, False, item.error)
|
||||
metrics_manager.update(
|
||||
processing_time=item.processing_time or 0,
|
||||
success=False,
|
||||
error=item.error
|
||||
)
|
||||
|
||||
async def _handle_retry_item(
|
||||
self,
|
||||
url: str,
|
||||
item: QueueItem,
|
||||
state_manager
|
||||
) -> None:
|
||||
"""Handle an item that will be retried"""
|
||||
logger.info(f"Recovering stuck item for retry: {url}")
|
||||
|
||||
item.retry_count += 1
|
||||
item.start_time = None
|
||||
item.processing_time = 0
|
||||
item.last_retry = datetime.utcnow()
|
||||
item.status = "pending"
|
||||
item.priority = max(0, item.priority - 2)
|
||||
|
||||
await state_manager.retry_item(item)
|
||||
|
||||
async def _handle_requeue_item(
|
||||
self,
|
||||
url: str,
|
||||
item: QueueItem,
|
||||
state_manager
|
||||
) -> None:
|
||||
"""Handle an item that will be requeued"""
|
||||
logger.info(f"Requeuing stuck item: {url}")
|
||||
|
||||
item.retry_count += 1
|
||||
item.start_time = None
|
||||
item.processing_time = 0
|
||||
item.last_retry = datetime.utcnow()
|
||||
item.status = "pending"
|
||||
item.priority = 0 # Reset priority
|
||||
|
||||
# Calculate backoff delay
|
||||
backoff = self.thresholds.backoff_base * (2 ** (item.retry_count - 1))
|
||||
await asyncio.sleep(min(backoff, 60)) # Cap at 60 seconds
|
||||
|
||||
await state_manager.retry_item(item)
|
||||
|
||||
async def _handle_emergency_recovery(
|
||||
self,
|
||||
url: str,
|
||||
item: QueueItem,
|
||||
state_manager,
|
||||
metrics_manager
|
||||
) -> None:
|
||||
"""Handle emergency recovery of an item"""
|
||||
logger.warning(f"Emergency recovery for item: {url}")
|
||||
|
||||
# Force item cleanup
|
||||
await state_manager.force_cleanup_item(item)
|
||||
|
||||
# Reset item state
|
||||
item.retry_count = 0
|
||||
item.start_time = None
|
||||
item.processing_time = 0
|
||||
item.status = "pending"
|
||||
item.priority = 10 # High priority
|
||||
|
||||
# Add back to queue
|
||||
await state_manager.retry_item(item)
|
||||
|
||||
async def perform_emergency_recovery(
|
||||
self,
|
||||
state_manager,
|
||||
metrics_manager
|
||||
) -> None:
|
||||
"""Perform emergency recovery of all processing items"""
|
||||
try:
|
||||
logger.warning("Performing emergency recovery of all processing items")
|
||||
|
||||
processing_items = await state_manager.get_all_processing_items()
|
||||
|
||||
recovered, failed = await self.recover_stuck_items(
|
||||
[(item.url, item) for item in processing_items],
|
||||
state_manager,
|
||||
metrics_manager
|
||||
)
|
||||
|
||||
logger.info(f"Emergency recovery complete - Recovered: {recovered}, Failed: {failed}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during emergency recovery: {str(e)}")
|
||||
|
||||
def should_recover_item(self, item: QueueItem) -> bool:
|
||||
"""Check if an item should be recovered"""
|
||||
if not hasattr(item, 'start_time') or not item.start_time:
|
||||
return False
|
||||
|
||||
processing_time = datetime.utcnow().timestamp() - item.start_time
|
||||
return processing_time > self.thresholds.deadlock_threshold
|
||||
|
||||
def get_recovery_stats(self) -> Dict[str, Any]:
|
||||
"""Get recovery statistics"""
|
||||
return {
|
||||
"policy": self.policy.value,
|
||||
"thresholds": {
|
||||
"max_retries": self.thresholds.max_retries,
|
||||
"deadlock_threshold": self.thresholds.deadlock_threshold,
|
||||
"emergency_threshold": self.thresholds.emergency_threshold,
|
||||
"max_concurrent": self.thresholds.max_concurrent_recoveries
|
||||
},
|
||||
"tracker": self.tracker.get_stats()
|
||||
}
|
||||
366
videoarchiver/queue/state_manager.py
Normal file
366
videoarchiver/queue/state_manager.py
Normal file
@@ -0,0 +1,366 @@
|
||||
"""Module for managing queue state"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Set, List, Optional, Any
|
||||
from datetime import datetime
|
||||
|
||||
from .models import QueueItem, QueueMetrics
|
||||
|
||||
logger = logging.getLogger("QueueStateManager")
|
||||
|
||||
class ItemState(Enum):
|
||||
"""Possible states for queue items"""
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
RETRYING = "retrying"
|
||||
|
||||
@dataclass
|
||||
class StateTransition:
|
||||
"""Records a state transition"""
|
||||
item_url: str
|
||||
from_state: ItemState
|
||||
to_state: ItemState
|
||||
timestamp: datetime
|
||||
reason: Optional[str] = None
|
||||
|
||||
class StateSnapshot:
|
||||
"""Represents a point-in-time snapshot of queue state"""
|
||||
|
||||
def __init__(self):
|
||||
self.timestamp = datetime.utcnow()
|
||||
self.queue: List[QueueItem] = []
|
||||
self.processing: Dict[str, QueueItem] = {}
|
||||
self.completed: Dict[str, QueueItem] = {}
|
||||
self.failed: Dict[str, QueueItem] = {}
|
||||
self.guild_queues: Dict[int, Set[str]] = {}
|
||||
self.channel_queues: Dict[int, Set[str]] = {}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert snapshot to dictionary"""
|
||||
return {
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
"queue": [item.__dict__ for item in self.queue],
|
||||
"processing": {url: item.__dict__ for url, item in self.processing.items()},
|
||||
"completed": {url: item.__dict__ for url, item in self.completed.items()},
|
||||
"failed": {url: item.__dict__ for url, item in self.failed.items()},
|
||||
"guild_queues": {gid: list(urls) for gid, urls in self.guild_queues.items()},
|
||||
"channel_queues": {cid: list(urls) for cid, urls in self.channel_queues.items()}
|
||||
}
|
||||
|
||||
class StateValidator:
|
||||
"""Validates queue state"""
|
||||
|
||||
@staticmethod
|
||||
def validate_item(item: QueueItem) -> bool:
|
||||
"""Validate a queue item"""
|
||||
return all([
|
||||
isinstance(item.url, str) and item.url,
|
||||
isinstance(item.guild_id, int) and item.guild_id > 0,
|
||||
isinstance(item.channel_id, int) and item.channel_id > 0,
|
||||
isinstance(item.priority, int) and 0 <= item.priority <= 10,
|
||||
isinstance(item.added_at, datetime),
|
||||
isinstance(item.status, str)
|
||||
])
|
||||
|
||||
@staticmethod
|
||||
def validate_transition(
|
||||
item: QueueItem,
|
||||
from_state: ItemState,
|
||||
to_state: ItemState
|
||||
) -> bool:
|
||||
"""Validate a state transition"""
|
||||
valid_transitions = {
|
||||
ItemState.PENDING: {ItemState.PROCESSING, ItemState.FAILED},
|
||||
ItemState.PROCESSING: {ItemState.COMPLETED, ItemState.FAILED, ItemState.RETRYING},
|
||||
ItemState.FAILED: {ItemState.RETRYING},
|
||||
ItemState.RETRYING: {ItemState.PENDING},
|
||||
ItemState.COMPLETED: set() # No transitions from completed
|
||||
}
|
||||
return to_state in valid_transitions.get(from_state, set())
|
||||
|
||||
class StateTracker:
|
||||
"""Tracks state changes and transitions"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.transitions: List[StateTransition] = []
|
||||
self.snapshots: List[StateSnapshot] = []
|
||||
self.state_counts: Dict[ItemState, int] = {state: 0 for state in ItemState}
|
||||
|
||||
def record_transition(
|
||||
self,
|
||||
transition: StateTransition
|
||||
) -> None:
|
||||
"""Record a state transition"""
|
||||
self.transitions.append(transition)
|
||||
if len(self.transitions) > self.max_history:
|
||||
self.transitions.pop(0)
|
||||
|
||||
self.state_counts[transition.from_state] -= 1
|
||||
self.state_counts[transition.to_state] += 1
|
||||
|
||||
def take_snapshot(self, state_manager: 'QueueStateManager') -> None:
|
||||
"""Take a snapshot of current state"""
|
||||
snapshot = StateSnapshot()
|
||||
snapshot.queue = state_manager._queue.copy()
|
||||
snapshot.processing = state_manager._processing.copy()
|
||||
snapshot.completed = state_manager._completed.copy()
|
||||
snapshot.failed = state_manager._failed.copy()
|
||||
snapshot.guild_queues = {
|
||||
gid: urls.copy() for gid, urls in state_manager._guild_queues.items()
|
||||
}
|
||||
snapshot.channel_queues = {
|
||||
cid: urls.copy() for cid, urls in state_manager._channel_queues.items()
|
||||
}
|
||||
|
||||
self.snapshots.append(snapshot)
|
||||
if len(self.snapshots) > self.max_history:
|
||||
self.snapshots.pop(0)
|
||||
|
||||
def get_state_history(self) -> Dict[str, Any]:
|
||||
"""Get state history statistics"""
|
||||
return {
|
||||
"transitions": len(self.transitions),
|
||||
"snapshots": len(self.snapshots),
|
||||
"state_counts": {
|
||||
state.value: count
|
||||
for state, count in self.state_counts.items()
|
||||
},
|
||||
"latest_snapshot": (
|
||||
self.snapshots[-1].to_dict()
|
||||
if self.snapshots
|
||||
else None
|
||||
)
|
||||
}
|
||||
|
||||
class QueueStateManager:
|
||||
"""Manages the state of the queue system"""
|
||||
|
||||
def __init__(self, max_queue_size: int = 1000):
|
||||
self.max_queue_size = max_queue_size
|
||||
|
||||
# Queue storage
|
||||
self._queue: List[QueueItem] = []
|
||||
self._processing: Dict[str, QueueItem] = {}
|
||||
self._completed: Dict[str, QueueItem] = {}
|
||||
self._failed: Dict[str, QueueItem] = {}
|
||||
|
||||
# Tracking
|
||||
self._guild_queues: Dict[int, Set[str]] = {}
|
||||
self._channel_queues: Dict[int, Set[str]] = {}
|
||||
|
||||
# State management
|
||||
self._lock = asyncio.Lock()
|
||||
self.validator = StateValidator()
|
||||
self.tracker = StateTracker()
|
||||
|
||||
async def add_item(self, item: QueueItem) -> bool:
|
||||
"""Add an item to the queue"""
|
||||
if not self.validator.validate_item(item):
|
||||
logger.error(f"Invalid queue item: {item}")
|
||||
return False
|
||||
|
||||
async with self._lock:
|
||||
if len(self._queue) >= self.max_queue_size:
|
||||
return False
|
||||
|
||||
# Record transition
|
||||
self.tracker.record_transition(StateTransition(
|
||||
item_url=item.url,
|
||||
from_state=ItemState.PENDING,
|
||||
to_state=ItemState.PENDING,
|
||||
timestamp=datetime.utcnow(),
|
||||
reason="Initial add"
|
||||
))
|
||||
|
||||
# Add to main queue
|
||||
self._queue.append(item)
|
||||
self._queue.sort(key=lambda x: (-x.priority, x.added_at))
|
||||
|
||||
# Update tracking
|
||||
if item.guild_id not in self._guild_queues:
|
||||
self._guild_queues[item.guild_id] = set()
|
||||
self._guild_queues[item.guild_id].add(item.url)
|
||||
|
||||
if item.channel_id not in self._channel_queues:
|
||||
self._channel_queues[item.channel_id] = set()
|
||||
self._channel_queues[item.channel_id].add(item.url)
|
||||
|
||||
# Take snapshot periodically
|
||||
if len(self._queue) % 100 == 0:
|
||||
self.tracker.take_snapshot(self)
|
||||
|
||||
return True
|
||||
|
||||
async def get_next_items(self, count: int = 5) -> List[QueueItem]:
|
||||
"""Get the next batch of items to process"""
|
||||
items = []
|
||||
async with self._lock:
|
||||
while len(items) < count and self._queue:
|
||||
item = self._queue.pop(0)
|
||||
items.append(item)
|
||||
self._processing[item.url] = item
|
||||
|
||||
# Record transition
|
||||
self.tracker.record_transition(StateTransition(
|
||||
item_url=item.url,
|
||||
from_state=ItemState.PENDING,
|
||||
to_state=ItemState.PROCESSING,
|
||||
timestamp=datetime.utcnow()
|
||||
))
|
||||
|
||||
return items
|
||||
|
||||
async def mark_completed(
|
||||
self,
|
||||
item: QueueItem,
|
||||
success: bool,
|
||||
error: Optional[str] = None
|
||||
) -> None:
|
||||
"""Mark an item as completed or failed"""
|
||||
async with self._lock:
|
||||
self._processing.pop(item.url, None)
|
||||
|
||||
to_state = ItemState.COMPLETED if success else ItemState.FAILED
|
||||
self.tracker.record_transition(StateTransition(
|
||||
item_url=item.url,
|
||||
from_state=ItemState.PROCESSING,
|
||||
to_state=to_state,
|
||||
timestamp=datetime.utcnow(),
|
||||
reason=error if error else None
|
||||
))
|
||||
|
||||
if success:
|
||||
self._completed[item.url] = item
|
||||
else:
|
||||
self._failed[item.url] = item
|
||||
|
||||
async def retry_item(self, item: QueueItem) -> None:
|
||||
"""Add an item back to the queue for retry"""
|
||||
if not self.validator.validate_transition(
|
||||
item,
|
||||
ItemState.FAILED,
|
||||
ItemState.RETRYING
|
||||
):
|
||||
logger.error(f"Invalid retry transition for item: {item}")
|
||||
return
|
||||
|
||||
async with self._lock:
|
||||
self._processing.pop(item.url, None)
|
||||
item.status = ItemState.PENDING.value
|
||||
item.last_retry = datetime.utcnow()
|
||||
item.priority = max(0, item.priority - 1)
|
||||
|
||||
# Record transitions
|
||||
self.tracker.record_transition(StateTransition(
|
||||
item_url=item.url,
|
||||
from_state=ItemState.FAILED,
|
||||
to_state=ItemState.RETRYING,
|
||||
timestamp=datetime.utcnow()
|
||||
))
|
||||
self.tracker.record_transition(StateTransition(
|
||||
item_url=item.url,
|
||||
from_state=ItemState.RETRYING,
|
||||
to_state=ItemState.PENDING,
|
||||
timestamp=datetime.utcnow()
|
||||
))
|
||||
|
||||
self._queue.append(item)
|
||||
self._queue.sort(key=lambda x: (-x.priority, x.added_at))
|
||||
|
||||
async def get_guild_status(self, guild_id: int) -> Dict[str, int]:
|
||||
"""Get queue status for a specific guild"""
|
||||
async with self._lock:
|
||||
return {
|
||||
"pending": len([
|
||||
item for item in self._queue
|
||||
if item.guild_id == guild_id
|
||||
]),
|
||||
"processing": len([
|
||||
item for item in self._processing.values()
|
||||
if item.guild_id == guild_id
|
||||
]),
|
||||
"completed": len([
|
||||
item for item in self._completed.values()
|
||||
if item.guild_id == guild_id
|
||||
]),
|
||||
"failed": len([
|
||||
item for item in self._failed.values()
|
||||
if item.guild_id == guild_id
|
||||
])
|
||||
}
|
||||
|
||||
async def clear_state(self) -> None:
|
||||
"""Clear all state data"""
|
||||
async with self._lock:
|
||||
self._queue.clear()
|
||||
self._processing.clear()
|
||||
self._completed.clear()
|
||||
self._failed.clear()
|
||||
self._guild_queues.clear()
|
||||
self._channel_queues.clear()
|
||||
|
||||
# Take final snapshot before clearing
|
||||
self.tracker.take_snapshot(self)
|
||||
|
||||
async def get_state_for_persistence(self) -> Dict[str, Any]:
|
||||
"""Get current state for persistence"""
|
||||
async with self._lock:
|
||||
# Take snapshot before persistence
|
||||
self.tracker.take_snapshot(self)
|
||||
|
||||
return {
|
||||
"queue": self._queue,
|
||||
"processing": self._processing,
|
||||
"completed": self._completed,
|
||||
"failed": self._failed,
|
||||
"history": self.tracker.get_state_history()
|
||||
}
|
||||
|
||||
async def restore_state(self, state: Dict[str, Any]) -> None:
|
||||
"""Restore state from persisted data"""
|
||||
async with self._lock:
|
||||
self._queue = state.get("queue", [])
|
||||
self._processing = state.get("processing", {})
|
||||
self._completed = state.get("completed", {})
|
||||
self._failed = state.get("failed", {})
|
||||
|
||||
# Validate restored items
|
||||
for item in self._queue:
|
||||
if not self.validator.validate_item(item):
|
||||
logger.warning(f"Removing invalid restored item: {item}")
|
||||
self._queue.remove(item)
|
||||
|
||||
# Rebuild tracking
|
||||
self._rebuild_tracking()
|
||||
|
||||
def _rebuild_tracking(self) -> None:
|
||||
"""Rebuild guild and channel tracking from queue data"""
|
||||
self._guild_queues.clear()
|
||||
self._channel_queues.clear()
|
||||
|
||||
for item in self._queue:
|
||||
if item.guild_id not in self._guild_queues:
|
||||
self._guild_queues[item.guild_id] = set()
|
||||
self._guild_queues[item.guild_id].add(item.url)
|
||||
|
||||
if item.channel_id not in self._channel_queues:
|
||||
self._channel_queues[item.channel_id] = set()
|
||||
self._channel_queues[item.channel_id].add(item.url)
|
||||
|
||||
def get_state_stats(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive state statistics"""
|
||||
return {
|
||||
"queue_size": len(self._queue),
|
||||
"processing_count": len(self._processing),
|
||||
"completed_count": len(self._completed),
|
||||
"failed_count": len(self._failed),
|
||||
"guild_count": len(self._guild_queues),
|
||||
"channel_count": len(self._channel_queues),
|
||||
"history": self.tracker.get_state_history()
|
||||
}
|
||||
Reference in New Issue
Block a user