Core Systems:

Component-based architecture with lifecycle management
Enhanced error handling and recovery mechanisms
Comprehensive state management and tracking
Event-driven architecture with monitoring
Queue Management:

Multiple processing strategies for different scenarios
Advanced state management with recovery
Comprehensive metrics and health monitoring
Sophisticated cleanup system with multiple strategies
Processing Pipeline:

Enhanced message handling with validation
Improved URL extraction and processing
Better queue management and monitoring
Advanced cleanup mechanisms
Overall Benefits:

Better code organization and maintainability
Improved error handling and recovery
Enhanced monitoring and reporting
More robust and reliable system
This commit is contained in:
pacnpal
2024-11-16 05:01:29 +00:00
parent 537a325807
commit a4ca6e8ea6
47 changed files with 11085 additions and 2110 deletions

View File

@@ -0,0 +1,500 @@
"""Module for cleaning guild-specific queue items"""
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, List, Set, Tuple, Any, Optional
from datetime import datetime
from ..models import QueueItem
logger = logging.getLogger("GuildCleaner")
class GuildCleanupStrategy(Enum):
"""Guild cleanup strategies"""
FULL = "full" # Clear all guild items
SELECTIVE = "selective" # Clear only specific categories
GRACEFUL = "graceful" # Clear with grace period
class CleanupCategory(Enum):
"""Categories for cleanup"""
QUEUE = "queue"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
TRACKING = "tracking"
@dataclass
class GuildCleanupConfig:
"""Configuration for guild cleanup"""
categories: Set[CleanupCategory] = field(default_factory=lambda: set(CleanupCategory))
grace_period: int = 300 # 5 minutes
preserve_completed: bool = False
preserve_failed: bool = False
batch_size: int = 100
@dataclass
class GuildCleanupResult:
"""Result of a guild cleanup operation"""
guild_id: int
timestamp: datetime
strategy: GuildCleanupStrategy
items_cleared: int
categories_cleared: Set[CleanupCategory]
initial_counts: Dict[str, int]
final_counts: Dict[str, int]
duration: float
error: Optional[str] = None
class GuildCleanupTracker:
"""Tracks guild cleanup operations"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[GuildCleanupResult] = []
self.cleanup_counts: Dict[int, int] = {} # guild_id -> count
self.total_items_cleared = 0
self.last_cleanup: Optional[datetime] = None
def record_cleanup(self, result: GuildCleanupResult) -> None:
"""Record a cleanup operation"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
self.cleanup_counts[result.guild_id] = (
self.cleanup_counts.get(result.guild_id, 0) + 1
)
self.total_items_cleared += result.items_cleared
self.last_cleanup = result.timestamp
def get_stats(self) -> Dict[str, Any]:
"""Get cleanup statistics"""
return {
"total_cleanups": len(self.history),
"total_items_cleared": self.total_items_cleared,
"guilds_cleaned": len(self.cleanup_counts),
"last_cleanup": (
self.last_cleanup.isoformat()
if self.last_cleanup
else None
),
"recent_cleanups": [
{
"guild_id": r.guild_id,
"timestamp": r.timestamp.isoformat(),
"strategy": r.strategy.value,
"items_cleared": r.items_cleared,
"categories": [c.value for c in r.categories_cleared]
}
for r in self.history[-5:] # Last 5 cleanups
]
}
class GuildCleaner:
"""Handles cleanup of guild-specific queue items"""
def __init__(
self,
strategy: GuildCleanupStrategy = GuildCleanupStrategy.GRACEFUL,
config: Optional[GuildCleanupConfig] = None
):
self.strategy = strategy
self.config = config or GuildCleanupConfig()
self.tracker = GuildCleanupTracker()
async def clear_guild_items(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]]
) -> Tuple[int, Dict[str, int]]:
"""Clear all queue items for a specific guild"""
start_time = datetime.utcnow()
cleared_categories = set()
try:
# Get initial counts
initial_counts = self._get_item_counts(
guild_id,
queue,
processing,
completed,
failed
)
# Get URLs for this guild
guild_urls = guild_queues.get(guild_id, set())
# Clear items based on strategy
cleared_count = 0
if self.strategy == GuildCleanupStrategy.FULL:
cleared_count = await self._full_cleanup(
guild_id,
queue,
processing,
completed,
failed,
guild_queues,
channel_queues,
cleared_categories
)
elif self.strategy == GuildCleanupStrategy.SELECTIVE:
cleared_count = await self._selective_cleanup(
guild_id,
queue,
processing,
completed,
failed,
guild_queues,
channel_queues,
cleared_categories
)
else: # GRACEFUL
cleared_count = await self._graceful_cleanup(
guild_id,
queue,
processing,
completed,
failed,
guild_queues,
channel_queues,
cleared_categories
)
# Get final counts
final_counts = self._get_item_counts(
guild_id,
queue,
processing,
completed,
failed
)
# Record cleanup result
duration = (datetime.utcnow() - start_time).total_seconds()
result = GuildCleanupResult(
guild_id=guild_id,
timestamp=datetime.utcnow(),
strategy=self.strategy,
items_cleared=cleared_count,
categories_cleared=cleared_categories,
initial_counts=initial_counts,
final_counts=final_counts,
duration=duration
)
self.tracker.record_cleanup(result)
logger.info(self.format_guild_cleanup_report(
guild_id,
initial_counts,
final_counts,
duration
))
return cleared_count, initial_counts
except Exception as e:
logger.error(f"Error clearing guild {guild_id} queue: {e}")
self.tracker.record_cleanup(GuildCleanupResult(
guild_id=guild_id,
timestamp=datetime.utcnow(),
strategy=self.strategy,
items_cleared=0,
categories_cleared=set(),
initial_counts={},
final_counts={},
duration=0,
error=str(e)
))
raise
async def _full_cleanup(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
cleared_categories: Set[CleanupCategory]
) -> int:
"""Perform full cleanup"""
cleared_count = 0
# Clear from pending queue
queue[:] = [item for item in queue if item.guild_id != guild_id]
cleared_count += len(queue)
cleared_categories.add(CleanupCategory.QUEUE)
# Clear from processing
cleared = await self._clear_from_dict(
processing, guild_id, 'processing'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.PROCESSING)
# Clear from completed
cleared = await self._clear_from_dict(
completed, guild_id, 'completed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.COMPLETED)
# Clear from failed
cleared = await self._clear_from_dict(
failed, guild_id, 'failed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.FAILED)
# Clear tracking
cleared = await self._clear_tracking(
guild_id,
guild_queues,
channel_queues
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.TRACKING)
return cleared_count
async def _selective_cleanup(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
cleared_categories: Set[CleanupCategory]
) -> int:
"""Perform selective cleanup"""
cleared_count = 0
# Clear only configured categories
if CleanupCategory.QUEUE in self.config.categories:
queue[:] = [item for item in queue if item.guild_id != guild_id]
cleared_count += len(queue)
cleared_categories.add(CleanupCategory.QUEUE)
if CleanupCategory.PROCESSING in self.config.categories:
cleared = await self._clear_from_dict(
processing, guild_id, 'processing'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.PROCESSING)
if (
CleanupCategory.COMPLETED in self.config.categories and
not self.config.preserve_completed
):
cleared = await self._clear_from_dict(
completed, guild_id, 'completed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.COMPLETED)
if (
CleanupCategory.FAILED in self.config.categories and
not self.config.preserve_failed
):
cleared = await self._clear_from_dict(
failed, guild_id, 'failed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.FAILED)
if CleanupCategory.TRACKING in self.config.categories:
cleared = await self._clear_tracking(
guild_id,
guild_queues,
channel_queues
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.TRACKING)
return cleared_count
async def _graceful_cleanup(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem],
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
cleared_categories: Set[CleanupCategory]
) -> int:
"""Perform graceful cleanup"""
cleared_count = 0
cutoff_time = datetime.utcnow().timestamp() - self.config.grace_period
# Clear queue items beyond grace period
queue[:] = [
item for item in queue
if not (
item.guild_id == guild_id and
item.added_at.timestamp() < cutoff_time
)
]
cleared_count += len(queue)
cleared_categories.add(CleanupCategory.QUEUE)
# Clear processing items beyond grace period
for url in list(processing.keys()):
item = processing[url]
if (
item.guild_id == guild_id and
item.added_at.timestamp() < cutoff_time
):
processing.pop(url)
cleared_count += 1
cleared_categories.add(CleanupCategory.PROCESSING)
# Clear completed and failed based on config
if not self.config.preserve_completed:
cleared = await self._clear_from_dict(
completed, guild_id, 'completed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.COMPLETED)
if not self.config.preserve_failed:
cleared = await self._clear_from_dict(
failed, guild_id, 'failed'
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.FAILED)
# Clear tracking
cleared = await self._clear_tracking(
guild_id,
guild_queues,
channel_queues
)
cleared_count += cleared
cleared_categories.add(CleanupCategory.TRACKING)
return cleared_count
async def _clear_from_dict(
self,
items_dict: Dict[str, QueueItem],
guild_id: int,
category: str
) -> int:
"""Clear guild items from a dictionary"""
cleared = 0
batch_count = 0
for url in list(items_dict.keys()):
if items_dict[url].guild_id == guild_id:
items_dict.pop(url)
cleared += 1
batch_count += 1
# Process in batches
if batch_count >= self.config.batch_size:
await asyncio.sleep(0) # Yield to event loop
batch_count = 0
logger.debug(f"Cleared {cleared} {category} items for guild {guild_id}")
return cleared
async def _clear_tracking(
self,
guild_id: int,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]]
) -> int:
"""Clear guild tracking data"""
cleared = 0
guild_urls = guild_queues.get(guild_id, set())
# Clear guild tracking
if guild_id in guild_queues:
cleared += len(guild_queues[guild_id])
guild_queues.pop(guild_id)
# Clear channel tracking
await self._clear_channel_tracking(channel_queues, guild_urls)
return cleared
async def _clear_channel_tracking(
self,
channel_queues: Dict[int, Set[str]],
guild_urls: Set[str]
) -> None:
"""Clear channel tracking for guild URLs"""
batch_count = 0
for channel_id in list(channel_queues.keys()):
channel_queues[channel_id] = {
url for url in channel_queues[channel_id]
if url not in guild_urls
}
if not channel_queues[channel_id]:
channel_queues.pop(channel_id)
batch_count += 1
if batch_count >= self.config.batch_size:
await asyncio.sleep(0) # Yield to event loop
batch_count = 0
def _get_item_counts(
self,
guild_id: int,
queue: List[QueueItem],
processing: Dict[str, QueueItem],
completed: Dict[str, QueueItem],
failed: Dict[str, QueueItem]
) -> Dict[str, int]:
"""Get item counts for a guild"""
return {
'queue': len([item for item in queue if item.guild_id == guild_id]),
'processing': len([item for item in processing.values() if item.guild_id == guild_id]),
'completed': len([item for item in completed.values() if item.guild_id == guild_id]),
'failed': len([item for item in failed.values() if item.guild_id == guild_id])
}
def format_guild_cleanup_report(
self,
guild_id: int,
initial_counts: Dict[str, int],
final_counts: Dict[str, int],
duration: float
) -> str:
"""Format a guild cleanup report"""
return (
f"Guild {guild_id} Cleanup Results:\n"
f"Strategy: {self.strategy.value}\n"
f"Duration: {duration:.2f}s\n"
f"Items:\n"
f"- Queue: {initial_counts['queue']} -> {final_counts['queue']}\n"
f"- Processing: {initial_counts['processing']} -> {final_counts['processing']}\n"
f"- Completed: {initial_counts['completed']} -> {final_counts['completed']}\n"
f"- Failed: {initial_counts['failed']} -> {final_counts['failed']}\n"
f"Total cleared: {sum(initial_counts.values()) - sum(final_counts.values())} items"
)
def get_cleaner_stats(self) -> Dict[str, Any]:
"""Get comprehensive cleaner statistics"""
return {
"strategy": self.strategy.value,
"config": {
"categories": [c.value for c in self.config.categories],
"grace_period": self.config.grace_period,
"preserve_completed": self.config.preserve_completed,
"preserve_failed": self.config.preserve_failed,
"batch_size": self.config.batch_size
},
"tracker": self.tracker.get_stats()
}

View File

@@ -0,0 +1,336 @@
"""Module for cleaning historical queue items"""
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, Optional, List, Any, Set
from datetime import datetime, timedelta
from ..models import QueueItem
logger = logging.getLogger("HistoryCleaner")
class CleanupStrategy(Enum):
"""Cleanup strategies"""
AGGRESSIVE = "aggressive" # Remove more aggressively
CONSERVATIVE = "conservative" # Remove conservatively
BALANCED = "balanced" # Balance between retention and cleanup
class CleanupPolicy(Enum):
"""Cleanup policies"""
AGE = "age" # Clean based on age
SIZE = "size" # Clean based on size
HYBRID = "hybrid" # Consider both age and size
@dataclass
class CleanupThresholds:
"""Thresholds for cleanup operations"""
max_history_age: int = 43200 # 12 hours
max_completed_items: int = 10000
max_failed_items: int = 5000
min_retention_time: int = 3600 # 1 hour
size_threshold: int = 100 * 1024 * 1024 # 100MB
@dataclass
class CleanupResult:
"""Result of a cleanup operation"""
timestamp: datetime
items_cleaned: int
space_freed: int
duration: float
strategy: CleanupStrategy
policy: CleanupPolicy
details: Dict[str, Any] = field(default_factory=dict)
class CleanupTracker:
"""Tracks cleanup operations"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[CleanupResult] = []
self.total_items_cleaned = 0
self.total_space_freed = 0
self.last_cleanup: Optional[datetime] = None
def record_cleanup(self, result: CleanupResult) -> None:
"""Record a cleanup operation"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
self.total_items_cleaned += result.items_cleaned
self.total_space_freed += result.space_freed
self.last_cleanup = result.timestamp
def get_stats(self) -> Dict[str, Any]:
"""Get cleanup statistics"""
return {
"total_cleanups": len(self.history),
"total_items_cleaned": self.total_items_cleaned,
"total_space_freed": self.total_space_freed,
"last_cleanup": (
self.last_cleanup.isoformat()
if self.last_cleanup
else None
),
"recent_cleanups": [
{
"timestamp": r.timestamp.isoformat(),
"items_cleaned": r.items_cleaned,
"space_freed": r.space_freed,
"strategy": r.strategy.value,
"policy": r.policy.value
}
for r in self.history[-5:] # Last 5 cleanups
]
}
class HistoryCleaner:
"""Handles cleanup of historical queue items"""
def __init__(
self,
strategy: CleanupStrategy = CleanupStrategy.BALANCED,
policy: CleanupPolicy = CleanupPolicy.HYBRID,
thresholds: Optional[CleanupThresholds] = None
):
self.strategy = strategy
self.policy = policy
self.thresholds = thresholds or CleanupThresholds()
self.tracker = CleanupTracker()
def _normalize_datetime(self, dt_value: any) -> datetime:
"""Normalize a datetime value"""
current_time = datetime.utcnow()
if not isinstance(dt_value, datetime):
try:
if isinstance(dt_value, str):
return datetime.fromisoformat(dt_value)
else:
return current_time
except (ValueError, TypeError):
return current_time
return dt_value
async def cleanup_completed(
self,
completed: Dict[str, QueueItem],
cleanup_cutoff: datetime
) -> int:
"""Clean up completed items"""
start_time = datetime.utcnow()
items_cleaned = 0
space_freed = 0
completed_count = len(completed)
try:
# Determine cleanup approach based on strategy and policy
if self.policy == CleanupPolicy.SIZE:
items_to_clean = self._get_items_by_size(completed)
elif self.policy == CleanupPolicy.HYBRID:
items_to_clean = self._get_items_hybrid(completed, cleanup_cutoff)
else: # AGE policy
items_to_clean = self._get_items_by_age(completed, cleanup_cutoff)
# Clean items
for url in items_to_clean:
try:
item = completed[url]
space_freed += self._estimate_item_size(item)
completed.pop(url)
items_cleaned += 1
except Exception as e:
logger.error(f"Error cleaning completed item {url}: {e}")
completed.pop(url)
items_cleaned += 1
# Record cleanup
self._record_cleanup_result(
items_cleaned,
space_freed,
start_time,
"completed"
)
logger.debug(f"Cleaned {items_cleaned} completed items")
return items_cleaned
except Exception as e:
logger.error(f"Error during completed items cleanup: {e}")
return 0
async def cleanup_failed(
self,
failed: Dict[str, QueueItem],
cleanup_cutoff: datetime
) -> int:
"""Clean up failed items"""
start_time = datetime.utcnow()
items_cleaned = 0
space_freed = 0
failed_count = len(failed)
try:
# Determine cleanup approach
if self.policy == CleanupPolicy.SIZE:
items_to_clean = self._get_items_by_size(failed)
elif self.policy == CleanupPolicy.HYBRID:
items_to_clean = self._get_items_hybrid(failed, cleanup_cutoff)
else: # AGE policy
items_to_clean = self._get_items_by_age(failed, cleanup_cutoff)
# Clean items
for url in items_to_clean:
try:
item = failed[url]
space_freed += self._estimate_item_size(item)
failed.pop(url)
items_cleaned += 1
except Exception as e:
logger.error(f"Error cleaning failed item {url}: {e}")
failed.pop(url)
items_cleaned += 1
# Record cleanup
self._record_cleanup_result(
items_cleaned,
space_freed,
start_time,
"failed"
)
logger.debug(f"Cleaned {items_cleaned} failed items")
return items_cleaned
except Exception as e:
logger.error(f"Error during failed items cleanup: {e}")
return 0
def _get_items_by_age(
self,
items: Dict[str, QueueItem],
cutoff: datetime
) -> Set[str]:
"""Get items to clean based on age"""
to_clean = set()
for url, item in items.items():
item.added_at = self._normalize_datetime(item.added_at)
if item.added_at < cutoff:
to_clean.add(url)
return to_clean
def _get_items_by_size(self, items: Dict[str, QueueItem]) -> Set[str]:
"""Get items to clean based on size"""
to_clean = set()
total_size = 0
# Sort items by size estimate
sorted_items = sorted(
items.items(),
key=lambda x: self._estimate_item_size(x[1]),
reverse=True
)
for url, item in sorted_items:
total_size += self._estimate_item_size(item)
if total_size > self.thresholds.size_threshold:
to_clean.add(url)
return to_clean
def _get_items_hybrid(
self,
items: Dict[str, QueueItem],
cutoff: datetime
) -> Set[str]:
"""Get items to clean using hybrid approach"""
by_age = self._get_items_by_age(items, cutoff)
by_size = self._get_items_by_size(items)
if self.strategy == CleanupStrategy.AGGRESSIVE:
return by_age.union(by_size)
elif self.strategy == CleanupStrategy.CONSERVATIVE:
return by_age.intersection(by_size)
else: # BALANCED
return by_age
def _estimate_item_size(self, item: QueueItem) -> int:
"""Estimate size of an item in bytes"""
# This could be enhanced with actual file size tracking
base_size = 1024 # 1KB base size
return base_size * (item.retry_count + 1)
def _record_cleanup_result(
self,
items_cleaned: int,
space_freed: int,
start_time: datetime,
cleanup_type: str
) -> None:
"""Record cleanup result"""
duration = (datetime.utcnow() - start_time).total_seconds()
result = CleanupResult(
timestamp=datetime.utcnow(),
items_cleaned=items_cleaned,
space_freed=space_freed,
duration=duration,
strategy=self.strategy,
policy=self.policy,
details={"type": cleanup_type}
)
self.tracker.record_cleanup(result)
def get_cleanup_cutoff(self) -> datetime:
"""Get the cutoff time for cleanup"""
if self.strategy == CleanupStrategy.AGGRESSIVE:
age = self.thresholds.max_history_age // 2
elif self.strategy == CleanupStrategy.CONSERVATIVE:
age = self.thresholds.max_history_age * 2
else: # BALANCED
age = self.thresholds.max_history_age
return datetime.utcnow() - timedelta(seconds=max(
age,
self.thresholds.min_retention_time
))
def format_cleanup_report(
self,
initial_completed: int,
final_completed: int,
initial_failed: int,
final_failed: int
) -> str:
"""Format a cleanup report"""
stats = self.tracker.get_stats()
return (
f"History Cleanup Results:\n"
f"- Completed items: {initial_completed} -> {final_completed}\n"
f"- Failed items: {initial_failed} -> {final_failed}\n"
f"- Total items cleaned: {(initial_completed - final_completed) + (initial_failed - final_failed)}\n"
f"- Space freed: {stats['total_space_freed']} bytes\n"
f"- Strategy: {self.strategy.value}\n"
f"- Policy: {self.policy.value}\n"
f"- Total cleanups: {stats['total_cleanups']}"
)
def get_cleaner_stats(self) -> Dict[str, Any]:
"""Get comprehensive cleaner statistics"""
return {
"strategy": self.strategy.value,
"policy": self.policy.value,
"thresholds": {
"max_history_age": self.thresholds.max_history_age,
"max_completed_items": self.thresholds.max_completed_items,
"max_failed_items": self.thresholds.max_failed_items,
"min_retention_time": self.thresholds.min_retention_time,
"size_threshold": self.thresholds.size_threshold
},
"tracker": self.tracker.get_stats()
}

View File

@@ -0,0 +1,452 @@
"""Module for cleaning queue tracking data"""
import logging
import asyncio
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, List, Set, Tuple, Any, Optional
from datetime import datetime
from ..models import QueueItem
logger = logging.getLogger("TrackingCleaner")
class TrackingCleanupStrategy(Enum):
"""Tracking cleanup strategies"""
AGGRESSIVE = "aggressive" # Remove all invalid entries
CONSERVATIVE = "conservative" # Keep recent invalid entries
BALANCED = "balanced" # Balance between cleanup and retention
class TrackingType(Enum):
"""Types of tracking data"""
GUILD = "guild"
CHANNEL = "channel"
URL = "url"
@dataclass
class TrackingCleanupConfig:
"""Configuration for tracking cleanup"""
batch_size: int = 100
retention_period: int = 3600 # 1 hour
validate_urls: bool = True
cleanup_empty: bool = True
max_invalid_ratio: float = 0.5 # 50% invalid threshold
@dataclass
class TrackingCleanupResult:
"""Result of a tracking cleanup operation"""
timestamp: datetime
strategy: TrackingCleanupStrategy
items_cleaned: int
guilds_cleaned: int
channels_cleaned: int
duration: float
initial_counts: Dict[str, int]
final_counts: Dict[str, int]
error: Optional[str] = None
class TrackingValidator:
"""Validates tracking data"""
@staticmethod
def validate_url(url: str) -> bool:
"""Validate URL format"""
try:
return bool(url and isinstance(url, str) and "://" in url)
except Exception:
return False
@staticmethod
def validate_id(id_value: int) -> bool:
"""Validate ID format"""
try:
return bool(isinstance(id_value, int) and id_value > 0)
except Exception:
return False
class TrackingCleanupTracker:
"""Tracks cleanup operations"""
def __init__(self, max_history: int = 1000):
self.max_history = max_history
self.history: List[TrackingCleanupResult] = []
self.total_items_cleaned = 0
self.total_guilds_cleaned = 0
self.total_channels_cleaned = 0
self.last_cleanup: Optional[datetime] = None
def record_cleanup(self, result: TrackingCleanupResult) -> None:
"""Record a cleanup operation"""
self.history.append(result)
if len(self.history) > self.max_history:
self.history.pop(0)
self.total_items_cleaned += result.items_cleaned
self.total_guilds_cleaned += result.guilds_cleaned
self.total_channels_cleaned += result.channels_cleaned
self.last_cleanup = result.timestamp
def get_stats(self) -> Dict[str, Any]:
"""Get cleanup statistics"""
return {
"total_cleanups": len(self.history),
"total_items_cleaned": self.total_items_cleaned,
"total_guilds_cleaned": self.total_guilds_cleaned,
"total_channels_cleaned": self.total_channels_cleaned,
"last_cleanup": (
self.last_cleanup.isoformat()
if self.last_cleanup
else None
),
"recent_cleanups": [
{
"timestamp": r.timestamp.isoformat(),
"strategy": r.strategy.value,
"items_cleaned": r.items_cleaned,
"guilds_cleaned": r.guilds_cleaned,
"channels_cleaned": r.channels_cleaned,
"duration": r.duration
}
for r in self.history[-5:] # Last 5 cleanups
]
}
class TrackingCleaner:
"""Handles cleanup of queue tracking data"""
def __init__(
self,
strategy: TrackingCleanupStrategy = TrackingCleanupStrategy.BALANCED,
config: Optional[TrackingCleanupConfig] = None
):
self.strategy = strategy
self.config = config or TrackingCleanupConfig()
self.tracker = TrackingCleanupTracker()
self.validator = TrackingValidator()
async def cleanup_tracking(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
queue: List[QueueItem],
processing: Dict[str, QueueItem]
) -> Tuple[int, Dict[str, int]]:
"""Clean up tracking data"""
start_time = datetime.utcnow()
try:
# Get initial counts
initial_counts = self._get_tracking_counts(
guild_queues,
channel_queues
)
# Get valid URLs
valid_urls = self._get_valid_urls(queue, processing)
# Clean tracking data based on strategy
items_cleaned = 0
guilds_cleaned = 0
channels_cleaned = 0
if self.strategy == TrackingCleanupStrategy.AGGRESSIVE:
cleaned = await self._aggressive_cleanup(
guild_queues,
channel_queues,
valid_urls
)
elif self.strategy == TrackingCleanupStrategy.CONSERVATIVE:
cleaned = await self._conservative_cleanup(
guild_queues,
channel_queues,
valid_urls
)
else: # BALANCED
cleaned = await self._balanced_cleanup(
guild_queues,
channel_queues,
valid_urls
)
items_cleaned = cleaned[0]
guilds_cleaned = cleaned[1]
channels_cleaned = cleaned[2]
# Get final counts
final_counts = self._get_tracking_counts(
guild_queues,
channel_queues
)
# Record cleanup result
duration = (datetime.utcnow() - start_time).total_seconds()
result = TrackingCleanupResult(
timestamp=datetime.utcnow(),
strategy=self.strategy,
items_cleaned=items_cleaned,
guilds_cleaned=guilds_cleaned,
channels_cleaned=channels_cleaned,
duration=duration,
initial_counts=initial_counts,
final_counts=final_counts
)
self.tracker.record_cleanup(result)
logger.info(self.format_tracking_cleanup_report(
initial_counts,
final_counts,
duration
))
return items_cleaned, initial_counts
except Exception as e:
logger.error(f"Error cleaning tracking data: {e}")
self.tracker.record_cleanup(TrackingCleanupResult(
timestamp=datetime.utcnow(),
strategy=self.strategy,
items_cleaned=0,
guilds_cleaned=0,
channels_cleaned=0,
duration=0,
initial_counts={},
final_counts={},
error=str(e)
))
raise
async def _aggressive_cleanup(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
valid_urls: Set[str]
) -> Tuple[int, int, int]:
"""Perform aggressive cleanup"""
items_cleaned = 0
guilds_cleaned = 0
channels_cleaned = 0
# Clean guild tracking
guild_cleaned = await self._cleanup_guild_tracking(
guild_queues,
valid_urls,
validate_all=True
)
items_cleaned += guild_cleaned[0]
guilds_cleaned += guild_cleaned[1]
# Clean channel tracking
channel_cleaned = await self._cleanup_channel_tracking(
channel_queues,
valid_urls,
validate_all=True
)
items_cleaned += channel_cleaned[0]
channels_cleaned += channel_cleaned[1]
return items_cleaned, guilds_cleaned, channels_cleaned
async def _conservative_cleanup(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
valid_urls: Set[str]
) -> Tuple[int, int, int]:
"""Perform conservative cleanup"""
items_cleaned = 0
guilds_cleaned = 0
channels_cleaned = 0
# Only clean if invalid ratio exceeds threshold
for guild_id, urls in list(guild_queues.items()):
invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
if invalid_ratio > self.config.max_invalid_ratio:
cleaned = await self._cleanup_guild_tracking(
{guild_id: urls},
valid_urls,
validate_all=False
)
items_cleaned += cleaned[0]
guilds_cleaned += cleaned[1]
for channel_id, urls in list(channel_queues.items()):
invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
if invalid_ratio > self.config.max_invalid_ratio:
cleaned = await self._cleanup_channel_tracking(
{channel_id: urls},
valid_urls,
validate_all=False
)
items_cleaned += cleaned[0]
channels_cleaned += cleaned[1]
return items_cleaned, guilds_cleaned, channels_cleaned
async def _balanced_cleanup(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]],
valid_urls: Set[str]
) -> Tuple[int, int, int]:
"""Perform balanced cleanup"""
items_cleaned = 0
guilds_cleaned = 0
channels_cleaned = 0
# Clean guild tracking with validation
guild_cleaned = await self._cleanup_guild_tracking(
guild_queues,
valid_urls,
validate_all=self.config.validate_urls
)
items_cleaned += guild_cleaned[0]
guilds_cleaned += guild_cleaned[1]
# Clean channel tracking with validation
channel_cleaned = await self._cleanup_channel_tracking(
channel_queues,
valid_urls,
validate_all=self.config.validate_urls
)
items_cleaned += channel_cleaned[0]
channels_cleaned += channel_cleaned[1]
return items_cleaned, guilds_cleaned, channels_cleaned
async def _cleanup_guild_tracking(
self,
guild_queues: Dict[int, Set[str]],
valid_urls: Set[str],
validate_all: bool
) -> Tuple[int, int]:
"""Clean up guild tracking data"""
items_cleaned = 0
guilds_cleaned = 0
batch_count = 0
for guild_id in list(guild_queues.keys()):
if not self.validator.validate_id(guild_id):
guild_queues.pop(guild_id)
guilds_cleaned += 1
continue
original_size = len(guild_queues[guild_id])
guild_queues[guild_id] = {
url for url in guild_queues[guild_id]
if (
(not validate_all or self.validator.validate_url(url)) and
url in valid_urls
)
}
items_cleaned += original_size - len(guild_queues[guild_id])
if self.config.cleanup_empty and not guild_queues[guild_id]:
guild_queues.pop(guild_id)
guilds_cleaned += 1
batch_count += 1
if batch_count >= self.config.batch_size:
await asyncio.sleep(0) # Yield to event loop
batch_count = 0
logger.debug(f"Cleaned {items_cleaned} guild tracking items")
return items_cleaned, guilds_cleaned
async def _cleanup_channel_tracking(
self,
channel_queues: Dict[int, Set[str]],
valid_urls: Set[str],
validate_all: bool
) -> Tuple[int, int]:
"""Clean up channel tracking data"""
items_cleaned = 0
channels_cleaned = 0
batch_count = 0
for channel_id in list(channel_queues.keys()):
if not self.validator.validate_id(channel_id):
channel_queues.pop(channel_id)
channels_cleaned += 1
continue
original_size = len(channel_queues[channel_id])
channel_queues[channel_id] = {
url for url in channel_queues[channel_id]
if (
(not validate_all or self.validator.validate_url(url)) and
url in valid_urls
)
}
items_cleaned += original_size - len(channel_queues[channel_id])
if self.config.cleanup_empty and not channel_queues[channel_id]:
channel_queues.pop(channel_id)
channels_cleaned += 1
batch_count += 1
if batch_count >= self.config.batch_size:
await asyncio.sleep(0) # Yield to event loop
batch_count = 0
logger.debug(f"Cleaned {items_cleaned} channel tracking items")
return items_cleaned, channels_cleaned
def _get_valid_urls(
self,
queue: List[QueueItem],
processing: Dict[str, QueueItem]
) -> Set[str]:
"""Get set of valid URLs"""
valid_urls = {item.url for item in queue}
valid_urls.update(processing.keys())
return valid_urls
def _get_tracking_counts(
self,
guild_queues: Dict[int, Set[str]],
channel_queues: Dict[int, Set[str]]
) -> Dict[str, int]:
"""Get tracking data counts"""
return {
'guilds': len(guild_queues),
'channels': len(channel_queues),
'guild_urls': sum(len(urls) for urls in guild_queues.values()),
'channel_urls': sum(len(urls) for urls in channel_queues.values())
}
def format_tracking_cleanup_report(
self,
initial_counts: Dict[str, int],
final_counts: Dict[str, int],
duration: float
) -> str:
"""Format a tracking cleanup report"""
total_cleaned = (
(initial_counts['guild_urls'] - final_counts['guild_urls']) +
(initial_counts['channel_urls'] - final_counts['channel_urls'])
)
return (
f"Tracking Cleanup Results:\n"
f"Strategy: {self.strategy.value}\n"
f"Duration: {duration:.2f}s\n"
f"Items:\n"
f"- Guild Queues: {initial_counts['guilds']} -> {final_counts['guilds']}\n"
f"- Channel Queues: {initial_counts['channels']} -> {final_counts['channels']}\n"
f"- Guild URLs: {initial_counts['guild_urls']} -> {final_counts['guild_urls']}\n"
f"- Channel URLs: {initial_counts['channel_urls']} -> {final_counts['channel_urls']}\n"
f"Total items cleaned: {total_cleaned}"
)
def get_cleaner_stats(self) -> Dict[str, Any]:
"""Get comprehensive cleaner statistics"""
return {
"strategy": self.strategy.value,
"config": {
"batch_size": self.config.batch_size,
"retention_period": self.config.retention_period,
"validate_urls": self.config.validate_urls,
"cleanup_empty": self.config.cleanup_empty,
"max_invalid_ratio": self.config.max_invalid_ratio
},
"tracker": self.tracker.get_stats()
}