mirror of
https://github.com/pacnpal/Pac-cogs.git
synced 2025-12-20 02:41:06 -05:00
Component-based architecture with lifecycle management Enhanced error handling and recovery mechanisms Comprehensive state management and tracking Event-driven architecture with monitoring Queue Management: Multiple processing strategies for different scenarios Advanced state management with recovery Comprehensive metrics and health monitoring Sophisticated cleanup system with multiple strategies Processing Pipeline: Enhanced message handling with validation Improved URL extraction and processing Better queue management and monitoring Advanced cleanup mechanisms Overall Benefits: Better code organization and maintainability Improved error handling and recovery Enhanced monitoring and reporting More robust and reliable system
453 lines
15 KiB
Python
453 lines
15 KiB
Python
"""Module for cleaning queue tracking data"""
|
|
|
|
import logging
|
|
import asyncio
|
|
from enum import Enum
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, List, Set, Tuple, Any, Optional
|
|
from datetime import datetime
|
|
|
|
from ..models import QueueItem
|
|
|
|
logger = logging.getLogger("TrackingCleaner")
|
|
|
|
class TrackingCleanupStrategy(Enum):
|
|
"""Tracking cleanup strategies"""
|
|
AGGRESSIVE = "aggressive" # Remove all invalid entries
|
|
CONSERVATIVE = "conservative" # Keep recent invalid entries
|
|
BALANCED = "balanced" # Balance between cleanup and retention
|
|
|
|
class TrackingType(Enum):
|
|
"""Types of tracking data"""
|
|
GUILD = "guild"
|
|
CHANNEL = "channel"
|
|
URL = "url"
|
|
|
|
@dataclass
|
|
class TrackingCleanupConfig:
|
|
"""Configuration for tracking cleanup"""
|
|
batch_size: int = 100
|
|
retention_period: int = 3600 # 1 hour
|
|
validate_urls: bool = True
|
|
cleanup_empty: bool = True
|
|
max_invalid_ratio: float = 0.5 # 50% invalid threshold
|
|
|
|
@dataclass
|
|
class TrackingCleanupResult:
|
|
"""Result of a tracking cleanup operation"""
|
|
timestamp: datetime
|
|
strategy: TrackingCleanupStrategy
|
|
items_cleaned: int
|
|
guilds_cleaned: int
|
|
channels_cleaned: int
|
|
duration: float
|
|
initial_counts: Dict[str, int]
|
|
final_counts: Dict[str, int]
|
|
error: Optional[str] = None
|
|
|
|
class TrackingValidator:
|
|
"""Validates tracking data"""
|
|
|
|
@staticmethod
|
|
def validate_url(url: str) -> bool:
|
|
"""Validate URL format"""
|
|
try:
|
|
return bool(url and isinstance(url, str) and "://" in url)
|
|
except Exception:
|
|
return False
|
|
|
|
@staticmethod
|
|
def validate_id(id_value: int) -> bool:
|
|
"""Validate ID format"""
|
|
try:
|
|
return bool(isinstance(id_value, int) and id_value > 0)
|
|
except Exception:
|
|
return False
|
|
|
|
class TrackingCleanupTracker:
|
|
"""Tracks cleanup operations"""
|
|
|
|
def __init__(self, max_history: int = 1000):
|
|
self.max_history = max_history
|
|
self.history: List[TrackingCleanupResult] = []
|
|
self.total_items_cleaned = 0
|
|
self.total_guilds_cleaned = 0
|
|
self.total_channels_cleaned = 0
|
|
self.last_cleanup: Optional[datetime] = None
|
|
|
|
def record_cleanup(self, result: TrackingCleanupResult) -> None:
|
|
"""Record a cleanup operation"""
|
|
self.history.append(result)
|
|
if len(self.history) > self.max_history:
|
|
self.history.pop(0)
|
|
|
|
self.total_items_cleaned += result.items_cleaned
|
|
self.total_guilds_cleaned += result.guilds_cleaned
|
|
self.total_channels_cleaned += result.channels_cleaned
|
|
self.last_cleanup = result.timestamp
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get cleanup statistics"""
|
|
return {
|
|
"total_cleanups": len(self.history),
|
|
"total_items_cleaned": self.total_items_cleaned,
|
|
"total_guilds_cleaned": self.total_guilds_cleaned,
|
|
"total_channels_cleaned": self.total_channels_cleaned,
|
|
"last_cleanup": (
|
|
self.last_cleanup.isoformat()
|
|
if self.last_cleanup
|
|
else None
|
|
),
|
|
"recent_cleanups": [
|
|
{
|
|
"timestamp": r.timestamp.isoformat(),
|
|
"strategy": r.strategy.value,
|
|
"items_cleaned": r.items_cleaned,
|
|
"guilds_cleaned": r.guilds_cleaned,
|
|
"channels_cleaned": r.channels_cleaned,
|
|
"duration": r.duration
|
|
}
|
|
for r in self.history[-5:] # Last 5 cleanups
|
|
]
|
|
}
|
|
|
|
class TrackingCleaner:
|
|
"""Handles cleanup of queue tracking data"""
|
|
|
|
def __init__(
|
|
self,
|
|
strategy: TrackingCleanupStrategy = TrackingCleanupStrategy.BALANCED,
|
|
config: Optional[TrackingCleanupConfig] = None
|
|
):
|
|
self.strategy = strategy
|
|
self.config = config or TrackingCleanupConfig()
|
|
self.tracker = TrackingCleanupTracker()
|
|
self.validator = TrackingValidator()
|
|
|
|
async def cleanup_tracking(
|
|
self,
|
|
guild_queues: Dict[int, Set[str]],
|
|
channel_queues: Dict[int, Set[str]],
|
|
queue: List[QueueItem],
|
|
processing: Dict[str, QueueItem]
|
|
) -> Tuple[int, Dict[str, int]]:
|
|
"""Clean up tracking data"""
|
|
start_time = datetime.utcnow()
|
|
|
|
try:
|
|
# Get initial counts
|
|
initial_counts = self._get_tracking_counts(
|
|
guild_queues,
|
|
channel_queues
|
|
)
|
|
|
|
# Get valid URLs
|
|
valid_urls = self._get_valid_urls(queue, processing)
|
|
|
|
# Clean tracking data based on strategy
|
|
items_cleaned = 0
|
|
guilds_cleaned = 0
|
|
channels_cleaned = 0
|
|
|
|
if self.strategy == TrackingCleanupStrategy.AGGRESSIVE:
|
|
cleaned = await self._aggressive_cleanup(
|
|
guild_queues,
|
|
channel_queues,
|
|
valid_urls
|
|
)
|
|
elif self.strategy == TrackingCleanupStrategy.CONSERVATIVE:
|
|
cleaned = await self._conservative_cleanup(
|
|
guild_queues,
|
|
channel_queues,
|
|
valid_urls
|
|
)
|
|
else: # BALANCED
|
|
cleaned = await self._balanced_cleanup(
|
|
guild_queues,
|
|
channel_queues,
|
|
valid_urls
|
|
)
|
|
|
|
items_cleaned = cleaned[0]
|
|
guilds_cleaned = cleaned[1]
|
|
channels_cleaned = cleaned[2]
|
|
|
|
# Get final counts
|
|
final_counts = self._get_tracking_counts(
|
|
guild_queues,
|
|
channel_queues
|
|
)
|
|
|
|
# Record cleanup result
|
|
duration = (datetime.utcnow() - start_time).total_seconds()
|
|
result = TrackingCleanupResult(
|
|
timestamp=datetime.utcnow(),
|
|
strategy=self.strategy,
|
|
items_cleaned=items_cleaned,
|
|
guilds_cleaned=guilds_cleaned,
|
|
channels_cleaned=channels_cleaned,
|
|
duration=duration,
|
|
initial_counts=initial_counts,
|
|
final_counts=final_counts
|
|
)
|
|
self.tracker.record_cleanup(result)
|
|
|
|
logger.info(self.format_tracking_cleanup_report(
|
|
initial_counts,
|
|
final_counts,
|
|
duration
|
|
))
|
|
return items_cleaned, initial_counts
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning tracking data: {e}")
|
|
self.tracker.record_cleanup(TrackingCleanupResult(
|
|
timestamp=datetime.utcnow(),
|
|
strategy=self.strategy,
|
|
items_cleaned=0,
|
|
guilds_cleaned=0,
|
|
channels_cleaned=0,
|
|
duration=0,
|
|
initial_counts={},
|
|
final_counts={},
|
|
error=str(e)
|
|
))
|
|
raise
|
|
|
|
async def _aggressive_cleanup(
|
|
self,
|
|
guild_queues: Dict[int, Set[str]],
|
|
channel_queues: Dict[int, Set[str]],
|
|
valid_urls: Set[str]
|
|
) -> Tuple[int, int, int]:
|
|
"""Perform aggressive cleanup"""
|
|
items_cleaned = 0
|
|
guilds_cleaned = 0
|
|
channels_cleaned = 0
|
|
|
|
# Clean guild tracking
|
|
guild_cleaned = await self._cleanup_guild_tracking(
|
|
guild_queues,
|
|
valid_urls,
|
|
validate_all=True
|
|
)
|
|
items_cleaned += guild_cleaned[0]
|
|
guilds_cleaned += guild_cleaned[1]
|
|
|
|
# Clean channel tracking
|
|
channel_cleaned = await self._cleanup_channel_tracking(
|
|
channel_queues,
|
|
valid_urls,
|
|
validate_all=True
|
|
)
|
|
items_cleaned += channel_cleaned[0]
|
|
channels_cleaned += channel_cleaned[1]
|
|
|
|
return items_cleaned, guilds_cleaned, channels_cleaned
|
|
|
|
async def _conservative_cleanup(
|
|
self,
|
|
guild_queues: Dict[int, Set[str]],
|
|
channel_queues: Dict[int, Set[str]],
|
|
valid_urls: Set[str]
|
|
) -> Tuple[int, int, int]:
|
|
"""Perform conservative cleanup"""
|
|
items_cleaned = 0
|
|
guilds_cleaned = 0
|
|
channels_cleaned = 0
|
|
|
|
# Only clean if invalid ratio exceeds threshold
|
|
for guild_id, urls in list(guild_queues.items()):
|
|
invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
|
|
if invalid_ratio > self.config.max_invalid_ratio:
|
|
cleaned = await self._cleanup_guild_tracking(
|
|
{guild_id: urls},
|
|
valid_urls,
|
|
validate_all=False
|
|
)
|
|
items_cleaned += cleaned[0]
|
|
guilds_cleaned += cleaned[1]
|
|
|
|
for channel_id, urls in list(channel_queues.items()):
|
|
invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
|
|
if invalid_ratio > self.config.max_invalid_ratio:
|
|
cleaned = await self._cleanup_channel_tracking(
|
|
{channel_id: urls},
|
|
valid_urls,
|
|
validate_all=False
|
|
)
|
|
items_cleaned += cleaned[0]
|
|
channels_cleaned += cleaned[1]
|
|
|
|
return items_cleaned, guilds_cleaned, channels_cleaned
|
|
|
|
async def _balanced_cleanup(
|
|
self,
|
|
guild_queues: Dict[int, Set[str]],
|
|
channel_queues: Dict[int, Set[str]],
|
|
valid_urls: Set[str]
|
|
) -> Tuple[int, int, int]:
|
|
"""Perform balanced cleanup"""
|
|
items_cleaned = 0
|
|
guilds_cleaned = 0
|
|
channels_cleaned = 0
|
|
|
|
# Clean guild tracking with validation
|
|
guild_cleaned = await self._cleanup_guild_tracking(
|
|
guild_queues,
|
|
valid_urls,
|
|
validate_all=self.config.validate_urls
|
|
)
|
|
items_cleaned += guild_cleaned[0]
|
|
guilds_cleaned += guild_cleaned[1]
|
|
|
|
# Clean channel tracking with validation
|
|
channel_cleaned = await self._cleanup_channel_tracking(
|
|
channel_queues,
|
|
valid_urls,
|
|
validate_all=self.config.validate_urls
|
|
)
|
|
items_cleaned += channel_cleaned[0]
|
|
channels_cleaned += channel_cleaned[1]
|
|
|
|
return items_cleaned, guilds_cleaned, channels_cleaned
|
|
|
|
async def _cleanup_guild_tracking(
|
|
self,
|
|
guild_queues: Dict[int, Set[str]],
|
|
valid_urls: Set[str],
|
|
validate_all: bool
|
|
) -> Tuple[int, int]:
|
|
"""Clean up guild tracking data"""
|
|
items_cleaned = 0
|
|
guilds_cleaned = 0
|
|
batch_count = 0
|
|
|
|
for guild_id in list(guild_queues.keys()):
|
|
if not self.validator.validate_id(guild_id):
|
|
guild_queues.pop(guild_id)
|
|
guilds_cleaned += 1
|
|
continue
|
|
|
|
original_size = len(guild_queues[guild_id])
|
|
guild_queues[guild_id] = {
|
|
url for url in guild_queues[guild_id]
|
|
if (
|
|
(not validate_all or self.validator.validate_url(url)) and
|
|
url in valid_urls
|
|
)
|
|
}
|
|
items_cleaned += original_size - len(guild_queues[guild_id])
|
|
|
|
if self.config.cleanup_empty and not guild_queues[guild_id]:
|
|
guild_queues.pop(guild_id)
|
|
guilds_cleaned += 1
|
|
|
|
batch_count += 1
|
|
if batch_count >= self.config.batch_size:
|
|
await asyncio.sleep(0) # Yield to event loop
|
|
batch_count = 0
|
|
|
|
logger.debug(f"Cleaned {items_cleaned} guild tracking items")
|
|
return items_cleaned, guilds_cleaned
|
|
|
|
async def _cleanup_channel_tracking(
|
|
self,
|
|
channel_queues: Dict[int, Set[str]],
|
|
valid_urls: Set[str],
|
|
validate_all: bool
|
|
) -> Tuple[int, int]:
|
|
"""Clean up channel tracking data"""
|
|
items_cleaned = 0
|
|
channels_cleaned = 0
|
|
batch_count = 0
|
|
|
|
for channel_id in list(channel_queues.keys()):
|
|
if not self.validator.validate_id(channel_id):
|
|
channel_queues.pop(channel_id)
|
|
channels_cleaned += 1
|
|
continue
|
|
|
|
original_size = len(channel_queues[channel_id])
|
|
channel_queues[channel_id] = {
|
|
url for url in channel_queues[channel_id]
|
|
if (
|
|
(not validate_all or self.validator.validate_url(url)) and
|
|
url in valid_urls
|
|
)
|
|
}
|
|
items_cleaned += original_size - len(channel_queues[channel_id])
|
|
|
|
if self.config.cleanup_empty and not channel_queues[channel_id]:
|
|
channel_queues.pop(channel_id)
|
|
channels_cleaned += 1
|
|
|
|
batch_count += 1
|
|
if batch_count >= self.config.batch_size:
|
|
await asyncio.sleep(0) # Yield to event loop
|
|
batch_count = 0
|
|
|
|
logger.debug(f"Cleaned {items_cleaned} channel tracking items")
|
|
return items_cleaned, channels_cleaned
|
|
|
|
def _get_valid_urls(
|
|
self,
|
|
queue: List[QueueItem],
|
|
processing: Dict[str, QueueItem]
|
|
) -> Set[str]:
|
|
"""Get set of valid URLs"""
|
|
valid_urls = {item.url for item in queue}
|
|
valid_urls.update(processing.keys())
|
|
return valid_urls
|
|
|
|
def _get_tracking_counts(
|
|
self,
|
|
guild_queues: Dict[int, Set[str]],
|
|
channel_queues: Dict[int, Set[str]]
|
|
) -> Dict[str, int]:
|
|
"""Get tracking data counts"""
|
|
return {
|
|
'guilds': len(guild_queues),
|
|
'channels': len(channel_queues),
|
|
'guild_urls': sum(len(urls) for urls in guild_queues.values()),
|
|
'channel_urls': sum(len(urls) for urls in channel_queues.values())
|
|
}
|
|
|
|
def format_tracking_cleanup_report(
|
|
self,
|
|
initial_counts: Dict[str, int],
|
|
final_counts: Dict[str, int],
|
|
duration: float
|
|
) -> str:
|
|
"""Format a tracking cleanup report"""
|
|
total_cleaned = (
|
|
(initial_counts['guild_urls'] - final_counts['guild_urls']) +
|
|
(initial_counts['channel_urls'] - final_counts['channel_urls'])
|
|
)
|
|
|
|
return (
|
|
f"Tracking Cleanup Results:\n"
|
|
f"Strategy: {self.strategy.value}\n"
|
|
f"Duration: {duration:.2f}s\n"
|
|
f"Items:\n"
|
|
f"- Guild Queues: {initial_counts['guilds']} -> {final_counts['guilds']}\n"
|
|
f"- Channel Queues: {initial_counts['channels']} -> {final_counts['channels']}\n"
|
|
f"- Guild URLs: {initial_counts['guild_urls']} -> {final_counts['guild_urls']}\n"
|
|
f"- Channel URLs: {initial_counts['channel_urls']} -> {final_counts['channel_urls']}\n"
|
|
f"Total items cleaned: {total_cleaned}"
|
|
)
|
|
|
|
def get_cleaner_stats(self) -> Dict[str, Any]:
|
|
"""Get comprehensive cleaner statistics"""
|
|
return {
|
|
"strategy": self.strategy.value,
|
|
"config": {
|
|
"batch_size": self.config.batch_size,
|
|
"retention_period": self.config.retention_period,
|
|
"validate_urls": self.config.validate_urls,
|
|
"cleanup_empty": self.config.cleanup_empty,
|
|
"max_invalid_ratio": self.config.max_invalid_ratio
|
|
},
|
|
"tracker": self.tracker.get_stats()
|
|
}
|