Core Systems:

Component-based architecture with lifecycle management Enhanced error handling and recovery mechanisms Comprehensive state management and tracking Event-driven architecture with monitoring Queue Management: Multiple processing strategies for different scenarios Advanced state management with recovery Comprehensive metrics and health monitoring Sophisticated cleanup system with multiple strategies Processing Pipeline: Enhanced message handling with validation Improved URL extraction and processing Better queue management and monitoring Advanced cleanup mechanisms Overall Benefits: Better code organization and maintainability Improved error handling and recovery Enhanced monitoring and reporting More robust and reliable system
2026-02-05 12:05:12 -05:00 · 2024-11-16 05:01:29 +00:00
parent 537a325807
commit a4ca6e8ea6
47 changed files with 11085 additions and 2110 deletions
--- a/videoarchiver/queue/cleaners/guild_cleaner.py
+++ b/videoarchiver/queue/cleaners/guild_cleaner.py
@@ -0,0 +1,500 @@
+"""Module for cleaning guild-specific queue items"""
+
+import logging
+from enum import Enum
+from dataclasses import dataclass, field
+from typing import Dict, List, Set, Tuple, Any, Optional
+from datetime import datetime
+
+from ..models import QueueItem
+
+logger = logging.getLogger("GuildCleaner")
+
+class GuildCleanupStrategy(Enum):
+    """Guild cleanup strategies"""
+    FULL = "full"          # Clear all guild items
+    SELECTIVE = "selective"  # Clear only specific categories
+    GRACEFUL = "graceful"   # Clear with grace period
+
+class CleanupCategory(Enum):
+    """Categories for cleanup"""
+    QUEUE = "queue"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    TRACKING = "tracking"
+
+@dataclass
+class GuildCleanupConfig:
+    """Configuration for guild cleanup"""
+    categories: Set[CleanupCategory] = field(default_factory=lambda: set(CleanupCategory))
+    grace_period: int = 300  # 5 minutes
+    preserve_completed: bool = False
+    preserve_failed: bool = False
+    batch_size: int = 100
+
+@dataclass
+class GuildCleanupResult:
+    """Result of a guild cleanup operation"""
+    guild_id: int
+    timestamp: datetime
+    strategy: GuildCleanupStrategy
+    items_cleared: int
+    categories_cleared: Set[CleanupCategory]
+    initial_counts: Dict[str, int]
+    final_counts: Dict[str, int]
+    duration: float
+    error: Optional[str] = None
+
+class GuildCleanupTracker:
+    """Tracks guild cleanup operations"""
+
+    def __init__(self, max_history: int = 1000):
+        self.max_history = max_history
+        self.history: List[GuildCleanupResult] = []
+        self.cleanup_counts: Dict[int, int] = {}  # guild_id -> count
+        self.total_items_cleared = 0
+        self.last_cleanup: Optional[datetime] = None
+
+    def record_cleanup(self, result: GuildCleanupResult) -> None:
+        """Record a cleanup operation"""
+        self.history.append(result)
+        if len(self.history) > self.max_history:
+            self.history.pop(0)
+
+        self.cleanup_counts[result.guild_id] = (
+            self.cleanup_counts.get(result.guild_id, 0) + 1
+        )
+        self.total_items_cleared += result.items_cleared
+        self.last_cleanup = result.timestamp
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cleanup statistics"""
+        return {
+            "total_cleanups": len(self.history),
+            "total_items_cleared": self.total_items_cleared,
+            "guilds_cleaned": len(self.cleanup_counts),
+            "last_cleanup": (
+                self.last_cleanup.isoformat()
+                if self.last_cleanup
+                else None
+            ),
+            "recent_cleanups": [
+                {
+                    "guild_id": r.guild_id,
+                    "timestamp": r.timestamp.isoformat(),
+                    "strategy": r.strategy.value,
+                    "items_cleared": r.items_cleared,
+                    "categories": [c.value for c in r.categories_cleared]
+                }
+                for r in self.history[-5:]  # Last 5 cleanups
+            ]
+        }
+
+class GuildCleaner:
+    """Handles cleanup of guild-specific queue items"""
+
+    def __init__(
+        self,
+        strategy: GuildCleanupStrategy = GuildCleanupStrategy.GRACEFUL,
+        config: Optional[GuildCleanupConfig] = None
+    ):
+        self.strategy = strategy
+        self.config = config or GuildCleanupConfig()
+        self.tracker = GuildCleanupTracker()
+
+    async def clear_guild_items(
+        self,
+        guild_id: int,
+        queue: List[QueueItem],
+        processing: Dict[str, QueueItem],
+        completed: Dict[str, QueueItem],
+        failed: Dict[str, QueueItem],
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]]
+    ) -> Tuple[int, Dict[str, int]]:
+        """Clear all queue items for a specific guild"""
+        start_time = datetime.utcnow()
+        cleared_categories = set()
+        
+        try:
+            # Get initial counts
+            initial_counts = self._get_item_counts(
+                guild_id,
+                queue,
+                processing,
+                completed,
+                failed
+            )
+
+            # Get URLs for this guild
+            guild_urls = guild_queues.get(guild_id, set())
+
+            # Clear items based on strategy
+            cleared_count = 0
+            if self.strategy == GuildCleanupStrategy.FULL:
+                cleared_count = await self._full_cleanup(
+                    guild_id,
+                    queue,
+                    processing,
+                    completed,
+                    failed,
+                    guild_queues,
+                    channel_queues,
+                    cleared_categories
+                )
+            elif self.strategy == GuildCleanupStrategy.SELECTIVE:
+                cleared_count = await self._selective_cleanup(
+                    guild_id,
+                    queue,
+                    processing,
+                    completed,
+                    failed,
+                    guild_queues,
+                    channel_queues,
+                    cleared_categories
+                )
+            else:  # GRACEFUL
+                cleared_count = await self._graceful_cleanup(
+                    guild_id,
+                    queue,
+                    processing,
+                    completed,
+                    failed,
+                    guild_queues,
+                    channel_queues,
+                    cleared_categories
+                )
+
+            # Get final counts
+            final_counts = self._get_item_counts(
+                guild_id,
+                queue,
+                processing,
+                completed,
+                failed
+            )
+
+            # Record cleanup result
+            duration = (datetime.utcnow() - start_time).total_seconds()
+            result = GuildCleanupResult(
+                guild_id=guild_id,
+                timestamp=datetime.utcnow(),
+                strategy=self.strategy,
+                items_cleared=cleared_count,
+                categories_cleared=cleared_categories,
+                initial_counts=initial_counts,
+                final_counts=final_counts,
+                duration=duration
+            )
+            self.tracker.record_cleanup(result)
+
+            logger.info(self.format_guild_cleanup_report(
+                guild_id,
+                initial_counts,
+                final_counts,
+                duration
+            ))
+            return cleared_count, initial_counts
+
+        except Exception as e:
+            logger.error(f"Error clearing guild {guild_id} queue: {e}")
+            self.tracker.record_cleanup(GuildCleanupResult(
+                guild_id=guild_id,
+                timestamp=datetime.utcnow(),
+                strategy=self.strategy,
+                items_cleared=0,
+                categories_cleared=set(),
+                initial_counts={},
+                final_counts={},
+                duration=0,
+                error=str(e)
+            ))
+            raise
+
+    async def _full_cleanup(
+        self,
+        guild_id: int,
+        queue: List[QueueItem],
+        processing: Dict[str, QueueItem],
+        completed: Dict[str, QueueItem],
+        failed: Dict[str, QueueItem],
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]],
+        cleared_categories: Set[CleanupCategory]
+    ) -> int:
+        """Perform full cleanup"""
+        cleared_count = 0
+
+        # Clear from pending queue
+        queue[:] = [item for item in queue if item.guild_id != guild_id]
+        cleared_count += len(queue)
+        cleared_categories.add(CleanupCategory.QUEUE)
+
+        # Clear from processing
+        cleared = await self._clear_from_dict(
+            processing, guild_id, 'processing'
+        )
+        cleared_count += cleared
+        cleared_categories.add(CleanupCategory.PROCESSING)
+
+        # Clear from completed
+        cleared = await self._clear_from_dict(
+            completed, guild_id, 'completed'
+        )
+        cleared_count += cleared
+        cleared_categories.add(CleanupCategory.COMPLETED)
+
+        # Clear from failed
+        cleared = await self._clear_from_dict(
+            failed, guild_id, 'failed'
+        )
+        cleared_count += cleared
+        cleared_categories.add(CleanupCategory.FAILED)
+
+        # Clear tracking
+        cleared = await self._clear_tracking(
+            guild_id,
+            guild_queues,
+            channel_queues
+        )
+        cleared_count += cleared
+        cleared_categories.add(CleanupCategory.TRACKING)
+
+        return cleared_count
+
+    async def _selective_cleanup(
+        self,
+        guild_id: int,
+        queue: List[QueueItem],
+        processing: Dict[str, QueueItem],
+        completed: Dict[str, QueueItem],
+        failed: Dict[str, QueueItem],
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]],
+        cleared_categories: Set[CleanupCategory]
+    ) -> int:
+        """Perform selective cleanup"""
+        cleared_count = 0
+
+        # Clear only configured categories
+        if CleanupCategory.QUEUE in self.config.categories:
+            queue[:] = [item for item in queue if item.guild_id != guild_id]
+            cleared_count += len(queue)
+            cleared_categories.add(CleanupCategory.QUEUE)
+
+        if CleanupCategory.PROCESSING in self.config.categories:
+            cleared = await self._clear_from_dict(
+                processing, guild_id, 'processing'
+            )
+            cleared_count += cleared
+            cleared_categories.add(CleanupCategory.PROCESSING)
+
+        if (
+            CleanupCategory.COMPLETED in self.config.categories and
+            not self.config.preserve_completed
+        ):
+            cleared = await self._clear_from_dict(
+                completed, guild_id, 'completed'
+            )
+            cleared_count += cleared
+            cleared_categories.add(CleanupCategory.COMPLETED)
+
+        if (
+            CleanupCategory.FAILED in self.config.categories and
+            not self.config.preserve_failed
+        ):
+            cleared = await self._clear_from_dict(
+                failed, guild_id, 'failed'
+            )
+            cleared_count += cleared
+            cleared_categories.add(CleanupCategory.FAILED)
+
+        if CleanupCategory.TRACKING in self.config.categories:
+            cleared = await self._clear_tracking(
+                guild_id,
+                guild_queues,
+                channel_queues
+            )
+            cleared_count += cleared
+            cleared_categories.add(CleanupCategory.TRACKING)
+
+        return cleared_count
+
+    async def _graceful_cleanup(
+        self,
+        guild_id: int,
+        queue: List[QueueItem],
+        processing: Dict[str, QueueItem],
+        completed: Dict[str, QueueItem],
+        failed: Dict[str, QueueItem],
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]],
+        cleared_categories: Set[CleanupCategory]
+    ) -> int:
+        """Perform graceful cleanup"""
+        cleared_count = 0
+        cutoff_time = datetime.utcnow().timestamp() - self.config.grace_period
+
+        # Clear queue items beyond grace period
+        queue[:] = [
+            item for item in queue
+            if not (
+                item.guild_id == guild_id and
+                item.added_at.timestamp() < cutoff_time
+            )
+        ]
+        cleared_count += len(queue)
+        cleared_categories.add(CleanupCategory.QUEUE)
+
+        # Clear processing items beyond grace period
+        for url in list(processing.keys()):
+            item = processing[url]
+            if (
+                item.guild_id == guild_id and
+                item.added_at.timestamp() < cutoff_time
+            ):
+                processing.pop(url)
+                cleared_count += 1
+        cleared_categories.add(CleanupCategory.PROCESSING)
+
+        # Clear completed and failed based on config
+        if not self.config.preserve_completed:
+            cleared = await self._clear_from_dict(
+                completed, guild_id, 'completed'
+            )
+            cleared_count += cleared
+            cleared_categories.add(CleanupCategory.COMPLETED)
+
+        if not self.config.preserve_failed:
+            cleared = await self._clear_from_dict(
+                failed, guild_id, 'failed'
+            )
+            cleared_count += cleared
+            cleared_categories.add(CleanupCategory.FAILED)
+
+        # Clear tracking
+        cleared = await self._clear_tracking(
+            guild_id,
+            guild_queues,
+            channel_queues
+        )
+        cleared_count += cleared
+        cleared_categories.add(CleanupCategory.TRACKING)
+
+        return cleared_count
+
+    async def _clear_from_dict(
+        self,
+        items_dict: Dict[str, QueueItem],
+        guild_id: int,
+        category: str
+    ) -> int:
+        """Clear guild items from a dictionary"""
+        cleared = 0
+        batch_count = 0
+        
+        for url in list(items_dict.keys()):
+            if items_dict[url].guild_id == guild_id:
+                items_dict.pop(url)
+                cleared += 1
+                batch_count += 1
+                
+                # Process in batches
+                if batch_count >= self.config.batch_size:
+                    await asyncio.sleep(0)  # Yield to event loop
+                    batch_count = 0
+        
+        logger.debug(f"Cleared {cleared} {category} items for guild {guild_id}")
+        return cleared
+
+    async def _clear_tracking(
+        self,
+        guild_id: int,
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]]
+    ) -> int:
+        """Clear guild tracking data"""
+        cleared = 0
+        guild_urls = guild_queues.get(guild_id, set())
+        
+        # Clear guild tracking
+        if guild_id in guild_queues:
+            cleared += len(guild_queues[guild_id])
+            guild_queues.pop(guild_id)
+
+        # Clear channel tracking
+        await self._clear_channel_tracking(channel_queues, guild_urls)
+        
+        return cleared
+
+    async def _clear_channel_tracking(
+        self,
+        channel_queues: Dict[int, Set[str]],
+        guild_urls: Set[str]
+    ) -> None:
+        """Clear channel tracking for guild URLs"""
+        batch_count = 0
+        
+        for channel_id in list(channel_queues.keys()):
+            channel_queues[channel_id] = {
+                url for url in channel_queues[channel_id]
+                if url not in guild_urls
+            }
+            if not channel_queues[channel_id]:
+                channel_queues.pop(channel_id)
+            
+            batch_count += 1
+            if batch_count >= self.config.batch_size:
+                await asyncio.sleep(0)  # Yield to event loop
+                batch_count = 0
+
+    def _get_item_counts(
+        self,
+        guild_id: int,
+        queue: List[QueueItem],
+        processing: Dict[str, QueueItem],
+        completed: Dict[str, QueueItem],
+        failed: Dict[str, QueueItem]
+    ) -> Dict[str, int]:
+        """Get item counts for a guild"""
+        return {
+            'queue': len([item for item in queue if item.guild_id == guild_id]),
+            'processing': len([item for item in processing.values() if item.guild_id == guild_id]),
+            'completed': len([item for item in completed.values() if item.guild_id == guild_id]),
+            'failed': len([item for item in failed.values() if item.guild_id == guild_id])
+        }
+
+    def format_guild_cleanup_report(
+        self,
+        guild_id: int,
+        initial_counts: Dict[str, int],
+        final_counts: Dict[str, int],
+        duration: float
+    ) -> str:
+        """Format a guild cleanup report"""
+        return (
+            f"Guild {guild_id} Cleanup Results:\n"
+            f"Strategy: {self.strategy.value}\n"
+            f"Duration: {duration:.2f}s\n"
+            f"Items:\n"
+            f"- Queue: {initial_counts['queue']} -> {final_counts['queue']}\n"
+            f"- Processing: {initial_counts['processing']} -> {final_counts['processing']}\n"
+            f"- Completed: {initial_counts['completed']} -> {final_counts['completed']}\n"
+            f"- Failed: {initial_counts['failed']} -> {final_counts['failed']}\n"
+            f"Total cleared: {sum(initial_counts.values()) - sum(final_counts.values())} items"
+        )
+
+    def get_cleaner_stats(self) -> Dict[str, Any]:
+        """Get comprehensive cleaner statistics"""
+        return {
+            "strategy": self.strategy.value,
+            "config": {
+                "categories": [c.value for c in self.config.categories],
+                "grace_period": self.config.grace_period,
+                "preserve_completed": self.config.preserve_completed,
+                "preserve_failed": self.config.preserve_failed,
+                "batch_size": self.config.batch_size
+            },
+            "tracker": self.tracker.get_stats()
+        }
--- a/videoarchiver/queue/cleaners/history_cleaner.py
+++ b/videoarchiver/queue/cleaners/history_cleaner.py
@@ -0,0 +1,336 @@
+"""Module for cleaning historical queue items"""
+
+import logging
+from enum import Enum
+from dataclasses import dataclass, field
+from typing import Dict, Optional, List, Any, Set
+from datetime import datetime, timedelta
+
+from ..models import QueueItem
+
+logger = logging.getLogger("HistoryCleaner")
+
+class CleanupStrategy(Enum):
+    """Cleanup strategies"""
+    AGGRESSIVE = "aggressive"    # Remove more aggressively
+    CONSERVATIVE = "conservative"  # Remove conservatively
+    BALANCED = "balanced"       # Balance between retention and cleanup
+
+class CleanupPolicy(Enum):
+    """Cleanup policies"""
+    AGE = "age"           # Clean based on age
+    SIZE = "size"         # Clean based on size
+    HYBRID = "hybrid"     # Consider both age and size
+
+@dataclass
+class CleanupThresholds:
+    """Thresholds for cleanup operations"""
+    max_history_age: int = 43200  # 12 hours
+    max_completed_items: int = 10000
+    max_failed_items: int = 5000
+    min_retention_time: int = 3600  # 1 hour
+    size_threshold: int = 100 * 1024 * 1024  # 100MB
+
+@dataclass
+class CleanupResult:
+    """Result of a cleanup operation"""
+    timestamp: datetime
+    items_cleaned: int
+    space_freed: int
+    duration: float
+    strategy: CleanupStrategy
+    policy: CleanupPolicy
+    details: Dict[str, Any] = field(default_factory=dict)
+
+class CleanupTracker:
+    """Tracks cleanup operations"""
+
+    def __init__(self, max_history: int = 1000):
+        self.max_history = max_history
+        self.history: List[CleanupResult] = []
+        self.total_items_cleaned = 0
+        self.total_space_freed = 0
+        self.last_cleanup: Optional[datetime] = None
+
+    def record_cleanup(self, result: CleanupResult) -> None:
+        """Record a cleanup operation"""
+        self.history.append(result)
+        if len(self.history) > self.max_history:
+            self.history.pop(0)
+        
+        self.total_items_cleaned += result.items_cleaned
+        self.total_space_freed += result.space_freed
+        self.last_cleanup = result.timestamp
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cleanup statistics"""
+        return {
+            "total_cleanups": len(self.history),
+            "total_items_cleaned": self.total_items_cleaned,
+            "total_space_freed": self.total_space_freed,
+            "last_cleanup": (
+                self.last_cleanup.isoformat()
+                if self.last_cleanup
+                else None
+            ),
+            "recent_cleanups": [
+                {
+                    "timestamp": r.timestamp.isoformat(),
+                    "items_cleaned": r.items_cleaned,
+                    "space_freed": r.space_freed,
+                    "strategy": r.strategy.value,
+                    "policy": r.policy.value
+                }
+                for r in self.history[-5:]  # Last 5 cleanups
+            ]
+        }
+
+class HistoryCleaner:
+    """Handles cleanup of historical queue items"""
+
+    def __init__(
+        self,
+        strategy: CleanupStrategy = CleanupStrategy.BALANCED,
+        policy: CleanupPolicy = CleanupPolicy.HYBRID,
+        thresholds: Optional[CleanupThresholds] = None
+    ):
+        self.strategy = strategy
+        self.policy = policy
+        self.thresholds = thresholds or CleanupThresholds()
+        self.tracker = CleanupTracker()
+
+    def _normalize_datetime(self, dt_value: any) -> datetime:
+        """Normalize a datetime value"""
+        current_time = datetime.utcnow()
+        
+        if not isinstance(dt_value, datetime):
+            try:
+                if isinstance(dt_value, str):
+                    return datetime.fromisoformat(dt_value)
+                else:
+                    return current_time
+            except (ValueError, TypeError):
+                return current_time
+        return dt_value
+
+    async def cleanup_completed(
+        self,
+        completed: Dict[str, QueueItem],
+        cleanup_cutoff: datetime
+    ) -> int:
+        """Clean up completed items"""
+        start_time = datetime.utcnow()
+        items_cleaned = 0
+        space_freed = 0
+        completed_count = len(completed)
+
+        try:
+            # Determine cleanup approach based on strategy and policy
+            if self.policy == CleanupPolicy.SIZE:
+                items_to_clean = self._get_items_by_size(completed)
+            elif self.policy == CleanupPolicy.HYBRID:
+                items_to_clean = self._get_items_hybrid(completed, cleanup_cutoff)
+            else:  # AGE policy
+                items_to_clean = self._get_items_by_age(completed, cleanup_cutoff)
+
+            # Clean items
+            for url in items_to_clean:
+                try:
+                    item = completed[url]
+                    space_freed += self._estimate_item_size(item)
+                    completed.pop(url)
+                    items_cleaned += 1
+                except Exception as e:
+                    logger.error(f"Error cleaning completed item {url}: {e}")
+                    completed.pop(url)
+                    items_cleaned += 1
+
+            # Record cleanup
+            self._record_cleanup_result(
+                items_cleaned,
+                space_freed,
+                start_time,
+                "completed"
+            )
+
+            logger.debug(f"Cleaned {items_cleaned} completed items")
+            return items_cleaned
+
+        except Exception as e:
+            logger.error(f"Error during completed items cleanup: {e}")
+            return 0
+
+    async def cleanup_failed(
+        self,
+        failed: Dict[str, QueueItem],
+        cleanup_cutoff: datetime
+    ) -> int:
+        """Clean up failed items"""
+        start_time = datetime.utcnow()
+        items_cleaned = 0
+        space_freed = 0
+        failed_count = len(failed)
+
+        try:
+            # Determine cleanup approach
+            if self.policy == CleanupPolicy.SIZE:
+                items_to_clean = self._get_items_by_size(failed)
+            elif self.policy == CleanupPolicy.HYBRID:
+                items_to_clean = self._get_items_hybrid(failed, cleanup_cutoff)
+            else:  # AGE policy
+                items_to_clean = self._get_items_by_age(failed, cleanup_cutoff)
+
+            # Clean items
+            for url in items_to_clean:
+                try:
+                    item = failed[url]
+                    space_freed += self._estimate_item_size(item)
+                    failed.pop(url)
+                    items_cleaned += 1
+                except Exception as e:
+                    logger.error(f"Error cleaning failed item {url}: {e}")
+                    failed.pop(url)
+                    items_cleaned += 1
+
+            # Record cleanup
+            self._record_cleanup_result(
+                items_cleaned,
+                space_freed,
+                start_time,
+                "failed"
+            )
+
+            logger.debug(f"Cleaned {items_cleaned} failed items")
+            return items_cleaned
+
+        except Exception as e:
+            logger.error(f"Error during failed items cleanup: {e}")
+            return 0
+
+    def _get_items_by_age(
+        self,
+        items: Dict[str, QueueItem],
+        cutoff: datetime
+    ) -> Set[str]:
+        """Get items to clean based on age"""
+        to_clean = set()
+        
+        for url, item in items.items():
+            item.added_at = self._normalize_datetime(item.added_at)
+            if item.added_at < cutoff:
+                to_clean.add(url)
+
+        return to_clean
+
+    def _get_items_by_size(self, items: Dict[str, QueueItem]) -> Set[str]:
+        """Get items to clean based on size"""
+        to_clean = set()
+        total_size = 0
+        
+        # Sort items by size estimate
+        sorted_items = sorted(
+            items.items(),
+            key=lambda x: self._estimate_item_size(x[1]),
+            reverse=True
+        )
+        
+        for url, item in sorted_items:
+            total_size += self._estimate_item_size(item)
+            if total_size > self.thresholds.size_threshold:
+                to_clean.add(url)
+
+        return to_clean
+
+    def _get_items_hybrid(
+        self,
+        items: Dict[str, QueueItem],
+        cutoff: datetime
+    ) -> Set[str]:
+        """Get items to clean using hybrid approach"""
+        by_age = self._get_items_by_age(items, cutoff)
+        by_size = self._get_items_by_size(items)
+        
+        if self.strategy == CleanupStrategy.AGGRESSIVE:
+            return by_age.union(by_size)
+        elif self.strategy == CleanupStrategy.CONSERVATIVE:
+            return by_age.intersection(by_size)
+        else:  # BALANCED
+            return by_age
+
+    def _estimate_item_size(self, item: QueueItem) -> int:
+        """Estimate size of an item in bytes"""
+        # This could be enhanced with actual file size tracking
+        base_size = 1024  # 1KB base size
+        return base_size * (item.retry_count + 1)
+
+    def _record_cleanup_result(
+        self,
+        items_cleaned: int,
+        space_freed: int,
+        start_time: datetime,
+        cleanup_type: str
+    ) -> None:
+        """Record cleanup result"""
+        duration = (datetime.utcnow() - start_time).total_seconds()
+        
+        result = CleanupResult(
+            timestamp=datetime.utcnow(),
+            items_cleaned=items_cleaned,
+            space_freed=space_freed,
+            duration=duration,
+            strategy=self.strategy,
+            policy=self.policy,
+            details={"type": cleanup_type}
+        )
+        
+        self.tracker.record_cleanup(result)
+
+    def get_cleanup_cutoff(self) -> datetime:
+        """Get the cutoff time for cleanup"""
+        if self.strategy == CleanupStrategy.AGGRESSIVE:
+            age = self.thresholds.max_history_age // 2
+        elif self.strategy == CleanupStrategy.CONSERVATIVE:
+            age = self.thresholds.max_history_age * 2
+        else:  # BALANCED
+            age = self.thresholds.max_history_age
+
+        return datetime.utcnow() - timedelta(seconds=max(
+            age,
+            self.thresholds.min_retention_time
+        ))
+
+    def format_cleanup_report(
+        self,
+        initial_completed: int,
+        final_completed: int,
+        initial_failed: int,
+        final_failed: int
+    ) -> str:
+        """Format a cleanup report"""
+        stats = self.tracker.get_stats()
+        
+        return (
+            f"History Cleanup Results:\n"
+            f"- Completed items: {initial_completed} -> {final_completed}\n"
+            f"- Failed items: {initial_failed} -> {final_failed}\n"
+            f"- Total items cleaned: {(initial_completed - final_completed) + (initial_failed - final_failed)}\n"
+            f"- Space freed: {stats['total_space_freed']} bytes\n"
+            f"- Strategy: {self.strategy.value}\n"
+            f"- Policy: {self.policy.value}\n"
+            f"- Total cleanups: {stats['total_cleanups']}"
+        )
+
+    def get_cleaner_stats(self) -> Dict[str, Any]:
+        """Get comprehensive cleaner statistics"""
+        return {
+            "strategy": self.strategy.value,
+            "policy": self.policy.value,
+            "thresholds": {
+                "max_history_age": self.thresholds.max_history_age,
+                "max_completed_items": self.thresholds.max_completed_items,
+                "max_failed_items": self.thresholds.max_failed_items,
+                "min_retention_time": self.thresholds.min_retention_time,
+                "size_threshold": self.thresholds.size_threshold
+            },
+            "tracker": self.tracker.get_stats()
+        }
--- a/videoarchiver/queue/cleaners/tracking_cleaner.py
+++ b/videoarchiver/queue/cleaners/tracking_cleaner.py
@@ -0,0 +1,452 @@
+"""Module for cleaning queue tracking data"""
+
+import logging
+import asyncio
+from enum import Enum
+from dataclasses import dataclass, field
+from typing import Dict, List, Set, Tuple, Any, Optional
+from datetime import datetime
+
+from ..models import QueueItem
+
+logger = logging.getLogger("TrackingCleaner")
+
+class TrackingCleanupStrategy(Enum):
+    """Tracking cleanup strategies"""
+    AGGRESSIVE = "aggressive"    # Remove all invalid entries
+    CONSERVATIVE = "conservative"  # Keep recent invalid entries
+    BALANCED = "balanced"       # Balance between cleanup and retention
+
+class TrackingType(Enum):
+    """Types of tracking data"""
+    GUILD = "guild"
+    CHANNEL = "channel"
+    URL = "url"
+
+@dataclass
+class TrackingCleanupConfig:
+    """Configuration for tracking cleanup"""
+    batch_size: int = 100
+    retention_period: int = 3600  # 1 hour
+    validate_urls: bool = True
+    cleanup_empty: bool = True
+    max_invalid_ratio: float = 0.5  # 50% invalid threshold
+
+@dataclass
+class TrackingCleanupResult:
+    """Result of a tracking cleanup operation"""
+    timestamp: datetime
+    strategy: TrackingCleanupStrategy
+    items_cleaned: int
+    guilds_cleaned: int
+    channels_cleaned: int
+    duration: float
+    initial_counts: Dict[str, int]
+    final_counts: Dict[str, int]
+    error: Optional[str] = None
+
+class TrackingValidator:
+    """Validates tracking data"""
+
+    @staticmethod
+    def validate_url(url: str) -> bool:
+        """Validate URL format"""
+        try:
+            return bool(url and isinstance(url, str) and "://" in url)
+        except Exception:
+            return False
+
+    @staticmethod
+    def validate_id(id_value: int) -> bool:
+        """Validate ID format"""
+        try:
+            return bool(isinstance(id_value, int) and id_value > 0)
+        except Exception:
+            return False
+
+class TrackingCleanupTracker:
+    """Tracks cleanup operations"""
+
+    def __init__(self, max_history: int = 1000):
+        self.max_history = max_history
+        self.history: List[TrackingCleanupResult] = []
+        self.total_items_cleaned = 0
+        self.total_guilds_cleaned = 0
+        self.total_channels_cleaned = 0
+        self.last_cleanup: Optional[datetime] = None
+
+    def record_cleanup(self, result: TrackingCleanupResult) -> None:
+        """Record a cleanup operation"""
+        self.history.append(result)
+        if len(self.history) > self.max_history:
+            self.history.pop(0)
+
+        self.total_items_cleaned += result.items_cleaned
+        self.total_guilds_cleaned += result.guilds_cleaned
+        self.total_channels_cleaned += result.channels_cleaned
+        self.last_cleanup = result.timestamp
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cleanup statistics"""
+        return {
+            "total_cleanups": len(self.history),
+            "total_items_cleaned": self.total_items_cleaned,
+            "total_guilds_cleaned": self.total_guilds_cleaned,
+            "total_channels_cleaned": self.total_channels_cleaned,
+            "last_cleanup": (
+                self.last_cleanup.isoformat()
+                if self.last_cleanup
+                else None
+            ),
+            "recent_cleanups": [
+                {
+                    "timestamp": r.timestamp.isoformat(),
+                    "strategy": r.strategy.value,
+                    "items_cleaned": r.items_cleaned,
+                    "guilds_cleaned": r.guilds_cleaned,
+                    "channels_cleaned": r.channels_cleaned,
+                    "duration": r.duration
+                }
+                for r in self.history[-5:]  # Last 5 cleanups
+            ]
+        }
+
+class TrackingCleaner:
+    """Handles cleanup of queue tracking data"""
+
+    def __init__(
+        self,
+        strategy: TrackingCleanupStrategy = TrackingCleanupStrategy.BALANCED,
+        config: Optional[TrackingCleanupConfig] = None
+    ):
+        self.strategy = strategy
+        self.config = config or TrackingCleanupConfig()
+        self.tracker = TrackingCleanupTracker()
+        self.validator = TrackingValidator()
+
+    async def cleanup_tracking(
+        self,
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]],
+        queue: List[QueueItem],
+        processing: Dict[str, QueueItem]
+    ) -> Tuple[int, Dict[str, int]]:
+        """Clean up tracking data"""
+        start_time = datetime.utcnow()
+        
+        try:
+            # Get initial counts
+            initial_counts = self._get_tracking_counts(
+                guild_queues,
+                channel_queues
+            )
+
+            # Get valid URLs
+            valid_urls = self._get_valid_urls(queue, processing)
+
+            # Clean tracking data based on strategy
+            items_cleaned = 0
+            guilds_cleaned = 0
+            channels_cleaned = 0
+
+            if self.strategy == TrackingCleanupStrategy.AGGRESSIVE:
+                cleaned = await self._aggressive_cleanup(
+                    guild_queues,
+                    channel_queues,
+                    valid_urls
+                )
+            elif self.strategy == TrackingCleanupStrategy.CONSERVATIVE:
+                cleaned = await self._conservative_cleanup(
+                    guild_queues,
+                    channel_queues,
+                    valid_urls
+                )
+            else:  # BALANCED
+                cleaned = await self._balanced_cleanup(
+                    guild_queues,
+                    channel_queues,
+                    valid_urls
+                )
+
+            items_cleaned = cleaned[0]
+            guilds_cleaned = cleaned[1]
+            channels_cleaned = cleaned[2]
+
+            # Get final counts
+            final_counts = self._get_tracking_counts(
+                guild_queues,
+                channel_queues
+            )
+
+            # Record cleanup result
+            duration = (datetime.utcnow() - start_time).total_seconds()
+            result = TrackingCleanupResult(
+                timestamp=datetime.utcnow(),
+                strategy=self.strategy,
+                items_cleaned=items_cleaned,
+                guilds_cleaned=guilds_cleaned,
+                channels_cleaned=channels_cleaned,
+                duration=duration,
+                initial_counts=initial_counts,
+                final_counts=final_counts
+            )
+            self.tracker.record_cleanup(result)
+
+            logger.info(self.format_tracking_cleanup_report(
+                initial_counts,
+                final_counts,
+                duration
+            ))
+            return items_cleaned, initial_counts
+
+        except Exception as e:
+            logger.error(f"Error cleaning tracking data: {e}")
+            self.tracker.record_cleanup(TrackingCleanupResult(
+                timestamp=datetime.utcnow(),
+                strategy=self.strategy,
+                items_cleaned=0,
+                guilds_cleaned=0,
+                channels_cleaned=0,
+                duration=0,
+                initial_counts={},
+                final_counts={},
+                error=str(e)
+            ))
+            raise
+
+    async def _aggressive_cleanup(
+        self,
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]],
+        valid_urls: Set[str]
+    ) -> Tuple[int, int, int]:
+        """Perform aggressive cleanup"""
+        items_cleaned = 0
+        guilds_cleaned = 0
+        channels_cleaned = 0
+
+        # Clean guild tracking
+        guild_cleaned = await self._cleanup_guild_tracking(
+            guild_queues,
+            valid_urls,
+            validate_all=True
+        )
+        items_cleaned += guild_cleaned[0]
+        guilds_cleaned += guild_cleaned[1]
+
+        # Clean channel tracking
+        channel_cleaned = await self._cleanup_channel_tracking(
+            channel_queues,
+            valid_urls,
+            validate_all=True
+        )
+        items_cleaned += channel_cleaned[0]
+        channels_cleaned += channel_cleaned[1]
+
+        return items_cleaned, guilds_cleaned, channels_cleaned
+
+    async def _conservative_cleanup(
+        self,
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]],
+        valid_urls: Set[str]
+    ) -> Tuple[int, int, int]:
+        """Perform conservative cleanup"""
+        items_cleaned = 0
+        guilds_cleaned = 0
+        channels_cleaned = 0
+
+        # Only clean if invalid ratio exceeds threshold
+        for guild_id, urls in list(guild_queues.items()):
+            invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
+            if invalid_ratio > self.config.max_invalid_ratio:
+                cleaned = await self._cleanup_guild_tracking(
+                    {guild_id: urls},
+                    valid_urls,
+                    validate_all=False
+                )
+                items_cleaned += cleaned[0]
+                guilds_cleaned += cleaned[1]
+
+        for channel_id, urls in list(channel_queues.items()):
+            invalid_ratio = len(urls - valid_urls) / len(urls) if urls else 0
+            if invalid_ratio > self.config.max_invalid_ratio:
+                cleaned = await self._cleanup_channel_tracking(
+                    {channel_id: urls},
+                    valid_urls,
+                    validate_all=False
+                )
+                items_cleaned += cleaned[0]
+                channels_cleaned += cleaned[1]
+
+        return items_cleaned, guilds_cleaned, channels_cleaned
+
+    async def _balanced_cleanup(
+        self,
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]],
+        valid_urls: Set[str]
+    ) -> Tuple[int, int, int]:
+        """Perform balanced cleanup"""
+        items_cleaned = 0
+        guilds_cleaned = 0
+        channels_cleaned = 0
+
+        # Clean guild tracking with validation
+        guild_cleaned = await self._cleanup_guild_tracking(
+            guild_queues,
+            valid_urls,
+            validate_all=self.config.validate_urls
+        )
+        items_cleaned += guild_cleaned[0]
+        guilds_cleaned += guild_cleaned[1]
+
+        # Clean channel tracking with validation
+        channel_cleaned = await self._cleanup_channel_tracking(
+            channel_queues,
+            valid_urls,
+            validate_all=self.config.validate_urls
+        )
+        items_cleaned += channel_cleaned[0]
+        channels_cleaned += channel_cleaned[1]
+
+        return items_cleaned, guilds_cleaned, channels_cleaned
+
+    async def _cleanup_guild_tracking(
+        self,
+        guild_queues: Dict[int, Set[str]],
+        valid_urls: Set[str],
+        validate_all: bool
+    ) -> Tuple[int, int]:
+        """Clean up guild tracking data"""
+        items_cleaned = 0
+        guilds_cleaned = 0
+        batch_count = 0
+
+        for guild_id in list(guild_queues.keys()):
+            if not self.validator.validate_id(guild_id):
+                guild_queues.pop(guild_id)
+                guilds_cleaned += 1
+                continue
+
+            original_size = len(guild_queues[guild_id])
+            guild_queues[guild_id] = {
+                url for url in guild_queues[guild_id]
+                if (
+                    (not validate_all or self.validator.validate_url(url)) and
+                    url in valid_urls
+                )
+            }
+            items_cleaned += original_size - len(guild_queues[guild_id])
+            
+            if self.config.cleanup_empty and not guild_queues[guild_id]:
+                guild_queues.pop(guild_id)
+                guilds_cleaned += 1
+
+            batch_count += 1
+            if batch_count >= self.config.batch_size:
+                await asyncio.sleep(0)  # Yield to event loop
+                batch_count = 0
+
+        logger.debug(f"Cleaned {items_cleaned} guild tracking items")
+        return items_cleaned, guilds_cleaned
+
+    async def _cleanup_channel_tracking(
+        self,
+        channel_queues: Dict[int, Set[str]],
+        valid_urls: Set[str],
+        validate_all: bool
+    ) -> Tuple[int, int]:
+        """Clean up channel tracking data"""
+        items_cleaned = 0
+        channels_cleaned = 0
+        batch_count = 0
+
+        for channel_id in list(channel_queues.keys()):
+            if not self.validator.validate_id(channel_id):
+                channel_queues.pop(channel_id)
+                channels_cleaned += 1
+                continue
+
+            original_size = len(channel_queues[channel_id])
+            channel_queues[channel_id] = {
+                url for url in channel_queues[channel_id]
+                if (
+                    (not validate_all or self.validator.validate_url(url)) and
+                    url in valid_urls
+                )
+            }
+            items_cleaned += original_size - len(channel_queues[channel_id])
+            
+            if self.config.cleanup_empty and not channel_queues[channel_id]:
+                channel_queues.pop(channel_id)
+                channels_cleaned += 1
+
+            batch_count += 1
+            if batch_count >= self.config.batch_size:
+                await asyncio.sleep(0)  # Yield to event loop
+                batch_count = 0
+
+        logger.debug(f"Cleaned {items_cleaned} channel tracking items")
+        return items_cleaned, channels_cleaned
+
+    def _get_valid_urls(
+        self,
+        queue: List[QueueItem],
+        processing: Dict[str, QueueItem]
+    ) -> Set[str]:
+        """Get set of valid URLs"""
+        valid_urls = {item.url for item in queue}
+        valid_urls.update(processing.keys())
+        return valid_urls
+
+    def _get_tracking_counts(
+        self,
+        guild_queues: Dict[int, Set[str]],
+        channel_queues: Dict[int, Set[str]]
+    ) -> Dict[str, int]:
+        """Get tracking data counts"""
+        return {
+            'guilds': len(guild_queues),
+            'channels': len(channel_queues),
+            'guild_urls': sum(len(urls) for urls in guild_queues.values()),
+            'channel_urls': sum(len(urls) for urls in channel_queues.values())
+        }
+
+    def format_tracking_cleanup_report(
+        self,
+        initial_counts: Dict[str, int],
+        final_counts: Dict[str, int],
+        duration: float
+    ) -> str:
+        """Format a tracking cleanup report"""
+        total_cleaned = (
+            (initial_counts['guild_urls'] - final_counts['guild_urls']) +
+            (initial_counts['channel_urls'] - final_counts['channel_urls'])
+        )
+        
+        return (
+            f"Tracking Cleanup Results:\n"
+            f"Strategy: {self.strategy.value}\n"
+            f"Duration: {duration:.2f}s\n"
+            f"Items:\n"
+            f"- Guild Queues: {initial_counts['guilds']} -> {final_counts['guilds']}\n"
+            f"- Channel Queues: {initial_counts['channels']} -> {final_counts['channels']}\n"
+            f"- Guild URLs: {initial_counts['guild_urls']} -> {final_counts['guild_urls']}\n"
+            f"- Channel URLs: {initial_counts['channel_urls']} -> {final_counts['channel_urls']}\n"
+            f"Total items cleaned: {total_cleaned}"
+        )
+
+    def get_cleaner_stats(self) -> Dict[str, Any]:
+        """Get comprehensive cleaner statistics"""
+        return {
+            "strategy": self.strategy.value,
+            "config": {
+                "batch_size": self.config.batch_size,
+                "retention_period": self.config.retention_period,
+                "validate_urls": self.config.validate_urls,
+                "cleanup_empty": self.config.cleanup_empty,
+                "max_invalid_ratio": self.config.max_invalid_ratio
+            },
+            "tracker": self.tracker.get_stats()
+        }