mirror of
https://github.com/pacnpal/Pac-cogs.git
synced 2025-12-20 02:41:06 -05:00
Core Systems:
Component-based architecture with lifecycle management Enhanced error handling and recovery mechanisms Comprehensive state management and tracking Event-driven architecture with monitoring Queue Management: Multiple processing strategies for different scenarios Advanced state management with recovery Comprehensive metrics and health monitoring Sophisticated cleanup system with multiple strategies Processing Pipeline: Enhanced message handling with validation Improved URL extraction and processing Better queue management and monitoring Advanced cleanup mechanisms Overall Benefits: Better code organization and maintainability Improved error handling and recovery Enhanced monitoring and reporting More robust and reliable system
This commit is contained in:
336
videoarchiver/queue/cleaners/history_cleaner.py
Normal file
336
videoarchiver/queue/cleaners/history_cleaner.py
Normal file
@@ -0,0 +1,336 @@
|
||||
"""Module for cleaning historical queue items"""
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Optional, List, Any, Set
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from ..models import QueueItem
|
||||
|
||||
logger = logging.getLogger("HistoryCleaner")
|
||||
|
||||
class CleanupStrategy(Enum):
|
||||
"""Cleanup strategies"""
|
||||
AGGRESSIVE = "aggressive" # Remove more aggressively
|
||||
CONSERVATIVE = "conservative" # Remove conservatively
|
||||
BALANCED = "balanced" # Balance between retention and cleanup
|
||||
|
||||
class CleanupPolicy(Enum):
|
||||
"""Cleanup policies"""
|
||||
AGE = "age" # Clean based on age
|
||||
SIZE = "size" # Clean based on size
|
||||
HYBRID = "hybrid" # Consider both age and size
|
||||
|
||||
@dataclass
|
||||
class CleanupThresholds:
|
||||
"""Thresholds for cleanup operations"""
|
||||
max_history_age: int = 43200 # 12 hours
|
||||
max_completed_items: int = 10000
|
||||
max_failed_items: int = 5000
|
||||
min_retention_time: int = 3600 # 1 hour
|
||||
size_threshold: int = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
@dataclass
|
||||
class CleanupResult:
|
||||
"""Result of a cleanup operation"""
|
||||
timestamp: datetime
|
||||
items_cleaned: int
|
||||
space_freed: int
|
||||
duration: float
|
||||
strategy: CleanupStrategy
|
||||
policy: CleanupPolicy
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
class CleanupTracker:
|
||||
"""Tracks cleanup operations"""
|
||||
|
||||
def __init__(self, max_history: int = 1000):
|
||||
self.max_history = max_history
|
||||
self.history: List[CleanupResult] = []
|
||||
self.total_items_cleaned = 0
|
||||
self.total_space_freed = 0
|
||||
self.last_cleanup: Optional[datetime] = None
|
||||
|
||||
def record_cleanup(self, result: CleanupResult) -> None:
|
||||
"""Record a cleanup operation"""
|
||||
self.history.append(result)
|
||||
if len(self.history) > self.max_history:
|
||||
self.history.pop(0)
|
||||
|
||||
self.total_items_cleaned += result.items_cleaned
|
||||
self.total_space_freed += result.space_freed
|
||||
self.last_cleanup = result.timestamp
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get cleanup statistics"""
|
||||
return {
|
||||
"total_cleanups": len(self.history),
|
||||
"total_items_cleaned": self.total_items_cleaned,
|
||||
"total_space_freed": self.total_space_freed,
|
||||
"last_cleanup": (
|
||||
self.last_cleanup.isoformat()
|
||||
if self.last_cleanup
|
||||
else None
|
||||
),
|
||||
"recent_cleanups": [
|
||||
{
|
||||
"timestamp": r.timestamp.isoformat(),
|
||||
"items_cleaned": r.items_cleaned,
|
||||
"space_freed": r.space_freed,
|
||||
"strategy": r.strategy.value,
|
||||
"policy": r.policy.value
|
||||
}
|
||||
for r in self.history[-5:] # Last 5 cleanups
|
||||
]
|
||||
}
|
||||
|
||||
class HistoryCleaner:
|
||||
"""Handles cleanup of historical queue items"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
strategy: CleanupStrategy = CleanupStrategy.BALANCED,
|
||||
policy: CleanupPolicy = CleanupPolicy.HYBRID,
|
||||
thresholds: Optional[CleanupThresholds] = None
|
||||
):
|
||||
self.strategy = strategy
|
||||
self.policy = policy
|
||||
self.thresholds = thresholds or CleanupThresholds()
|
||||
self.tracker = CleanupTracker()
|
||||
|
||||
def _normalize_datetime(self, dt_value: any) -> datetime:
|
||||
"""Normalize a datetime value"""
|
||||
current_time = datetime.utcnow()
|
||||
|
||||
if not isinstance(dt_value, datetime):
|
||||
try:
|
||||
if isinstance(dt_value, str):
|
||||
return datetime.fromisoformat(dt_value)
|
||||
else:
|
||||
return current_time
|
||||
except (ValueError, TypeError):
|
||||
return current_time
|
||||
return dt_value
|
||||
|
||||
async def cleanup_completed(
|
||||
self,
|
||||
completed: Dict[str, QueueItem],
|
||||
cleanup_cutoff: datetime
|
||||
) -> int:
|
||||
"""Clean up completed items"""
|
||||
start_time = datetime.utcnow()
|
||||
items_cleaned = 0
|
||||
space_freed = 0
|
||||
completed_count = len(completed)
|
||||
|
||||
try:
|
||||
# Determine cleanup approach based on strategy and policy
|
||||
if self.policy == CleanupPolicy.SIZE:
|
||||
items_to_clean = self._get_items_by_size(completed)
|
||||
elif self.policy == CleanupPolicy.HYBRID:
|
||||
items_to_clean = self._get_items_hybrid(completed, cleanup_cutoff)
|
||||
else: # AGE policy
|
||||
items_to_clean = self._get_items_by_age(completed, cleanup_cutoff)
|
||||
|
||||
# Clean items
|
||||
for url in items_to_clean:
|
||||
try:
|
||||
item = completed[url]
|
||||
space_freed += self._estimate_item_size(item)
|
||||
completed.pop(url)
|
||||
items_cleaned += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning completed item {url}: {e}")
|
||||
completed.pop(url)
|
||||
items_cleaned += 1
|
||||
|
||||
# Record cleanup
|
||||
self._record_cleanup_result(
|
||||
items_cleaned,
|
||||
space_freed,
|
||||
start_time,
|
||||
"completed"
|
||||
)
|
||||
|
||||
logger.debug(f"Cleaned {items_cleaned} completed items")
|
||||
return items_cleaned
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during completed items cleanup: {e}")
|
||||
return 0
|
||||
|
||||
async def cleanup_failed(
|
||||
self,
|
||||
failed: Dict[str, QueueItem],
|
||||
cleanup_cutoff: datetime
|
||||
) -> int:
|
||||
"""Clean up failed items"""
|
||||
start_time = datetime.utcnow()
|
||||
items_cleaned = 0
|
||||
space_freed = 0
|
||||
failed_count = len(failed)
|
||||
|
||||
try:
|
||||
# Determine cleanup approach
|
||||
if self.policy == CleanupPolicy.SIZE:
|
||||
items_to_clean = self._get_items_by_size(failed)
|
||||
elif self.policy == CleanupPolicy.HYBRID:
|
||||
items_to_clean = self._get_items_hybrid(failed, cleanup_cutoff)
|
||||
else: # AGE policy
|
||||
items_to_clean = self._get_items_by_age(failed, cleanup_cutoff)
|
||||
|
||||
# Clean items
|
||||
for url in items_to_clean:
|
||||
try:
|
||||
item = failed[url]
|
||||
space_freed += self._estimate_item_size(item)
|
||||
failed.pop(url)
|
||||
items_cleaned += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning failed item {url}: {e}")
|
||||
failed.pop(url)
|
||||
items_cleaned += 1
|
||||
|
||||
# Record cleanup
|
||||
self._record_cleanup_result(
|
||||
items_cleaned,
|
||||
space_freed,
|
||||
start_time,
|
||||
"failed"
|
||||
)
|
||||
|
||||
logger.debug(f"Cleaned {items_cleaned} failed items")
|
||||
return items_cleaned
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during failed items cleanup: {e}")
|
||||
return 0
|
||||
|
||||
def _get_items_by_age(
|
||||
self,
|
||||
items: Dict[str, QueueItem],
|
||||
cutoff: datetime
|
||||
) -> Set[str]:
|
||||
"""Get items to clean based on age"""
|
||||
to_clean = set()
|
||||
|
||||
for url, item in items.items():
|
||||
item.added_at = self._normalize_datetime(item.added_at)
|
||||
if item.added_at < cutoff:
|
||||
to_clean.add(url)
|
||||
|
||||
return to_clean
|
||||
|
||||
def _get_items_by_size(self, items: Dict[str, QueueItem]) -> Set[str]:
|
||||
"""Get items to clean based on size"""
|
||||
to_clean = set()
|
||||
total_size = 0
|
||||
|
||||
# Sort items by size estimate
|
||||
sorted_items = sorted(
|
||||
items.items(),
|
||||
key=lambda x: self._estimate_item_size(x[1]),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
for url, item in sorted_items:
|
||||
total_size += self._estimate_item_size(item)
|
||||
if total_size > self.thresholds.size_threshold:
|
||||
to_clean.add(url)
|
||||
|
||||
return to_clean
|
||||
|
||||
def _get_items_hybrid(
|
||||
self,
|
||||
items: Dict[str, QueueItem],
|
||||
cutoff: datetime
|
||||
) -> Set[str]:
|
||||
"""Get items to clean using hybrid approach"""
|
||||
by_age = self._get_items_by_age(items, cutoff)
|
||||
by_size = self._get_items_by_size(items)
|
||||
|
||||
if self.strategy == CleanupStrategy.AGGRESSIVE:
|
||||
return by_age.union(by_size)
|
||||
elif self.strategy == CleanupStrategy.CONSERVATIVE:
|
||||
return by_age.intersection(by_size)
|
||||
else: # BALANCED
|
||||
return by_age
|
||||
|
||||
def _estimate_item_size(self, item: QueueItem) -> int:
|
||||
"""Estimate size of an item in bytes"""
|
||||
# This could be enhanced with actual file size tracking
|
||||
base_size = 1024 # 1KB base size
|
||||
return base_size * (item.retry_count + 1)
|
||||
|
||||
def _record_cleanup_result(
|
||||
self,
|
||||
items_cleaned: int,
|
||||
space_freed: int,
|
||||
start_time: datetime,
|
||||
cleanup_type: str
|
||||
) -> None:
|
||||
"""Record cleanup result"""
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
result = CleanupResult(
|
||||
timestamp=datetime.utcnow(),
|
||||
items_cleaned=items_cleaned,
|
||||
space_freed=space_freed,
|
||||
duration=duration,
|
||||
strategy=self.strategy,
|
||||
policy=self.policy,
|
||||
details={"type": cleanup_type}
|
||||
)
|
||||
|
||||
self.tracker.record_cleanup(result)
|
||||
|
||||
def get_cleanup_cutoff(self) -> datetime:
|
||||
"""Get the cutoff time for cleanup"""
|
||||
if self.strategy == CleanupStrategy.AGGRESSIVE:
|
||||
age = self.thresholds.max_history_age // 2
|
||||
elif self.strategy == CleanupStrategy.CONSERVATIVE:
|
||||
age = self.thresholds.max_history_age * 2
|
||||
else: # BALANCED
|
||||
age = self.thresholds.max_history_age
|
||||
|
||||
return datetime.utcnow() - timedelta(seconds=max(
|
||||
age,
|
||||
self.thresholds.min_retention_time
|
||||
))
|
||||
|
||||
def format_cleanup_report(
|
||||
self,
|
||||
initial_completed: int,
|
||||
final_completed: int,
|
||||
initial_failed: int,
|
||||
final_failed: int
|
||||
) -> str:
|
||||
"""Format a cleanup report"""
|
||||
stats = self.tracker.get_stats()
|
||||
|
||||
return (
|
||||
f"History Cleanup Results:\n"
|
||||
f"- Completed items: {initial_completed} -> {final_completed}\n"
|
||||
f"- Failed items: {initial_failed} -> {final_failed}\n"
|
||||
f"- Total items cleaned: {(initial_completed - final_completed) + (initial_failed - final_failed)}\n"
|
||||
f"- Space freed: {stats['total_space_freed']} bytes\n"
|
||||
f"- Strategy: {self.strategy.value}\n"
|
||||
f"- Policy: {self.policy.value}\n"
|
||||
f"- Total cleanups: {stats['total_cleanups']}"
|
||||
)
|
||||
|
||||
def get_cleaner_stats(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive cleaner statistics"""
|
||||
return {
|
||||
"strategy": self.strategy.value,
|
||||
"policy": self.policy.value,
|
||||
"thresholds": {
|
||||
"max_history_age": self.thresholds.max_history_age,
|
||||
"max_completed_items": self.thresholds.max_completed_items,
|
||||
"max_failed_items": self.thresholds.max_failed_items,
|
||||
"min_retention_time": self.thresholds.min_retention_time,
|
||||
"size_threshold": self.thresholds.size_threshold
|
||||
},
|
||||
"tracker": self.tracker.get_stats()
|
||||
}
|
||||
Reference in New Issue
Block a user