mirror of
https://github.com/pacnpal/Pac-cogs.git
synced 2025-12-20 02:41:06 -05:00
337 lines
11 KiB
Python
337 lines
11 KiB
Python
"""Module for cleaning historical queue items"""
|
|
|
|
import logging
|
|
from enum import Enum
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, Optional, List, Any, Set
|
|
from datetime import datetime, timedelta
|
|
|
|
from models import QueueItem
|
|
|
|
logger = logging.getLogger("HistoryCleaner")
|
|
|
|
class CleanupStrategy(Enum):
|
|
"""Cleanup strategies"""
|
|
AGGRESSIVE = "aggressive" # Remove more aggressively
|
|
CONSERVATIVE = "conservative" # Remove conservatively
|
|
BALANCED = "balanced" # Balance between retention and cleanup
|
|
|
|
class CleanupPolicy(Enum):
|
|
"""Cleanup policies"""
|
|
AGE = "age" # Clean based on age
|
|
SIZE = "size" # Clean based on size
|
|
HYBRID = "hybrid" # Consider both age and size
|
|
|
|
@dataclass
|
|
class CleanupThresholds:
|
|
"""Thresholds for cleanup operations"""
|
|
max_history_age: int = 43200 # 12 hours
|
|
max_completed_items: int = 10000
|
|
max_failed_items: int = 5000
|
|
min_retention_time: int = 3600 # 1 hour
|
|
size_threshold: int = 100 * 1024 * 1024 # 100MB
|
|
|
|
@dataclass
|
|
class CleanupResult:
|
|
"""Result of a cleanup operation"""
|
|
timestamp: datetime
|
|
items_cleaned: int
|
|
space_freed: int
|
|
duration: float
|
|
strategy: CleanupStrategy
|
|
policy: CleanupPolicy
|
|
details: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
class CleanupTracker:
|
|
"""Tracks cleanup operations"""
|
|
|
|
def __init__(self, max_history: int = 1000):
|
|
self.max_history = max_history
|
|
self.history: List[CleanupResult] = []
|
|
self.total_items_cleaned = 0
|
|
self.total_space_freed = 0
|
|
self.last_cleanup: Optional[datetime] = None
|
|
|
|
def record_cleanup(self, result: CleanupResult) -> None:
|
|
"""Record a cleanup operation"""
|
|
self.history.append(result)
|
|
if len(self.history) > self.max_history:
|
|
self.history.pop(0)
|
|
|
|
self.total_items_cleaned += result.items_cleaned
|
|
self.total_space_freed += result.space_freed
|
|
self.last_cleanup = result.timestamp
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get cleanup statistics"""
|
|
return {
|
|
"total_cleanups": len(self.history),
|
|
"total_items_cleaned": self.total_items_cleaned,
|
|
"total_space_freed": self.total_space_freed,
|
|
"last_cleanup": (
|
|
self.last_cleanup.isoformat()
|
|
if self.last_cleanup
|
|
else None
|
|
),
|
|
"recent_cleanups": [
|
|
{
|
|
"timestamp": r.timestamp.isoformat(),
|
|
"items_cleaned": r.items_cleaned,
|
|
"space_freed": r.space_freed,
|
|
"strategy": r.strategy.value,
|
|
"policy": r.policy.value
|
|
}
|
|
for r in self.history[-5:] # Last 5 cleanups
|
|
]
|
|
}
|
|
|
|
class HistoryCleaner:
|
|
"""Handles cleanup of historical queue items"""
|
|
|
|
def __init__(
|
|
self,
|
|
strategy: CleanupStrategy = CleanupStrategy.BALANCED,
|
|
policy: CleanupPolicy = CleanupPolicy.HYBRID,
|
|
thresholds: Optional[CleanupThresholds] = None
|
|
):
|
|
self.strategy = strategy
|
|
self.policy = policy
|
|
self.thresholds = thresholds or CleanupThresholds()
|
|
self.tracker = CleanupTracker()
|
|
|
|
def _normalize_datetime(self, dt_value: any) -> datetime:
|
|
"""Normalize a datetime value"""
|
|
current_time = datetime.utcnow()
|
|
|
|
if not isinstance(dt_value, datetime):
|
|
try:
|
|
if isinstance(dt_value, str):
|
|
return datetime.fromisoformat(dt_value)
|
|
else:
|
|
return current_time
|
|
except (ValueError, TypeError):
|
|
return current_time
|
|
return dt_value
|
|
|
|
async def cleanup_completed(
|
|
self,
|
|
completed: Dict[str, QueueItem],
|
|
cleanup_cutoff: datetime
|
|
) -> int:
|
|
"""Clean up completed items"""
|
|
start_time = datetime.utcnow()
|
|
items_cleaned = 0
|
|
space_freed = 0
|
|
completed_count = len(completed)
|
|
|
|
try:
|
|
# Determine cleanup approach based on strategy and policy
|
|
if self.policy == CleanupPolicy.SIZE:
|
|
items_to_clean = self._get_items_by_size(completed)
|
|
elif self.policy == CleanupPolicy.HYBRID:
|
|
items_to_clean = self._get_items_hybrid(completed, cleanup_cutoff)
|
|
else: # AGE policy
|
|
items_to_clean = self._get_items_by_age(completed, cleanup_cutoff)
|
|
|
|
# Clean items
|
|
for url in items_to_clean:
|
|
try:
|
|
item = completed[url]
|
|
space_freed += self._estimate_item_size(item)
|
|
completed.pop(url)
|
|
items_cleaned += 1
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning completed item {url}: {e}")
|
|
completed.pop(url)
|
|
items_cleaned += 1
|
|
|
|
# Record cleanup
|
|
self._record_cleanup_result(
|
|
items_cleaned,
|
|
space_freed,
|
|
start_time,
|
|
"completed"
|
|
)
|
|
|
|
logger.debug(f"Cleaned {items_cleaned} completed items")
|
|
return items_cleaned
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during completed items cleanup: {e}")
|
|
return 0
|
|
|
|
async def cleanup_failed(
|
|
self,
|
|
failed: Dict[str, QueueItem],
|
|
cleanup_cutoff: datetime
|
|
) -> int:
|
|
"""Clean up failed items"""
|
|
start_time = datetime.utcnow()
|
|
items_cleaned = 0
|
|
space_freed = 0
|
|
failed_count = len(failed)
|
|
|
|
try:
|
|
# Determine cleanup approach
|
|
if self.policy == CleanupPolicy.SIZE:
|
|
items_to_clean = self._get_items_by_size(failed)
|
|
elif self.policy == CleanupPolicy.HYBRID:
|
|
items_to_clean = self._get_items_hybrid(failed, cleanup_cutoff)
|
|
else: # AGE policy
|
|
items_to_clean = self._get_items_by_age(failed, cleanup_cutoff)
|
|
|
|
# Clean items
|
|
for url in items_to_clean:
|
|
try:
|
|
item = failed[url]
|
|
space_freed += self._estimate_item_size(item)
|
|
failed.pop(url)
|
|
items_cleaned += 1
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning failed item {url}: {e}")
|
|
failed.pop(url)
|
|
items_cleaned += 1
|
|
|
|
# Record cleanup
|
|
self._record_cleanup_result(
|
|
items_cleaned,
|
|
space_freed,
|
|
start_time,
|
|
"failed"
|
|
)
|
|
|
|
logger.debug(f"Cleaned {items_cleaned} failed items")
|
|
return items_cleaned
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during failed items cleanup: {e}")
|
|
return 0
|
|
|
|
def _get_items_by_age(
|
|
self,
|
|
items: Dict[str, QueueItem],
|
|
cutoff: datetime
|
|
) -> Set[str]:
|
|
"""Get items to clean based on age"""
|
|
to_clean = set()
|
|
|
|
for url, item in items.items():
|
|
item.added_at = self._normalize_datetime(item.added_at)
|
|
if item.added_at < cutoff:
|
|
to_clean.add(url)
|
|
|
|
return to_clean
|
|
|
|
def _get_items_by_size(self, items: Dict[str, QueueItem]) -> Set[str]:
|
|
"""Get items to clean based on size"""
|
|
to_clean = set()
|
|
total_size = 0
|
|
|
|
# Sort items by size estimate
|
|
sorted_items = sorted(
|
|
items.items(),
|
|
key=lambda x: self._estimate_item_size(x[1]),
|
|
reverse=True
|
|
)
|
|
|
|
for url, item in sorted_items:
|
|
total_size += self._estimate_item_size(item)
|
|
if total_size > self.thresholds.size_threshold:
|
|
to_clean.add(url)
|
|
|
|
return to_clean
|
|
|
|
def _get_items_hybrid(
|
|
self,
|
|
items: Dict[str, QueueItem],
|
|
cutoff: datetime
|
|
) -> Set[str]:
|
|
"""Get items to clean using hybrid approach"""
|
|
by_age = self._get_items_by_age(items, cutoff)
|
|
by_size = self._get_items_by_size(items)
|
|
|
|
if self.strategy == CleanupStrategy.AGGRESSIVE:
|
|
return by_age.union(by_size)
|
|
elif self.strategy == CleanupStrategy.CONSERVATIVE:
|
|
return by_age.intersection(by_size)
|
|
else: # BALANCED
|
|
return by_age
|
|
|
|
def _estimate_item_size(self, item: QueueItem) -> int:
|
|
"""Estimate size of an item in bytes"""
|
|
# This could be enhanced with actual file size tracking
|
|
base_size = 1024 # 1KB base size
|
|
return base_size * (item.retry_count + 1)
|
|
|
|
def _record_cleanup_result(
|
|
self,
|
|
items_cleaned: int,
|
|
space_freed: int,
|
|
start_time: datetime,
|
|
cleanup_type: str
|
|
) -> None:
|
|
"""Record cleanup result"""
|
|
duration = (datetime.utcnow() - start_time).total_seconds()
|
|
|
|
result = CleanupResult(
|
|
timestamp=datetime.utcnow(),
|
|
items_cleaned=items_cleaned,
|
|
space_freed=space_freed,
|
|
duration=duration,
|
|
strategy=self.strategy,
|
|
policy=self.policy,
|
|
details={"type": cleanup_type}
|
|
)
|
|
|
|
self.tracker.record_cleanup(result)
|
|
|
|
def get_cleanup_cutoff(self) -> datetime:
|
|
"""Get the cutoff time for cleanup"""
|
|
if self.strategy == CleanupStrategy.AGGRESSIVE:
|
|
age = self.thresholds.max_history_age // 2
|
|
elif self.strategy == CleanupStrategy.CONSERVATIVE:
|
|
age = self.thresholds.max_history_age * 2
|
|
else: # BALANCED
|
|
age = self.thresholds.max_history_age
|
|
|
|
return datetime.utcnow() - timedelta(seconds=max(
|
|
age,
|
|
self.thresholds.min_retention_time
|
|
))
|
|
|
|
def format_cleanup_report(
|
|
self,
|
|
initial_completed: int,
|
|
final_completed: int,
|
|
initial_failed: int,
|
|
final_failed: int
|
|
) -> str:
|
|
"""Format a cleanup report"""
|
|
stats = self.tracker.get_stats()
|
|
|
|
return (
|
|
f"History Cleanup Results:\n"
|
|
f"- Completed items: {initial_completed} -> {final_completed}\n"
|
|
f"- Failed items: {initial_failed} -> {final_failed}\n"
|
|
f"- Total items cleaned: {(initial_completed - final_completed) + (initial_failed - final_failed)}\n"
|
|
f"- Space freed: {stats['total_space_freed']} bytes\n"
|
|
f"- Strategy: {self.strategy.value}\n"
|
|
f"- Policy: {self.policy.value}\n"
|
|
f"- Total cleanups: {stats['total_cleanups']}"
|
|
)
|
|
|
|
def get_cleaner_stats(self) -> Dict[str, Any]:
|
|
"""Get comprehensive cleaner statistics"""
|
|
return {
|
|
"strategy": self.strategy.value,
|
|
"policy": self.policy.value,
|
|
"thresholds": {
|
|
"max_history_age": self.thresholds.max_history_age,
|
|
"max_completed_items": self.thresholds.max_completed_items,
|
|
"max_failed_items": self.thresholds.max_failed_items,
|
|
"min_retention_time": self.thresholds.min_retention_time,
|
|
"size_threshold": self.thresholds.size_threshold
|
|
},
|
|
"tracker": self.tracker.get_stats()
|
|
}
|