Pac-cogs/videoarchiver/queue/metrics_manager.py

"""Module for managing queue metrics"""

import time
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Dict, Optional, List, Any, Set
from datetime import datetime, timedelta
import json

logger = logging.getLogger("QueueMetricsManager")

class MetricCategory(Enum):
    """Categories of metrics"""
    PROCESSING = "processing"
    PERFORMANCE = "performance"
    ERRORS = "errors"
    HARDWARE = "hardware"
    MEMORY = "memory"
    ACTIVITY = "activity"

class ErrorCategory(Enum):
    """Categories of errors"""
    NETWORK = "network"
    TIMEOUT = "timeout"
    PERMISSION = "permission"
    MEMORY = "memory"
    HARDWARE = "hardware"
    COMPRESSION = "compression"
    STORAGE = "storage"
    OTHER = "other"

@dataclass
class ProcessingMetrics:
    """Processing-related metrics"""
    total_processed: int = 0
    total_failed: int = 0
    success_rate: float = 0.0
    avg_processing_time: float = 0.0
    _total_processing_time: float = 0.0
    _processing_count: int = 0

    def update(self, processing_time: float, success: bool) -> None:
        """Update processing metrics"""
        self.total_processed += 1
        if not success:
            self.total_failed += 1

        self._total_processing_time += processing_time
        self._processing_count += 1

        self.success_rate = (
            (self.total_processed - self.total_failed)
            / self.total_processed
            if self.total_processed > 0
            else 0.0
        )
        self.avg_processing_time = (
            self._total_processing_time / self._processing_count
            if self._processing_count > 0
            else 0.0
        )

@dataclass
class ErrorMetrics:
    """Error-related metrics"""
    errors_by_type: Dict[str, int] = field(default_factory=dict)
    errors_by_category: Dict[ErrorCategory, int] = field(default_factory=dict)
    recent_errors: List[Dict[str, Any]] = field(default_factory=list)
    error_patterns: Dict[str, int] = field(default_factory=dict)
    max_recent_errors: int = 100

    def record_error(self, error: str, category: Optional[ErrorCategory] = None) -> None:
        """Record an error occurrence"""
        # Track by exact error
        self.errors_by_type[error] = self.errors_by_type.get(error, 0) + 1

        # Track by category
        if category is None:
            category = self._categorize_error(error)
        self.errors_by_category[category] = self.errors_by_category.get(category, 0) + 1

        # Track recent errors
        self.recent_errors.append({
            "error": error,
            "category": category.value,
            "timestamp": datetime.utcnow().isoformat()
        })
        if len(self.recent_errors) > self.max_recent_errors:
            self.recent_errors.pop(0)

        # Update error patterns
        pattern = self._extract_error_pattern(error)
        self.error_patterns[pattern] = self.error_patterns.get(pattern, 0) + 1

    def _categorize_error(self, error: str) -> ErrorCategory:
        """Categorize an error message"""
        error_lower = error.lower()

        if any(word in error_lower for word in ["network", "connection", "dns"]):
            return ErrorCategory.NETWORK
        elif "timeout" in error_lower:
            return ErrorCategory.TIMEOUT
        elif any(word in error_lower for word in ["permission", "access", "denied"]):
            return ErrorCategory.PERMISSION
        elif "memory" in error_lower:
            return ErrorCategory.MEMORY
        elif "hardware" in error_lower:
            return ErrorCategory.HARDWARE
        elif "compression" in error_lower:
            return ErrorCategory.COMPRESSION
        elif any(word in error_lower for word in ["disk", "storage", "space"]):
            return ErrorCategory.STORAGE
        return ErrorCategory.OTHER

    def _extract_error_pattern(self, error: str) -> str:
        """Extract general pattern from error message"""
        # This could be enhanced with regex or more sophisticated pattern matching
        words = error.split()
        if len(words) > 5:
            return " ".join(words[:5]) + "..."
        return error

@dataclass
class PerformanceMetrics:
    """Performance-related metrics"""
    peak_memory_usage: float = 0.0
    compression_failures: int = 0
    hardware_accel_failures: int = 0
    peak_queue_size: int = 0
    peak_processing_time: float = 0.0
    avg_queue_wait_time: float = 0.0
    _total_wait_time: float = 0.0
    _wait_count: int = 0

    def update_memory(self, memory_usage: float) -> None:
        """Update memory usage metrics"""
        self.peak_memory_usage = max(self.peak_memory_usage, memory_usage)

    def record_wait_time(self, wait_time: float) -> None:
        """Record queue wait time"""
        self._total_wait_time += wait_time
        self._wait_count += 1
        self.avg_queue_wait_time = (
            self._total_wait_time / self._wait_count
            if self._wait_count > 0
            else 0.0
        )

class MetricAggregator:
    """Aggregates metrics over time periods"""

    def __init__(self, max_history: int = 1000):
        self.max_history = max_history
        self.hourly_metrics: List[Dict[str, Any]] = []
        self.daily_metrics: List[Dict[str, Any]] = []
        self.last_aggregation = datetime.utcnow()

    def aggregate_metrics(self, current_metrics: Dict[str, Any]) -> None:
        """Aggregate current metrics"""
        now = datetime.utcnow()

        # Hourly aggregation
        if now - self.last_aggregation >= timedelta(hours=1):
            self.hourly_metrics.append({
                "timestamp": now.isoformat(),
                "metrics": current_metrics
            })
            if len(self.hourly_metrics) > self.max_history:
                self.hourly_metrics.pop(0)

        # Daily aggregation
        if now.date() > self.last_aggregation.date():
            daily_avg = self._calculate_daily_average(
                self.hourly_metrics,
                self.last_aggregation.date()
            )
            self.daily_metrics.append(daily_avg)
            if len(self.daily_metrics) > 30:  # Keep last 30 days
                self.daily_metrics.pop(0)

        self.last_aggregation = now

    def _calculate_daily_average(
        self,
        metrics: List[Dict[str, Any]],
        date: datetime.date
    ) -> Dict[str, Any]:
        """Calculate average metrics for a day"""
        day_metrics = [
            m for m in metrics
            if datetime.fromisoformat(m["timestamp"]).date() == date
        ]

        if not day_metrics:
            return {
                "date": date.isoformat(),
                "metrics": {}
            }

        # Calculate averages for numeric values
        avg_metrics = {}
        for key in day_metrics[0]["metrics"].keys():
            if isinstance(day_metrics[0]["metrics"][key], (int, float)):
                avg_metrics[key] = sum(
                    m["metrics"][key] for m in day_metrics
                ) / len(day_metrics)
            else:
                avg_metrics[key] = day_metrics[-1]["metrics"][key]

        return {
            "date": date.isoformat(),
            "metrics": avg_metrics
        }

class QueueMetricsManager:
    """Manages metrics collection and reporting for the queue system"""

    def __init__(self):
        self.processing = ProcessingMetrics()
        self.errors = ErrorMetrics()
        self.performance = PerformanceMetrics()
        self.aggregator = MetricAggregator()
        self.last_activity = time.time()
        self.last_cleanup = datetime.utcnow()

    def update(
        self,
        processing_time: float,
        success: bool,
        error: Optional[str] = None
    ) -> None:
        """Update metrics with new processing information"""
        try:
            # Update processing metrics
            self.processing.update(processing_time, success)

            # Update error tracking
            if error:
                self.errors.record_error(error)

                # Track specific failures
                if "hardware acceleration" in error.lower():
                    self.performance.hardware_accel_failures += 1
                elif "compression" in error.lower():
                    self.performance.compression_failures += 1

            # Update activity timestamp
            self.last_activity = time.time()

            # Aggregate metrics
            self.aggregator.aggregate_metrics(self.get_metrics())

        except Exception as e:
            logger.error(f"Error updating metrics: {e}")

    def get_metrics(self) -> Dict[str, Any]:
        """Get current metrics"""
        return {
            MetricCategory.PROCESSING.value: {
                "total_processed": self.processing.total_processed,
                "total_failed": self.processing.total_failed,
                "success_rate": self.processing.success_rate,
                "avg_processing_time": self.processing.avg_processing_time
            },
            MetricCategory.ERRORS.value: {
                "errors_by_type": self.errors.errors_by_type,
                "errors_by_category": {
                    cat.value: count
                    for cat, count in self.errors.errors_by_category.items()
                },
                "error_patterns": self.errors.error_patterns,
                "recent_errors": self.errors.recent_errors
            },
            MetricCategory.PERFORMANCE.value: {
                "peak_memory_usage": self.performance.peak_memory_usage,
                "compression_failures": self.performance.compression_failures,
                "hardware_accel_failures": self.performance.hardware_accel_failures,
                "peak_queue_size": self.performance.peak_queue_size,
                "avg_queue_wait_time": self.performance.avg_queue_wait_time
            },
            MetricCategory.ACTIVITY.value: {
                "last_activity": time.time() - self.last_activity,
                "last_cleanup": self.last_cleanup.isoformat()
            },
            "history": {
                "hourly": self.aggregator.hourly_metrics,
                "daily": self.aggregator.daily_metrics
            }
        }

    def update_memory_usage(self, memory_usage: float) -> None:
        """Update peak memory usage"""
        self.performance.update_memory(memory_usage)

    def update_cleanup_time(self) -> None:
        """Update last cleanup timestamp"""
        self.last_cleanup = datetime.utcnow()

    def reset_metrics(self) -> None:
        """Reset all metrics to initial state"""
        self.processing = ProcessingMetrics()
        self.errors = ErrorMetrics()
        self.performance = PerformanceMetrics()
        self.last_activity = time.time()
        self.last_cleanup = datetime.utcnow()

    def save_metrics(self, file_path: str) -> None:
        """Save metrics to file"""
        try:
            metrics = self.get_metrics()
            with open(file_path, 'w') as f:
                json.dump(metrics, f, indent=2)
        except Exception as e:
            logger.error(f"Error saving metrics: {e}")

    def load_metrics(self, file_path: str) -> None:
        """Load metrics from file"""
        try:
            with open(file_path, 'r') as f:
                metrics = json.load(f)
            self.restore_metrics(metrics)
        except Exception as e:
            logger.error(f"Error loading metrics: {e}")

    def restore_metrics(self, metrics_data: Dict[str, Any]) -> None:
        """Restore metrics from saved data"""
        try:
            # Restore processing metrics
            proc_data = metrics_data.get(MetricCategory.PROCESSING.value, {})
            self.processing = ProcessingMetrics(
                total_processed=proc_data.get("total_processed", 0),
                total_failed=proc_data.get("total_failed", 0),
                success_rate=proc_data.get("success_rate", 0.0),
                avg_processing_time=proc_data.get("avg_processing_time", 0.0)
            )

            # Restore error metrics
            error_data = metrics_data.get(MetricCategory.ERRORS.value, {})
            self.errors = ErrorMetrics(
                errors_by_type=error_data.get("errors_by_type", {}),
                errors_by_category={
                    ErrorCategory[k.upper()]: v
                    for k, v in error_data.get("errors_by_category", {}).items()
                },
                error_patterns=error_data.get("error_patterns", {}),
                recent_errors=error_data.get("recent_errors", [])
            )

            # Restore performance metrics
            perf_data = metrics_data.get(MetricCategory.PERFORMANCE.value, {})
            self.performance = PerformanceMetrics(
                peak_memory_usage=perf_data.get("peak_memory_usage", 0.0),
                compression_failures=perf_data.get("compression_failures", 0),
                hardware_accel_failures=perf_data.get("hardware_accel_failures", 0),
                peak_queue_size=perf_data.get("peak_queue_size", 0),
                avg_queue_wait_time=perf_data.get("avg_queue_wait_time", 0.0)
            )

            # Restore history
            history = metrics_data.get("history", {})
            self.aggregator.hourly_metrics = history.get("hourly", [])
            self.aggregator.daily_metrics = history.get("daily", [])

        except Exception as e:
            logger.error(f"Error restoring metrics: {e}")