thrillwiki_django_no_react/history_tracking/comparison.py

import asyncio
import json
from typing import List, Dict, Any, Optional
from django.core.cache import cache
from django.db import models
from django.utils import timezone
from concurrent.futures import ThreadPoolExecutor
from .models import VersionTag, ChangeSet

class StructuredDiff:
    def __init__(self, version1: str, version2: str):
        self.version1 = version1
        self.version2 = version2
        self.changes: List[Dict[str, Any]] = []
        self.impact_score = 0.0
        self.computation_time = 0.0
        self.timestamp = timezone.now()

    def to_dict(self) -> Dict[str, Any]:
        return {
            'version1': self.version1,
            'version2': self.version2,
            'changes': self.changes,
            'impact_score': self.impact_score,
            'computation_time': self.computation_time,
            'timestamp': self.timestamp.isoformat()
        }

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'StructuredDiff':
        diff = cls(data['version1'], data['version2'])
        diff.changes = data['changes']
        diff.impact_score = data['impact_score']
        diff.computation_time = data['computation_time']
        diff.timestamp = timezone.datetime.fromisoformat(data['timestamp'])
        return diff

class ComparisonEngine:
    """Handles version comparison operations with background processing and caching"""

    def __init__(self, chunk_size: int = 10485760):  # 10MB default chunk size
        self.chunk_size = chunk_size
        self.executor = ThreadPoolExecutor(max_workers=4)
        self.cache_ttl = 300  # 5 minutes cache TTL

    async def compare_versions(self, versions: List[str]) -> List[StructuredDiff]:
        """
        Compare multiple versions, processing in background and using cache
        Args:
            versions: List of version identifiers to compare
        Returns:
            List of StructuredDiff objects with comparison results
        """
        if len(versions) < 2:
            raise ValueError("At least two versions required for comparison")

        results: List[StructuredDiff] = []
        cache_misses = []

        # Check cache first
        for i in range(len(versions) - 1):
            for j in range(i + 1, len(versions)):
                cache_key = self._get_cache_key(versions[i], versions[j])
                cached_result = cache.get(cache_key)

                if cached_result:
                    results.append(StructuredDiff.from_dict(json.loads(cached_result)))
                else:
                    cache_misses.append((versions[i], versions[j]))

        # Process cache misses in background
        if cache_misses:
            comparison_tasks = [
                self._compare_version_pair(v1, v2)
                for v1, v2 in cache_misses
            ]
            new_results = await asyncio.gather(*comparison_tasks)
            results.extend(new_results)

        return sorted(
            results,
            key=lambda x: (x.version1, x.version2)
        )

    def calculate_impact_score(self, diffs: List[StructuredDiff]) -> float:
        """
        Calculate impact score for a set of diffs
        Args:
            diffs: List of StructuredDiff objects
        Returns:
            Float impact score (0-1)
        """
        if not diffs:
            return 0.0

        total_score = 0.0
        weights = {
            'file_count': 0.3,
            'change_size': 0.3,
            'structural_impact': 0.4
        }

        for diff in diffs:
            # File count impact
            file_count = len(set(c['file'] for c in diff.changes))
            file_score = min(file_count / 100, 1.0)  # Normalize to max 100 files

            # Change size impact
            total_changes = sum(
                len(c.get('additions', [])) + len(c.get('deletions', []))
                for c in diff.changes
            )
            size_score = min(total_changes / 1000, 1.0)  # Normalize to max 1000 lines

            # Structural impact (e.g., function/class changes)
            structural_changes = sum(
                1 for c in diff.changes
                if c.get('type') in ['function', 'class', 'schema']
            )
            structural_score = min(structural_changes / 10, 1.0)  # Normalize to max 10 structural changes

            # Weighted average
            diff.impact_score = (
                weights['file_count'] * file_score +
                weights['change_size'] * size_score +
                weights['structural_impact'] * structural_score
            )
            total_score += diff.impact_score

        return total_score / len(diffs)

    async def _compare_version_pair(self, version1: str, version2: str) -> StructuredDiff:
        """Compare two versions in background"""
        start_time = timezone.now()

        # Create diff structure
        diff = StructuredDiff(version1, version2)

        try:
            # Get version data
            v1_tag = await self._get_version_tag(version1)
            v2_tag = await self._get_version_tag(version2)

            if not v1_tag or not v2_tag:
                raise ValueError("Version tag not found")

            # Process in chunks if needed
            changes = await self._process_version_changes(v1_tag, v2_tag)
            diff.changes = changes

            # Calculate impact score
            diff.impact_score = self.calculate_impact_score([diff])

            # Store in cache
            cache_key = self._get_cache_key(version1, version2)
            cache.set(
                cache_key,
                json.dumps(diff.to_dict()),
                self.cache_ttl
            )

        except Exception as e:
            diff.changes = [{'error': str(e)}]
            diff.impact_score = 0.0

        diff.computation_time = (timezone.now() - start_time).total_seconds()
        return diff

    async def _get_version_tag(self, version: str) -> Optional[VersionTag]:
        """Get version tag by identifier"""
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            self.executor,
            lambda: VersionTag.objects.filter(name=version).first()
        )

    async def _process_version_changes(
        self,
        v1_tag: VersionTag,
        v2_tag: VersionTag
    ) -> List[Dict[str, Any]]:
        """Process changes between versions in chunks"""
        changes = []

        # Get changesets between versions
        changesets = await self._get_changesets_between(v1_tag, v2_tag)

        for changeset in changesets:
            # Process each change in chunks if needed
            change_data = await self._process_changeset(changeset)
            changes.extend(change_data)

        return changes

    async def _get_changesets_between(
        self,
        v1_tag: VersionTag,
        v2_tag: VersionTag
    ) -> List[ChangeSet]:
        """Get all changesets between two versions"""
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            self.executor,
            lambda: list(ChangeSet.objects.filter(
                branch=v2_tag.branch,
                created_at__gt=v1_tag.created_at,
                created_at__lte=v2_tag.created_at,
                status='applied'
            ).order_by('created_at'))
        )

    async def _process_changeset(self, changeset: ChangeSet) -> List[Dict[str, Any]]:
        """Process individual changeset for comparison"""
        loop = asyncio.get_event_loop()

        def process():
            changes = []
            instance = changeset.historical_instance
            if instance:
                # Get changes from historical record
                diff = instance.diff_against_previous
                if diff:
                    for field, values in diff.items():
                        change = {
                            'type': 'field',
                            'file': f"{instance._meta.model_name}.{field}",
                            'old_value': values['old'],
                            'new_value': values['new']
                        }
                        changes.append(change)
            return changes

        return await loop.run_in_executor(self.executor, process)

    def _get_cache_key(self, version1: str, version2: str) -> str:
        """Generate cache key for version comparison"""
        return f"version_diff:{version1}:{version2}"