Files
thrillwiki_django_no_react/history_tracking/comparison.py

237 lines
8.4 KiB
Python

import asyncio
import json
from typing import List, Dict, Any, Optional
from django.core.cache import cache
from django.db import models
from django.utils import timezone
from concurrent.futures import ThreadPoolExecutor
from .models import VersionTag, ChangeSet
class StructuredDiff:
def __init__(self, version1: str, version2: str):
self.version1 = version1
self.version2 = version2
self.changes: List[Dict[str, Any]] = []
self.impact_score = 0.0
self.computation_time = 0.0
self.timestamp = timezone.now()
def to_dict(self) -> Dict[str, Any]:
return {
'version1': self.version1,
'version2': self.version2,
'changes': self.changes,
'impact_score': self.impact_score,
'computation_time': self.computation_time,
'timestamp': self.timestamp.isoformat()
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'StructuredDiff':
diff = cls(data['version1'], data['version2'])
diff.changes = data['changes']
diff.impact_score = data['impact_score']
diff.computation_time = data['computation_time']
diff.timestamp = timezone.datetime.fromisoformat(data['timestamp'])
return diff
class ComparisonEngine:
"""Handles version comparison operations with background processing and caching"""
def __init__(self, chunk_size: int = 10485760): # 10MB default chunk size
self.chunk_size = chunk_size
self.executor = ThreadPoolExecutor(max_workers=4)
self.cache_ttl = 300 # 5 minutes cache TTL
async def compare_versions(self, versions: List[str]) -> List[StructuredDiff]:
"""
Compare multiple versions, processing in background and using cache
Args:
versions: List of version identifiers to compare
Returns:
List of StructuredDiff objects with comparison results
"""
if len(versions) < 2:
raise ValueError("At least two versions required for comparison")
results: List[StructuredDiff] = []
cache_misses = []
# Check cache first
for i in range(len(versions) - 1):
for j in range(i + 1, len(versions)):
cache_key = self._get_cache_key(versions[i], versions[j])
cached_result = cache.get(cache_key)
if cached_result:
results.append(StructuredDiff.from_dict(json.loads(cached_result)))
else:
cache_misses.append((versions[i], versions[j]))
# Process cache misses in background
if cache_misses:
comparison_tasks = [
self._compare_version_pair(v1, v2)
for v1, v2 in cache_misses
]
new_results = await asyncio.gather(*comparison_tasks)
results.extend(new_results)
return sorted(
results,
key=lambda x: (x.version1, x.version2)
)
def calculate_impact_score(self, diffs: List[StructuredDiff]) -> float:
"""
Calculate impact score for a set of diffs
Args:
diffs: List of StructuredDiff objects
Returns:
Float impact score (0-1)
"""
if not diffs:
return 0.0
total_score = 0.0
weights = {
'file_count': 0.3,
'change_size': 0.3,
'structural_impact': 0.4
}
for diff in diffs:
# File count impact
file_count = len(set(c['file'] for c in diff.changes))
file_score = min(file_count / 100, 1.0) # Normalize to max 100 files
# Change size impact
total_changes = sum(
len(c.get('additions', [])) + len(c.get('deletions', []))
for c in diff.changes
)
size_score = min(total_changes / 1000, 1.0) # Normalize to max 1000 lines
# Structural impact (e.g., function/class changes)
structural_changes = sum(
1 for c in diff.changes
if c.get('type') in ['function', 'class', 'schema']
)
structural_score = min(structural_changes / 10, 1.0) # Normalize to max 10 structural changes
# Weighted average
diff.impact_score = (
weights['file_count'] * file_score +
weights['change_size'] * size_score +
weights['structural_impact'] * structural_score
)
total_score += diff.impact_score
return total_score / len(diffs)
async def _compare_version_pair(self, version1: str, version2: str) -> StructuredDiff:
"""Compare two versions in background"""
start_time = timezone.now()
# Create diff structure
diff = StructuredDiff(version1, version2)
try:
# Get version data
v1_tag = await self._get_version_tag(version1)
v2_tag = await self._get_version_tag(version2)
if not v1_tag or not v2_tag:
raise ValueError("Version tag not found")
# Process in chunks if needed
changes = await self._process_version_changes(v1_tag, v2_tag)
diff.changes = changes
# Calculate impact score
diff.impact_score = self.calculate_impact_score([diff])
# Store in cache
cache_key = self._get_cache_key(version1, version2)
cache.set(
cache_key,
json.dumps(diff.to_dict()),
self.cache_ttl
)
except Exception as e:
diff.changes = [{'error': str(e)}]
diff.impact_score = 0.0
diff.computation_time = (timezone.now() - start_time).total_seconds()
return diff
async def _get_version_tag(self, version: str) -> Optional[VersionTag]:
"""Get version tag by identifier"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
self.executor,
lambda: VersionTag.objects.filter(name=version).first()
)
async def _process_version_changes(
self,
v1_tag: VersionTag,
v2_tag: VersionTag
) -> List[Dict[str, Any]]:
"""Process changes between versions in chunks"""
changes = []
# Get changesets between versions
changesets = await self._get_changesets_between(v1_tag, v2_tag)
for changeset in changesets:
# Process each change in chunks if needed
change_data = await self._process_changeset(changeset)
changes.extend(change_data)
return changes
async def _get_changesets_between(
self,
v1_tag: VersionTag,
v2_tag: VersionTag
) -> List[ChangeSet]:
"""Get all changesets between two versions"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
self.executor,
lambda: list(ChangeSet.objects.filter(
branch=v2_tag.branch,
created_at__gt=v1_tag.created_at,
created_at__lte=v2_tag.created_at,
status='applied'
).order_by('created_at'))
)
async def _process_changeset(self, changeset: ChangeSet) -> List[Dict[str, Any]]:
"""Process individual changeset for comparison"""
loop = asyncio.get_event_loop()
def process():
changes = []
instance = changeset.historical_instance
if instance:
# Get changes from historical record
diff = instance.diff_against_previous
if diff:
for field, values in diff.items():
change = {
'type': 'field',
'file': f"{instance._meta.model_name}.{field}",
'old_value': values['old'],
'new_value': values['new']
}
changes.append(change)
return changes
return await loop.run_in_executor(self.executor, process)
def _get_cache_key(self, version1: str, version2: str) -> str:
"""Generate cache key for version comparison"""
return f"version_diff:{version1}:{version2}"