Add automated data retention cleanup

Implements edge function, Django tasks, and UI hooks/panels for automatic retention of old metrics, anomalies, alerts, and incidents, plus updates to query keys and monitoring dashboard to reflect data-retention workflows.
2025-12-24 00:51:12 -05:00 · 2025-11-11 02:21:27 +00:00
parent 07fdfe34f3
commit 915a9fe2df
9 changed files with 589 additions and 0 deletions
--- a/django/README_MONITORING.md
+++ b/django/README_MONITORING.md
@@ -136,6 +136,24 @@ SELECT cron.schedule(
 );
 ```

+### 5. Data Retention Cleanup Setup
+
+The `data-retention-cleanup` edge function should run daily:
+
+```sql
+SELECT cron.schedule(
+  'data-retention-cleanup-daily',
+  '0 3 * * *', -- Daily at 3:00 AM
+  $$
+  SELECT net.http_post(
+    url:='https://api.thrillwiki.com/functions/v1/data-retention-cleanup',
+    headers:='{"Content-Type": "application/json", "Authorization": "Bearer YOUR_ANON_KEY"}'::jsonb,
+    body:=concat('{"time": "', now(), '"}')::jsonb
+  ) as request_id;
+  $$
+);
+```
+
 ## Metrics Collected

 ### Django Metrics
@@ -154,6 +172,35 @@ SELECT cron.schedule(
 - `submission_approval_rate`: Percentage of approved submissions (workflow)
 - `avg_moderation_time`: Average time to moderate in minutes (workflow)

+## Data Retention Policies
+
+The system automatically cleans up old data to manage database size:
+
+### Retention Periods
+- **Metrics** (`metric_time_series`): 30 days
+- **Anomaly Detections**: 30 days (resolved alerts archived after 7 days)
+- **Resolved Alerts**: 90 days
+- **Resolved Incidents**: 90 days
+
+### Cleanup Functions
+
+The following database functions manage data retention:
+
+1. **`cleanup_old_metrics(retention_days)`**: Deletes metrics older than specified days (default: 30)
+2. **`cleanup_old_anomalies(retention_days)`**: Archives resolved anomalies and deletes old unresolved ones (default: 30)
+3. **`cleanup_old_alerts(retention_days)`**: Deletes old resolved alerts (default: 90)
+4. **`cleanup_old_incidents(retention_days)`**: Deletes old resolved incidents (default: 90)
+5. **`run_data_retention_cleanup()`**: Master function that runs all cleanup operations
+
+### Automated Cleanup Schedule
+
+Django Celery tasks run retention cleanup automatically:
+- Full cleanup: Daily at 3:00 AM
+- Metrics cleanup: Daily at 3:30 AM
+- Anomaly cleanup: Daily at 4:00 AM
+
+View retention statistics in the Admin Dashboard's Data Retention panel.
+
 ## Monitoring

 View collected metrics in the Admin Monitoring Dashboard:
--- a/django/apps/monitoring/tasks_retention.py
+++ b/django/apps/monitoring/tasks_retention.py
@@ -0,0 +1,168 @@
+"""
+Celery tasks for data retention and cleanup.
+"""
+import logging
+import requests
+import os
+from celery import shared_task
+
+logger = logging.getLogger(__name__)
+
+SUPABASE_URL = os.environ.get('SUPABASE_URL', 'https://api.thrillwiki.com')
+SUPABASE_SERVICE_KEY = os.environ.get('SUPABASE_SERVICE_ROLE_KEY')
+
+
+@shared_task(bind=True, name='monitoring.run_data_retention_cleanup')
+def run_data_retention_cleanup(self):
+    """
+    Run comprehensive data retention cleanup.
+    Cleans up old metrics, anomaly detections, alerts, and incidents.
+    Runs daily at 3 AM.
+    """
+    logger.info("Starting data retention cleanup")
+    
+    if not SUPABASE_SERVICE_KEY:
+        logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
+        return {'success': False, 'error': 'Missing service key'}
+    
+    try:
+        # Call the Supabase RPC function
+        headers = {
+            'apikey': SUPABASE_SERVICE_KEY,
+            'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
+            'Content-Type': 'application/json',
+        }
+        
+        response = requests.post(
+            f'{SUPABASE_URL}/rest/v1/rpc/run_data_retention_cleanup',
+            headers=headers,
+            timeout=60
+        )
+        
+        if response.status_code == 200:
+            result = response.json()
+            logger.info(f"Data retention cleanup completed: {result}")
+            return result
+        else:
+            logger.error(f"Data retention cleanup failed: {response.status_code} - {response.text}")
+            return {'success': False, 'error': response.text}
+            
+    except Exception as e:
+        logger.error(f"Error in data retention cleanup: {e}", exc_info=True)
+        raise
+
+
+@shared_task(bind=True, name='monitoring.cleanup_old_metrics')
+def cleanup_old_metrics(self, retention_days: int = 30):
+    """
+    Clean up old metric time series data.
+    Runs daily to remove metrics older than retention period.
+    """
+    logger.info(f"Cleaning up metrics older than {retention_days} days")
+    
+    if not SUPABASE_SERVICE_KEY:
+        logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
+        return {'success': False, 'error': 'Missing service key'}
+    
+    try:
+        headers = {
+            'apikey': SUPABASE_SERVICE_KEY,
+            'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
+            'Content-Type': 'application/json',
+        }
+        
+        response = requests.post(
+            f'{SUPABASE_URL}/rest/v1/rpc/cleanup_old_metrics',
+            headers=headers,
+            json={'retention_days': retention_days},
+            timeout=30
+        )
+        
+        if response.status_code == 200:
+            deleted_count = response.json()
+            logger.info(f"Cleaned up {deleted_count} old metrics")
+            return {'success': True, 'deleted_count': deleted_count}
+        else:
+            logger.error(f"Metrics cleanup failed: {response.status_code} - {response.text}")
+            return {'success': False, 'error': response.text}
+            
+    except Exception as e:
+        logger.error(f"Error in metrics cleanup: {e}", exc_info=True)
+        raise
+
+
+@shared_task(bind=True, name='monitoring.cleanup_old_anomalies')
+def cleanup_old_anomalies(self, retention_days: int = 30):
+    """
+    Clean up old anomaly detections.
+    Archives resolved anomalies and deletes very old unresolved ones.
+    """
+    logger.info(f"Cleaning up anomalies older than {retention_days} days")
+    
+    if not SUPABASE_SERVICE_KEY:
+        logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
+        return {'success': False, 'error': 'Missing service key'}
+    
+    try:
+        headers = {
+            'apikey': SUPABASE_SERVICE_KEY,
+            'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
+            'Content-Type': 'application/json',
+        }
+        
+        response = requests.post(
+            f'{SUPABASE_URL}/rest/v1/rpc/cleanup_old_anomalies',
+            headers=headers,
+            json={'retention_days': retention_days},
+            timeout=30
+        )
+        
+        if response.status_code == 200:
+            result = response.json()
+            logger.info(f"Cleaned up anomalies: {result}")
+            return {'success': True, 'result': result}
+        else:
+            logger.error(f"Anomalies cleanup failed: {response.status_code} - {response.text}")
+            return {'success': False, 'error': response.text}
+            
+    except Exception as e:
+        logger.error(f"Error in anomalies cleanup: {e}", exc_info=True)
+        raise
+
+
+@shared_task(bind=True, name='monitoring.get_retention_stats')
+def get_retention_stats(self):
+    """
+    Get current data retention statistics.
+    Shows record counts and storage size for monitored tables.
+    """
+    logger.info("Fetching data retention statistics")
+    
+    if not SUPABASE_SERVICE_KEY:
+        logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
+        return {'success': False, 'error': 'Missing service key'}
+    
+    try:
+        headers = {
+            'apikey': SUPABASE_SERVICE_KEY,
+            'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
+            'Content-Type': 'application/json',
+        }
+        
+        response = requests.get(
+            f'{SUPABASE_URL}/rest/v1/data_retention_stats',
+            headers=headers,
+            timeout=10
+        )
+        
+        if response.status_code == 200:
+            stats = response.json()
+            logger.info(f"Retrieved retention stats for {len(stats)} tables")
+            return {'success': True, 'stats': stats}
+        else:
+            logger.error(f"Failed to get retention stats: {response.status_code} - {response.text}")
+            return {'success': False, 'error': response.text}
+            
+    except Exception as e:
+        logger.error(f"Error getting retention stats: {e}", exc_info=True)
+        raise
--- a/django/config/celery_beat_schedule.py
+++ b/django/config/celery_beat_schedule.py
@@ -33,6 +33,25 @@ CELERY_BEAT_SCHEDULE = {
        'options': {'queue': 'monitoring'}
    },
    
+    # Data retention cleanup tasks
+    'run-data-retention-cleanup': {
+        'task': 'monitoring.run_data_retention_cleanup',
+        'schedule': crontab(hour=3, minute=0),  # Daily at 3 AM
+        'options': {'queue': 'maintenance'}
+    },
+    
+    'cleanup-old-metrics': {
+        'task': 'monitoring.cleanup_old_metrics',
+        'schedule': crontab(hour=3, minute=30),  # Daily at 3:30 AM
+        'options': {'queue': 'maintenance'}
+    },
+    
+    'cleanup-old-anomalies': {
+        'task': 'monitoring.cleanup_old_anomalies',
+        'schedule': crontab(hour=4, minute=0),  # Daily at 4 AM
+        'options': {'queue': 'maintenance'}
+    },
+    
    # Existing user tasks
    'cleanup-expired-tokens': {
        'task': 'users.cleanup_expired_tokens',