mirror of
https://github.com/pacnpal/thrilltrack-explorer.git
synced 2025-12-24 00:51:12 -05:00
Add automated data retention cleanup
Implements edge function, Django tasks, and UI hooks/panels for automatic retention of old metrics, anomalies, alerts, and incidents, plus updates to query keys and monitoring dashboard to reflect data-retention workflows.
This commit is contained in:
@@ -136,6 +136,24 @@ SELECT cron.schedule(
|
||||
);
|
||||
```
|
||||
|
||||
### 5. Data Retention Cleanup Setup
|
||||
|
||||
The `data-retention-cleanup` edge function should run daily:
|
||||
|
||||
```sql
|
||||
SELECT cron.schedule(
|
||||
'data-retention-cleanup-daily',
|
||||
'0 3 * * *', -- Daily at 3:00 AM
|
||||
$$
|
||||
SELECT net.http_post(
|
||||
url:='https://api.thrillwiki.com/functions/v1/data-retention-cleanup',
|
||||
headers:='{"Content-Type": "application/json", "Authorization": "Bearer YOUR_ANON_KEY"}'::jsonb,
|
||||
body:=concat('{"time": "', now(), '"}')::jsonb
|
||||
) as request_id;
|
||||
$$
|
||||
);
|
||||
```
|
||||
|
||||
## Metrics Collected
|
||||
|
||||
### Django Metrics
|
||||
@@ -154,6 +172,35 @@ SELECT cron.schedule(
|
||||
- `submission_approval_rate`: Percentage of approved submissions (workflow)
|
||||
- `avg_moderation_time`: Average time to moderate in minutes (workflow)
|
||||
|
||||
## Data Retention Policies
|
||||
|
||||
The system automatically cleans up old data to manage database size:
|
||||
|
||||
### Retention Periods
|
||||
- **Metrics** (`metric_time_series`): 30 days
|
||||
- **Anomaly Detections**: 30 days (resolved alerts archived after 7 days)
|
||||
- **Resolved Alerts**: 90 days
|
||||
- **Resolved Incidents**: 90 days
|
||||
|
||||
### Cleanup Functions
|
||||
|
||||
The following database functions manage data retention:
|
||||
|
||||
1. **`cleanup_old_metrics(retention_days)`**: Deletes metrics older than specified days (default: 30)
|
||||
2. **`cleanup_old_anomalies(retention_days)`**: Archives resolved anomalies and deletes old unresolved ones (default: 30)
|
||||
3. **`cleanup_old_alerts(retention_days)`**: Deletes old resolved alerts (default: 90)
|
||||
4. **`cleanup_old_incidents(retention_days)`**: Deletes old resolved incidents (default: 90)
|
||||
5. **`run_data_retention_cleanup()`**: Master function that runs all cleanup operations
|
||||
|
||||
### Automated Cleanup Schedule
|
||||
|
||||
Django Celery tasks run retention cleanup automatically:
|
||||
- Full cleanup: Daily at 3:00 AM
|
||||
- Metrics cleanup: Daily at 3:30 AM
|
||||
- Anomaly cleanup: Daily at 4:00 AM
|
||||
|
||||
View retention statistics in the Admin Dashboard's Data Retention panel.
|
||||
|
||||
## Monitoring
|
||||
|
||||
View collected metrics in the Admin Monitoring Dashboard:
|
||||
|
||||
168
django/apps/monitoring/tasks_retention.py
Normal file
168
django/apps/monitoring/tasks_retention.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Celery tasks for data retention and cleanup.
|
||||
"""
|
||||
import logging
|
||||
import requests
|
||||
import os
|
||||
from celery import shared_task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SUPABASE_URL = os.environ.get('SUPABASE_URL', 'https://api.thrillwiki.com')
|
||||
SUPABASE_SERVICE_KEY = os.environ.get('SUPABASE_SERVICE_ROLE_KEY')
|
||||
|
||||
|
||||
@shared_task(bind=True, name='monitoring.run_data_retention_cleanup')
|
||||
def run_data_retention_cleanup(self):
|
||||
"""
|
||||
Run comprehensive data retention cleanup.
|
||||
Cleans up old metrics, anomaly detections, alerts, and incidents.
|
||||
Runs daily at 3 AM.
|
||||
"""
|
||||
logger.info("Starting data retention cleanup")
|
||||
|
||||
if not SUPABASE_SERVICE_KEY:
|
||||
logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
|
||||
return {'success': False, 'error': 'Missing service key'}
|
||||
|
||||
try:
|
||||
# Call the Supabase RPC function
|
||||
headers = {
|
||||
'apikey': SUPABASE_SERVICE_KEY,
|
||||
'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f'{SUPABASE_URL}/rest/v1/rpc/run_data_retention_cleanup',
|
||||
headers=headers,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
logger.info(f"Data retention cleanup completed: {result}")
|
||||
return result
|
||||
else:
|
||||
logger.error(f"Data retention cleanup failed: {response.status_code} - {response.text}")
|
||||
return {'success': False, 'error': response.text}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in data retention cleanup: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
@shared_task(bind=True, name='monitoring.cleanup_old_metrics')
|
||||
def cleanup_old_metrics(self, retention_days: int = 30):
|
||||
"""
|
||||
Clean up old metric time series data.
|
||||
Runs daily to remove metrics older than retention period.
|
||||
"""
|
||||
logger.info(f"Cleaning up metrics older than {retention_days} days")
|
||||
|
||||
if not SUPABASE_SERVICE_KEY:
|
||||
logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
|
||||
return {'success': False, 'error': 'Missing service key'}
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'apikey': SUPABASE_SERVICE_KEY,
|
||||
'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f'{SUPABASE_URL}/rest/v1/rpc/cleanup_old_metrics',
|
||||
headers=headers,
|
||||
json={'retention_days': retention_days},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
deleted_count = response.json()
|
||||
logger.info(f"Cleaned up {deleted_count} old metrics")
|
||||
return {'success': True, 'deleted_count': deleted_count}
|
||||
else:
|
||||
logger.error(f"Metrics cleanup failed: {response.status_code} - {response.text}")
|
||||
return {'success': False, 'error': response.text}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in metrics cleanup: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
@shared_task(bind=True, name='monitoring.cleanup_old_anomalies')
|
||||
def cleanup_old_anomalies(self, retention_days: int = 30):
|
||||
"""
|
||||
Clean up old anomaly detections.
|
||||
Archives resolved anomalies and deletes very old unresolved ones.
|
||||
"""
|
||||
logger.info(f"Cleaning up anomalies older than {retention_days} days")
|
||||
|
||||
if not SUPABASE_SERVICE_KEY:
|
||||
logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
|
||||
return {'success': False, 'error': 'Missing service key'}
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'apikey': SUPABASE_SERVICE_KEY,
|
||||
'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f'{SUPABASE_URL}/rest/v1/rpc/cleanup_old_anomalies',
|
||||
headers=headers,
|
||||
json={'retention_days': retention_days},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
logger.info(f"Cleaned up anomalies: {result}")
|
||||
return {'success': True, 'result': result}
|
||||
else:
|
||||
logger.error(f"Anomalies cleanup failed: {response.status_code} - {response.text}")
|
||||
return {'success': False, 'error': response.text}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in anomalies cleanup: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
@shared_task(bind=True, name='monitoring.get_retention_stats')
|
||||
def get_retention_stats(self):
|
||||
"""
|
||||
Get current data retention statistics.
|
||||
Shows record counts and storage size for monitored tables.
|
||||
"""
|
||||
logger.info("Fetching data retention statistics")
|
||||
|
||||
if not SUPABASE_SERVICE_KEY:
|
||||
logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
|
||||
return {'success': False, 'error': 'Missing service key'}
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'apikey': SUPABASE_SERVICE_KEY,
|
||||
'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
f'{SUPABASE_URL}/rest/v1/data_retention_stats',
|
||||
headers=headers,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
stats = response.json()
|
||||
logger.info(f"Retrieved retention stats for {len(stats)} tables")
|
||||
return {'success': True, 'stats': stats}
|
||||
else:
|
||||
logger.error(f"Failed to get retention stats: {response.status_code} - {response.text}")
|
||||
return {'success': False, 'error': response.text}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting retention stats: {e}", exc_info=True)
|
||||
raise
|
||||
@@ -33,6 +33,25 @@ CELERY_BEAT_SCHEDULE = {
|
||||
'options': {'queue': 'monitoring'}
|
||||
},
|
||||
|
||||
# Data retention cleanup tasks
|
||||
'run-data-retention-cleanup': {
|
||||
'task': 'monitoring.run_data_retention_cleanup',
|
||||
'schedule': crontab(hour=3, minute=0), # Daily at 3 AM
|
||||
'options': {'queue': 'maintenance'}
|
||||
},
|
||||
|
||||
'cleanup-old-metrics': {
|
||||
'task': 'monitoring.cleanup_old_metrics',
|
||||
'schedule': crontab(hour=3, minute=30), # Daily at 3:30 AM
|
||||
'options': {'queue': 'maintenance'}
|
||||
},
|
||||
|
||||
'cleanup-old-anomalies': {
|
||||
'task': 'monitoring.cleanup_old_anomalies',
|
||||
'schedule': crontab(hour=4, minute=0), # Daily at 4 AM
|
||||
'options': {'queue': 'maintenance'}
|
||||
},
|
||||
|
||||
# Existing user tasks
|
||||
'cleanup-expired-tokens': {
|
||||
'task': 'users.cleanup_expired_tokens',
|
||||
|
||||
Reference in New Issue
Block a user