Add automated data retention cleanup

Implements edge function, Django tasks, and UI hooks/panels for automatic retention of old metrics, anomalies, alerts, and incidents, plus updates to query keys and monitoring dashboard to reflect data-retention workflows.
This commit is contained in:
gpt-engineer-app[bot]
2025-11-11 02:21:27 +00:00
parent 07fdfe34f3
commit 915a9fe2df
9 changed files with 589 additions and 0 deletions

View File

@@ -136,6 +136,24 @@ SELECT cron.schedule(
); );
``` ```
### 5. Data Retention Cleanup Setup
The `data-retention-cleanup` edge function should run daily:
```sql
SELECT cron.schedule(
'data-retention-cleanup-daily',
'0 3 * * *', -- Daily at 3:00 AM
$$
SELECT net.http_post(
url:='https://api.thrillwiki.com/functions/v1/data-retention-cleanup',
headers:='{"Content-Type": "application/json", "Authorization": "Bearer YOUR_ANON_KEY"}'::jsonb,
body:=concat('{"time": "', now(), '"}')::jsonb
) as request_id;
$$
);
```
## Metrics Collected ## Metrics Collected
### Django Metrics ### Django Metrics
@@ -154,6 +172,35 @@ SELECT cron.schedule(
- `submission_approval_rate`: Percentage of approved submissions (workflow) - `submission_approval_rate`: Percentage of approved submissions (workflow)
- `avg_moderation_time`: Average time to moderate in minutes (workflow) - `avg_moderation_time`: Average time to moderate in minutes (workflow)
## Data Retention Policies
The system automatically cleans up old data to manage database size:
### Retention Periods
- **Metrics** (`metric_time_series`): 30 days
- **Anomaly Detections**: 30 days (resolved alerts archived after 7 days)
- **Resolved Alerts**: 90 days
- **Resolved Incidents**: 90 days
### Cleanup Functions
The following database functions manage data retention:
1. **`cleanup_old_metrics(retention_days)`**: Deletes metrics older than specified days (default: 30)
2. **`cleanup_old_anomalies(retention_days)`**: Archives resolved anomalies and deletes old unresolved ones (default: 30)
3. **`cleanup_old_alerts(retention_days)`**: Deletes old resolved alerts (default: 90)
4. **`cleanup_old_incidents(retention_days)`**: Deletes old resolved incidents (default: 90)
5. **`run_data_retention_cleanup()`**: Master function that runs all cleanup operations
### Automated Cleanup Schedule
Django Celery tasks run retention cleanup automatically:
- Full cleanup: Daily at 3:00 AM
- Metrics cleanup: Daily at 3:30 AM
- Anomaly cleanup: Daily at 4:00 AM
View retention statistics in the Admin Dashboard's Data Retention panel.
## Monitoring ## Monitoring
View collected metrics in the Admin Monitoring Dashboard: View collected metrics in the Admin Monitoring Dashboard:

View File

@@ -0,0 +1,168 @@
"""
Celery tasks for data retention and cleanup.
"""
import logging
import requests
import os
from celery import shared_task
logger = logging.getLogger(__name__)
SUPABASE_URL = os.environ.get('SUPABASE_URL', 'https://api.thrillwiki.com')
SUPABASE_SERVICE_KEY = os.environ.get('SUPABASE_SERVICE_ROLE_KEY')
@shared_task(bind=True, name='monitoring.run_data_retention_cleanup')
def run_data_retention_cleanup(self):
"""
Run comprehensive data retention cleanup.
Cleans up old metrics, anomaly detections, alerts, and incidents.
Runs daily at 3 AM.
"""
logger.info("Starting data retention cleanup")
if not SUPABASE_SERVICE_KEY:
logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
return {'success': False, 'error': 'Missing service key'}
try:
# Call the Supabase RPC function
headers = {
'apikey': SUPABASE_SERVICE_KEY,
'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
'Content-Type': 'application/json',
}
response = requests.post(
f'{SUPABASE_URL}/rest/v1/rpc/run_data_retention_cleanup',
headers=headers,
timeout=60
)
if response.status_code == 200:
result = response.json()
logger.info(f"Data retention cleanup completed: {result}")
return result
else:
logger.error(f"Data retention cleanup failed: {response.status_code} - {response.text}")
return {'success': False, 'error': response.text}
except Exception as e:
logger.error(f"Error in data retention cleanup: {e}", exc_info=True)
raise
@shared_task(bind=True, name='monitoring.cleanup_old_metrics')
def cleanup_old_metrics(self, retention_days: int = 30):
"""
Clean up old metric time series data.
Runs daily to remove metrics older than retention period.
"""
logger.info(f"Cleaning up metrics older than {retention_days} days")
if not SUPABASE_SERVICE_KEY:
logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
return {'success': False, 'error': 'Missing service key'}
try:
headers = {
'apikey': SUPABASE_SERVICE_KEY,
'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
'Content-Type': 'application/json',
}
response = requests.post(
f'{SUPABASE_URL}/rest/v1/rpc/cleanup_old_metrics',
headers=headers,
json={'retention_days': retention_days},
timeout=30
)
if response.status_code == 200:
deleted_count = response.json()
logger.info(f"Cleaned up {deleted_count} old metrics")
return {'success': True, 'deleted_count': deleted_count}
else:
logger.error(f"Metrics cleanup failed: {response.status_code} - {response.text}")
return {'success': False, 'error': response.text}
except Exception as e:
logger.error(f"Error in metrics cleanup: {e}", exc_info=True)
raise
@shared_task(bind=True, name='monitoring.cleanup_old_anomalies')
def cleanup_old_anomalies(self, retention_days: int = 30):
"""
Clean up old anomaly detections.
Archives resolved anomalies and deletes very old unresolved ones.
"""
logger.info(f"Cleaning up anomalies older than {retention_days} days")
if not SUPABASE_SERVICE_KEY:
logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
return {'success': False, 'error': 'Missing service key'}
try:
headers = {
'apikey': SUPABASE_SERVICE_KEY,
'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
'Content-Type': 'application/json',
}
response = requests.post(
f'{SUPABASE_URL}/rest/v1/rpc/cleanup_old_anomalies',
headers=headers,
json={'retention_days': retention_days},
timeout=30
)
if response.status_code == 200:
result = response.json()
logger.info(f"Cleaned up anomalies: {result}")
return {'success': True, 'result': result}
else:
logger.error(f"Anomalies cleanup failed: {response.status_code} - {response.text}")
return {'success': False, 'error': response.text}
except Exception as e:
logger.error(f"Error in anomalies cleanup: {e}", exc_info=True)
raise
@shared_task(bind=True, name='monitoring.get_retention_stats')
def get_retention_stats(self):
"""
Get current data retention statistics.
Shows record counts and storage size for monitored tables.
"""
logger.info("Fetching data retention statistics")
if not SUPABASE_SERVICE_KEY:
logger.error("SUPABASE_SERVICE_ROLE_KEY not configured")
return {'success': False, 'error': 'Missing service key'}
try:
headers = {
'apikey': SUPABASE_SERVICE_KEY,
'Authorization': f'Bearer {SUPABASE_SERVICE_KEY}',
'Content-Type': 'application/json',
}
response = requests.get(
f'{SUPABASE_URL}/rest/v1/data_retention_stats',
headers=headers,
timeout=10
)
if response.status_code == 200:
stats = response.json()
logger.info(f"Retrieved retention stats for {len(stats)} tables")
return {'success': True, 'stats': stats}
else:
logger.error(f"Failed to get retention stats: {response.status_code} - {response.text}")
return {'success': False, 'error': response.text}
except Exception as e:
logger.error(f"Error getting retention stats: {e}", exc_info=True)
raise

View File

@@ -33,6 +33,25 @@ CELERY_BEAT_SCHEDULE = {
'options': {'queue': 'monitoring'} 'options': {'queue': 'monitoring'}
}, },
# Data retention cleanup tasks
'run-data-retention-cleanup': {
'task': 'monitoring.run_data_retention_cleanup',
'schedule': crontab(hour=3, minute=0), # Daily at 3 AM
'options': {'queue': 'maintenance'}
},
'cleanup-old-metrics': {
'task': 'monitoring.cleanup_old_metrics',
'schedule': crontab(hour=3, minute=30), # Daily at 3:30 AM
'options': {'queue': 'maintenance'}
},
'cleanup-old-anomalies': {
'task': 'monitoring.cleanup_old_anomalies',
'schedule': crontab(hour=4, minute=0), # Daily at 4 AM
'options': {'queue': 'maintenance'}
},
# Existing user tasks # Existing user tasks
'cleanup-expired-tokens': { 'cleanup-expired-tokens': {
'task': 'users.cleanup_expired_tokens', 'task': 'users.cleanup_expired_tokens',

View File

@@ -0,0 +1,161 @@
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
import { Button } from "@/components/ui/button";
import { Badge } from "@/components/ui/badge";
import { Trash2, Database, Clock, HardDrive, TrendingDown } from "lucide-react";
import { useRetentionStats, useRunCleanup } from "@/hooks/admin/useDataRetention";
import { formatDistanceToNow } from "date-fns";
export function DataRetentionPanel() {
const { data: stats, isLoading } = useRetentionStats();
const runCleanup = useRunCleanup();
if (isLoading) {
return (
<Card>
<CardHeader>
<CardTitle>Data Retention</CardTitle>
<CardDescription>Loading retention statistics...</CardDescription>
</CardHeader>
</Card>
);
}
const totalRecords = stats?.reduce((sum, s) => sum + s.total_records, 0) || 0;
const totalSize = stats?.reduce((sum, s) => {
const size = s.table_size.replace(/[^0-9.]/g, '');
return sum + parseFloat(size);
}, 0) || 0;
return (
<Card>
<CardHeader>
<div className="flex items-center justify-between">
<div>
<CardTitle className="flex items-center gap-2">
<Database className="h-5 w-5" />
Data Retention Management
</CardTitle>
<CardDescription>
Automatic cleanup of old metrics and monitoring data
</CardDescription>
</div>
<Button
onClick={() => runCleanup.mutate()}
disabled={runCleanup.isPending}
variant="destructive"
size="sm"
>
<Trash2 className="h-4 w-4 mr-2" />
Run Cleanup Now
</Button>
</div>
</CardHeader>
<CardContent className="space-y-6">
{/* Summary Stats */}
<div className="grid gap-4 md:grid-cols-3">
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm text-muted-foreground">
<Database className="h-4 w-4" />
Total Records
</div>
<div className="text-2xl font-bold">{totalRecords.toLocaleString()}</div>
</div>
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm text-muted-foreground">
<HardDrive className="h-4 w-4" />
Total Size
</div>
<div className="text-2xl font-bold">{totalSize.toFixed(1)} MB</div>
</div>
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm text-muted-foreground">
<TrendingDown className="h-4 w-4" />
Tables Monitored
</div>
<div className="text-2xl font-bold">{stats?.length || 0}</div>
</div>
</div>
{/* Retention Policies */}
<div>
<h3 className="font-semibold mb-3">Retention Policies</h3>
<div className="space-y-2 text-sm">
<div className="flex justify-between items-center p-2 bg-muted/50 rounded">
<span>Metrics (metric_time_series)</span>
<Badge variant="outline">30 days</Badge>
</div>
<div className="flex justify-between items-center p-2 bg-muted/50 rounded">
<span>Anomaly Detections</span>
<Badge variant="outline">30 days</Badge>
</div>
<div className="flex justify-between items-center p-2 bg-muted/50 rounded">
<span>Resolved Alerts</span>
<Badge variant="outline">90 days</Badge>
</div>
<div className="flex justify-between items-center p-2 bg-muted/50 rounded">
<span>Resolved Incidents</span>
<Badge variant="outline">90 days</Badge>
</div>
</div>
</div>
{/* Table Statistics */}
<div>
<h3 className="font-semibold mb-3">Storage Details</h3>
<div className="space-y-3">
{stats?.map((stat) => (
<div
key={stat.table_name}
className="border rounded-lg p-3 space-y-2"
>
<div className="flex items-center justify-between">
<span className="font-medium">{stat.table_name}</span>
<Badge variant="secondary">{stat.table_size}</Badge>
</div>
<div className="grid grid-cols-3 gap-2 text-xs text-muted-foreground">
<div>
<div>Total</div>
<div className="font-medium text-foreground">
{stat.total_records.toLocaleString()}
</div>
</div>
<div>
<div>Last 7 days</div>
<div className="font-medium text-foreground">
{stat.last_7_days.toLocaleString()}
</div>
</div>
<div>
<div>Last 30 days</div>
<div className="font-medium text-foreground">
{stat.last_30_days.toLocaleString()}
</div>
</div>
</div>
{stat.oldest_record && (
<div className="flex items-center gap-1 text-xs text-muted-foreground">
<Clock className="h-3 w-3" />
Oldest:{" "}
{formatDistanceToNow(new Date(stat.oldest_record), {
addSuffix: true,
})}
</div>
)}
</div>
))}
</div>
</div>
{/* Cleanup Schedule */}
<div className="bg-muted/50 rounded-lg p-4 space-y-2">
<h3 className="font-semibold text-sm">Automated Cleanup Schedule</h3>
<div className="space-y-1 text-sm text-muted-foreground">
<div> Full cleanup runs daily at 3:00 AM</div>
<div> Metrics cleanup at 3:30 AM</div>
<div> Anomaly cleanup at 4:00 AM</div>
</div>
</div>
</CardContent>
</Card>
);
}

View File

@@ -0,0 +1,134 @@
import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
import { supabase } from "@/integrations/supabase/client";
import { toast } from "sonner";
interface RetentionStats {
table_name: string;
total_records: number;
last_7_days: number;
last_30_days: number;
oldest_record: string;
newest_record: string;
table_size: string;
}
interface CleanupResult {
success: boolean;
cleanup_results: {
metrics_deleted: number;
anomalies_archived: number;
anomalies_deleted: number;
alerts_deleted: number;
incidents_deleted: number;
};
timestamp: string;
}
export function useRetentionStats() {
return useQuery({
queryKey: ["dataRetentionStats"],
queryFn: async () => {
const { data, error } = await supabase
.from("data_retention_stats")
.select("*");
if (error) throw error;
return data as RetentionStats[];
},
refetchInterval: 60000, // Refetch every minute
});
}
export function useRunCleanup() {
const queryClient = useQueryClient();
return useMutation({
mutationFn: async () => {
const { data, error } = await supabase.functions.invoke(
"data-retention-cleanup"
);
if (error) throw error;
return data as CleanupResult;
},
onSuccess: (data) => {
const results = data.cleanup_results;
const total =
results.metrics_deleted +
results.anomalies_archived +
results.anomalies_deleted +
results.alerts_deleted +
results.incidents_deleted;
toast.success(
`Cleanup completed: ${total} records removed`,
{
description: `Metrics: ${results.metrics_deleted}, Anomalies: ${results.anomalies_deleted}, Alerts: ${results.alerts_deleted}`,
}
);
// Invalidate relevant queries
queryClient.invalidateQueries({ queryKey: ["dataRetentionStats"] });
queryClient.invalidateQueries({ queryKey: ["anomalyDetections"] });
queryClient.invalidateQueries({ queryKey: ["systemAlerts"] });
},
onError: (error: Error) => {
toast.error("Failed to run cleanup", {
description: error.message,
});
},
});
}
export function useCleanupMetrics() {
const queryClient = useQueryClient();
return useMutation({
mutationFn: async (retentionDays: number = 30) => {
const { data, error } = await supabase.rpc("cleanup_old_metrics", {
retention_days: retentionDays,
});
if (error) throw error;
return data;
},
onSuccess: (deletedCount) => {
toast.success(`Cleaned up ${deletedCount} old metrics`);
queryClient.invalidateQueries({ queryKey: ["dataRetentionStats"] });
},
onError: (error: Error) => {
toast.error("Failed to cleanup metrics", {
description: error.message,
});
},
});
}
export function useCleanupAnomalies() {
const queryClient = useQueryClient();
return useMutation({
mutationFn: async (retentionDays: number = 30) => {
const { data, error } = await supabase.rpc("cleanup_old_anomalies", {
retention_days: retentionDays,
});
if (error) throw error;
return data;
},
onSuccess: (result) => {
// Result is returned as an array with one element
const cleanupResult = Array.isArray(result) ? result[0] : result;
toast.success(
`Cleaned up anomalies: ${cleanupResult.archived_count} archived, ${cleanupResult.deleted_count} deleted`
);
queryClient.invalidateQueries({ queryKey: ["dataRetentionStats"] });
queryClient.invalidateQueries({ queryKey: ["anomalyDetections"] });
},
onError: (error: Error) => {
toast.error("Failed to cleanup anomalies", {
description: error.message,
});
},
});
}

View File

@@ -96,5 +96,6 @@ export const queryKeys = {
incidents: (status?: string) => ['monitoring', 'incidents', status] as const, incidents: (status?: string) => ['monitoring', 'incidents', status] as const,
incidentDetails: (incidentId: string) => ['monitoring', 'incident-details', incidentId] as const, incidentDetails: (incidentId: string) => ['monitoring', 'incident-details', incidentId] as const,
anomalyDetections: () => ['monitoring', 'anomaly-detections'] as const, anomalyDetections: () => ['monitoring', 'anomaly-detections'] as const,
dataRetentionStats: () => ['monitoring', 'data-retention-stats'] as const,
}, },
} as const; } as const;

View File

@@ -7,6 +7,7 @@ import { GroupedAlertsPanel } from '@/components/admin/GroupedAlertsPanel';
import { CorrelatedAlertsPanel } from '@/components/admin/CorrelatedAlertsPanel'; import { CorrelatedAlertsPanel } from '@/components/admin/CorrelatedAlertsPanel';
import { IncidentsPanel } from '@/components/admin/IncidentsPanel'; import { IncidentsPanel } from '@/components/admin/IncidentsPanel';
import { AnomalyDetectionPanel } from '@/components/admin/AnomalyDetectionPanel'; import { AnomalyDetectionPanel } from '@/components/admin/AnomalyDetectionPanel';
import { DataRetentionPanel } from '@/components/admin/DataRetentionPanel';
import { MonitoringQuickStats } from '@/components/admin/MonitoringQuickStats'; import { MonitoringQuickStats } from '@/components/admin/MonitoringQuickStats';
import { RecentActivityTimeline } from '@/components/admin/RecentActivityTimeline'; import { RecentActivityTimeline } from '@/components/admin/RecentActivityTimeline';
import { MonitoringNavCards } from '@/components/admin/MonitoringNavCards'; import { MonitoringNavCards } from '@/components/admin/MonitoringNavCards';
@@ -150,6 +151,9 @@ export default function MonitoringOverview() {
isLoading={anomalies.isLoading} isLoading={anomalies.isLoading}
/> />
{/* Data Retention Management */}
<DataRetentionPanel />
{/* Quick Stats Grid */} {/* Quick Stats Grid */}
<MonitoringQuickStats <MonitoringQuickStats
systemHealth={systemHealth.data ?? undefined} systemHealth={systemHealth.data ?? undefined}

View File

@@ -0,0 +1,48 @@
import { createClient } from 'https://esm.sh/@supabase/supabase-js@2.57.4';
const corsHeaders = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
};
Deno.serve(async (req) => {
if (req.method === 'OPTIONS') {
return new Response(null, { headers: corsHeaders });
}
try {
const supabaseUrl = Deno.env.get('SUPABASE_URL')!;
const supabaseKey = Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!;
const supabase = createClient(supabaseUrl, supabaseKey);
console.log('Starting data retention cleanup...');
// Call the master cleanup function
const { data, error } = await supabase.rpc('run_data_retention_cleanup');
if (error) {
console.error('Error running data retention cleanup:', error);
throw error;
}
console.log('Data retention cleanup completed:', data);
return new Response(
JSON.stringify({
success: true,
cleanup_results: data.cleanup_results,
timestamp: data.timestamp,
}),
{ headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
);
} catch (error) {
console.error('Error in data-retention-cleanup function:', error);
return new Response(
JSON.stringify({ error: error.message }),
{
status: 500,
headers: { ...corsHeaders, 'Content-Type': 'application/json' },
}
);
}
});

View File

@@ -0,0 +1,7 @@
-- Fix security warnings: Set search_path for all retention policy functions
ALTER FUNCTION cleanup_old_metrics(INTEGER) SET search_path = public;
ALTER FUNCTION cleanup_old_anomalies(INTEGER) SET search_path = public;
ALTER FUNCTION cleanup_old_alerts(INTEGER) SET search_path = public;
ALTER FUNCTION cleanup_old_incidents(INTEGER) SET search_path = public;
ALTER FUNCTION run_data_retention_cleanup() SET search_path = public;