feat: Implement comprehensive bot detection

2025-12-20 15:31:13 -05:00 · 2025-10-29 20:49:26 +00:00
parent d362fa7537
commit 2918f9d280
5 changed files with 515 additions and 76 deletions
--- a/api/botDetection/index.ts
+++ b/api/botDetection/index.ts
@@ -0,0 +1,144 @@
+/**
+ * Comprehensive bot detection system
+ * Combines user-agent patterns, header analysis, and behavioral heuristics
+ */
+
+import { BOT_PATTERNS, GENERIC_BOT_REGEX } from './userAgentPatterns';
+import { analyzeHeaders } from './headerAnalysis';
+import { analyzeHeuristics } from './heuristics';
+
+export interface BotDetectionResult {
+  isBot: boolean;
+  confidence: 'high' | 'medium' | 'low';
+  platform: string | null;
+  detectionMethod: 'user-agent' | 'header' | 'heuristic' | 'combination';
+  score: number; // 0-100
+  metadata: {
+    userAgent: string;
+    signals: string[];
+    headerScore: number;
+    heuristicScore: number;
+    uaMatch: boolean;
+  };
+}
+
+/**
+ * Main bot detection function
+ */
+export function detectBot(
+  userAgent: string,
+  headers: Record<string, string | string[] | undefined> = {}
+): BotDetectionResult {
+  const userAgentLower = userAgent.toLowerCase();
+  let detectionMethod: BotDetectionResult['detectionMethod'] = 'user-agent';
+  let platform: string | null = null;
+  let score = 0;
+  const signals: string[] = [];
+
+  // 1. User-Agent Pattern Matching (most reliable)
+  let uaMatch = false;
+  for (const { pattern, platform: platformName, category } of BOT_PATTERNS) {
+    if (userAgentLower.includes(pattern)) {
+      uaMatch = true;
+      platform = platformName;
+      
+      // High confidence for explicit matches
+      if (category === 'social' || category === 'seo' || category === 'preview') {
+        score = 95;
+        signals.push(`ua-explicit-${category}`);
+      } else if (category === 'generic') {
+        score = 60; // Lower confidence for generic patterns
+        signals.push('ua-generic');
+      } else {
+        score = 85;
+        signals.push(`ua-${category}`);
+      }
+      
+      break; // First match wins
+    }
+  }
+
+  // 2. Header Analysis
+  const headerAnalysis = analyzeHeaders(headers);
+  signals.push(...headerAnalysis.signals.map(s => `header:${s}`));
+
+  // 3. Behavioral Heuristics
+  const heuristicAnalysis = analyzeHeuristics(userAgent, headers);
+  signals.push(...heuristicAnalysis.signals.map(s => `heuristic:${s}`));
+
+  // 4. Combine scores with weighted approach
+  if (uaMatch) {
+    // User-agent match found - combine with other signals
+    score = Math.max(score, 
+      score * 0.7 + headerAnalysis.confidence * 0.2 + heuristicAnalysis.confidence * 0.1
+    );
+    
+    if (headerAnalysis.isBot || heuristicAnalysis.isBot) {
+      detectionMethod = 'combination';
+    }
+  } else {
+    // No user-agent match - rely on header and heuristic analysis
+    score = headerAnalysis.confidence * 0.5 + heuristicAnalysis.confidence * 0.5;
+    
+    if (headerAnalysis.isBot && heuristicAnalysis.isBot) {
+      detectionMethod = 'combination';
+      platform = 'unknown-bot';
+    } else if (headerAnalysis.isBot) {
+      detectionMethod = 'header';
+      platform = 'header-detected-bot';
+    } else if (heuristicAnalysis.isBot) {
+      detectionMethod = 'heuristic';
+      platform = 'heuristic-detected-bot';
+    }
+  }
+
+  // Final bot determination
+  const isBot = score >= 50; // 50% confidence threshold
+
+  // Determine confidence level
+  let confidence: 'high' | 'medium' | 'low';
+  if (score >= 80) {
+    confidence = 'high';
+  } else if (score >= 60) {
+    confidence = 'medium';
+  } else {
+    confidence = 'low';
+  }
+
+  return {
+    isBot,
+    confidence,
+    platform,
+    detectionMethod,
+    score: Math.round(score),
+    metadata: {
+      userAgent,
+      signals,
+      headerScore: headerAnalysis.confidence,
+      heuristicScore: heuristicAnalysis.confidence,
+      uaMatch,
+    },
+  };
+}
+
+/**
+ * Quick bot check for high-traffic scenarios (lightweight)
+ */
+export function quickBotCheck(userAgent: string): boolean {
+  const userAgentLower = userAgent.toLowerCase();
+  
+  // Check most common social/SEO bots first
+  const quickPatterns = [
+    'facebookexternalhit', 'twitterbot', 'linkedinbot', 'slackbot',
+    'discordbot', 'telegrambot', 'whatsapp', 'googlebot', 'bingbot'
+  ];
+  
+  for (const pattern of quickPatterns) {
+    if (userAgentLower.includes(pattern)) {
+      return true;
+    }
+  }
+  
+  // Generic regex check
+  return GENERIC_BOT_REGEX.test(userAgent);
+}