/** * Comprehensive bot detection system * Combines user-agent patterns, header analysis, and behavioral heuristics */ import { BOT_PATTERNS, GENERIC_BOT_REGEX } from './userAgentPatterns.js'; import { analyzeHeaders } from './headerAnalysis.js'; import { analyzeHeuristics } from './heuristics.js'; export interface BotDetectionResult { isBot: boolean; confidence: 'high' | 'medium' | 'low'; platform: string | null; detectionMethod: 'user-agent' | 'header' | 'heuristic' | 'combination'; score: number; // 0-100 metadata: { userAgent: string; signals: string[]; headerScore: number; heuristicScore: number; uaMatch: boolean; }; } /** * Main bot detection function */ export function detectBot( userAgent: string, headers: Record = {} ): BotDetectionResult { const userAgentLower = userAgent.toLowerCase(); let detectionMethod: BotDetectionResult['detectionMethod'] = 'user-agent'; let platform: string | null = null; let score = 0; const signals: string[] = []; // 1. User-Agent Pattern Matching (most reliable) let uaMatch = false; for (const { pattern, platform: platformName, category } of BOT_PATTERNS) { if (userAgentLower.includes(pattern)) { uaMatch = true; platform = platformName; // High confidence for explicit matches if (category === 'social' || category === 'seo' || category === 'preview') { score = 95; signals.push(`ua-explicit-${category}`); } else if (category === 'generic') { score = 60; // Lower confidence for generic patterns signals.push('ua-generic'); } else { score = 85; signals.push(`ua-${category}`); } break; // First match wins } } // 2. Header Analysis const headerAnalysis = analyzeHeaders(headers); signals.push(...headerAnalysis.signals.map(s => `header:${s}`)); // 3. Behavioral Heuristics const heuristicAnalysis = analyzeHeuristics(userAgent, headers); signals.push(...heuristicAnalysis.signals.map(s => `heuristic:${s}`)); // 4. Combine scores with weighted approach if (uaMatch) { // User-agent match found - combine with other signals score = Math.max(score, score * 0.7 + headerAnalysis.confidence * 0.2 + heuristicAnalysis.confidence * 0.1 ); if (headerAnalysis.isBot || heuristicAnalysis.isBot) { detectionMethod = 'combination'; } } else { // No user-agent match - rely on header and heuristic analysis score = headerAnalysis.confidence * 0.5 + heuristicAnalysis.confidence * 0.5; if (headerAnalysis.isBot && heuristicAnalysis.isBot) { detectionMethod = 'combination'; platform = 'unknown-bot'; } else if (headerAnalysis.isBot) { detectionMethod = 'header'; platform = 'header-detected-bot'; } else if (heuristicAnalysis.isBot) { detectionMethod = 'heuristic'; platform = 'heuristic-detected-bot'; } } // Final bot determination const isBot = score >= 50; // 50% confidence threshold // Determine confidence level let confidence: 'high' | 'medium' | 'low'; if (score >= 80) { confidence = 'high'; } else if (score >= 60) { confidence = 'medium'; } else { confidence = 'low'; } return { isBot, confidence, platform, detectionMethod, score: Math.round(score), metadata: { userAgent, signals, headerScore: headerAnalysis.confidence, heuristicScore: heuristicAnalysis.confidence, uaMatch, }, }; } /** * Quick bot check for high-traffic scenarios (lightweight) */ export function quickBotCheck(userAgent: string): boolean { const userAgentLower = userAgent.toLowerCase(); // Check most common social/SEO bots first const quickPatterns = [ 'facebookexternalhit', 'twitterbot', 'linkedinbot', 'slackbot', 'discordbot', 'telegrambot', 'whatsapp', 'googlebot', 'bingbot' ]; for (const pattern of quickPatterns) { if (userAgentLower.includes(pattern)) { return true; } } // Generic regex check return GENERIC_BOT_REGEX.test(userAgent); }