/** * Behavioral heuristics for bot detection */ export interface HeuristicResult { isBot: boolean; confidence: number; // 0-100 signals: string[]; } /** * Analyze user-agent behavior patterns */ export function analyzeHeuristics(userAgent: string, headers: Record): HeuristicResult { const signals: string[] = []; let confidence = 0; // Very short user agent (< 20 chars) - likely a bot if (userAgent.length < 20) { signals.push('very-short-ua'); confidence += 25; } // Very long user agent (> 400 chars) - suspicious if (userAgent.length > 400) { signals.push('very-long-ua'); confidence += 15; } // No Mozilla in user agent (almost all browsers have this) if (!userAgent.includes('Mozilla') && !userAgent.includes('compatible')) { signals.push('no-mozilla'); confidence += 20; } // Contains "http" or "https" in UA (common in bot UAs) if (userAgent.toLowerCase().includes('http://') || userAgent.toLowerCase().includes('https://')) { signals.push('url-in-ua'); confidence += 30; } // Contains email in UA (some bots identify with contact email) if (userAgent.match(/@|\[at\]|email/i)) { signals.push('email-in-ua'); confidence += 25; } // Common bot indicators in UA const botKeywords = ['fetch', 'request', 'client', 'library', 'script', 'api', 'scan', 'check', 'monitor', 'test']; for (const keyword of botKeywords) { if (userAgent.toLowerCase().includes(keyword)) { signals.push(`keyword-${keyword}`); confidence += 10; break; // Only count once } } // Programming language identifiers const langIdentifiers = ['python', 'java', 'ruby', 'perl', 'go-http', 'php']; for (const lang of langIdentifiers) { if (userAgent.toLowerCase().includes(lang)) { signals.push(`lang-${lang}`); confidence += 15; break; } } // Version number patterns typical of bots (e.g., "v1.0", "version/2.3") if (userAgent.match(/\b(v|version)[/\s]?\d+\.\d+/i)) { signals.push('version-pattern'); confidence += 10; } // Contains plus (+) sign outside of version numbers (common in bot UAs) if (userAgent.includes('+') && !userAgent.match(/\d+\+/)) { signals.push('plus-sign'); confidence += 15; } // Only contains alphanumeric, slashes, and dots (no spaces) - very bot-like if (!userAgent.includes(' ') && userAgent.length > 5) { signals.push('no-spaces'); confidence += 20; } // Normalize headers const normalizedHeaders: Record = {}; for (const [key, value] of Object.entries(headers)) { if (value) { normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value; } } // Missing Accept-Language but has other headers (bots often forget this) if (!normalizedHeaders['accept-language'] && normalizedHeaders['accept']) { signals.push('missing-language-header'); confidence += 15; } // Accept: */* with no other accept headers (lazy bot implementation) if (normalizedHeaders['accept'] === '*/*' && userAgent.length < 50) { signals.push('lazy-accept-header'); confidence += 20; } // Cap confidence at 100 confidence = Math.min(confidence, 100); const isBot = confidence >= 40; // Threshold for heuristic-based detection return { isBot, confidence, signals, }; }