Files
thrilltrack-explorer/api/botDetection/heuristics.ts
gpt-engineer-app[bot] 41f4e3b920 Fix ESLint errors
2025-10-29 23:27:37 +00:00

117 lines
3.3 KiB
TypeScript

/**
* Behavioral heuristics for bot detection
*/
export interface HeuristicResult {
isBot: boolean;
confidence: number; // 0-100
signals: string[];
}
/**
* Analyze user-agent behavior patterns
*/
export function analyzeHeuristics(userAgent: string, headers: Record<string, string | string[] | undefined>): HeuristicResult {
const signals: string[] = [];
let confidence = 0;
// Very short user agent (< 20 chars) - likely a bot
if (userAgent.length < 20) {
signals.push('very-short-ua');
confidence += 25;
}
// Very long user agent (> 400 chars) - suspicious
if (userAgent.length > 400) {
signals.push('very-long-ua');
confidence += 15;
}
// No Mozilla in user agent (almost all browsers have this)
if (!userAgent.includes('Mozilla') && !userAgent.includes('compatible')) {
signals.push('no-mozilla');
confidence += 20;
}
// Contains "http" or "https" in UA (common in bot UAs)
if (userAgent.toLowerCase().includes('http://') || userAgent.toLowerCase().includes('https://')) {
signals.push('url-in-ua');
confidence += 30;
}
// Contains email in UA (some bots identify with contact email)
if (userAgent.match(/@|\[at\]|email/i)) {
signals.push('email-in-ua');
confidence += 25;
}
// Common bot indicators in UA
const botKeywords = ['fetch', 'request', 'client', 'library', 'script', 'api', 'scan', 'check', 'monitor', 'test'];
for (const keyword of botKeywords) {
if (userAgent.toLowerCase().includes(keyword)) {
signals.push(`keyword-${keyword}`);
confidence += 10;
break; // Only count once
}
}
// Programming language identifiers
const langIdentifiers = ['python', 'java', 'ruby', 'perl', 'go-http', 'php'];
for (const lang of langIdentifiers) {
if (userAgent.toLowerCase().includes(lang)) {
signals.push(`lang-${lang}`);
confidence += 15;
break;
}
}
// Version number patterns typical of bots (e.g., "v1.0", "version/2.3")
if (userAgent.match(/\b(v|version)[/\s]?\d+\.\d+/i)) {
signals.push('version-pattern');
confidence += 10;
}
// Contains plus (+) sign outside of version numbers (common in bot UAs)
if (userAgent.includes('+') && !userAgent.match(/\d+\+/)) {
signals.push('plus-sign');
confidence += 15;
}
// Only contains alphanumeric, slashes, and dots (no spaces) - very bot-like
if (!userAgent.includes(' ') && userAgent.length > 5) {
signals.push('no-spaces');
confidence += 20;
}
// Normalize headers
const normalizedHeaders: Record<string, string> = {};
for (const [key, value] of Object.entries(headers)) {
if (value) {
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
}
}
// Missing Accept-Language but has other headers (bots often forget this)
if (!normalizedHeaders['accept-language'] && normalizedHeaders['accept']) {
signals.push('missing-language-header');
confidence += 15;
}
// Accept: */* with no other accept headers (lazy bot implementation)
if (normalizedHeaders['accept'] === '*/*' && userAgent.length < 50) {
signals.push('lazy-accept-header');
confidence += 20;
}
// Cap confidence at 100
confidence = Math.min(confidence, 100);
const isBot = confidence >= 40; // Threshold for heuristic-based detection
return {
isBot,
confidence,
signals,
};
}