Files
thrilltrack-explorer/api/botDetection/index.ts
2025-10-29 20:49:26 +00:00

145 lines
4.1 KiB
TypeScript

/**
* Comprehensive bot detection system
* Combines user-agent patterns, header analysis, and behavioral heuristics
*/
import { BOT_PATTERNS, GENERIC_BOT_REGEX } from './userAgentPatterns';
import { analyzeHeaders } from './headerAnalysis';
import { analyzeHeuristics } from './heuristics';
export interface BotDetectionResult {
isBot: boolean;
confidence: 'high' | 'medium' | 'low';
platform: string | null;
detectionMethod: 'user-agent' | 'header' | 'heuristic' | 'combination';
score: number; // 0-100
metadata: {
userAgent: string;
signals: string[];
headerScore: number;
heuristicScore: number;
uaMatch: boolean;
};
}
/**
* Main bot detection function
*/
export function detectBot(
userAgent: string,
headers: Record<string, string | string[] | undefined> = {}
): BotDetectionResult {
const userAgentLower = userAgent.toLowerCase();
let detectionMethod: BotDetectionResult['detectionMethod'] = 'user-agent';
let platform: string | null = null;
let score = 0;
const signals: string[] = [];
// 1. User-Agent Pattern Matching (most reliable)
let uaMatch = false;
for (const { pattern, platform: platformName, category } of BOT_PATTERNS) {
if (userAgentLower.includes(pattern)) {
uaMatch = true;
platform = platformName;
// High confidence for explicit matches
if (category === 'social' || category === 'seo' || category === 'preview') {
score = 95;
signals.push(`ua-explicit-${category}`);
} else if (category === 'generic') {
score = 60; // Lower confidence for generic patterns
signals.push('ua-generic');
} else {
score = 85;
signals.push(`ua-${category}`);
}
break; // First match wins
}
}
// 2. Header Analysis
const headerAnalysis = analyzeHeaders(headers);
signals.push(...headerAnalysis.signals.map(s => `header:${s}`));
// 3. Behavioral Heuristics
const heuristicAnalysis = analyzeHeuristics(userAgent, headers);
signals.push(...heuristicAnalysis.signals.map(s => `heuristic:${s}`));
// 4. Combine scores with weighted approach
if (uaMatch) {
// User-agent match found - combine with other signals
score = Math.max(score,
score * 0.7 + headerAnalysis.confidence * 0.2 + heuristicAnalysis.confidence * 0.1
);
if (headerAnalysis.isBot || heuristicAnalysis.isBot) {
detectionMethod = 'combination';
}
} else {
// No user-agent match - rely on header and heuristic analysis
score = headerAnalysis.confidence * 0.5 + heuristicAnalysis.confidence * 0.5;
if (headerAnalysis.isBot && heuristicAnalysis.isBot) {
detectionMethod = 'combination';
platform = 'unknown-bot';
} else if (headerAnalysis.isBot) {
detectionMethod = 'header';
platform = 'header-detected-bot';
} else if (heuristicAnalysis.isBot) {
detectionMethod = 'heuristic';
platform = 'heuristic-detected-bot';
}
}
// Final bot determination
const isBot = score >= 50; // 50% confidence threshold
// Determine confidence level
let confidence: 'high' | 'medium' | 'low';
if (score >= 80) {
confidence = 'high';
} else if (score >= 60) {
confidence = 'medium';
} else {
confidence = 'low';
}
return {
isBot,
confidence,
platform,
detectionMethod,
score: Math.round(score),
metadata: {
userAgent,
signals,
headerScore: headerAnalysis.confidence,
heuristicScore: heuristicAnalysis.confidence,
uaMatch,
},
};
}
/**
* Quick bot check for high-traffic scenarios (lightweight)
*/
export function quickBotCheck(userAgent: string): boolean {
const userAgentLower = userAgent.toLowerCase();
// Check most common social/SEO bots first
const quickPatterns = [
'facebookexternalhit', 'twitterbot', 'linkedinbot', 'slackbot',
'discordbot', 'telegrambot', 'whatsapp', 'googlebot', 'bingbot'
];
for (const pattern of quickPatterns) {
if (userAgentLower.includes(pattern)) {
return true;
}
}
// Generic regex check
return GENERIC_BOT_REGEX.test(userAgent);
}