mirror of
https://github.com/pacnpal/thrilltrack-explorer.git
synced 2025-12-20 08:11:13 -05:00
145 lines
4.1 KiB
TypeScript
145 lines
4.1 KiB
TypeScript
/**
|
|
* Comprehensive bot detection system
|
|
* Combines user-agent patterns, header analysis, and behavioral heuristics
|
|
*/
|
|
|
|
import { BOT_PATTERNS, GENERIC_BOT_REGEX } from './userAgentPatterns.js';
|
|
import { analyzeHeaders } from './headerAnalysis.js';
|
|
import { analyzeHeuristics } from './heuristics.js';
|
|
|
|
export interface BotDetectionResult {
|
|
isBot: boolean;
|
|
confidence: 'high' | 'medium' | 'low';
|
|
platform: string | null;
|
|
detectionMethod: 'user-agent' | 'header' | 'heuristic' | 'combination';
|
|
score: number; // 0-100
|
|
metadata: {
|
|
userAgent: string;
|
|
signals: string[];
|
|
headerScore: number;
|
|
heuristicScore: number;
|
|
uaMatch: boolean;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Main bot detection function
|
|
*/
|
|
export function detectBot(
|
|
userAgent: string,
|
|
headers: Record<string, string | string[] | undefined> = {}
|
|
): BotDetectionResult {
|
|
const userAgentLower = userAgent.toLowerCase();
|
|
let detectionMethod: BotDetectionResult['detectionMethod'] = 'user-agent';
|
|
let platform: string | null = null;
|
|
let score = 0;
|
|
const signals: string[] = [];
|
|
|
|
// 1. User-Agent Pattern Matching (most reliable)
|
|
let uaMatch = false;
|
|
for (const { pattern, platform: platformName, category } of BOT_PATTERNS) {
|
|
if (userAgentLower.includes(pattern)) {
|
|
uaMatch = true;
|
|
platform = platformName;
|
|
|
|
// High confidence for explicit matches
|
|
if (category === 'social' || category === 'seo' || category === 'preview') {
|
|
score = 95;
|
|
signals.push(`ua-explicit-${category}`);
|
|
} else if (category === 'generic') {
|
|
score = 60; // Lower confidence for generic patterns
|
|
signals.push('ua-generic');
|
|
} else {
|
|
score = 85;
|
|
signals.push(`ua-${category}`);
|
|
}
|
|
|
|
break; // First match wins
|
|
}
|
|
}
|
|
|
|
// 2. Header Analysis
|
|
const headerAnalysis = analyzeHeaders(headers);
|
|
signals.push(...headerAnalysis.signals.map(s => `header:${s}`));
|
|
|
|
// 3. Behavioral Heuristics
|
|
const heuristicAnalysis = analyzeHeuristics(userAgent, headers);
|
|
signals.push(...heuristicAnalysis.signals.map(s => `heuristic:${s}`));
|
|
|
|
// 4. Combine scores with weighted approach
|
|
if (uaMatch) {
|
|
// User-agent match found - combine with other signals
|
|
score = Math.max(score,
|
|
score * 0.7 + headerAnalysis.confidence * 0.2 + heuristicAnalysis.confidence * 0.1
|
|
);
|
|
|
|
if (headerAnalysis.isBot || heuristicAnalysis.isBot) {
|
|
detectionMethod = 'combination';
|
|
}
|
|
} else {
|
|
// No user-agent match - rely on header and heuristic analysis
|
|
score = headerAnalysis.confidence * 0.5 + heuristicAnalysis.confidence * 0.5;
|
|
|
|
if (headerAnalysis.isBot && heuristicAnalysis.isBot) {
|
|
detectionMethod = 'combination';
|
|
platform = 'unknown-bot';
|
|
} else if (headerAnalysis.isBot) {
|
|
detectionMethod = 'header';
|
|
platform = 'header-detected-bot';
|
|
} else if (heuristicAnalysis.isBot) {
|
|
detectionMethod = 'heuristic';
|
|
platform = 'heuristic-detected-bot';
|
|
}
|
|
}
|
|
|
|
// Final bot determination
|
|
const isBot = score >= 50; // 50% confidence threshold
|
|
|
|
// Determine confidence level
|
|
let confidence: 'high' | 'medium' | 'low';
|
|
if (score >= 80) {
|
|
confidence = 'high';
|
|
} else if (score >= 60) {
|
|
confidence = 'medium';
|
|
} else {
|
|
confidence = 'low';
|
|
}
|
|
|
|
return {
|
|
isBot,
|
|
confidence,
|
|
platform,
|
|
detectionMethod,
|
|
score: Math.round(score),
|
|
metadata: {
|
|
userAgent,
|
|
signals,
|
|
headerScore: headerAnalysis.confidence,
|
|
heuristicScore: heuristicAnalysis.confidence,
|
|
uaMatch,
|
|
},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Quick bot check for high-traffic scenarios (lightweight)
|
|
*/
|
|
export function quickBotCheck(userAgent: string): boolean {
|
|
const userAgentLower = userAgent.toLowerCase();
|
|
|
|
// Check most common social/SEO bots first
|
|
const quickPatterns = [
|
|
'facebookexternalhit', 'twitterbot', 'linkedinbot', 'slackbot',
|
|
'discordbot', 'telegrambot', 'whatsapp', 'googlebot', 'bingbot'
|
|
];
|
|
|
|
for (const pattern of quickPatterns) {
|
|
if (userAgentLower.includes(pattern)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Generic regex check
|
|
return GENERIC_BOT_REGEX.test(userAgent);
|
|
}
|