mirror of
https://github.com/pacnpal/thrilltrack-explorer.git
synced 2025-12-20 15:31:13 -05:00
feat: Implement comprehensive bot detection
This commit is contained in:
144
api/botDetection/index.ts
Normal file
144
api/botDetection/index.ts
Normal file
@@ -0,0 +1,144 @@
|
||||
/**
|
||||
* Comprehensive bot detection system
|
||||
* Combines user-agent patterns, header analysis, and behavioral heuristics
|
||||
*/
|
||||
|
||||
import { BOT_PATTERNS, GENERIC_BOT_REGEX } from './userAgentPatterns';
|
||||
import { analyzeHeaders } from './headerAnalysis';
|
||||
import { analyzeHeuristics } from './heuristics';
|
||||
|
||||
export interface BotDetectionResult {
|
||||
isBot: boolean;
|
||||
confidence: 'high' | 'medium' | 'low';
|
||||
platform: string | null;
|
||||
detectionMethod: 'user-agent' | 'header' | 'heuristic' | 'combination';
|
||||
score: number; // 0-100
|
||||
metadata: {
|
||||
userAgent: string;
|
||||
signals: string[];
|
||||
headerScore: number;
|
||||
heuristicScore: number;
|
||||
uaMatch: boolean;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Main bot detection function
|
||||
*/
|
||||
export function detectBot(
|
||||
userAgent: string,
|
||||
headers: Record<string, string | string[] | undefined> = {}
|
||||
): BotDetectionResult {
|
||||
const userAgentLower = userAgent.toLowerCase();
|
||||
let detectionMethod: BotDetectionResult['detectionMethod'] = 'user-agent';
|
||||
let platform: string | null = null;
|
||||
let score = 0;
|
||||
const signals: string[] = [];
|
||||
|
||||
// 1. User-Agent Pattern Matching (most reliable)
|
||||
let uaMatch = false;
|
||||
for (const { pattern, platform: platformName, category } of BOT_PATTERNS) {
|
||||
if (userAgentLower.includes(pattern)) {
|
||||
uaMatch = true;
|
||||
platform = platformName;
|
||||
|
||||
// High confidence for explicit matches
|
||||
if (category === 'social' || category === 'seo' || category === 'preview') {
|
||||
score = 95;
|
||||
signals.push(`ua-explicit-${category}`);
|
||||
} else if (category === 'generic') {
|
||||
score = 60; // Lower confidence for generic patterns
|
||||
signals.push('ua-generic');
|
||||
} else {
|
||||
score = 85;
|
||||
signals.push(`ua-${category}`);
|
||||
}
|
||||
|
||||
break; // First match wins
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Header Analysis
|
||||
const headerAnalysis = analyzeHeaders(headers);
|
||||
signals.push(...headerAnalysis.signals.map(s => `header:${s}`));
|
||||
|
||||
// 3. Behavioral Heuristics
|
||||
const heuristicAnalysis = analyzeHeuristics(userAgent, headers);
|
||||
signals.push(...heuristicAnalysis.signals.map(s => `heuristic:${s}`));
|
||||
|
||||
// 4. Combine scores with weighted approach
|
||||
if (uaMatch) {
|
||||
// User-agent match found - combine with other signals
|
||||
score = Math.max(score,
|
||||
score * 0.7 + headerAnalysis.confidence * 0.2 + heuristicAnalysis.confidence * 0.1
|
||||
);
|
||||
|
||||
if (headerAnalysis.isBot || heuristicAnalysis.isBot) {
|
||||
detectionMethod = 'combination';
|
||||
}
|
||||
} else {
|
||||
// No user-agent match - rely on header and heuristic analysis
|
||||
score = headerAnalysis.confidence * 0.5 + heuristicAnalysis.confidence * 0.5;
|
||||
|
||||
if (headerAnalysis.isBot && heuristicAnalysis.isBot) {
|
||||
detectionMethod = 'combination';
|
||||
platform = 'unknown-bot';
|
||||
} else if (headerAnalysis.isBot) {
|
||||
detectionMethod = 'header';
|
||||
platform = 'header-detected-bot';
|
||||
} else if (heuristicAnalysis.isBot) {
|
||||
detectionMethod = 'heuristic';
|
||||
platform = 'heuristic-detected-bot';
|
||||
}
|
||||
}
|
||||
|
||||
// Final bot determination
|
||||
const isBot = score >= 50; // 50% confidence threshold
|
||||
|
||||
// Determine confidence level
|
||||
let confidence: 'high' | 'medium' | 'low';
|
||||
if (score >= 80) {
|
||||
confidence = 'high';
|
||||
} else if (score >= 60) {
|
||||
confidence = 'medium';
|
||||
} else {
|
||||
confidence = 'low';
|
||||
}
|
||||
|
||||
return {
|
||||
isBot,
|
||||
confidence,
|
||||
platform,
|
||||
detectionMethod,
|
||||
score: Math.round(score),
|
||||
metadata: {
|
||||
userAgent,
|
||||
signals,
|
||||
headerScore: headerAnalysis.confidence,
|
||||
heuristicScore: heuristicAnalysis.confidence,
|
||||
uaMatch,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Quick bot check for high-traffic scenarios (lightweight)
|
||||
*/
|
||||
export function quickBotCheck(userAgent: string): boolean {
|
||||
const userAgentLower = userAgent.toLowerCase();
|
||||
|
||||
// Check most common social/SEO bots first
|
||||
const quickPatterns = [
|
||||
'facebookexternalhit', 'twitterbot', 'linkedinbot', 'slackbot',
|
||||
'discordbot', 'telegrambot', 'whatsapp', 'googlebot', 'bingbot'
|
||||
];
|
||||
|
||||
for (const pattern of quickPatterns) {
|
||||
if (userAgentLower.includes(pattern)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Generic regex check
|
||||
return GENERIC_BOT_REGEX.test(userAgent);
|
||||
}
|
||||
Reference in New Issue
Block a user