mirror of
https://github.com/pacnpal/thrilltrack-explorer.git
synced 2025-12-20 20:51:13 -05:00
feat: Implement comprehensive bot detection
This commit is contained in:
116
api/botDetection/heuristics.ts
Normal file
116
api/botDetection/heuristics.ts
Normal file
@@ -0,0 +1,116 @@
|
||||
/**
|
||||
* Behavioral heuristics for bot detection
|
||||
*/
|
||||
|
||||
export interface HeuristicResult {
|
||||
isBot: boolean;
|
||||
confidence: number; // 0-100
|
||||
signals: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze user-agent behavior patterns
|
||||
*/
|
||||
export function analyzeHeuristics(userAgent: string, headers: Record<string, string | string[] | undefined>): HeuristicResult {
|
||||
const signals: string[] = [];
|
||||
let confidence = 0;
|
||||
|
||||
// Very short user agent (< 20 chars) - likely a bot
|
||||
if (userAgent.length < 20) {
|
||||
signals.push('very-short-ua');
|
||||
confidence += 25;
|
||||
}
|
||||
|
||||
// Very long user agent (> 400 chars) - suspicious
|
||||
if (userAgent.length > 400) {
|
||||
signals.push('very-long-ua');
|
||||
confidence += 15;
|
||||
}
|
||||
|
||||
// No Mozilla in user agent (almost all browsers have this)
|
||||
if (!userAgent.includes('Mozilla') && !userAgent.includes('compatible')) {
|
||||
signals.push('no-mozilla');
|
||||
confidence += 20;
|
||||
}
|
||||
|
||||
// Contains "http" or "https" in UA (common in bot UAs)
|
||||
if (userAgent.toLowerCase().includes('http://') || userAgent.toLowerCase().includes('https://')) {
|
||||
signals.push('url-in-ua');
|
||||
confidence += 30;
|
||||
}
|
||||
|
||||
// Contains email in UA (some bots identify with contact email)
|
||||
if (userAgent.match(/@|\[at\]|email/i)) {
|
||||
signals.push('email-in-ua');
|
||||
confidence += 25;
|
||||
}
|
||||
|
||||
// Common bot indicators in UA
|
||||
const botKeywords = ['fetch', 'request', 'client', 'library', 'script', 'api', 'scan', 'check', 'monitor', 'test'];
|
||||
for (const keyword of botKeywords) {
|
||||
if (userAgent.toLowerCase().includes(keyword)) {
|
||||
signals.push(`keyword-${keyword}`);
|
||||
confidence += 10;
|
||||
break; // Only count once
|
||||
}
|
||||
}
|
||||
|
||||
// Programming language identifiers
|
||||
const langIdentifiers = ['python', 'java', 'ruby', 'perl', 'go-http', 'php'];
|
||||
for (const lang of langIdentifiers) {
|
||||
if (userAgent.toLowerCase().includes(lang)) {
|
||||
signals.push(`lang-${lang}`);
|
||||
confidence += 15;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Version number patterns typical of bots (e.g., "v1.0", "version/2.3")
|
||||
if (userAgent.match(/\b(v|version)[\/\s]?\d+\.\d+/i)) {
|
||||
signals.push('version-pattern');
|
||||
confidence += 10;
|
||||
}
|
||||
|
||||
// Contains plus (+) sign outside of version numbers (common in bot UAs)
|
||||
if (userAgent.includes('+') && !userAgent.match(/\d+\+/)) {
|
||||
signals.push('plus-sign');
|
||||
confidence += 15;
|
||||
}
|
||||
|
||||
// Only contains alphanumeric, slashes, and dots (no spaces) - very bot-like
|
||||
if (!userAgent.includes(' ') && userAgent.length > 5) {
|
||||
signals.push('no-spaces');
|
||||
confidence += 20;
|
||||
}
|
||||
|
||||
// Normalize headers
|
||||
const normalizedHeaders: Record<string, string> = {};
|
||||
for (const [key, value] of Object.entries(headers)) {
|
||||
if (value) {
|
||||
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
|
||||
}
|
||||
}
|
||||
|
||||
// Missing Accept-Language but has other headers (bots often forget this)
|
||||
if (!normalizedHeaders['accept-language'] && normalizedHeaders['accept']) {
|
||||
signals.push('missing-language-header');
|
||||
confidence += 15;
|
||||
}
|
||||
|
||||
// Accept: */* with no other accept headers (lazy bot implementation)
|
||||
if (normalizedHeaders['accept'] === '*/*' && userAgent.length < 50) {
|
||||
signals.push('lazy-accept-header');
|
||||
confidence += 20;
|
||||
}
|
||||
|
||||
// Cap confidence at 100
|
||||
confidence = Math.min(confidence, 100);
|
||||
|
||||
const isBot = confidence >= 40; // Threshold for heuristic-based detection
|
||||
|
||||
return {
|
||||
isBot,
|
||||
confidence,
|
||||
signals,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user