mirror of
https://github.com/pacnpal/thrilltrack-explorer.git
synced 2025-12-20 06:11:11 -05:00
117 lines
3.3 KiB
TypeScript
117 lines
3.3 KiB
TypeScript
/**
|
|
* Behavioral heuristics for bot detection
|
|
*/
|
|
|
|
export interface HeuristicResult {
|
|
isBot: boolean;
|
|
confidence: number; // 0-100
|
|
signals: string[];
|
|
}
|
|
|
|
/**
|
|
* Analyze user-agent behavior patterns
|
|
*/
|
|
export function analyzeHeuristics(userAgent: string, headers: Record<string, string | string[] | undefined>): HeuristicResult {
|
|
const signals: string[] = [];
|
|
let confidence = 0;
|
|
|
|
// Very short user agent (< 20 chars) - likely a bot
|
|
if (userAgent.length < 20) {
|
|
signals.push('very-short-ua');
|
|
confidence += 25;
|
|
}
|
|
|
|
// Very long user agent (> 400 chars) - suspicious
|
|
if (userAgent.length > 400) {
|
|
signals.push('very-long-ua');
|
|
confidence += 15;
|
|
}
|
|
|
|
// No Mozilla in user agent (almost all browsers have this)
|
|
if (!userAgent.includes('Mozilla') && !userAgent.includes('compatible')) {
|
|
signals.push('no-mozilla');
|
|
confidence += 20;
|
|
}
|
|
|
|
// Contains "http" or "https" in UA (common in bot UAs)
|
|
if (userAgent.toLowerCase().includes('http://') || userAgent.toLowerCase().includes('https://')) {
|
|
signals.push('url-in-ua');
|
|
confidence += 30;
|
|
}
|
|
|
|
// Contains email in UA (some bots identify with contact email)
|
|
if (userAgent.match(/@|\[at\]|email/i)) {
|
|
signals.push('email-in-ua');
|
|
confidence += 25;
|
|
}
|
|
|
|
// Common bot indicators in UA
|
|
const botKeywords = ['fetch', 'request', 'client', 'library', 'script', 'api', 'scan', 'check', 'monitor', 'test'];
|
|
for (const keyword of botKeywords) {
|
|
if (userAgent.toLowerCase().includes(keyword)) {
|
|
signals.push(`keyword-${keyword}`);
|
|
confidence += 10;
|
|
break; // Only count once
|
|
}
|
|
}
|
|
|
|
// Programming language identifiers
|
|
const langIdentifiers = ['python', 'java', 'ruby', 'perl', 'go-http', 'php'];
|
|
for (const lang of langIdentifiers) {
|
|
if (userAgent.toLowerCase().includes(lang)) {
|
|
signals.push(`lang-${lang}`);
|
|
confidence += 15;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Version number patterns typical of bots (e.g., "v1.0", "version/2.3")
|
|
if (userAgent.match(/\b(v|version)[/\s]?\d+\.\d+/i)) {
|
|
signals.push('version-pattern');
|
|
confidence += 10;
|
|
}
|
|
|
|
// Contains plus (+) sign outside of version numbers (common in bot UAs)
|
|
if (userAgent.includes('+') && !userAgent.match(/\d+\+/)) {
|
|
signals.push('plus-sign');
|
|
confidence += 15;
|
|
}
|
|
|
|
// Only contains alphanumeric, slashes, and dots (no spaces) - very bot-like
|
|
if (!userAgent.includes(' ') && userAgent.length > 5) {
|
|
signals.push('no-spaces');
|
|
confidence += 20;
|
|
}
|
|
|
|
// Normalize headers
|
|
const normalizedHeaders: Record<string, string> = {};
|
|
for (const [key, value] of Object.entries(headers)) {
|
|
if (value) {
|
|
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
|
|
}
|
|
}
|
|
|
|
// Missing Accept-Language but has other headers (bots often forget this)
|
|
if (!normalizedHeaders['accept-language'] && normalizedHeaders['accept']) {
|
|
signals.push('missing-language-header');
|
|
confidence += 15;
|
|
}
|
|
|
|
// Accept: */* with no other accept headers (lazy bot implementation)
|
|
if (normalizedHeaders['accept'] === '*/*' && userAgent.length < 50) {
|
|
signals.push('lazy-accept-header');
|
|
confidence += 20;
|
|
}
|
|
|
|
// Cap confidence at 100
|
|
confidence = Math.min(confidence, 100);
|
|
|
|
const isBot = confidence >= 40; // Threshold for heuristic-based detection
|
|
|
|
return {
|
|
isBot,
|
|
confidence,
|
|
signals,
|
|
};
|
|
}
|