Files
thrilltrack-explorer/api/botDetection/headerAnalysis.ts
2025-10-29 20:49:26 +00:00

107 lines
2.9 KiB
TypeScript

/**
* Header-based bot detection
*/
export interface HeaderAnalysisResult {
isBot: boolean;
confidence: number; // 0-100
signals: string[];
}
/**
* Analyze request headers for bot indicators
*/
export function analyzeHeaders(headers: Record<string, string | string[] | undefined>): HeaderAnalysisResult {
const signals: string[] = [];
let confidence = 0;
// Normalize headers to lowercase
const normalizedHeaders: Record<string, string> = {};
for (const [key, value] of Object.entries(headers)) {
if (value) {
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
}
}
// Check for explicit bot-identifying headers
if (normalizedHeaders['x-purpose'] === 'preview') {
signals.push('x-purpose-preview');
confidence += 40;
}
// Check for headless Chrome DevTools Protocol
if (normalizedHeaders['x-devtools-emulate-network-conditions-client-id']) {
signals.push('devtools-protocol');
confidence += 30;
}
// Missing typical browser headers
if (!normalizedHeaders['accept-language']) {
signals.push('missing-accept-language');
confidence += 15;
}
if (!normalizedHeaders['accept-encoding']) {
signals.push('missing-accept-encoding');
confidence += 10;
}
// Suspicious Accept header (not typical browser)
const accept = normalizedHeaders['accept'];
if (accept && !accept.includes('text/html') && !accept.includes('*/*')) {
signals.push('non-html-accept');
confidence += 15;
}
// Direct access without referer (common for bots)
if (!normalizedHeaders['referer'] && !normalizedHeaders['referrer']) {
signals.push('no-referer');
confidence += 5;
}
// Check for automation headers
if (normalizedHeaders['x-requested-with'] === 'XMLHttpRequest') {
// XHR requests might be AJAX but also automation
signals.push('xhr-request');
confidence += 5;
}
// Very simple Accept header (typical of scrapers)
if (accept === '*/*' || accept === 'application/json') {
signals.push('simple-accept');
confidence += 10;
}
// No DNT or cookie-related headers (bots often don't send these)
if (!normalizedHeaders['cookie'] && !normalizedHeaders['dnt']) {
signals.push('no-cookie-or-dnt');
confidence += 5;
}
// Forward headers from proxies/CDNs (could indicate bot)
if (normalizedHeaders['x-forwarded-for']) {
signals.push('has-x-forwarded-for');
confidence += 5;
}
// Cloudflare bot management headers
if (normalizedHeaders['cf-ray']) {
// Cloudflare is present, which is normal
if (normalizedHeaders['cf-ipcountry'] && !normalizedHeaders['accept-language']) {
signals.push('cloudflare-without-language');
confidence += 10;
}
}
// Cap confidence at 100
confidence = Math.min(confidence, 100);
const isBot = confidence >= 30; // Threshold for header-based detection
return {
isBot,
confidence,
signals,
};
}