mirror of
https://github.com/pacnpal/thrilltrack-explorer.git
synced 2025-12-20 16:11:12 -05:00
107 lines
2.9 KiB
TypeScript
107 lines
2.9 KiB
TypeScript
/**
|
|
* Header-based bot detection
|
|
*/
|
|
|
|
export interface HeaderAnalysisResult {
|
|
isBot: boolean;
|
|
confidence: number; // 0-100
|
|
signals: string[];
|
|
}
|
|
|
|
/**
|
|
* Analyze request headers for bot indicators
|
|
*/
|
|
export function analyzeHeaders(headers: Record<string, string | string[] | undefined>): HeaderAnalysisResult {
|
|
const signals: string[] = [];
|
|
let confidence = 0;
|
|
|
|
// Normalize headers to lowercase
|
|
const normalizedHeaders: Record<string, string> = {};
|
|
for (const [key, value] of Object.entries(headers)) {
|
|
if (value) {
|
|
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
|
|
}
|
|
}
|
|
|
|
// Check for explicit bot-identifying headers
|
|
if (normalizedHeaders['x-purpose'] === 'preview') {
|
|
signals.push('x-purpose-preview');
|
|
confidence += 40;
|
|
}
|
|
|
|
// Check for headless Chrome DevTools Protocol
|
|
if (normalizedHeaders['x-devtools-emulate-network-conditions-client-id']) {
|
|
signals.push('devtools-protocol');
|
|
confidence += 30;
|
|
}
|
|
|
|
// Missing typical browser headers
|
|
if (!normalizedHeaders['accept-language']) {
|
|
signals.push('missing-accept-language');
|
|
confidence += 15;
|
|
}
|
|
|
|
if (!normalizedHeaders['accept-encoding']) {
|
|
signals.push('missing-accept-encoding');
|
|
confidence += 10;
|
|
}
|
|
|
|
// Suspicious Accept header (not typical browser)
|
|
const accept = normalizedHeaders['accept'];
|
|
if (accept && !accept.includes('text/html') && !accept.includes('*/*')) {
|
|
signals.push('non-html-accept');
|
|
confidence += 15;
|
|
}
|
|
|
|
// Direct access without referer (common for bots)
|
|
if (!normalizedHeaders['referer'] && !normalizedHeaders['referrer']) {
|
|
signals.push('no-referer');
|
|
confidence += 5;
|
|
}
|
|
|
|
// Check for automation headers
|
|
if (normalizedHeaders['x-requested-with'] === 'XMLHttpRequest') {
|
|
// XHR requests might be AJAX but also automation
|
|
signals.push('xhr-request');
|
|
confidence += 5;
|
|
}
|
|
|
|
// Very simple Accept header (typical of scrapers)
|
|
if (accept === '*/*' || accept === 'application/json') {
|
|
signals.push('simple-accept');
|
|
confidence += 10;
|
|
}
|
|
|
|
// No DNT or cookie-related headers (bots often don't send these)
|
|
if (!normalizedHeaders['cookie'] && !normalizedHeaders['dnt']) {
|
|
signals.push('no-cookie-or-dnt');
|
|
confidence += 5;
|
|
}
|
|
|
|
// Forward headers from proxies/CDNs (could indicate bot)
|
|
if (normalizedHeaders['x-forwarded-for']) {
|
|
signals.push('has-x-forwarded-for');
|
|
confidence += 5;
|
|
}
|
|
|
|
// Cloudflare bot management headers
|
|
if (normalizedHeaders['cf-ray']) {
|
|
// Cloudflare is present, which is normal
|
|
if (normalizedHeaders['cf-ipcountry'] && !normalizedHeaders['accept-language']) {
|
|
signals.push('cloudflare-without-language');
|
|
confidence += 10;
|
|
}
|
|
}
|
|
|
|
// Cap confidence at 100
|
|
confidence = Math.min(confidence, 100);
|
|
|
|
const isBot = confidence >= 30; // Threshold for header-based detection
|
|
|
|
return {
|
|
isBot,
|
|
confidence,
|
|
signals,
|
|
};
|
|
}
|