feat: Implement comprehensive bot detection

This commit is contained in:
gpt-engineer-app[bot]
2025-10-29 20:49:26 +00:00
parent d362fa7537
commit 2918f9d280
5 changed files with 515 additions and 76 deletions

View File

@@ -0,0 +1,106 @@
/**
* Header-based bot detection
*/
export interface HeaderAnalysisResult {
isBot: boolean;
confidence: number; // 0-100
signals: string[];
}
/**
* Analyze request headers for bot indicators
*/
export function analyzeHeaders(headers: Record<string, string | string[] | undefined>): HeaderAnalysisResult {
const signals: string[] = [];
let confidence = 0;
// Normalize headers to lowercase
const normalizedHeaders: Record<string, string> = {};
for (const [key, value] of Object.entries(headers)) {
if (value) {
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
}
}
// Check for explicit bot-identifying headers
if (normalizedHeaders['x-purpose'] === 'preview') {
signals.push('x-purpose-preview');
confidence += 40;
}
// Check for headless Chrome DevTools Protocol
if (normalizedHeaders['x-devtools-emulate-network-conditions-client-id']) {
signals.push('devtools-protocol');
confidence += 30;
}
// Missing typical browser headers
if (!normalizedHeaders['accept-language']) {
signals.push('missing-accept-language');
confidence += 15;
}
if (!normalizedHeaders['accept-encoding']) {
signals.push('missing-accept-encoding');
confidence += 10;
}
// Suspicious Accept header (not typical browser)
const accept = normalizedHeaders['accept'];
if (accept && !accept.includes('text/html') && !accept.includes('*/*')) {
signals.push('non-html-accept');
confidence += 15;
}
// Direct access without referer (common for bots)
if (!normalizedHeaders['referer'] && !normalizedHeaders['referrer']) {
signals.push('no-referer');
confidence += 5;
}
// Check for automation headers
if (normalizedHeaders['x-requested-with'] === 'XMLHttpRequest') {
// XHR requests might be AJAX but also automation
signals.push('xhr-request');
confidence += 5;
}
// Very simple Accept header (typical of scrapers)
if (accept === '*/*' || accept === 'application/json') {
signals.push('simple-accept');
confidence += 10;
}
// No DNT or cookie-related headers (bots often don't send these)
if (!normalizedHeaders['cookie'] && !normalizedHeaders['dnt']) {
signals.push('no-cookie-or-dnt');
confidence += 5;
}
// Forward headers from proxies/CDNs (could indicate bot)
if (normalizedHeaders['x-forwarded-for']) {
signals.push('has-x-forwarded-for');
confidence += 5;
}
// Cloudflare bot management headers
if (normalizedHeaders['cf-ray']) {
// Cloudflare is present, which is normal
if (normalizedHeaders['cf-ipcountry'] && !normalizedHeaders['accept-language']) {
signals.push('cloudflare-without-language');
confidence += 10;
}
}
// Cap confidence at 100
confidence = Math.min(confidence, 100);
const isBot = confidence >= 30; // Threshold for header-based detection
return {
isBot,
confidence,
signals,
};
}

View File

@@ -0,0 +1,116 @@
/**
* Behavioral heuristics for bot detection
*/
export interface HeuristicResult {
isBot: boolean;
confidence: number; // 0-100
signals: string[];
}
/**
* Analyze user-agent behavior patterns
*/
export function analyzeHeuristics(userAgent: string, headers: Record<string, string | string[] | undefined>): HeuristicResult {
const signals: string[] = [];
let confidence = 0;
// Very short user agent (< 20 chars) - likely a bot
if (userAgent.length < 20) {
signals.push('very-short-ua');
confidence += 25;
}
// Very long user agent (> 400 chars) - suspicious
if (userAgent.length > 400) {
signals.push('very-long-ua');
confidence += 15;
}
// No Mozilla in user agent (almost all browsers have this)
if (!userAgent.includes('Mozilla') && !userAgent.includes('compatible')) {
signals.push('no-mozilla');
confidence += 20;
}
// Contains "http" or "https" in UA (common in bot UAs)
if (userAgent.toLowerCase().includes('http://') || userAgent.toLowerCase().includes('https://')) {
signals.push('url-in-ua');
confidence += 30;
}
// Contains email in UA (some bots identify with contact email)
if (userAgent.match(/@|\[at\]|email/i)) {
signals.push('email-in-ua');
confidence += 25;
}
// Common bot indicators in UA
const botKeywords = ['fetch', 'request', 'client', 'library', 'script', 'api', 'scan', 'check', 'monitor', 'test'];
for (const keyword of botKeywords) {
if (userAgent.toLowerCase().includes(keyword)) {
signals.push(`keyword-${keyword}`);
confidence += 10;
break; // Only count once
}
}
// Programming language identifiers
const langIdentifiers = ['python', 'java', 'ruby', 'perl', 'go-http', 'php'];
for (const lang of langIdentifiers) {
if (userAgent.toLowerCase().includes(lang)) {
signals.push(`lang-${lang}`);
confidence += 15;
break;
}
}
// Version number patterns typical of bots (e.g., "v1.0", "version/2.3")
if (userAgent.match(/\b(v|version)[\/\s]?\d+\.\d+/i)) {
signals.push('version-pattern');
confidence += 10;
}
// Contains plus (+) sign outside of version numbers (common in bot UAs)
if (userAgent.includes('+') && !userAgent.match(/\d+\+/)) {
signals.push('plus-sign');
confidence += 15;
}
// Only contains alphanumeric, slashes, and dots (no spaces) - very bot-like
if (!userAgent.includes(' ') && userAgent.length > 5) {
signals.push('no-spaces');
confidence += 20;
}
// Normalize headers
const normalizedHeaders: Record<string, string> = {};
for (const [key, value] of Object.entries(headers)) {
if (value) {
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
}
}
// Missing Accept-Language but has other headers (bots often forget this)
if (!normalizedHeaders['accept-language'] && normalizedHeaders['accept']) {
signals.push('missing-language-header');
confidence += 15;
}
// Accept: */* with no other accept headers (lazy bot implementation)
if (normalizedHeaders['accept'] === '*/*' && userAgent.length < 50) {
signals.push('lazy-accept-header');
confidence += 20;
}
// Cap confidence at 100
confidence = Math.min(confidence, 100);
const isBot = confidence >= 40; // Threshold for heuristic-based detection
return {
isBot,
confidence,
signals,
};
}

144
api/botDetection/index.ts Normal file
View File

@@ -0,0 +1,144 @@
/**
* Comprehensive bot detection system
* Combines user-agent patterns, header analysis, and behavioral heuristics
*/
import { BOT_PATTERNS, GENERIC_BOT_REGEX } from './userAgentPatterns';
import { analyzeHeaders } from './headerAnalysis';
import { analyzeHeuristics } from './heuristics';
export interface BotDetectionResult {
isBot: boolean;
confidence: 'high' | 'medium' | 'low';
platform: string | null;
detectionMethod: 'user-agent' | 'header' | 'heuristic' | 'combination';
score: number; // 0-100
metadata: {
userAgent: string;
signals: string[];
headerScore: number;
heuristicScore: number;
uaMatch: boolean;
};
}
/**
* Main bot detection function
*/
export function detectBot(
userAgent: string,
headers: Record<string, string | string[] | undefined> = {}
): BotDetectionResult {
const userAgentLower = userAgent.toLowerCase();
let detectionMethod: BotDetectionResult['detectionMethod'] = 'user-agent';
let platform: string | null = null;
let score = 0;
const signals: string[] = [];
// 1. User-Agent Pattern Matching (most reliable)
let uaMatch = false;
for (const { pattern, platform: platformName, category } of BOT_PATTERNS) {
if (userAgentLower.includes(pattern)) {
uaMatch = true;
platform = platformName;
// High confidence for explicit matches
if (category === 'social' || category === 'seo' || category === 'preview') {
score = 95;
signals.push(`ua-explicit-${category}`);
} else if (category === 'generic') {
score = 60; // Lower confidence for generic patterns
signals.push('ua-generic');
} else {
score = 85;
signals.push(`ua-${category}`);
}
break; // First match wins
}
}
// 2. Header Analysis
const headerAnalysis = analyzeHeaders(headers);
signals.push(...headerAnalysis.signals.map(s => `header:${s}`));
// 3. Behavioral Heuristics
const heuristicAnalysis = analyzeHeuristics(userAgent, headers);
signals.push(...heuristicAnalysis.signals.map(s => `heuristic:${s}`));
// 4. Combine scores with weighted approach
if (uaMatch) {
// User-agent match found - combine with other signals
score = Math.max(score,
score * 0.7 + headerAnalysis.confidence * 0.2 + heuristicAnalysis.confidence * 0.1
);
if (headerAnalysis.isBot || heuristicAnalysis.isBot) {
detectionMethod = 'combination';
}
} else {
// No user-agent match - rely on header and heuristic analysis
score = headerAnalysis.confidence * 0.5 + heuristicAnalysis.confidence * 0.5;
if (headerAnalysis.isBot && heuristicAnalysis.isBot) {
detectionMethod = 'combination';
platform = 'unknown-bot';
} else if (headerAnalysis.isBot) {
detectionMethod = 'header';
platform = 'header-detected-bot';
} else if (heuristicAnalysis.isBot) {
detectionMethod = 'heuristic';
platform = 'heuristic-detected-bot';
}
}
// Final bot determination
const isBot = score >= 50; // 50% confidence threshold
// Determine confidence level
let confidence: 'high' | 'medium' | 'low';
if (score >= 80) {
confidence = 'high';
} else if (score >= 60) {
confidence = 'medium';
} else {
confidence = 'low';
}
return {
isBot,
confidence,
platform,
detectionMethod,
score: Math.round(score),
metadata: {
userAgent,
signals,
headerScore: headerAnalysis.confidence,
heuristicScore: heuristicAnalysis.confidence,
uaMatch,
},
};
}
/**
* Quick bot check for high-traffic scenarios (lightweight)
*/
export function quickBotCheck(userAgent: string): boolean {
const userAgentLower = userAgent.toLowerCase();
// Check most common social/SEO bots first
const quickPatterns = [
'facebookexternalhit', 'twitterbot', 'linkedinbot', 'slackbot',
'discordbot', 'telegrambot', 'whatsapp', 'googlebot', 'bingbot'
];
for (const pattern of quickPatterns) {
if (userAgentLower.includes(pattern)) {
return true;
}
}
// Generic regex check
return GENERIC_BOT_REGEX.test(userAgent);
}

View File

@@ -0,0 +1,130 @@
/**
* Comprehensive user-agent bot patterns organized by category
*/
export interface BotPattern {
pattern: string;
platform: string;
category: 'social' | 'seo' | 'monitoring' | 'preview' | 'ai' | 'dev' | 'archive' | 'email' | 'generic';
}
export const BOT_PATTERNS: BotPattern[] = [
// Social Media Preview Bots (HIGH PRIORITY)
{ pattern: 'facebookexternalhit', platform: 'facebook', category: 'social' },
{ pattern: 'facebot', platform: 'facebook', category: 'social' },
{ pattern: 'twitterbot', platform: 'twitter', category: 'social' },
{ pattern: 'twitter', platform: 'twitter', category: 'social' },
{ pattern: 'linkedinbot', platform: 'linkedin', category: 'social' },
{ pattern: 'linkedin', platform: 'linkedin', category: 'social' },
{ pattern: 'slackbot', platform: 'slack', category: 'social' },
{ pattern: 'slack-imgproxy', platform: 'slack', category: 'social' },
{ pattern: 'telegrambot', platform: 'telegram', category: 'social' },
{ pattern: 'whatsapp', platform: 'whatsapp', category: 'social' },
{ pattern: 'discordbot', platform: 'discord', category: 'social' },
{ pattern: 'discord', platform: 'discord', category: 'social' },
{ pattern: 'pinterestbot', platform: 'pinterest', category: 'social' },
{ pattern: 'pinterest', platform: 'pinterest', category: 'social' },
{ pattern: 'redditbot', platform: 'reddit', category: 'social' },
{ pattern: 'reddit', platform: 'reddit', category: 'social' },
{ pattern: 'instagram', platform: 'instagram', category: 'social' },
{ pattern: 'snapchat', platform: 'snapchat', category: 'social' },
{ pattern: 'tiktokbot', platform: 'tiktok', category: 'social' },
{ pattern: 'bytespider', platform: 'tiktok', category: 'social' },
{ pattern: 'tumblr', platform: 'tumblr', category: 'social' },
{ pattern: 'vkshare', platform: 'vk', category: 'social' },
{ pattern: 'line', platform: 'line', category: 'social' },
{ pattern: 'kakaotalk', platform: 'kakaotalk', category: 'social' },
{ pattern: 'wechat', platform: 'wechat', category: 'social' },
// Search Engine Crawlers
{ pattern: 'googlebot', platform: 'google', category: 'seo' },
{ pattern: 'bingbot', platform: 'bing', category: 'seo' },
{ pattern: 'bingpreview', platform: 'bing', category: 'preview' },
{ pattern: 'slurp', platform: 'yahoo', category: 'seo' },
{ pattern: 'duckduckbot', platform: 'duckduckgo', category: 'seo' },
{ pattern: 'baiduspider', platform: 'baidu', category: 'seo' },
{ pattern: 'yandexbot', platform: 'yandex', category: 'seo' },
// SEO & Analytics Crawlers
{ pattern: 'ahrefsbot', platform: 'ahrefs', category: 'seo' },
{ pattern: 'ahrefs', platform: 'ahrefs', category: 'seo' },
{ pattern: 'semrushbot', platform: 'semrush', category: 'seo' },
{ pattern: 'dotbot', platform: 'moz', category: 'seo' },
{ pattern: 'rogerbot', platform: 'moz', category: 'seo' },
{ pattern: 'screaming frog', platform: 'screaming-frog', category: 'seo' },
{ pattern: 'majestic', platform: 'majestic', category: 'seo' },
{ pattern: 'mjl12bot', platform: 'majestic', category: 'seo' },
{ pattern: 'similarweb', platform: 'similarweb', category: 'seo' },
{ pattern: 'dataforseo', platform: 'dataforseo', category: 'seo' },
// Monitoring & Uptime Services
{ pattern: 'pingdom', platform: 'pingdom', category: 'monitoring' },
{ pattern: 'statuscake', platform: 'statuscake', category: 'monitoring' },
{ pattern: 'uptimerobot', platform: 'uptimerobot', category: 'monitoring' },
{ pattern: 'newrelic', platform: 'newrelic', category: 'monitoring' },
{ pattern: 'datadog', platform: 'datadog', category: 'monitoring' },
// Preview & Unfurling Services
{ pattern: 'embedly', platform: 'embedly', category: 'preview' },
{ pattern: 'nuzzel', platform: 'nuzzel', category: 'preview' },
{ pattern: 'qwantify', platform: 'qwantify', category: 'preview' },
{ pattern: 'skypeuripreview', platform: 'skype', category: 'preview' },
{ pattern: 'outbrain', platform: 'outbrain', category: 'preview' },
{ pattern: 'flipboard', platform: 'flipboard', category: 'preview' },
// AI & LLM Crawlers
{ pattern: 'gptbot', platform: 'openai', category: 'ai' },
{ pattern: 'chatgpt', platform: 'openai', category: 'ai' },
{ pattern: 'claudebot', platform: 'anthropic', category: 'ai' },
{ pattern: 'anthropic-ai', platform: 'anthropic', category: 'ai' },
{ pattern: 'google-extended', platform: 'google-bard', category: 'ai' },
{ pattern: 'cohere-ai', platform: 'cohere', category: 'ai' },
{ pattern: 'perplexitybot', platform: 'perplexity', category: 'ai' },
{ pattern: 'ccbot', platform: 'commoncrawl', category: 'ai' },
// Development & Testing Tools
{ pattern: 'postman', platform: 'postman', category: 'dev' },
{ pattern: 'insomnia', platform: 'insomnia', category: 'dev' },
{ pattern: 'httpie', platform: 'httpie', category: 'dev' },
{ pattern: 'curl', platform: 'curl', category: 'dev' },
{ pattern: 'wget', platform: 'wget', category: 'dev' },
{ pattern: 'apache-httpclient', platform: 'apache', category: 'dev' },
{ pattern: 'python-requests', platform: 'python', category: 'dev' },
{ pattern: 'node-fetch', platform: 'nodejs', category: 'dev' },
{ pattern: 'axios', platform: 'axios', category: 'dev' },
// Headless Browsers & Automation
{ pattern: 'headless', platform: 'headless-browser', category: 'dev' },
{ pattern: 'chrome-lighthouse', platform: 'lighthouse', category: 'dev' },
{ pattern: 'puppeteer', platform: 'puppeteer', category: 'dev' },
{ pattern: 'playwright', platform: 'playwright', category: 'dev' },
{ pattern: 'selenium', platform: 'selenium', category: 'dev' },
{ pattern: 'phantomjs', platform: 'phantomjs', category: 'dev' },
// Vercel & Deployment Platforms
{ pattern: 'vercel', platform: 'vercel', category: 'preview' },
{ pattern: 'vercel-screenshot', platform: 'vercel', category: 'preview' },
{ pattern: 'prerender', platform: 'prerender', category: 'preview' },
{ pattern: 'netlify', platform: 'netlify', category: 'preview' },
// Archive & Research
{ pattern: 'ia_archiver', platform: 'internet-archive', category: 'archive' },
{ pattern: 'archive.org_bot', platform: 'internet-archive', category: 'archive' },
// Email Clients (for link previews)
{ pattern: 'outlook', platform: 'outlook', category: 'email' },
{ pattern: 'googleimageproxy', platform: 'gmail', category: 'email' },
{ pattern: 'apple mail', platform: 'apple-mail', category: 'email' },
{ pattern: 'yahoo', platform: 'yahoo-mail', category: 'email' },
// Generic patterns (LOWEST PRIORITY - check last)
{ pattern: 'bot', platform: 'generic-bot', category: 'generic' },
{ pattern: 'crawler', platform: 'generic-crawler', category: 'generic' },
{ pattern: 'spider', platform: 'generic-spider', category: 'generic' },
{ pattern: 'scraper', platform: 'generic-scraper', category: 'generic' },
];
/**
* Regex patterns for faster generic matching
*/
export const GENERIC_BOT_REGEX = /(bot|crawler|spider|scraper|curl|wget|http|fetch)/i;

View File

@@ -14,68 +14,7 @@ type VercelResponse = ServerResponse & {
send: (body: string) => VercelResponse;
};
// Bot detection configuration
const SOCIAL_BOTS = {
'facebookexternalhit': 'facebook',
'facebot': 'facebook',
'facebookcatalog': 'facebook',
'twitterbot': 'twitter',
'x-bot': 'twitter',
'linkedinbot': 'linkedin',
'discordbot': 'discord',
'slackbot': 'slack',
'slack-imgproxy': 'slack',
'whatsapp': 'whatsapp',
'telegrambot': 'telegram',
'pinterestbot': 'pinterest',
'redditbot': 'reddit',
'apple-pcs': 'imessage',
'mastodon': 'mastodon',
'ms-teams': 'teams',
'googlebot': 'google',
'bingbot': 'bing',
'slurp': 'yahoo',
'duckduckbot': 'duckduckgo',
'baiduspider': 'baidu',
'yandexbot': 'yandex',
// Headless browsers & crawlers
'headless': 'headless-browser',
'chrome-lighthouse': 'lighthouse',
'puppeteer': 'puppeteer',
'playwright': 'playwright',
'selenium': 'selenium',
'phantomjs': 'phantomjs',
// Vercel & deployment platforms
'vercel': 'vercel',
'vercel-screenshot': 'vercel',
'prerender': 'prerender',
// Generic crawler patterns
'bot': 'generic-bot',
'crawler': 'generic-crawler',
'spider': 'generic-spider',
'scraper': 'generic-scraper'
};
interface BotDetection {
isBot: boolean;
platform: string | null;
}
function detectBot(userAgent: string): BotDetection {
if (!userAgent) {
return { isBot: false, platform: null };
}
const ua = userAgent.toLowerCase();
for (const [pattern, platform] of Object.entries(SOCIAL_BOTS)) {
if (ua.includes(pattern)) {
return { isBot: true, platform };
}
}
return { isBot: false, platform: null };
}
import { detectBot } from './botDetection/index';
interface PageData {
title: string;
@@ -245,24 +184,25 @@ export default async function handler(req: VercelRequest, res: VercelResponse) {
const fullUrl = `https://${req.headers.host}${req.url}`;
const pathname = new URL(fullUrl).pathname;
// Bot detection
const botDetection = detectBot(userAgent);
// Comprehensive bot detection with headers
const botDetection = detectBot(userAgent, req.headers as Record<string, string | string[] | undefined>);
// Enhanced logging
// Enhanced logging with detection details
if (botDetection.isBot) {
console.log(`[SSR-OG] ✅ Bot detected: ${botDetection.platform} | ${req.method} ${pathname}`);
console.log(`[SSR-OG] Full UA: ${userAgent}`);
console.log(`[SSR-OG] ✅ Bot detected: ${botDetection.platform || 'unknown'} | Confidence: ${botDetection.confidence} (${botDetection.score}%) | Method: ${botDetection.detectionMethod}`);
console.log(`[SSR-OG] Path: ${req.method} ${pathname}`);
console.log(`[SSR-OG] UA: ${userAgent}`);
if (botDetection.metadata.signals.length > 0) {
console.log(`[SSR-OG] Signals: ${botDetection.metadata.signals.slice(0, 5).join(', ')}${botDetection.metadata.signals.length > 5 ? '...' : ''}`);
}
} else {
// Log undetected potential bots for debugging
const looksLikeBot = !userAgent.includes('Mozilla') ||
userAgent.includes('http') ||
userAgent.length < 50;
if (looksLikeBot) {
console.warn(`[SSR-OG] ⚠️ Possible undetected bot | ${req.method} ${pathname}`);
console.warn(`[SSR-OG] Full UA: ${userAgent}`);
// Log potential false negatives
if (botDetection.score > 30) {
console.warn(`[SSR-OG] ⚠️ Low confidence bot (${botDetection.score}%) - not serving SSR | ${req.method} ${pathname}`);
console.warn(`[SSR-OG] UA: ${userAgent}`);
console.warn(`[SSR-OG] Signals: ${botDetection.metadata.signals.join(', ')}`);
} else {
console.log(`[SSR-OG] Regular user | ${req.method} ${pathname} | UA: ${userAgent.substring(0, 60)}...`);
console.log(`[SSR-OG] Regular user (score: ${botDetection.score}%) | ${req.method} ${pathname}`);
}
}
@@ -280,6 +220,9 @@ export default async function handler(req: VercelRequest, res: VercelResponse) {
html = injectOGTags(html, ogTags);
res.setHeader('X-Bot-Platform', botDetection.platform || 'unknown');
res.setHeader('X-Bot-Confidence', botDetection.confidence);
res.setHeader('X-Bot-Score', botDetection.score.toString());
res.setHeader('X-Bot-Method', botDetection.detectionMethod);
res.setHeader('X-SSR-Modified', 'true');
}