mirror of
https://github.com/pacnpal/thrilltrack-explorer.git
synced 2025-12-20 08:31:12 -05:00
feat: Implement comprehensive bot detection
This commit is contained in:
106
api/botDetection/headerAnalysis.ts
Normal file
106
api/botDetection/headerAnalysis.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
/**
|
||||
* Header-based bot detection
|
||||
*/
|
||||
|
||||
export interface HeaderAnalysisResult {
|
||||
isBot: boolean;
|
||||
confidence: number; // 0-100
|
||||
signals: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze request headers for bot indicators
|
||||
*/
|
||||
export function analyzeHeaders(headers: Record<string, string | string[] | undefined>): HeaderAnalysisResult {
|
||||
const signals: string[] = [];
|
||||
let confidence = 0;
|
||||
|
||||
// Normalize headers to lowercase
|
||||
const normalizedHeaders: Record<string, string> = {};
|
||||
for (const [key, value] of Object.entries(headers)) {
|
||||
if (value) {
|
||||
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for explicit bot-identifying headers
|
||||
if (normalizedHeaders['x-purpose'] === 'preview') {
|
||||
signals.push('x-purpose-preview');
|
||||
confidence += 40;
|
||||
}
|
||||
|
||||
// Check for headless Chrome DevTools Protocol
|
||||
if (normalizedHeaders['x-devtools-emulate-network-conditions-client-id']) {
|
||||
signals.push('devtools-protocol');
|
||||
confidence += 30;
|
||||
}
|
||||
|
||||
// Missing typical browser headers
|
||||
if (!normalizedHeaders['accept-language']) {
|
||||
signals.push('missing-accept-language');
|
||||
confidence += 15;
|
||||
}
|
||||
|
||||
if (!normalizedHeaders['accept-encoding']) {
|
||||
signals.push('missing-accept-encoding');
|
||||
confidence += 10;
|
||||
}
|
||||
|
||||
// Suspicious Accept header (not typical browser)
|
||||
const accept = normalizedHeaders['accept'];
|
||||
if (accept && !accept.includes('text/html') && !accept.includes('*/*')) {
|
||||
signals.push('non-html-accept');
|
||||
confidence += 15;
|
||||
}
|
||||
|
||||
// Direct access without referer (common for bots)
|
||||
if (!normalizedHeaders['referer'] && !normalizedHeaders['referrer']) {
|
||||
signals.push('no-referer');
|
||||
confidence += 5;
|
||||
}
|
||||
|
||||
// Check for automation headers
|
||||
if (normalizedHeaders['x-requested-with'] === 'XMLHttpRequest') {
|
||||
// XHR requests might be AJAX but also automation
|
||||
signals.push('xhr-request');
|
||||
confidence += 5;
|
||||
}
|
||||
|
||||
// Very simple Accept header (typical of scrapers)
|
||||
if (accept === '*/*' || accept === 'application/json') {
|
||||
signals.push('simple-accept');
|
||||
confidence += 10;
|
||||
}
|
||||
|
||||
// No DNT or cookie-related headers (bots often don't send these)
|
||||
if (!normalizedHeaders['cookie'] && !normalizedHeaders['dnt']) {
|
||||
signals.push('no-cookie-or-dnt');
|
||||
confidence += 5;
|
||||
}
|
||||
|
||||
// Forward headers from proxies/CDNs (could indicate bot)
|
||||
if (normalizedHeaders['x-forwarded-for']) {
|
||||
signals.push('has-x-forwarded-for');
|
||||
confidence += 5;
|
||||
}
|
||||
|
||||
// Cloudflare bot management headers
|
||||
if (normalizedHeaders['cf-ray']) {
|
||||
// Cloudflare is present, which is normal
|
||||
if (normalizedHeaders['cf-ipcountry'] && !normalizedHeaders['accept-language']) {
|
||||
signals.push('cloudflare-without-language');
|
||||
confidence += 10;
|
||||
}
|
||||
}
|
||||
|
||||
// Cap confidence at 100
|
||||
confidence = Math.min(confidence, 100);
|
||||
|
||||
const isBot = confidence >= 30; // Threshold for header-based detection
|
||||
|
||||
return {
|
||||
isBot,
|
||||
confidence,
|
||||
signals,
|
||||
};
|
||||
}
|
||||
116
api/botDetection/heuristics.ts
Normal file
116
api/botDetection/heuristics.ts
Normal file
@@ -0,0 +1,116 @@
|
||||
/**
|
||||
* Behavioral heuristics for bot detection
|
||||
*/
|
||||
|
||||
export interface HeuristicResult {
|
||||
isBot: boolean;
|
||||
confidence: number; // 0-100
|
||||
signals: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze user-agent behavior patterns
|
||||
*/
|
||||
export function analyzeHeuristics(userAgent: string, headers: Record<string, string | string[] | undefined>): HeuristicResult {
|
||||
const signals: string[] = [];
|
||||
let confidence = 0;
|
||||
|
||||
// Very short user agent (< 20 chars) - likely a bot
|
||||
if (userAgent.length < 20) {
|
||||
signals.push('very-short-ua');
|
||||
confidence += 25;
|
||||
}
|
||||
|
||||
// Very long user agent (> 400 chars) - suspicious
|
||||
if (userAgent.length > 400) {
|
||||
signals.push('very-long-ua');
|
||||
confidence += 15;
|
||||
}
|
||||
|
||||
// No Mozilla in user agent (almost all browsers have this)
|
||||
if (!userAgent.includes('Mozilla') && !userAgent.includes('compatible')) {
|
||||
signals.push('no-mozilla');
|
||||
confidence += 20;
|
||||
}
|
||||
|
||||
// Contains "http" or "https" in UA (common in bot UAs)
|
||||
if (userAgent.toLowerCase().includes('http://') || userAgent.toLowerCase().includes('https://')) {
|
||||
signals.push('url-in-ua');
|
||||
confidence += 30;
|
||||
}
|
||||
|
||||
// Contains email in UA (some bots identify with contact email)
|
||||
if (userAgent.match(/@|\[at\]|email/i)) {
|
||||
signals.push('email-in-ua');
|
||||
confidence += 25;
|
||||
}
|
||||
|
||||
// Common bot indicators in UA
|
||||
const botKeywords = ['fetch', 'request', 'client', 'library', 'script', 'api', 'scan', 'check', 'monitor', 'test'];
|
||||
for (const keyword of botKeywords) {
|
||||
if (userAgent.toLowerCase().includes(keyword)) {
|
||||
signals.push(`keyword-${keyword}`);
|
||||
confidence += 10;
|
||||
break; // Only count once
|
||||
}
|
||||
}
|
||||
|
||||
// Programming language identifiers
|
||||
const langIdentifiers = ['python', 'java', 'ruby', 'perl', 'go-http', 'php'];
|
||||
for (const lang of langIdentifiers) {
|
||||
if (userAgent.toLowerCase().includes(lang)) {
|
||||
signals.push(`lang-${lang}`);
|
||||
confidence += 15;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Version number patterns typical of bots (e.g., "v1.0", "version/2.3")
|
||||
if (userAgent.match(/\b(v|version)[\/\s]?\d+\.\d+/i)) {
|
||||
signals.push('version-pattern');
|
||||
confidence += 10;
|
||||
}
|
||||
|
||||
// Contains plus (+) sign outside of version numbers (common in bot UAs)
|
||||
if (userAgent.includes('+') && !userAgent.match(/\d+\+/)) {
|
||||
signals.push('plus-sign');
|
||||
confidence += 15;
|
||||
}
|
||||
|
||||
// Only contains alphanumeric, slashes, and dots (no spaces) - very bot-like
|
||||
if (!userAgent.includes(' ') && userAgent.length > 5) {
|
||||
signals.push('no-spaces');
|
||||
confidence += 20;
|
||||
}
|
||||
|
||||
// Normalize headers
|
||||
const normalizedHeaders: Record<string, string> = {};
|
||||
for (const [key, value] of Object.entries(headers)) {
|
||||
if (value) {
|
||||
normalizedHeaders[key.toLowerCase()] = Array.isArray(value) ? value[0] : value;
|
||||
}
|
||||
}
|
||||
|
||||
// Missing Accept-Language but has other headers (bots often forget this)
|
||||
if (!normalizedHeaders['accept-language'] && normalizedHeaders['accept']) {
|
||||
signals.push('missing-language-header');
|
||||
confidence += 15;
|
||||
}
|
||||
|
||||
// Accept: */* with no other accept headers (lazy bot implementation)
|
||||
if (normalizedHeaders['accept'] === '*/*' && userAgent.length < 50) {
|
||||
signals.push('lazy-accept-header');
|
||||
confidence += 20;
|
||||
}
|
||||
|
||||
// Cap confidence at 100
|
||||
confidence = Math.min(confidence, 100);
|
||||
|
||||
const isBot = confidence >= 40; // Threshold for heuristic-based detection
|
||||
|
||||
return {
|
||||
isBot,
|
||||
confidence,
|
||||
signals,
|
||||
};
|
||||
}
|
||||
144
api/botDetection/index.ts
Normal file
144
api/botDetection/index.ts
Normal file
@@ -0,0 +1,144 @@
|
||||
/**
|
||||
* Comprehensive bot detection system
|
||||
* Combines user-agent patterns, header analysis, and behavioral heuristics
|
||||
*/
|
||||
|
||||
import { BOT_PATTERNS, GENERIC_BOT_REGEX } from './userAgentPatterns';
|
||||
import { analyzeHeaders } from './headerAnalysis';
|
||||
import { analyzeHeuristics } from './heuristics';
|
||||
|
||||
export interface BotDetectionResult {
|
||||
isBot: boolean;
|
||||
confidence: 'high' | 'medium' | 'low';
|
||||
platform: string | null;
|
||||
detectionMethod: 'user-agent' | 'header' | 'heuristic' | 'combination';
|
||||
score: number; // 0-100
|
||||
metadata: {
|
||||
userAgent: string;
|
||||
signals: string[];
|
||||
headerScore: number;
|
||||
heuristicScore: number;
|
||||
uaMatch: boolean;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Main bot detection function
|
||||
*/
|
||||
export function detectBot(
|
||||
userAgent: string,
|
||||
headers: Record<string, string | string[] | undefined> = {}
|
||||
): BotDetectionResult {
|
||||
const userAgentLower = userAgent.toLowerCase();
|
||||
let detectionMethod: BotDetectionResult['detectionMethod'] = 'user-agent';
|
||||
let platform: string | null = null;
|
||||
let score = 0;
|
||||
const signals: string[] = [];
|
||||
|
||||
// 1. User-Agent Pattern Matching (most reliable)
|
||||
let uaMatch = false;
|
||||
for (const { pattern, platform: platformName, category } of BOT_PATTERNS) {
|
||||
if (userAgentLower.includes(pattern)) {
|
||||
uaMatch = true;
|
||||
platform = platformName;
|
||||
|
||||
// High confidence for explicit matches
|
||||
if (category === 'social' || category === 'seo' || category === 'preview') {
|
||||
score = 95;
|
||||
signals.push(`ua-explicit-${category}`);
|
||||
} else if (category === 'generic') {
|
||||
score = 60; // Lower confidence for generic patterns
|
||||
signals.push('ua-generic');
|
||||
} else {
|
||||
score = 85;
|
||||
signals.push(`ua-${category}`);
|
||||
}
|
||||
|
||||
break; // First match wins
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Header Analysis
|
||||
const headerAnalysis = analyzeHeaders(headers);
|
||||
signals.push(...headerAnalysis.signals.map(s => `header:${s}`));
|
||||
|
||||
// 3. Behavioral Heuristics
|
||||
const heuristicAnalysis = analyzeHeuristics(userAgent, headers);
|
||||
signals.push(...heuristicAnalysis.signals.map(s => `heuristic:${s}`));
|
||||
|
||||
// 4. Combine scores with weighted approach
|
||||
if (uaMatch) {
|
||||
// User-agent match found - combine with other signals
|
||||
score = Math.max(score,
|
||||
score * 0.7 + headerAnalysis.confidence * 0.2 + heuristicAnalysis.confidence * 0.1
|
||||
);
|
||||
|
||||
if (headerAnalysis.isBot || heuristicAnalysis.isBot) {
|
||||
detectionMethod = 'combination';
|
||||
}
|
||||
} else {
|
||||
// No user-agent match - rely on header and heuristic analysis
|
||||
score = headerAnalysis.confidence * 0.5 + heuristicAnalysis.confidence * 0.5;
|
||||
|
||||
if (headerAnalysis.isBot && heuristicAnalysis.isBot) {
|
||||
detectionMethod = 'combination';
|
||||
platform = 'unknown-bot';
|
||||
} else if (headerAnalysis.isBot) {
|
||||
detectionMethod = 'header';
|
||||
platform = 'header-detected-bot';
|
||||
} else if (heuristicAnalysis.isBot) {
|
||||
detectionMethod = 'heuristic';
|
||||
platform = 'heuristic-detected-bot';
|
||||
}
|
||||
}
|
||||
|
||||
// Final bot determination
|
||||
const isBot = score >= 50; // 50% confidence threshold
|
||||
|
||||
// Determine confidence level
|
||||
let confidence: 'high' | 'medium' | 'low';
|
||||
if (score >= 80) {
|
||||
confidence = 'high';
|
||||
} else if (score >= 60) {
|
||||
confidence = 'medium';
|
||||
} else {
|
||||
confidence = 'low';
|
||||
}
|
||||
|
||||
return {
|
||||
isBot,
|
||||
confidence,
|
||||
platform,
|
||||
detectionMethod,
|
||||
score: Math.round(score),
|
||||
metadata: {
|
||||
userAgent,
|
||||
signals,
|
||||
headerScore: headerAnalysis.confidence,
|
||||
heuristicScore: heuristicAnalysis.confidence,
|
||||
uaMatch,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Quick bot check for high-traffic scenarios (lightweight)
|
||||
*/
|
||||
export function quickBotCheck(userAgent: string): boolean {
|
||||
const userAgentLower = userAgent.toLowerCase();
|
||||
|
||||
// Check most common social/SEO bots first
|
||||
const quickPatterns = [
|
||||
'facebookexternalhit', 'twitterbot', 'linkedinbot', 'slackbot',
|
||||
'discordbot', 'telegrambot', 'whatsapp', 'googlebot', 'bingbot'
|
||||
];
|
||||
|
||||
for (const pattern of quickPatterns) {
|
||||
if (userAgentLower.includes(pattern)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Generic regex check
|
||||
return GENERIC_BOT_REGEX.test(userAgent);
|
||||
}
|
||||
130
api/botDetection/userAgentPatterns.ts
Normal file
130
api/botDetection/userAgentPatterns.ts
Normal file
@@ -0,0 +1,130 @@
|
||||
/**
|
||||
* Comprehensive user-agent bot patterns organized by category
|
||||
*/
|
||||
|
||||
export interface BotPattern {
|
||||
pattern: string;
|
||||
platform: string;
|
||||
category: 'social' | 'seo' | 'monitoring' | 'preview' | 'ai' | 'dev' | 'archive' | 'email' | 'generic';
|
||||
}
|
||||
|
||||
export const BOT_PATTERNS: BotPattern[] = [
|
||||
// Social Media Preview Bots (HIGH PRIORITY)
|
||||
{ pattern: 'facebookexternalhit', platform: 'facebook', category: 'social' },
|
||||
{ pattern: 'facebot', platform: 'facebook', category: 'social' },
|
||||
{ pattern: 'twitterbot', platform: 'twitter', category: 'social' },
|
||||
{ pattern: 'twitter', platform: 'twitter', category: 'social' },
|
||||
{ pattern: 'linkedinbot', platform: 'linkedin', category: 'social' },
|
||||
{ pattern: 'linkedin', platform: 'linkedin', category: 'social' },
|
||||
{ pattern: 'slackbot', platform: 'slack', category: 'social' },
|
||||
{ pattern: 'slack-imgproxy', platform: 'slack', category: 'social' },
|
||||
{ pattern: 'telegrambot', platform: 'telegram', category: 'social' },
|
||||
{ pattern: 'whatsapp', platform: 'whatsapp', category: 'social' },
|
||||
{ pattern: 'discordbot', platform: 'discord', category: 'social' },
|
||||
{ pattern: 'discord', platform: 'discord', category: 'social' },
|
||||
{ pattern: 'pinterestbot', platform: 'pinterest', category: 'social' },
|
||||
{ pattern: 'pinterest', platform: 'pinterest', category: 'social' },
|
||||
{ pattern: 'redditbot', platform: 'reddit', category: 'social' },
|
||||
{ pattern: 'reddit', platform: 'reddit', category: 'social' },
|
||||
{ pattern: 'instagram', platform: 'instagram', category: 'social' },
|
||||
{ pattern: 'snapchat', platform: 'snapchat', category: 'social' },
|
||||
{ pattern: 'tiktokbot', platform: 'tiktok', category: 'social' },
|
||||
{ pattern: 'bytespider', platform: 'tiktok', category: 'social' },
|
||||
{ pattern: 'tumblr', platform: 'tumblr', category: 'social' },
|
||||
{ pattern: 'vkshare', platform: 'vk', category: 'social' },
|
||||
{ pattern: 'line', platform: 'line', category: 'social' },
|
||||
{ pattern: 'kakaotalk', platform: 'kakaotalk', category: 'social' },
|
||||
{ pattern: 'wechat', platform: 'wechat', category: 'social' },
|
||||
|
||||
// Search Engine Crawlers
|
||||
{ pattern: 'googlebot', platform: 'google', category: 'seo' },
|
||||
{ pattern: 'bingbot', platform: 'bing', category: 'seo' },
|
||||
{ pattern: 'bingpreview', platform: 'bing', category: 'preview' },
|
||||
{ pattern: 'slurp', platform: 'yahoo', category: 'seo' },
|
||||
{ pattern: 'duckduckbot', platform: 'duckduckgo', category: 'seo' },
|
||||
{ pattern: 'baiduspider', platform: 'baidu', category: 'seo' },
|
||||
{ pattern: 'yandexbot', platform: 'yandex', category: 'seo' },
|
||||
|
||||
// SEO & Analytics Crawlers
|
||||
{ pattern: 'ahrefsbot', platform: 'ahrefs', category: 'seo' },
|
||||
{ pattern: 'ahrefs', platform: 'ahrefs', category: 'seo' },
|
||||
{ pattern: 'semrushbot', platform: 'semrush', category: 'seo' },
|
||||
{ pattern: 'dotbot', platform: 'moz', category: 'seo' },
|
||||
{ pattern: 'rogerbot', platform: 'moz', category: 'seo' },
|
||||
{ pattern: 'screaming frog', platform: 'screaming-frog', category: 'seo' },
|
||||
{ pattern: 'majestic', platform: 'majestic', category: 'seo' },
|
||||
{ pattern: 'mjl12bot', platform: 'majestic', category: 'seo' },
|
||||
{ pattern: 'similarweb', platform: 'similarweb', category: 'seo' },
|
||||
{ pattern: 'dataforseo', platform: 'dataforseo', category: 'seo' },
|
||||
|
||||
// Monitoring & Uptime Services
|
||||
{ pattern: 'pingdom', platform: 'pingdom', category: 'monitoring' },
|
||||
{ pattern: 'statuscake', platform: 'statuscake', category: 'monitoring' },
|
||||
{ pattern: 'uptimerobot', platform: 'uptimerobot', category: 'monitoring' },
|
||||
{ pattern: 'newrelic', platform: 'newrelic', category: 'monitoring' },
|
||||
{ pattern: 'datadog', platform: 'datadog', category: 'monitoring' },
|
||||
|
||||
// Preview & Unfurling Services
|
||||
{ pattern: 'embedly', platform: 'embedly', category: 'preview' },
|
||||
{ pattern: 'nuzzel', platform: 'nuzzel', category: 'preview' },
|
||||
{ pattern: 'qwantify', platform: 'qwantify', category: 'preview' },
|
||||
{ pattern: 'skypeuripreview', platform: 'skype', category: 'preview' },
|
||||
{ pattern: 'outbrain', platform: 'outbrain', category: 'preview' },
|
||||
{ pattern: 'flipboard', platform: 'flipboard', category: 'preview' },
|
||||
|
||||
// AI & LLM Crawlers
|
||||
{ pattern: 'gptbot', platform: 'openai', category: 'ai' },
|
||||
{ pattern: 'chatgpt', platform: 'openai', category: 'ai' },
|
||||
{ pattern: 'claudebot', platform: 'anthropic', category: 'ai' },
|
||||
{ pattern: 'anthropic-ai', platform: 'anthropic', category: 'ai' },
|
||||
{ pattern: 'google-extended', platform: 'google-bard', category: 'ai' },
|
||||
{ pattern: 'cohere-ai', platform: 'cohere', category: 'ai' },
|
||||
{ pattern: 'perplexitybot', platform: 'perplexity', category: 'ai' },
|
||||
{ pattern: 'ccbot', platform: 'commoncrawl', category: 'ai' },
|
||||
|
||||
// Development & Testing Tools
|
||||
{ pattern: 'postman', platform: 'postman', category: 'dev' },
|
||||
{ pattern: 'insomnia', platform: 'insomnia', category: 'dev' },
|
||||
{ pattern: 'httpie', platform: 'httpie', category: 'dev' },
|
||||
{ pattern: 'curl', platform: 'curl', category: 'dev' },
|
||||
{ pattern: 'wget', platform: 'wget', category: 'dev' },
|
||||
{ pattern: 'apache-httpclient', platform: 'apache', category: 'dev' },
|
||||
{ pattern: 'python-requests', platform: 'python', category: 'dev' },
|
||||
{ pattern: 'node-fetch', platform: 'nodejs', category: 'dev' },
|
||||
{ pattern: 'axios', platform: 'axios', category: 'dev' },
|
||||
|
||||
// Headless Browsers & Automation
|
||||
{ pattern: 'headless', platform: 'headless-browser', category: 'dev' },
|
||||
{ pattern: 'chrome-lighthouse', platform: 'lighthouse', category: 'dev' },
|
||||
{ pattern: 'puppeteer', platform: 'puppeteer', category: 'dev' },
|
||||
{ pattern: 'playwright', platform: 'playwright', category: 'dev' },
|
||||
{ pattern: 'selenium', platform: 'selenium', category: 'dev' },
|
||||
{ pattern: 'phantomjs', platform: 'phantomjs', category: 'dev' },
|
||||
|
||||
// Vercel & Deployment Platforms
|
||||
{ pattern: 'vercel', platform: 'vercel', category: 'preview' },
|
||||
{ pattern: 'vercel-screenshot', platform: 'vercel', category: 'preview' },
|
||||
{ pattern: 'prerender', platform: 'prerender', category: 'preview' },
|
||||
{ pattern: 'netlify', platform: 'netlify', category: 'preview' },
|
||||
|
||||
// Archive & Research
|
||||
{ pattern: 'ia_archiver', platform: 'internet-archive', category: 'archive' },
|
||||
{ pattern: 'archive.org_bot', platform: 'internet-archive', category: 'archive' },
|
||||
|
||||
// Email Clients (for link previews)
|
||||
{ pattern: 'outlook', platform: 'outlook', category: 'email' },
|
||||
{ pattern: 'googleimageproxy', platform: 'gmail', category: 'email' },
|
||||
{ pattern: 'apple mail', platform: 'apple-mail', category: 'email' },
|
||||
{ pattern: 'yahoo', platform: 'yahoo-mail', category: 'email' },
|
||||
|
||||
// Generic patterns (LOWEST PRIORITY - check last)
|
||||
{ pattern: 'bot', platform: 'generic-bot', category: 'generic' },
|
||||
{ pattern: 'crawler', platform: 'generic-crawler', category: 'generic' },
|
||||
{ pattern: 'spider', platform: 'generic-spider', category: 'generic' },
|
||||
{ pattern: 'scraper', platform: 'generic-scraper', category: 'generic' },
|
||||
];
|
||||
|
||||
/**
|
||||
* Regex patterns for faster generic matching
|
||||
*/
|
||||
export const GENERIC_BOT_REGEX = /(bot|crawler|spider|scraper|curl|wget|http|fetch)/i;
|
||||
95
api/ssrOG.ts
95
api/ssrOG.ts
@@ -14,68 +14,7 @@ type VercelResponse = ServerResponse & {
|
||||
send: (body: string) => VercelResponse;
|
||||
};
|
||||
|
||||
// Bot detection configuration
|
||||
const SOCIAL_BOTS = {
|
||||
'facebookexternalhit': 'facebook',
|
||||
'facebot': 'facebook',
|
||||
'facebookcatalog': 'facebook',
|
||||
'twitterbot': 'twitter',
|
||||
'x-bot': 'twitter',
|
||||
'linkedinbot': 'linkedin',
|
||||
'discordbot': 'discord',
|
||||
'slackbot': 'slack',
|
||||
'slack-imgproxy': 'slack',
|
||||
'whatsapp': 'whatsapp',
|
||||
'telegrambot': 'telegram',
|
||||
'pinterestbot': 'pinterest',
|
||||
'redditbot': 'reddit',
|
||||
'apple-pcs': 'imessage',
|
||||
'mastodon': 'mastodon',
|
||||
'ms-teams': 'teams',
|
||||
'googlebot': 'google',
|
||||
'bingbot': 'bing',
|
||||
'slurp': 'yahoo',
|
||||
'duckduckbot': 'duckduckgo',
|
||||
'baiduspider': 'baidu',
|
||||
'yandexbot': 'yandex',
|
||||
// Headless browsers & crawlers
|
||||
'headless': 'headless-browser',
|
||||
'chrome-lighthouse': 'lighthouse',
|
||||
'puppeteer': 'puppeteer',
|
||||
'playwright': 'playwright',
|
||||
'selenium': 'selenium',
|
||||
'phantomjs': 'phantomjs',
|
||||
// Vercel & deployment platforms
|
||||
'vercel': 'vercel',
|
||||
'vercel-screenshot': 'vercel',
|
||||
'prerender': 'prerender',
|
||||
// Generic crawler patterns
|
||||
'bot': 'generic-bot',
|
||||
'crawler': 'generic-crawler',
|
||||
'spider': 'generic-spider',
|
||||
'scraper': 'generic-scraper'
|
||||
};
|
||||
|
||||
interface BotDetection {
|
||||
isBot: boolean;
|
||||
platform: string | null;
|
||||
}
|
||||
|
||||
function detectBot(userAgent: string): BotDetection {
|
||||
if (!userAgent) {
|
||||
return { isBot: false, platform: null };
|
||||
}
|
||||
|
||||
const ua = userAgent.toLowerCase();
|
||||
|
||||
for (const [pattern, platform] of Object.entries(SOCIAL_BOTS)) {
|
||||
if (ua.includes(pattern)) {
|
||||
return { isBot: true, platform };
|
||||
}
|
||||
}
|
||||
|
||||
return { isBot: false, platform: null };
|
||||
}
|
||||
import { detectBot } from './botDetection/index';
|
||||
|
||||
interface PageData {
|
||||
title: string;
|
||||
@@ -245,24 +184,25 @@ export default async function handler(req: VercelRequest, res: VercelResponse) {
|
||||
const fullUrl = `https://${req.headers.host}${req.url}`;
|
||||
const pathname = new URL(fullUrl).pathname;
|
||||
|
||||
// Bot detection
|
||||
const botDetection = detectBot(userAgent);
|
||||
// Comprehensive bot detection with headers
|
||||
const botDetection = detectBot(userAgent, req.headers as Record<string, string | string[] | undefined>);
|
||||
|
||||
// Enhanced logging
|
||||
// Enhanced logging with detection details
|
||||
if (botDetection.isBot) {
|
||||
console.log(`[SSR-OG] ✅ Bot detected: ${botDetection.platform} | ${req.method} ${pathname}`);
|
||||
console.log(`[SSR-OG] Full UA: ${userAgent}`);
|
||||
console.log(`[SSR-OG] ✅ Bot detected: ${botDetection.platform || 'unknown'} | Confidence: ${botDetection.confidence} (${botDetection.score}%) | Method: ${botDetection.detectionMethod}`);
|
||||
console.log(`[SSR-OG] Path: ${req.method} ${pathname}`);
|
||||
console.log(`[SSR-OG] UA: ${userAgent}`);
|
||||
if (botDetection.metadata.signals.length > 0) {
|
||||
console.log(`[SSR-OG] Signals: ${botDetection.metadata.signals.slice(0, 5).join(', ')}${botDetection.metadata.signals.length > 5 ? '...' : ''}`);
|
||||
}
|
||||
} else {
|
||||
// Log undetected potential bots for debugging
|
||||
const looksLikeBot = !userAgent.includes('Mozilla') ||
|
||||
userAgent.includes('http') ||
|
||||
userAgent.length < 50;
|
||||
|
||||
if (looksLikeBot) {
|
||||
console.warn(`[SSR-OG] ⚠️ Possible undetected bot | ${req.method} ${pathname}`);
|
||||
console.warn(`[SSR-OG] Full UA: ${userAgent}`);
|
||||
// Log potential false negatives
|
||||
if (botDetection.score > 30) {
|
||||
console.warn(`[SSR-OG] ⚠️ Low confidence bot (${botDetection.score}%) - not serving SSR | ${req.method} ${pathname}`);
|
||||
console.warn(`[SSR-OG] UA: ${userAgent}`);
|
||||
console.warn(`[SSR-OG] Signals: ${botDetection.metadata.signals.join(', ')}`);
|
||||
} else {
|
||||
console.log(`[SSR-OG] Regular user | ${req.method} ${pathname} | UA: ${userAgent.substring(0, 60)}...`);
|
||||
console.log(`[SSR-OG] Regular user (score: ${botDetection.score}%) | ${req.method} ${pathname}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -280,6 +220,9 @@ export default async function handler(req: VercelRequest, res: VercelResponse) {
|
||||
html = injectOGTags(html, ogTags);
|
||||
|
||||
res.setHeader('X-Bot-Platform', botDetection.platform || 'unknown');
|
||||
res.setHeader('X-Bot-Confidence', botDetection.confidence);
|
||||
res.setHeader('X-Bot-Score', botDetection.score.toString());
|
||||
res.setHeader('X-Bot-Method', botDetection.detectionMethod);
|
||||
res.setHeader('X-SSR-Modified', 'true');
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user