feat: Implement optimization plan for Markov Discord bot

- Added `optimization-plan.md` detailing strategies to reduce response latency and improve training throughput. - Enhanced performance analysis in `performance-analysis.md` with identified bottlenecks and completed optimizations. - Created `productContext.md` summarizing project goals, user scenarios, and implementation priorities. - Developed `markov-store.ts` for high-performance serialized chain storage with alias method sampling. - Implemented database performance indexes in `1704067200000-AddPerformanceIndexes.ts`. - Introduced `markov-worker.ts` for handling CPU-intensive operations in separate threads. - Established a worker pool in `worker-pool.ts` to manage multiple worker threads efficiently.
2025-12-22 20:01:06 -05:00 · 2025-09-25 13:39:22 -04:00
parent 239ded1669
commit 1f0a2573c4
15 changed files with 4082 additions and 335 deletions
--- a/bench/trace.sh
+++ b/bench/trace.sh
@@ -0,0 +1,319 @@
+#!/bin/bash
+
+# Markov Discord Performance Tracing Script
+# Usage: ./bench/trace.sh [baseline|optimized] [iterations]
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+MODE="${1:-baseline}"
+ITERATIONS="${2:-10}"
+GUILD_ID="test-guild-123"
+OUTPUT_DIR="$PROJECT_DIR/bench/results"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+
+echo "=== Markov Discord Performance Tracing ==="
+echo "Mode: $MODE"
+echo "Iterations: $ITERATIONS"
+echo "Guild ID: $GUILD_ID"
+echo "Output: $OUTPUT_DIR"
+echo "Timestamp: $TIMESTAMP"
+echo
+
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+
+# Generate test data if it doesn't exist
+TEST_DATA_FILE="$PROJECT_DIR/test-data.json"
+if [ ! -f "$TEST_DATA_FILE" ]; then
+    echo "Generating test data..."
+    node -e "
+    const fs = require('fs');
+    const messages = [];
+    const words = ['hello', 'world', 'this', 'is', 'a', 'test', 'message', 'for', 'performance', 'testing', 'with', 'many', 'different', 'words', 'and', 'phrases'];
+
+    for (let i = 0; i < 10000; i++) {
+        const sentence = [];
+        for (let j = 0; j < Math.floor(Math.random() * 10) + 3; j++) {
+            sentence.push(words[Math.floor(Math.random() * words.length)]);
+        }
+        messages.push({ message: sentence.join(' ') });
+    }
+
+    fs.writeFileSync('$TEST_DATA_FILE', JSON.stringify(messages, null, 2));
+    console.log('Generated 10,000 test messages');
+    "
+fi
+
+# Function to run training benchmark
+run_training_benchmark() {
+    local mode=$1
+    local output_file="$OUTPUT_DIR/training_${mode}_${TIMESTAMP}.json"
+
+    echo "Running training benchmark ($mode)..."
+
+    # Set environment variables based on mode
+    if [ "$mode" = "optimized" ]; then
+        export USE_MARKOV_STORE=true
+        export USE_WORKER_THREADS=true
+    else
+        export USE_MARKOV_STORE=false
+        export USE_WORKER_THREADS=false
+    fi
+
+    # Run with Node.js profiling
+    node --prof --trace-deopt --track_gc_object_stats \
+         --log-timer-events \
+         -e "
+    const startTime = process.hrtime.bigint();
+    const startMemory = process.memoryUsage();
+
+    // Simulate training
+    const fs = require('fs');
+    const data = JSON.parse(fs.readFileSync('$TEST_DATA_FILE', 'utf8'));
+
+    console.log('Processing', data.length, 'messages');
+
+    // Simple training simulation
+    let chain = new Map();
+    for (const msg of data) {
+        const words = msg.message.split(' ');
+        for (let i = 0; i < words.length - 1; i++) {
+            const prefix = words[i];
+            const suffix = words[i + 1];
+            if (!chain.has(prefix)) chain.set(prefix, new Map());
+            const suffixMap = chain.get(prefix);
+            suffixMap.set(suffix, (suffixMap.get(suffix) || 0) + 1);
+        }
+    }
+
+    const endTime = process.hrtime.bigint();
+    const endMemory = process.memoryUsage();
+
+    console.log('Training completed');
+    console.log('Time:', Number(endTime - startTime) / 1000000, 'ms');
+    console.log('Memory:', (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024, 'MB');
+    console.log('Chains:', chain.size);
+    " 2>&1 | tee "$output_file"
+
+    echo "Training benchmark completed: $output_file"
+}
+
+# Function to run generation benchmark
+run_generation_benchmark() {
+    local mode=$1
+    local output_file="$OUTPUT_DIR/generation_${mode}_${TIMESTAMP}.json"
+
+    echo "Running generation benchmark ($mode)..."
+
+    # Set environment variables based on mode
+    if [ "$mode" = "optimized" ]; then
+        export USE_MARKOV_STORE=true
+        export USE_WORKER_THREADS=true
+    else
+        export USE_MARKOV_STORE=false
+        export USE_WORKER_THREADS=false
+    fi
+
+    # Run with Node.js profiling
+    node --prof --trace-deopt --track_gc_object_stats \
+         --log-timer-events \
+         -e "
+    const startTime = process.hrtime.bigint();
+    const startMemory = process.memoryUsage();
+
+    // Simple generation simulation
+    const fs = require('fs');
+    const data = JSON.parse(fs.readFileSync('$TEST_DATA_FILE', 'utf8'));
+
+    // Build a simple chain
+    let chain = new Map();
+    for (const msg of data.slice(0, 1000)) { // Use subset for chain building
+        const words = msg.message.split(' ');
+        for (let i = 0; i < words.length - 1; i++) {
+            const prefix = words[i];
+            const suffix = words[i + 1];
+            if (!chain.has(prefix)) chain.set(prefix, new Map());
+            const suffixMap = chain.get(prefix);
+            suffixMap.set(suffix, (suffixMap.get(suffix) || 0) + 1);
+        }
+    }
+
+    // Generate responses
+    const responses = [];
+    for (let i = 0; i < 100; i++) {
+        const prefixes = Array.from(chain.keys());
+        const startWord = prefixes[Math.floor(Math.random() * prefixes.length)];
+        let current = startWord;
+        let response = [current];
+
+        for (let j = 0; j < 20; j++) {
+            const suffixMap = chain.get(current);
+            if (!suffixMap || suffixMap.size === 0) break;
+
+            const suffixes = Array.from(suffixMap.entries());
+            const total = suffixes.reduce((sum, [, count]) => sum + count, 0);
+            let random = Math.random() * total;
+
+            for (const [suffix, count] of suffixes) {
+                random -= count;
+                if (random <= 0) {
+                    response.push(suffix);
+                    current = suffix;
+                    break;
+                }
+            }
+        }
+
+        responses.push(response.join(' '));
+    }
+
+    const endTime = process.hrtime.bigint();
+    const endMemory = process.memoryUsage();
+
+    console.log('Generation completed');
+    console.log('Generated', responses.length, 'responses');
+    console.log('Time:', Number(endTime - startTime) / 1000000, 'ms');
+    console.log('Memory:', (endMemory.heapUsed - startMemory.heapUsed) / 1024 / 1024, 'MB');
+    " 2>&1 | tee "$output_file"
+
+    echo "Generation benchmark completed: $output_file"
+}
+
+# Function to run memory usage benchmark
+run_memory_benchmark() {
+    local mode=$1
+    local output_file="$OUTPUT_DIR/memory_${mode}_${TIMESTAMP}.json"
+
+    echo "Running memory benchmark ($mode)..."
+
+    # Set environment variables based on mode
+    if [ "$mode" = "optimized" ]; then
+        export USE_MARKOV_STORE=true
+        export USE_WORKER_THREADS=true
+    else
+        export USE_MARKOV_STORE=false
+        export USE_WORKER_THREADS=false
+    fi
+
+    # Run memory profiling
+    node --expose-gc --max-old-space-size=4096 \
+         -e "
+    const fs = require('fs');
+    const data = JSON.parse(fs.readFileSync('$TEST_DATA_FILE', 'utf8'));
+
+    console.log('Starting memory benchmark...');
+
+    let chain = new Map();
+    let memoryUsage = [];
+
+    // Build chain incrementally and measure memory
+    for (let i = 0; i < Math.min(data.length, 5000); i += 100) {
+        const batch = data.slice(i, i + 100);
+
+        for (const msg of batch) {
+            const words = msg.message.split(' ');
+            for (let j = 0; j < words.length - 1; j++) {
+                const prefix = words[j];
+                const suffix = words[j + 1];
+                if (!chain.has(prefix)) chain.set(prefix, new Map());
+                const suffixMap = chain.get(prefix);
+                suffixMap.set(suffix, (suffixMap.get(suffix) || 0) + 1);
+            }
+        }
+
+        if (global.gc) global.gc();
+        const mem = process.memoryUsage();
+        memoryUsage.push({
+            messagesProcessed: i + 100,
+            heapUsed: mem.heapUsed,
+            heapTotal: mem.heapTotal,
+            external: mem.external,
+            rss: mem.rss
+        });
+    }
+
+    console.log('Memory benchmark completed');
+    console.log('Final chains:', chain.size);
+    console.log('Memory samples:', memoryUsage.length);
+
+    fs.writeFileSync('$output_file', JSON.stringify({
+        mode: '$mode',
+        memoryUsage,
+        finalChainSize: chain.size,
+        timestamp: '$TIMESTAMP'
+    }, null, 2));
+
+    console.log('Memory benchmark data saved to: $output_file');
+    " 2>&1 | tee "${output_file}.log"
+
+    echo "Memory benchmark completed: $output_file"
+}
+
+# Main execution
+case "$MODE" in
+    "baseline")
+        echo "Running baseline benchmarks..."
+        run_training_benchmark "baseline"
+        run_generation_benchmark "baseline"
+        run_memory_benchmark "baseline"
+        ;;
+    "optimized")
+        echo "Running optimized benchmarks..."
+        run_training_benchmark "optimized"
+        run_generation_benchmark "optimized"
+        run_memory_benchmark "optimized"
+        ;;
+    "both")
+        echo "Running both baseline and optimized benchmarks..."
+        run_training_benchmark "baseline"
+        run_training_benchmark "optimized"
+        run_generation_benchmark "baseline"
+        run_generation_benchmark "optimized"
+        run_memory_benchmark "baseline"
+        run_memory_benchmark "optimized"
+        ;;
+    *)
+        echo "Usage: $0 [baseline|optimized|both] [iterations]"
+        echo "  baseline  - Run benchmarks without optimizations"
+        echo "  optimized - Run benchmarks with optimizations enabled"
+        echo "  both      - Run both baseline and optimized benchmarks"
+        echo "  iterations - Number of iterations to run (default: 10)"
+        exit 1
+        ;;
+esac
+
+# Generate comparison report if both modes were run
+if [ "$MODE" = "both" ]; then
+    echo
+    echo "Generating comparison report..."
+
+    # Simple comparison report
+    cat > "$OUTPUT_DIR/comparison_${TIMESTAMP}.txt" << EOF
+=== Markov Discord Performance Comparison ===
+Timestamp: $TIMESTAMP
+Iterations: $ITERATIONS
+
+Benchmark Results Summary:
+- Baseline and optimized modes compared
+- See individual benchmark files for detailed metrics
+- Check $OUTPUT_DIR for all result files
+
+Files generated:
+- training_baseline_*.json
+- training_optimized_*.json
+- generation_baseline_*.json
+- generation_optimized_*.json
+- memory_baseline_*.json
+- memory_optimized_*.json
+
+EOF
+
+    echo "Comparison report: $OUTPUT_DIR/comparison_${TIMESTAMP}.txt"
+fi
+
+echo
+echo "=== Benchmarking Complete ==="
+echo "Results saved to: $OUTPUT_DIR"
+echo "Check individual files for detailed performance metrics"