thrillwiki_django_no_react/shared/scripts/vm/deploy-automation.sh

#!/usr/bin/env bash
#
# ThrillWiki Deployment Automation Service Script
# Comprehensive automated deployment management with preset integration
#
# Features:
# - Cross-shell compatible (bash/zsh)
# - Deployment preset integration
# - Health monitoring and recovery
# - Smart deployment coordination
# - Systemd service integration
# - GitHub authentication management
# - Server lifecycle management
#

set -e

# [AWS-SECRET-REMOVED]====================================
# SCRIPT CONFIGURATION
# [AWS-SECRET-REMOVED]====================================

# Cross-shell compatible script directory detection
if [ -n "${BASH_SOURCE:-}" ]; then
    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
    SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
elif [ -n "${ZSH_NAME:-}" ]; then
    SCRIPT_DIR="$(cd "$(dirname "${(%):-%x}")" && pwd)"
    SCRIPT_NAME="$(basename "${(%):-%x}")"
else
    SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
    SCRIPT_NAME="$(basename "$0")"
fi

PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"

# Default configuration (can be overridden by environment)
DEPLOYMENT_PRESET="${DEPLOYMENT_PRESET:-dev}"
PULL_INTERVAL="${PULL_INTERVAL:-300}"
HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-60}"
DEBUG_MODE="${DEBUG_MODE:-false}"
LOG_LEVEL="${LOG_LEVEL:-INFO}"
MAX_RESTART_ATTEMPTS="${MAX_RESTART_ATTEMPTS:-3}"
RESTART_COOLDOWN="${RESTART_COOLDOWN:-300}"

# Logging configuration
LOG_DIR="${LOG_DIR:-$PROJECT_DIR/logs}"
LOG_FILE="${LOG_FILE:-$LOG_DIR/deployment-automation.log}"
LOCK_FILE="${LOCK_FILE:-/tmp/thrillwiki-deployment.lock}"

# [AWS-SECRET-REMOVED]====================================
# COLOR DEFINITIONS
# [AWS-SECRET-REMOVED]====================================
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m' # No Color

# [AWS-SECRET-REMOVED]====================================
# LOGGING FUNCTIONS
# [AWS-SECRET-REMOVED]====================================

deploy_log() {
    local level="$1"
    local color="$2"
    local message="$3"
    local timestamp="$(date '+%Y-%m-%d %H:%M:%S')"

    # Ensure log directory exists
    mkdir -p "$(dirname "$LOG_FILE")"

    # Log to file (without colors)
    echo "[$timestamp] [$level] [DEPLOY-AUTO] $message" >> "$LOG_FILE"

    # Log to console (with colors) if not running as systemd service
    if [ -t 1 ] && [ "${SYSTEMD_EXEC_PID:-}" = "" ]; then
        echo -e "${color}[$timestamp] [DEPLOY-AUTO-$level]${NC} $message"
    fi

    # Log to systemd journal if running as service
    if [ "${SYSTEMD_EXEC_PID:-}" != "" ]; then
        echo "$message"
    fi
}

deploy_info() {
    deploy_log "INFO" "$BLUE" "$1"
}

deploy_success() {
    deploy_log "SUCCESS" "$GREEN" "✅ $1"
}

deploy_warning() {
    deploy_log "WARNING" "$YELLOW" "⚠️  $1"
}

deploy_error() {
    deploy_log "ERROR" "$RED" "❌ $1"
}

deploy_debug() {
    if [ "${DEBUG_MODE:-false}" = "true" ] || [ "${LOG_LEVEL:-INFO}" = "DEBUG" ]; then
        deploy_log "DEBUG" "$PURPLE" "🔍 $1"
    fi
}

deploy_progress() {
    deploy_log "PROGRESS" "$CYAN" "🚀 $1"
}

# [AWS-SECRET-REMOVED]====================================
# UTILITY FUNCTIONS
# [AWS-SECRET-REMOVED]====================================

# Cross-shell compatible command existence check
command_exists() {
    command -v "$1" >/dev/null 2>&1
}

# Lock file management
acquire_lock() {
    if [ -f "$LOCK_FILE" ]; then
        local lock_pid
        lock_pid=$(cat "$LOCK_FILE" 2>/dev/null || echo "")

        if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
            deploy_warning "Another deployment automation instance is already running (PID: $lock_pid)"
            return 1
        else
            deploy_info "Removing stale lock file"
            rm -f "$LOCK_FILE"
        fi
    fi

    echo $$ > "$LOCK_FILE"
    deploy_debug "Lock acquired (PID: $$)"
    return 0
}

release_lock() {
    if [ -f "$LOCK_FILE" ]; then
        rm -f "$LOCK_FILE"
        deploy_debug "Lock released"
    fi
}

# Trap for cleanup
cleanup_and_exit() {
    deploy_info "Deployment automation service stopping"
    release_lock
    exit 0
}

# [AWS-SECRET-REMOVED]====================================
# PRESET CONFIGURATION FUNCTIONS
# [AWS-SECRET-REMOVED]====================================

# Apply deployment preset configuration
apply_preset_configuration() {
    local preset="${DEPLOYMENT_PRESET:-dev}"

    deploy_info "Applying deployment preset: $preset"

    case "$preset" in
        "dev")
            PULL_INTERVAL="${PULL_INTERVAL:-60}"
            HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-30}"
            DEBUG_MODE="${DEBUG_MODE:-true}"
            LOG_LEVEL="${LOG_LEVEL:-DEBUG}"
            AUTO_MIGRATE="${AUTO_MIGRATE:-true}"
            AUTO_UPDATE_DEPENDENCIES="${AUTO_UPDATE_DEPENDENCIES:-true}"
            ;;
        "prod")
            PULL_INTERVAL="${PULL_INTERVAL:-300}"
            HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-60}"
            DEBUG_MODE="${DEBUG_MODE:-false}"
            LOG_LEVEL="${LOG_LEVEL:-WARNING}"
            AUTO_MIGRATE="${AUTO_MIGRATE:-true}"
            AUTO_UPDATE_DEPENDENCIES="${AUTO_UPDATE_DEPENDENCIES:-false}"
            ;;
        "demo")
            PULL_INTERVAL="${PULL_INTERVAL:-120}"
            HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-45}"
            DEBUG_MODE="${DEBUG_MODE:-false}"
            LOG_LEVEL="${LOG_LEVEL:-INFO}"
            AUTO_MIGRATE="${AUTO_MIGRATE:-true}"
            AUTO_UPDATE_DEPENDENCIES="${AUTO_UPDATE_DEPENDENCIES:-true}"
            ;;
        "testing")
            PULL_INTERVAL="${PULL_INTERVAL:-180}"
            HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-30}"
            DEBUG_MODE="${DEBUG_MODE:-true}"
            LOG_LEVEL="${LOG_LEVEL:-DEBUG}"
            AUTO_MIGRATE="${AUTO_MIGRATE:-true}"
            AUTO_UPDATE_DEPENDENCIES="${AUTO_UPDATE_DEPENDENCIES:-true}"
            ;;
        *)
            deploy_warning "Unknown preset '$preset', using development defaults"
            PULL_INTERVAL="${PULL_INTERVAL:-60}"
            HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-30}"
            DEBUG_MODE="${DEBUG_MODE:-true}"
            LOG_LEVEL="${LOG_LEVEL:-DEBUG}"
            ;;
    esac

    deploy_success "Preset configuration applied successfully"
    deploy_debug "Configuration: interval=${PULL_INTERVAL}s, health=${HEALTH_CHECK_INTERVAL}s, debug=$DEBUG_MODE"
}

# [AWS-SECRET-REMOVED]====================================
# HEALTH CHECK FUNCTIONS
# [AWS-SECRET-REMOVED]====================================

# Check if smart deployment service is healthy
check_smart_deployment_health() {
    deploy_debug "Checking smart deployment service health"

    # Check if smart-deploy script exists and is executable
    local smart_deploy_script="$PROJECT_DIR/scripts/smart-deploy.sh"
    if [ ! -x "$smart_deploy_script" ]; then
        deploy_warning "Smart deployment script not found or not executable: $smart_deploy_script"
        return 1
    fi

    # Check if systemd timer is active
    if command_exists systemctl; then
        if systemctl is-active --quiet thrillwiki-smart-deploy.timer 2>/dev/null; then
            deploy_debug "Smart deployment timer is active"
        else
            deploy_warning "Smart deployment timer is not active"
            return 1
        fi
    fi

    return 0
}

# Check if development server is healthy
check_development_server_health() {
    deploy_debug "Checking development server health"

    local health_url="${HEALTH_CHECK_URL:-http://localhost:8000/}"
    local timeout="${HEALTH_CHECK_TIMEOUT:-30}"

    if command_exists curl; then
        if curl -s --connect-timeout "$timeout" "$health_url" > /dev/null 2>&1; then
            deploy_debug "Development server health check passed"
            return 0
        else
            deploy_warning "Development server health check failed"
            return 1
        fi
    else
        deploy_warning "curl not available for health checks"
        return 1
    fi
}

# Check GitHub authentication
check_github_authentication() {
    deploy_debug "Checking GitHub authentication"

    local github_token=""

    # Try to get token from file
    if [ -f "${GITHUB_TOKEN_FILE:-$PROJECT_DIR/.github-pat}" ]; then
        github_token=$(cat "${GITHUB_TOKEN_FILE:-$PROJECT_DIR/.github-pat}" 2>/dev/null | tr -d '\n\r')
    fi

    # Try environment variable
    if [ -z "$github_token" ] && [ -n "${GITHUB_TOKEN:-}" ]; then
        github_token="$GITHUB_TOKEN"
    fi

    if [ -z "$github_token" ]; then
        deploy_warning "No GitHub token found"
        return 1
    fi

    # Test GitHub API access
    if command_exists curl; then
        local response
        response=$(curl -s -H "Authorization: token $github_token" https://api.github.com/user 2>/dev/null)
        if echo "$response" | grep -q '"login"'; then
            deploy_debug "GitHub authentication verified"
            return 0
        else
            deploy_warning "GitHub authentication failed"
            return 1
        fi
    else
        deploy_warning "Cannot verify GitHub authentication - curl not available"
        return 1
    fi
}

# Comprehensive system health check
perform_health_check() {
    deploy_debug "Performing comprehensive health check"

    local health_issues=0

    # Check smart deployment
    if ! check_smart_deployment_health; then
        ((health_issues++))
    fi

    # Check development server
    if ! check_development_server_health; then
        ((health_issues++))
    fi

    # Check GitHub authentication
    if ! check_github_authentication; then
        ((health_issues++))
    fi

    if [ $health_issues -eq 0 ]; then
        deploy_success "All health checks passed"
        return 0
    else
        deploy_warning "Health check found $health_issues issue(s)"
        return 1
    fi
}

# [AWS-SECRET-REMOVED]====================================
# RECOVERY FUNCTIONS
# [AWS-SECRET-REMOVED]====================================

# Restart smart deployment timer
restart_smart_deployment() {
    deploy_info "Restarting smart deployment timer"

    if command_exists systemctl; then
        if systemctl restart thrillwiki-smart-deploy.timer 2>/dev/null; then
            deploy_success "Smart deployment timer restarted"
            return 0
        else
            deploy_error "Failed to restart smart deployment timer"
            return 1
        fi
    else
        deploy_warning "systemctl not available - cannot restart smart deployment"
        return 1
    fi
}

# Restart development server through smart deployment
restart_development_server() {
    deploy_info "Restarting development server"

    local smart_deploy_script="$PROJECT_DIR/scripts/smart-deploy.sh"
    if [ -x "$smart_deploy_script" ]; then
        if "$smart_deploy_script" restart-server 2>&1 | while IFS= read -r line; do
            deploy_debug "Smart deploy: $line"
        done; then
            deploy_success "Development server restart initiated"
            return 0
        else
            deploy_error "Failed to restart development server"
            return 1
        fi
    else
        deploy_warning "Smart deployment script not available"
        return 1
    fi
}

# Attempt recovery from health check failures
attempt_recovery() {
    local attempt="$1"
    local max_attempts="$2"

    deploy_info "Attempting recovery (attempt $attempt/$max_attempts)"

    # Try restarting smart deployment
    if restart_smart_deployment; then
        sleep 30  # Wait for service to stabilize

        # Try restarting development server
        if restart_development_server; then
            sleep 60  # Wait for server to start

            # Recheck health
            if perform_health_check; then
                deploy_success "Recovery successful"
                return 0
            fi
        fi
    fi

    deploy_warning "Recovery attempt $attempt failed"
    return 1
}

# [AWS-SECRET-REMOVED]====================================
# MAIN AUTOMATION LOOP
# [AWS-SECRET-REMOVED]====================================

# Main deployment automation service
run_deployment_automation() {
    deploy_info "Starting deployment automation service"
    deploy_info "Preset: $DEPLOYMENT_PRESET, Pull interval: ${PULL_INTERVAL}s, Health check: ${HEALTH_CHECK_INTERVAL}s"

    local consecutive_failures=0
    local last_recovery_attempt=0

    while true; do
        # Perform health check
        if perform_health_check; then
            consecutive_failures=0
            deploy_debug "System healthy - continuing monitoring"
        else
            ((consecutive_failures++))
            deploy_warning "Health check failed (consecutive failures: $consecutive_failures)"

            # Attempt recovery if we have consecutive failures
            if [ $consecutive_failures -ge 3 ]; then
                local current_time
                current_time=$(date +%s)

                # Check if enough time has passed since last recovery attempt
                if [ $((current_time - last_recovery_attempt)) -ge $RESTART_COOLDOWN ]; then
                    deploy_info "Too many consecutive failures, attempting recovery"

                    local recovery_attempt=1
                    while [ $recovery_attempt -le $MAX_RESTART_ATTEMPTS ]; do
                        if attempt_recovery "$recovery_attempt" "$MAX_RESTART_ATTEMPTS"; then
                            consecutive_failures=0
                            last_recovery_attempt=$current_time
                            break
                        fi

                        ((recovery_attempt++))
                        if [ $recovery_attempt -le $MAX_RESTART_ATTEMPTS ]; then
                            sleep 60  # Wait between recovery attempts
                        fi
                    done

                    if [ $recovery_attempt -gt $MAX_RESTART_ATTEMPTS ]; then
                        deploy_error "All recovery attempts failed - manual intervention may be required"
                        # Reset failure count to prevent continuous recovery attempts
                        consecutive_failures=0
                        last_recovery_attempt=$current_time
                    fi
                else
                    deploy_debug "Recovery cooldown in effect, waiting before next attempt"
                fi
            fi
        fi

        # Wait for next health check cycle
        sleep "$HEALTH_CHECK_INTERVAL"
    done
}

# [AWS-SECRET-REMOVED]====================================
# INITIALIZATION AND STARTUP
# [AWS-SECRET-REMOVED]====================================

# Initialize deployment automation
initialize_automation() {
    deploy_info "Initializing ThrillWiki deployment automation"

    # Ensure we're in the project directory
    cd "$PROJECT_DIR"

    # Apply preset configuration
    apply_preset_configuration

    # Set up signal handlers
    trap cleanup_and_exit INT TERM

    # Acquire lock
    if ! acquire_lock; then
        deploy_error "Failed to acquire deployment lock"
        exit 1
    fi

    # Perform initial health check
    deploy_info "Performing initial system health check"
    if ! perform_health_check; then
        deploy_warning "Initial health check detected issues - will monitor and attempt recovery"
    fi

    deploy_success "Deployment automation initialized successfully"
}

# [AWS-SECRET-REMOVED]====================================
# COMMAND HANDLING
# [AWS-SECRET-REMOVED]====================================

# Handle script commands
case "${1:-start}" in
    start)
        initialize_automation
        run_deployment_automation
        ;;
    health-check)
        if perform_health_check; then
            echo "System is healthy"
            exit 0
        else
            echo "System health check failed"
            exit 1
        fi
        ;;
    restart-smart-deploy)
        restart_smart_deployment
        ;;
    restart-server)
        restart_development_server
        ;;
    status)
        if [ -f "$LOCK_FILE" ]; then
            local lock_pid
            lock_pid=$(cat "$LOCK_FILE" 2>/dev/null || echo "")
            if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
                echo "Deployment automation is running (PID: $lock_pid)"
                exit 0
            else
                echo "Deployment automation is not running (stale lock file)"
                exit 1
            fi
        else
            echo "Deployment automation is not running"
            exit 1
        fi
        ;;
    stop)
        if [ -f "$LOCK_FILE" ]; then
            local lock_pid
            lock_pid=$(cat "$LOCK_FILE" 2>/dev/null || echo "")
            if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
                echo "Stopping deployment automation (PID: $lock_pid)"
                kill -TERM "$lock_pid"
                sleep 5
                if kill -0 "$lock_pid" 2>/dev/null; then
                    kill -KILL "$lock_pid"
                fi
                rm -f "$LOCK_FILE"
                echo "Deployment automation stopped"
            else
                echo "Deployment automation is not running"
                rm -f "$LOCK_FILE"
            fi
        else
            echo "Deployment automation is not running"
        fi
        ;;
    *)
        echo "Usage: $0 {start|stop|status|health-check|restart-smart-deploy|restart-server}"
        exit 1
        ;;
esac