Files
Pac-cogs/videoarchiver/utils/video_downloader.py
pacnpal e66af6e844 Added timeout handling and force cleanup in VideoArchiver cog
Added proper cancellation and requeuing of downloads in VideoProcessor
Added cancellable logger and process cleanup in VideoDownloader
Added shutdown flag and force_stop capability in EnhancedQueueManager
Added process tracking and kill_all_processes method in FFmpegManager
The changes ensure that:

Active downloads are paused and requeued when unload is called
If cleanup takes too long, force cleanup kicks in
All resources are properly cleaned up, even in case of timeout
Downloads can be safely cancelled and resumed later
No processes are left hanging during unload
2024-11-15 15:48:03 +00:00

777 lines
30 KiB
Python

"""Video download and processing utilities"""
import os
import re
import logging
import asyncio
import ffmpeg
import yt_dlp
import shutil
import subprocess
import json
import signal
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Tuple, Callable, Set
from pathlib import Path
from datetime import datetime
from videoarchiver.ffmpeg.ffmpeg_manager import FFmpegManager
from videoarchiver.ffmpeg.exceptions import (
FFmpegError,
CompressionError,
VerificationError,
FFprobeError,
TimeoutError,
handle_ffmpeg_error,
)
from videoarchiver.utils.exceptions import VideoVerificationError
from videoarchiver.utils.file_ops import secure_delete_file
from videoarchiver.utils.path_manager import temp_path_context
logger = logging.getLogger("VideoArchiver")
# Add a custom yt-dlp logger to handle cancellation
class CancellableYTDLLogger:
def __init__(self):
self.cancelled = False
def debug(self, msg):
if self.cancelled:
raise Exception("Download cancelled")
logger.debug(msg)
def warning(self, msg):
if self.cancelled:
raise Exception("Download cancelled")
logger.warning(msg)
def error(self, msg):
if self.cancelled:
raise Exception("Download cancelled")
logger.error(msg)
def is_video_url_pattern(url: str) -> bool:
"""Check if URL matches common video platform patterns"""
video_patterns = [
r"youtube\.com/watch\?v=",
r"youtu\.be/",
r"vimeo\.com/",
r"tiktok\.com/",
r"twitter\.com/.*/video/",
r"x\.com/.*/video/",
r"bsky\.app/",
r"facebook\.com/.*/videos/",
r"instagram\.com/.*/(tv|reel|p)/",
r"twitch\.tv/.*/clip/",
r"streamable\.com/",
r"v\.redd\.it/",
r"clips\.twitch\.tv/",
r"dailymotion\.com/video/",
r"\.mp4$",
r"\.webm$",
r"\.mov$",
]
return any(re.search(pattern, url, re.IGNORECASE) for pattern in video_patterns)
class VideoDownloader:
MAX_RETRIES = 5
RETRY_DELAY = 10
FILE_OP_RETRIES = 3
FILE_OP_RETRY_DELAY = 1
SHUTDOWN_TIMEOUT = 15 # seconds
def __init__(
self,
download_path: str,
video_format: str,
max_quality: int,
max_file_size: int,
enabled_sites: Optional[List[str]] = None,
concurrent_downloads: int = 2,
ffmpeg_mgr: Optional[FFmpegManager] = None,
):
self.download_path = Path(download_path)
self.download_path.mkdir(parents=True, exist_ok=True)
os.chmod(str(self.download_path), 0o755)
self.video_format = video_format
self.max_quality = max_quality
self.max_file_size = max_file_size
self.enabled_sites = enabled_sites
self.ffmpeg_mgr = ffmpeg_mgr or FFmpegManager()
# Create thread pool with proper naming
self.download_pool = ThreadPoolExecutor(
max_workers=max(1, min(3, concurrent_downloads)),
thread_name_prefix="videoarchiver_download",
)
# Track active downloads and processes
self.active_downloads: Dict[str, Dict[str, Any]] = {}
self._downloads_lock = asyncio.Lock()
self._active_processes: Set[subprocess.Popen] = set()
self._processes_lock = asyncio.Lock()
self._shutting_down = False
# Create cancellable logger
self.ytdl_logger = CancellableYTDLLogger()
# Configure yt-dlp options
self.ydl_opts = {
"format": f"bv*[height<={max_quality}][ext=mp4]+ba[ext=m4a]/b[height<={max_quality}]/best",
"outtmpl": "%(title)s.%(ext)s",
"merge_output_format": video_format,
"quiet": True,
"no_warnings": True,
"extract_flat": True,
"concurrent_fragment_downloads": 1,
"retries": self.MAX_RETRIES,
"fragment_retries": self.MAX_RETRIES,
"file_access_retries": self.FILE_OP_RETRIES,
"extractor_retries": self.MAX_RETRIES,
"postprocessor_hooks": [self._check_file_size],
"progress_hooks": [self._progress_hook, self._detailed_progress_hook],
"ffmpeg_location": str(self.ffmpeg_mgr.get_ffmpeg_path()),
"ffprobe_location": str(self.ffmpeg_mgr.get_ffprobe_path()),
"paths": {"home": str(self.download_path)},
"logger": self.ytdl_logger,
"ignoreerrors": True,
"no_color": True,
"geo_bypass": True,
"socket_timeout": 60,
"http_chunk_size": 1048576,
"external_downloader_args": {
"ffmpeg": ["-timeout", "60000000"]
},
"max_sleep_interval": 5,
"sleep_interval": 1,
"max_filesize": max_file_size * 1024 * 1024,
}
async def cleanup(self) -> None:
"""Clean up resources with proper shutdown"""
self._shutting_down = True
try:
# Cancel active downloads
self.ytdl_logger.cancelled = True
# Kill any active FFmpeg processes
async with self._processes_lock:
for process in self._active_processes:
try:
process.terminate()
await asyncio.sleep(0.1) # Give process time to terminate
if process.poll() is None:
process.kill() # Force kill if still running
except Exception as e:
logger.error(f"Error killing process: {e}")
self._active_processes.clear()
# Clean up thread pool
self.download_pool.shutdown(wait=False, cancel_futures=True)
# Clean up active downloads
async with self._downloads_lock:
self.active_downloads.clear()
except Exception as e:
logger.error(f"Error during downloader cleanup: {e}")
finally:
self._shutting_down = False
async def force_cleanup(self) -> None:
"""Force cleanup of all resources"""
try:
# Force cancel all downloads
self.ytdl_logger.cancelled = True
# Kill all processes immediately
async with self._processes_lock:
for process in self._active_processes:
try:
process.kill()
except Exception as e:
logger.error(f"Error force killing process: {e}")
self._active_processes.clear()
# Force shutdown thread pool
self.download_pool.shutdown(wait=False, cancel_futures=True)
# Clear all tracking
async with self._downloads_lock:
self.active_downloads.clear()
except Exception as e:
logger.error(f"Error during force cleanup: {e}")
def _detailed_progress_hook(self, d):
"""Handle detailed download progress tracking"""
try:
if d["status"] == "downloading":
# Get URL from info dict
url = d.get("info_dict", {}).get("webpage_url", "unknown")
# Update global progress tracking
from videoarchiver.processor import _download_progress
if url in _download_progress:
_download_progress[url].update({
'active': True,
'percent': float(d.get("_percent_str", "0").replace('%', '')),
'speed': d.get("_speed_str", "N/A"),
'eta': d.get("_eta_str", "N/A"),
'downloaded_bytes': d.get("downloaded_bytes", 0),
'total_bytes': d.get("total_bytes", 0) or d.get("total_bytes_estimate", 0),
'retries': d.get("retry_count", 0),
'fragment_count': d.get("fragment_count", 0),
'fragment_index': d.get("fragment_index", 0),
'video_title': d.get("info_dict", {}).get("title", "Unknown"),
'extractor': d.get("info_dict", {}).get("extractor", "Unknown"),
'format': d.get("info_dict", {}).get("format", "Unknown"),
'resolution': d.get("info_dict", {}).get("resolution", "Unknown"),
'fps': d.get("info_dict", {}).get("fps", "Unknown"),
'last_update': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
})
logger.debug(
f"Detailed progress for {url}: "
f"{_download_progress[url]['percent']}% at {_download_progress[url]['speed']}, "
f"ETA: {_download_progress[url]['eta']}"
)
except Exception as e:
logger.error(f"Error in detailed progress hook: {str(e)}")
def _progress_hook(self, d):
"""Handle download progress"""
if d["status"] == "finished":
logger.info(f"Download completed: {d['filename']}")
elif d["status"] == "downloading":
try:
percent = float(d.get("_percent_str", "0").replace("%", ""))
speed = d.get("_speed_str", "N/A")
eta = d.get("_eta_str", "N/A")
downloaded = d.get("downloaded_bytes", 0)
total = d.get("total_bytes", 0) or d.get("total_bytes_estimate", 0)
logger.debug(
f"Download progress: {percent}% at {speed}, "
f"ETA: {eta}, Downloaded: {downloaded}/{total} bytes"
)
except Exception as e:
logger.debug(f"Error logging progress: {str(e)}")
def is_supported_url(self, url: str) -> bool:
"""Check if URL is supported by attempting a simulated download"""
if not is_video_url_pattern(url):
return False
try:
simulate_opts = {
**self.ydl_opts,
"simulate": True,
"quiet": True,
"no_warnings": True,
"extract_flat": True,
"skip_download": True,
"format": "best",
}
with yt_dlp.YoutubeDL(simulate_opts) as ydl:
try:
info = ydl.extract_info(url, download=False)
if info is None:
return False
if self.enabled_sites:
extractor = info.get("extractor", "").lower()
if not any(
site.lower() in extractor for site in self.enabled_sites
):
logger.info(f"Site {extractor} not in enabled sites list")
return False
logger.info(
f"URL supported: {url} (Extractor: {info.get('extractor', 'unknown')})"
)
return True
except yt_dlp.utils.UnsupportedError:
return False
except Exception as e:
if "Unsupported URL" not in str(e):
logger.error(f"Error checking URL {url}: {str(e)}")
return False
except Exception as e:
logger.error(f"Error during URL check: {str(e)}")
return False
async def download_video(
self, url: str, progress_callback: Optional[Callable[[float], None]] = None
) -> Tuple[bool, str, str]:
"""Download and process a video with improved error handling"""
if self._shutting_down:
return False, "", "Downloader is shutting down"
# Initialize progress tracking for this URL
from videoarchiver.processor import _download_progress
_download_progress[url] = {
'active': True,
'start_time': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
'percent': 0,
'speed': 'N/A',
'eta': 'N/A',
'downloaded_bytes': 0,
'total_bytes': 0,
'retries': 0,
'fragment_count': 0,
'fragment_index': 0,
'video_title': 'Unknown',
'extractor': 'Unknown',
'format': 'Unknown',
'resolution': 'Unknown',
'fps': 'Unknown',
'last_update': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
}
original_file = None
compressed_file = None
temp_dir = None
hardware_accel_failed = False
compression_params = None
try:
with temp_path_context() as temp_dir:
# Download the video
success, file_path, error = await self._safe_download(
url, temp_dir, progress_callback
)
if not success:
return False, "", error
original_file = file_path
async with self._downloads_lock:
self.active_downloads[url] = {
'file_path': original_file,
'start_time': datetime.utcnow()
}
# Check file size and compress if needed
file_size = os.path.getsize(original_file)
if file_size > (self.max_file_size * 1024 * 1024):
logger.info(f"Compressing video: {original_file}")
try:
# Get optimal compression parameters
compression_params = self.ffmpeg_mgr.get_compression_params(
original_file, self.max_file_size
)
compressed_file = os.path.join(
self.download_path,
f"compressed_{os.path.basename(original_file)}",
)
# Try hardware acceleration first
success = await self._try_compression(
original_file,
compressed_file,
compression_params,
progress_callback,
use_hardware=True,
)
# If hardware acceleration fails, fall back to CPU
if not success:
hardware_accel_failed = True
logger.warning(
"Hardware acceleration failed, falling back to CPU encoding"
)
success = await self._try_compression(
original_file,
compressed_file,
compression_params,
progress_callback,
use_hardware=False,
)
if not success:
raise CompressionError(
"Failed to compress with both hardware and CPU encoding",
file_size,
self.max_file_size * 1024 * 1024,
)
# Verify compressed file
if not self._verify_video_file(compressed_file):
raise VideoVerificationError(
"Compressed file verification failed"
)
compressed_size = os.path.getsize(compressed_file)
if compressed_size <= (self.max_file_size * 1024 * 1024):
await self._safe_delete_file(original_file)
return True, compressed_file, ""
else:
await self._safe_delete_file(compressed_file)
raise CompressionError(
"Failed to compress to target size",
file_size,
self.max_file_size * 1024 * 1024,
)
except Exception as e:
error_msg = str(e)
if hardware_accel_failed:
error_msg = f"Hardware acceleration failed, CPU fallback error: {error_msg}"
if compressed_file and os.path.exists(compressed_file):
await self._safe_delete_file(compressed_file)
return False, "", error_msg
else:
# Move file to final location
final_path = os.path.join(
self.download_path, os.path.basename(original_file)
)
success = await self._safe_move_file(original_file, final_path)
if not success:
return False, "", "Failed to move file to final location"
return True, final_path, ""
except Exception as e:
logger.error(f"Download error: {str(e)}")
return False, "", str(e)
finally:
# Clean up
async with self._downloads_lock:
self.active_downloads.pop(url, None)
if url in _download_progress:
_download_progress[url]['active'] = False
try:
if original_file and os.path.exists(original_file):
await self._safe_delete_file(original_file)
if (
compressed_file
and os.path.exists(compressed_file)
and not compressed_file.startswith(self.download_path)
):
await self._safe_delete_file(compressed_file)
except Exception as e:
logger.error(f"Error during file cleanup: {str(e)}")
async def _try_compression(
self,
input_file: str,
output_file: str,
params: Dict[str, str],
progress_callback: Optional[Callable[[float], None]] = None,
use_hardware: bool = True,
) -> bool:
"""Attempt video compression with given parameters"""
if self._shutting_down:
return False
try:
# Build FFmpeg command
ffmpeg_path = str(self.ffmpeg_mgr.get_ffmpeg_path())
cmd = [ffmpeg_path, "-y", "-i", input_file]
# Add progress monitoring
cmd.extend(["-progress", "pipe:1"])
# Modify parameters based on hardware acceleration preference
if use_hardware:
gpu_info = self.ffmpeg_mgr.gpu_info
if gpu_info["nvidia"] and params.get("c:v") == "libx264":
params["c:v"] = "h264_nvenc"
elif gpu_info["amd"] and params.get("c:v") == "libx264":
params["c:v"] = "h264_amf"
elif gpu_info["intel"] and params.get("c:v") == "libx264":
params["c:v"] = "h264_qsv"
else:
params["c:v"] = "libx264"
# Add all parameters to command
for key, value in params.items():
cmd.extend([f"-{key}", str(value)])
# Add output file
cmd.append(output_file)
# Get video duration for progress calculation
duration = self._get_video_duration(input_file)
# Update compression progress tracking
from videoarchiver.processor import _compression_progress
# Get input file size
input_size = os.path.getsize(input_file)
# Initialize compression progress
_compression_progress[input_file] = {
'active': True,
'filename': os.path.basename(input_file),
'start_time': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
'percent': 0,
'elapsed_time': '0:00',
'input_size': input_size,
'current_size': 0,
'target_size': self.max_file_size * 1024 * 1024,
'codec': params.get('c:v', 'unknown'),
'hardware_accel': use_hardware,
'preset': params.get('preset', 'unknown'),
'crf': params.get('crf', 'unknown'),
'duration': duration,
'bitrate': params.get('b:v', 'unknown'),
'audio_codec': params.get('c:a', 'unknown'),
'audio_bitrate': params.get('b:a', 'unknown'),
'last_update': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
}
# Run compression with progress monitoring
process = await asyncio.create_subprocess_exec(
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)
# Track the process
async with self._processes_lock:
self._active_processes.add(process)
start_time = datetime.utcnow()
try:
while True:
if self._shutting_down:
process.terminate()
return False
line = await process.stdout.readline()
if not line:
break
try:
line = line.decode().strip()
if line.startswith("out_time_ms="):
current_time = (
int(line.split("=")[1]) / 1000000
) # Convert microseconds to seconds
if duration > 0:
progress = min(100, (current_time / duration) * 100)
# Update compression progress
elapsed = datetime.utcnow() - start_time
_compression_progress[input_file].update({
'percent': progress,
'elapsed_time': str(elapsed).split('.')[0], # Remove microseconds
'current_size': os.path.getsize(output_file) if os.path.exists(output_file) else 0,
'current_time': current_time,
'last_update': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
})
if progress_callback:
await progress_callback(progress)
except Exception as e:
logger.error(f"Error parsing FFmpeg progress: {e}")
await process.wait()
success = os.path.exists(output_file)
# Update final status
if success and input_file in _compression_progress:
_compression_progress[input_file].update({
'active': False,
'percent': 100,
'current_size': os.path.getsize(output_file),
'last_update': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
})
return success
finally:
# Remove process from tracking
async with self._processes_lock:
self._active_processes.discard(process)
except subprocess.CalledProcessError as e:
logger.error(f"FFmpeg compression failed: {e.stderr.decode()}")
return False
except Exception as e:
logger.error(f"Compression attempt failed: {str(e)}")
return False
finally:
# Ensure compression progress is marked as inactive
if input_file in _compression_progress:
_compression_progress[input_file]['active'] = False
def _get_video_duration(self, file_path: str) -> float:
"""Get video duration in seconds"""
try:
ffprobe_path = str(self.ffmpeg_mgr.get_ffprobe_path())
cmd = [
ffprobe_path,
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
file_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
data = json.loads(result.stdout)
return float(data["format"]["duration"])
except Exception as e:
logger.error(f"Error getting video duration: {e}")
return 0
def _check_file_size(self, info):
"""Check if file size is within limits"""
if info.get("filepath") and os.path.exists(info["filepath"]):
try:
size = os.path.getsize(info["filepath"])
if size > (self.max_file_size * 1024 * 1024):
logger.info(
f"File exceeds size limit, will compress: {info['filepath']}"
)
except OSError as e:
logger.error(f"Error checking file size: {str(e)}")
def _verify_video_file(self, file_path: str) -> bool:
"""Verify video file integrity"""
try:
ffprobe_path = str(self.ffmpeg_mgr.get_ffprobe_path())
cmd = [
ffprobe_path,
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
"-show_streams",
file_path,
]
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=30,
)
if result.returncode != 0:
raise VideoVerificationError(f"FFprobe failed: {result.stderr}")
probe = json.loads(result.stdout)
# Verify video stream
video_streams = [s for s in probe["streams"] if s["codec_type"] == "video"]
if not video_streams:
raise VideoVerificationError("No video streams found")
# Verify duration
duration = float(probe["format"].get("duration", 0))
if duration <= 0:
raise VideoVerificationError("Invalid video duration")
# Verify file is readable
with open(file_path, "rb") as f:
f.seek(0, 2)
if f.tell() == 0:
raise VideoVerificationError("Empty file")
return True
except Exception as e:
logger.error(f"Error verifying video file {file_path}: {e}")
return False
async def _safe_download(
self,
url: str,
temp_dir: str,
progress_callback: Optional[Callable[[float], None]] = None,
) -> Tuple[bool, str, str]:
"""Safely download video with retries"""
if self._shutting_down:
return False, "", "Downloader is shutting down"
last_error = None
for attempt in range(self.MAX_RETRIES):
try:
ydl_opts = self.ydl_opts.copy()
ydl_opts["outtmpl"] = os.path.join(temp_dir, ydl_opts["outtmpl"])
# Add progress callback
if progress_callback:
original_progress_hook = ydl_opts["progress_hooks"][0]
def combined_progress_hook(d):
original_progress_hook(d)
if d["status"] == "downloading":
try:
percent = float(
d.get("_percent_str", "0").replace("%", "")
)
asyncio.create_task(progress_callback(percent))
except Exception as e:
logger.error(f"Error in progress callback: {e}")
ydl_opts["progress_hooks"] = [combined_progress_hook]
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = await asyncio.get_event_loop().run_in_executor(
self.download_pool, lambda: ydl.extract_info(url, download=True)
)
if info is None:
raise Exception("Failed to extract video information")
file_path = os.path.join(temp_dir, ydl.prepare_filename(info))
if not os.path.exists(file_path):
raise FileNotFoundError("Download completed but file not found")
if not self._verify_video_file(file_path):
raise VideoVerificationError("Downloaded file is not a valid video")
return True, file_path, ""
except Exception as e:
last_error = str(e)
logger.error(f"Download attempt {attempt + 1} failed: {str(e)}")
if attempt < self.MAX_RETRIES - 1:
# Exponential backoff with jitter
delay = self.RETRY_DELAY * (2**attempt) + (attempt * 2)
await asyncio.sleep(delay)
else:
return False, "", f"All download attempts failed: {last_error}"
async def _safe_delete_file(self, file_path: str) -> bool:
"""Safely delete a file with retries"""
for attempt in range(self.FILE_OP_RETRIES):
try:
if await secure_delete_file(file_path):
return True
await asyncio.sleep(self.FILE_OP_RETRY_DELAY * (attempt + 1))
except Exception as e:
logger.error(f"Delete attempt {attempt + 1} failed: {str(e)}")
if attempt == self.FILE_OP_RETRIES - 1:
return False
await asyncio.sleep(self.FILE_OP_RETRY_DELAY * (attempt + 1))
return False
async def _safe_move_file(self, src: str, dst: str) -> bool:
"""Safely move a file with retries"""
for attempt in range(self.FILE_OP_RETRIES):
try:
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.move(src, dst)
return True
except Exception as e:
logger.error(f"Move attempt {attempt + 1} failed: {str(e)}")
if attempt == self.FILE_OP_RETRIES - 1:
return False
await asyncio.sleep(self.FILE_OP_RETRY_DELAY * (attempt + 1))
return False