From 3e50faec75b281c6bd7dc640f6cb79ae309549c8 Mon Sep 17 00:00:00 2001 From: pacnpal <183241239+pacnpal@users.noreply.github.com> Date: Fri, 15 Nov 2024 03:40:53 +0000 Subject: [PATCH] Switching from regex patterns to yt-dlp simulation for URL detection: Removed the regex-based URL detection Added proper yt-dlp simulation to check if URLs are supported Added better error handling for URL checking Added detailed logging of URL detection results Improving FFmpeg integration: Added proper ffprobe binary download alongside FFmpeg Added better verification of both binaries Added retry mechanisms for binary downloads Added proper cleanup of failed downloads Enhancing error handling and logging: Added detailed logging throughout the system Added better error recovery mechanisms Added proper cleanup of temporary files Added better tracking of failed operations --- videoarchiver/ffmpeg/ffmpeg_downloader.py | 128 +++++++++++++--------- videoarchiver/processor.py | 13 ++- videoarchiver/utils/video_downloader.py | 98 +++++++---------- 3 files changed, 128 insertions(+), 111 deletions(-) diff --git a/videoarchiver/ffmpeg/ffmpeg_downloader.py b/videoarchiver/ffmpeg/ffmpeg_downloader.py index 4d60f95..82d9d04 100644 --- a/videoarchiver/ffmpeg/ffmpeg_downloader.py +++ b/videoarchiver/ffmpeg/ffmpeg_downloader.py @@ -12,7 +12,7 @@ import platform import hashlib from pathlib import Path from contextlib import contextmanager -from typing import Optional +from typing import Optional, Dict, List import time from .exceptions import DownloadError @@ -37,31 +37,31 @@ class FFmpegDownloader: "Windows": { "x86_64": { "url": "https://github.com/BtbN/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-win64-gpl.zip", - "bin_name": "ffmpeg.exe", + "bin_names": ["ffmpeg.exe", "ffprobe.exe"], } }, "Linux": { "x86_64": { "url": "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz", - "bin_name": "ffmpeg", + "bin_names": ["ffmpeg", "ffprobe"], }, "aarch64": { "url": "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-arm64-static.tar.xz", - "bin_name": "ffmpeg", + "bin_names": ["ffmpeg", "ffprobe"], }, "armv7l": { "url": "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-armhf-static.tar.xz", - "bin_name": "ffmpeg", + "bin_names": ["ffmpeg", "ffprobe"], }, }, "Darwin": { "x86_64": { "url": "https://evermeet.cx/ffmpeg/getrelease/zip", - "bin_name": "ffmpeg", + "bin_names": ["ffmpeg", "ffprobe"], }, "arm64": { "url": "https://evermeet.cx/ffmpeg/getrelease/zip", - "bin_name": "ffmpeg", + "bin_names": ["ffmpeg", "ffprobe"], }, }, } @@ -73,15 +73,17 @@ class FFmpegDownloader: if self.machine == "arm64": self.machine = "aarch64" # Normalize ARM64 naming self.base_dir = base_dir - self.ffmpeg_path = self.base_dir / self._get_binary_name() + self.ffmpeg_path = self.base_dir / self._get_binary_names()[0] + self.ffprobe_path = self.base_dir / self._get_binary_names()[1] logger.info(f"Initialized FFmpeg downloader for {system}/{machine}") logger.info(f"FFmpeg binary path: {self.ffmpeg_path}") + logger.info(f"FFprobe binary path: {self.ffprobe_path}") - def _get_binary_name(self) -> str: - """Get the appropriate binary name for the current system""" + def _get_binary_names(self) -> List[str]: + """Get the appropriate binary names for the current system""" try: - return self.FFMPEG_URLS[self.system][self.machine]["bin_name"] + return self.FFMPEG_URLS[self.system][self.machine]["bin_names"] except KeyError: raise DownloadError(f"Unsupported system/architecture: {self.system}/{self.machine}") @@ -92,8 +94,8 @@ class FFmpegDownloader: except KeyError: raise DownloadError(f"Unsupported system/architecture: {self.system}/{self.machine}") - def download(self) -> Path: - """Download and set up FFmpeg binary with retries""" + def download(self) -> Dict[str, Path]: + """Download and set up FFmpeg and FFprobe binaries with retries""" max_retries = 3 retry_delay = 5 last_error = None @@ -106,12 +108,13 @@ class FFmpegDownloader: self.base_dir.mkdir(parents=True, exist_ok=True) os.chmod(str(self.base_dir), 0o777) - # Clean up any existing file or directory - if self.ffmpeg_path.exists(): - if self.ffmpeg_path.is_dir(): - shutil.rmtree(str(self.ffmpeg_path)) - else: - self.ffmpeg_path.unlink() + # Clean up any existing files + for binary_path in [self.ffmpeg_path, self.ffprobe_path]: + if binary_path.exists(): + if binary_path.is_dir(): + shutil.rmtree(str(binary_path)) + else: + binary_path.unlink() with temp_path_context() as temp_dir: # Download archive @@ -121,18 +124,23 @@ class FFmpegDownloader: if not self._verify_download(archive_path): raise DownloadError("Downloaded file verification failed") - # Extract FFmpeg binary - self._extract_binary(archive_path, temp_dir) + # Extract binaries + self._extract_binaries(archive_path, temp_dir) # Set proper permissions - os.chmod(str(self.ffmpeg_path), 0o755) + for binary_path in [self.ffmpeg_path, self.ffprobe_path]: + os.chmod(str(binary_path), 0o755) - # Verify binary + # Verify binaries if not self.verify(): - raise DownloadError("FFmpeg binary verification failed") + raise DownloadError("Binary verification failed") logger.info(f"Successfully downloaded FFmpeg to {self.ffmpeg_path}") - return self.ffmpeg_path + logger.info(f"Successfully downloaded FFprobe to {self.ffprobe_path}") + return { + "ffmpeg": self.ffmpeg_path, + "ffprobe": self.ffprobe_path + } except Exception as e: last_error = str(e) @@ -193,9 +201,9 @@ class FFmpegDownloader: logger.error(f"Download verification failed: {str(e)}") return False - def _extract_binary(self, archive_path: Path, temp_dir: str): - """Extract FFmpeg binary from archive""" - logger.info("Extracting FFmpeg binary") + def _extract_binaries(self, archive_path: Path, temp_dir: str): + """Extract FFmpeg and FFprobe binaries from archive""" + logger.info("Extracting FFmpeg and FFprobe binaries") if self.system == "Windows": self._extract_zip(archive_path, temp_dir) @@ -205,50 +213,70 @@ class FFmpegDownloader: def _extract_zip(self, archive_path: Path, temp_dir: str): """Extract from zip archive (Windows)""" with zipfile.ZipFile(archive_path, "r") as zip_ref: - ffmpeg_files = [f for f in zip_ref.namelist() if self._get_binary_name() in f] - if not ffmpeg_files: - raise DownloadError("FFmpeg binary not found in archive") - - zip_ref.extract(ffmpeg_files[0], temp_dir) - extracted_path = Path(temp_dir) / ffmpeg_files[0] - shutil.copy2(extracted_path, self.ffmpeg_path) + binary_names = self._get_binary_names() + for binary_name in binary_names: + binary_files = [f for f in zip_ref.namelist() if binary_name in f] + if not binary_files: + raise DownloadError(f"{binary_name} not found in archive") + + zip_ref.extract(binary_files[0], temp_dir) + extracted_path = Path(temp_dir) / binary_files[0] + target_path = self.base_dir / binary_name + shutil.copy2(extracted_path, target_path) def _extract_tar(self, archive_path: Path, temp_dir: str): """Extract from tar archive (Linux/macOS)""" with tarfile.open(archive_path, "r:xz") as tar_ref: - ffmpeg_files = [f for f in tar_ref.getnames() if f.endswith("/ffmpeg")] - if not ffmpeg_files: - raise DownloadError("FFmpeg binary not found in archive") - - tar_ref.extract(ffmpeg_files[0], temp_dir) - extracted_path = Path(temp_dir) / ffmpeg_files[0] - shutil.copy2(extracted_path, self.ffmpeg_path) + binary_names = self._get_binary_names() + for binary_name in binary_names: + binary_files = [f for f in tar_ref.getnames() if f.endswith(f"/{binary_name}")] + if not binary_files: + raise DownloadError(f"{binary_name} not found in archive") + + tar_ref.extract(binary_files[0], temp_dir) + extracted_path = Path(temp_dir) / binary_files[0] + target_path = self.base_dir / binary_name + shutil.copy2(extracted_path, target_path) def verify(self) -> bool: - """Verify FFmpeg binary works""" + """Verify FFmpeg and FFprobe binaries work""" try: - if not self.ffmpeg_path.exists(): + if not self.ffmpeg_path.exists() or not self.ffprobe_path.exists(): return False # Ensure proper permissions os.chmod(str(self.ffmpeg_path), 0o755) + os.chmod(str(self.ffprobe_path), 0o755) # Test FFmpeg functionality - result = subprocess.run( + ffmpeg_result = subprocess.run( [str(self.ffmpeg_path), "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=5 ) - if result.returncode == 0: - version = result.stdout.decode().split('\n')[0] - logger.info(f"FFmpeg verification successful: {version}") + # Test FFprobe functionality + ffprobe_result = subprocess.run( + [str(self.ffprobe_path), "-version"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=5 + ) + + if ffmpeg_result.returncode == 0 and ffprobe_result.returncode == 0: + ffmpeg_version = ffmpeg_result.stdout.decode().split('\n')[0] + ffprobe_version = ffprobe_result.stdout.decode().split('\n')[0] + logger.info(f"FFmpeg verification successful: {ffmpeg_version}") + logger.info(f"FFprobe verification successful: {ffprobe_version}") return True else: - logger.error(f"FFmpeg verification failed: {result.stderr.decode()}") + if ffmpeg_result.returncode != 0: + logger.error(f"FFmpeg verification failed: {ffmpeg_result.stderr.decode()}") + if ffprobe_result.returncode != 0: + logger.error(f"FFprobe verification failed: {ffprobe_result.stderr.decode()}") return False except Exception as e: - logger.error(f"FFmpeg verification failed: {e}") + logger.error(f"Binary verification failed: {e}") return False diff --git a/videoarchiver/processor.py b/videoarchiver/processor.py index 73153a3..76f2665 100644 --- a/videoarchiver/processor.py +++ b/videoarchiver/processor.py @@ -316,15 +316,22 @@ class VideoProcessor: if monitored_channels and message.channel.id not in monitored_channels: return - # Find all video URLs in message with improved pattern matching + # Find all video URLs in message using yt-dlp simulation logger.info(f"Checking message {message.id} for video URLs...") urls = [] if message.guild.id in self.components: downloader = self.components[message.guild.id]["downloader"] if downloader: + # Check each word in the message for word in message.content.split(): - if downloader.is_supported_url(word): - urls.append(word) + # Use yt-dlp simulation to check if URL is supported + try: + if downloader.is_supported_url(word): + logger.info(f"Found supported URL: {word}") + urls.append(word) + except Exception as e: + logger.error(f"Error checking URL {word}: {str(e)}") + continue if urls: logger.info(f"Found {len(urls)} video URLs in message {message.id}") diff --git a/videoarchiver/utils/video_downloader.py b/videoarchiver/utils/video_downloader.py index d9cac92..8834640 100644 --- a/videoarchiver/utils/video_downloader.py +++ b/videoarchiver/utils/video_downloader.py @@ -36,7 +36,6 @@ class VideoDownloader: # Ensure download path exists with proper permissions self.download_path = Path(download_path) self.download_path.mkdir(parents=True, exist_ok=True) - # Ensure directory has rwx permissions for user and rx for group/others os.chmod(str(self.download_path), 0o755) logger.info(f"Initialized download directory: {self.download_path}") @@ -44,14 +43,10 @@ class VideoDownloader: self.max_quality = max_quality self.max_file_size = max_file_size self.enabled_sites = enabled_sites - self.url_patterns = self._get_url_patterns() # Initialize FFmpeg manager self.ffmpeg_mgr = FFmpegManager() - ffmpeg_path = self.ffmpeg_mgr.get_ffmpeg_path() - if not os.path.exists(ffmpeg_path): - raise FileNotFoundError(f"FFmpeg not found at {ffmpeg_path}") - logger.info(f"Using FFmpeg from: {ffmpeg_path}") + logger.info(f"FFmpeg path: {self.ffmpeg_mgr.get_ffmpeg_path()}") # Create thread pool for this instance self.download_pool = ThreadPoolExecutor( @@ -63,7 +58,7 @@ class VideoDownloader: self.active_downloads: Dict[str, str] = {} self._downloads_lock = asyncio.Lock() - # Configure yt-dlp options with absolute FFmpeg path + # Configure yt-dlp options self.ydl_opts = { "format": f"bv*[height<={max_quality}][ext=mp4]+ba[ext=m4a]/b[height<={max_quality}]/best", # More flexible format "outtmpl": "%(title)s.%(ext)s", # Base filename only, path added later @@ -79,55 +74,55 @@ class VideoDownloader: "extractor_retries": self.MAX_RETRIES, "postprocessor_hooks": [self._check_file_size], "progress_hooks": [self._progress_hook], - "ffmpeg_location": str(ffmpeg_path), # Convert Path to string - "prefer_ffmpeg": True, # Force use of FFmpeg - "hls_prefer_ffmpeg": True, # Use FFmpeg for HLS + "ffmpeg_location": self.ffmpeg_mgr.get_ffmpeg_path(), "logger": logger, # Use our logger "ignoreerrors": True, # Don't stop on download errors "no_color": True, # Disable ANSI colors in output "geo_bypass": True, # Try to bypass geo-restrictions "socket_timeout": 30, # Increase timeout - "external_downloader": { - "m3u8": "ffmpeg", # Use FFmpeg for m3u8 downloads - }, - "external_downloader_args": { - "ffmpeg": ["-v", "warning"], # Reduce FFmpeg verbosity - } } - logger.info("VideoDownloader initialized successfully") - def __del__(self): - """Ensure thread pool is shutdown and files are cleaned up""" + def is_supported_url(self, url: str) -> bool: + """Check if URL is supported by attempting a simulated download""" try: - # Cancel all active downloads - for file_path in self.active_downloads.values(): + # Configure yt-dlp for simulation + simulate_opts = { + **self.ydl_opts, + "simulate": True, # Only simulate download + "quiet": True, # Reduce output noise + "no_warnings": True, + "extract_flat": True, # Don't download video info + "skip_download": True, # Skip actual download + "format": "best", # Don't spend time finding best format + } + + # Create a new yt-dlp instance for simulation + with yt_dlp.YoutubeDL(simulate_opts) as ydl: try: - secure_delete_file(file_path) + # Try to extract info without downloading + info = ydl.extract_info(url, download=False) + if info is None: + logger.debug(f"URL not supported: {url}") + return False + + # Check if site is enabled (if enabled_sites is configured) + if self.enabled_sites: + extractor = info.get('extractor', '').lower() + if not any(site.lower() in extractor for site in self.enabled_sites): + logger.info(f"Site {extractor} not in enabled sites list") + return False + + logger.info(f"URL supported: {url} (Extractor: {info.get('extractor', 'unknown')})") + return True + except Exception as e: - logger.error(f"Error deleting file during cleanup: {str(e)}") - self.active_downloads.clear() - - # Shutdown thread pool - if hasattr(self, "download_pool"): - self.download_pool.shutdown(wait=True) + if "Unsupported URL" not in str(e): + logger.error(f"Error checking URL {url}: {str(e)}") + return False + except Exception as e: - logger.error(f"Error during VideoDownloader cleanup: {str(e)}") - - def _get_url_patterns(self) -> List[Tuple[str, str]]: - """Get URL patterns and names for supported sites""" - patterns = [] - try: - with yt_dlp.YoutubeDL() as ydl: - for ie in ydl._ies: - if hasattr(ie, "_VALID_URL") and ie._VALID_URL: - if not self.enabled_sites or any( - site.lower() in ie.IE_NAME.lower() - for site in self.enabled_sites - ): - patterns.append((ie._VALID_URL, ie.IE_NAME)) - except Exception as e: - logger.error(f"Error getting URL patterns: {str(e)}") - return patterns + logger.error(f"Error during URL check: {str(e)}") + return False def _check_file_size(self, info): """Check if file size is within limits""" @@ -342,16 +337,3 @@ class VideoDownloader: return False await asyncio.sleep(self.FILE_OP_RETRY_DELAY * (attempt + 1)) return False - - def is_supported_url(self, url: str) -> bool: - """Check if URL is supported using regex patterns""" - try: - # Try each pattern - for pattern, site_name in self.url_patterns: - if re.match(pattern, url): - logger.debug(f"URL matched pattern for {site_name}") - return True - return False - except Exception as e: - logger.error(f"Error checking URL support: {str(e)}") - return False