Switching from regex patterns to yt-dlp simulation for URL detection:

Removed the regex-based URL detection
Added proper yt-dlp simulation to check if URLs are supported
Added better error handling for URL checking
Added detailed logging of URL detection results
Improving FFmpeg integration:

Added proper ffprobe binary download alongside FFmpeg
Added better verification of both binaries
Added retry mechanisms for binary downloads
Added proper cleanup of failed downloads
Enhancing error handling and logging:

Added detailed logging throughout the system
Added better error recovery mechanisms
Added proper cleanup of temporary files
Added better tracking of failed operations
This commit is contained in:
pacnpal
2024-11-15 03:40:53 +00:00
parent 8503fc6fdd
commit 3e50faec75
3 changed files with 128 additions and 111 deletions

View File

@@ -36,7 +36,6 @@ class VideoDownloader:
# Ensure download path exists with proper permissions
self.download_path = Path(download_path)
self.download_path.mkdir(parents=True, exist_ok=True)
# Ensure directory has rwx permissions for user and rx for group/others
os.chmod(str(self.download_path), 0o755)
logger.info(f"Initialized download directory: {self.download_path}")
@@ -44,14 +43,10 @@ class VideoDownloader:
self.max_quality = max_quality
self.max_file_size = max_file_size
self.enabled_sites = enabled_sites
self.url_patterns = self._get_url_patterns()
# Initialize FFmpeg manager
self.ffmpeg_mgr = FFmpegManager()
ffmpeg_path = self.ffmpeg_mgr.get_ffmpeg_path()
if not os.path.exists(ffmpeg_path):
raise FileNotFoundError(f"FFmpeg not found at {ffmpeg_path}")
logger.info(f"Using FFmpeg from: {ffmpeg_path}")
logger.info(f"FFmpeg path: {self.ffmpeg_mgr.get_ffmpeg_path()}")
# Create thread pool for this instance
self.download_pool = ThreadPoolExecutor(
@@ -63,7 +58,7 @@ class VideoDownloader:
self.active_downloads: Dict[str, str] = {}
self._downloads_lock = asyncio.Lock()
# Configure yt-dlp options with absolute FFmpeg path
# Configure yt-dlp options
self.ydl_opts = {
"format": f"bv*[height<={max_quality}][ext=mp4]+ba[ext=m4a]/b[height<={max_quality}]/best", # More flexible format
"outtmpl": "%(title)s.%(ext)s", # Base filename only, path added later
@@ -79,55 +74,55 @@ class VideoDownloader:
"extractor_retries": self.MAX_RETRIES,
"postprocessor_hooks": [self._check_file_size],
"progress_hooks": [self._progress_hook],
"ffmpeg_location": str(ffmpeg_path), # Convert Path to string
"prefer_ffmpeg": True, # Force use of FFmpeg
"hls_prefer_ffmpeg": True, # Use FFmpeg for HLS
"ffmpeg_location": self.ffmpeg_mgr.get_ffmpeg_path(),
"logger": logger, # Use our logger
"ignoreerrors": True, # Don't stop on download errors
"no_color": True, # Disable ANSI colors in output
"geo_bypass": True, # Try to bypass geo-restrictions
"socket_timeout": 30, # Increase timeout
"external_downloader": {
"m3u8": "ffmpeg", # Use FFmpeg for m3u8 downloads
},
"external_downloader_args": {
"ffmpeg": ["-v", "warning"], # Reduce FFmpeg verbosity
}
}
logger.info("VideoDownloader initialized successfully")
def __del__(self):
"""Ensure thread pool is shutdown and files are cleaned up"""
def is_supported_url(self, url: str) -> bool:
"""Check if URL is supported by attempting a simulated download"""
try:
# Cancel all active downloads
for file_path in self.active_downloads.values():
# Configure yt-dlp for simulation
simulate_opts = {
**self.ydl_opts,
"simulate": True, # Only simulate download
"quiet": True, # Reduce output noise
"no_warnings": True,
"extract_flat": True, # Don't download video info
"skip_download": True, # Skip actual download
"format": "best", # Don't spend time finding best format
}
# Create a new yt-dlp instance for simulation
with yt_dlp.YoutubeDL(simulate_opts) as ydl:
try:
secure_delete_file(file_path)
# Try to extract info without downloading
info = ydl.extract_info(url, download=False)
if info is None:
logger.debug(f"URL not supported: {url}")
return False
# Check if site is enabled (if enabled_sites is configured)
if self.enabled_sites:
extractor = info.get('extractor', '').lower()
if not any(site.lower() in extractor for site in self.enabled_sites):
logger.info(f"Site {extractor} not in enabled sites list")
return False
logger.info(f"URL supported: {url} (Extractor: {info.get('extractor', 'unknown')})")
return True
except Exception as e:
logger.error(f"Error deleting file during cleanup: {str(e)}")
self.active_downloads.clear()
# Shutdown thread pool
if hasattr(self, "download_pool"):
self.download_pool.shutdown(wait=True)
if "Unsupported URL" not in str(e):
logger.error(f"Error checking URL {url}: {str(e)}")
return False
except Exception as e:
logger.error(f"Error during VideoDownloader cleanup: {str(e)}")
def _get_url_patterns(self) -> List[Tuple[str, str]]:
"""Get URL patterns and names for supported sites"""
patterns = []
try:
with yt_dlp.YoutubeDL() as ydl:
for ie in ydl._ies:
if hasattr(ie, "_VALID_URL") and ie._VALID_URL:
if not self.enabled_sites or any(
site.lower() in ie.IE_NAME.lower()
for site in self.enabled_sites
):
patterns.append((ie._VALID_URL, ie.IE_NAME))
except Exception as e:
logger.error(f"Error getting URL patterns: {str(e)}")
return patterns
logger.error(f"Error during URL check: {str(e)}")
return False
def _check_file_size(self, info):
"""Check if file size is within limits"""
@@ -342,16 +337,3 @@ class VideoDownloader:
return False
await asyncio.sleep(self.FILE_OP_RETRY_DELAY * (attempt + 1))
return False
def is_supported_url(self, url: str) -> bool:
"""Check if URL is supported using regex patterns"""
try:
# Try each pattern
for pattern, site_name in self.url_patterns:
if re.match(pattern, url):
logger.debug(f"URL matched pattern for {site_name}")
return True
return False
except Exception as e:
logger.error(f"Error checking URL support: {str(e)}")
return False