Update and rename export-docs.py to astro_docs_to_pdf.py

2026-02-04 19:45:12 -05:00 · 2024-12-09 21:01:16 -05:00
parent 056c7956e7
commit 5889ad9440
2 changed files with 700 additions and 440 deletions
--- a/astro_docs_to_pdf.py
+++ b/astro_docs_to_pdf.py
@@ -0,0 +1,700 @@
+"""
+Astro Documentation PDF Generator
+Modified version of Docs-Exporter for Astro documentation
+
+Original work Copyright (C) 2024 Riyooo
+Modified work Copyright (C) 2024 PacNPal
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Modifications:
+- Replaced wkhtmltopdf with Playwright for PDF generation
+- Added support for Astro's MDX format and frontmatter
+- Enhanced frontmatter parsing
+- Added automatic CSS generation
+- Improved error handling and reporting
+"""
+
+import os
+import markdown
+import tempfile
+import yaml
+import re
+import html
+import shutil
+from pathlib import Path
+from git import Repo, RemoteProgress, GitCommandError
+from datetime import datetime
+from packaging import version
+from tqdm import tqdm
+from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
+
+def get_license_notice():
+    """Return a formatted license notice for inclusion in output"""
+    return """
+This PDF was generated by Astro Documentation PDF Generator
+Original work Copyright (C) 2024 Riyooo
+Modified work Copyright (C) 2024 PacNPal
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License version 3.
+Source code is available at: [Your Repository URL]
+"""
+
+def add_license_page(html_content):
+    """Add a license notice page to the HTML content"""
+    license_html = f"""
+    <div class="license-notice" style="page-break-before: always;">
+        <h2>License Notice</h2>
+        <pre style="white-space: pre-wrap; font-family: monospace;">
+            {get_license_notice()}
+        </pre>
+        <p>Complete source code for this program is available at: [Your Repository URL]</p>
+        <p>This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you 
+        are welcome to redistribute it under certain conditions. See the GNU Affero General 
+        Public License version 3 for details.</p>
+    </div>
+    """
+    return html_content + license_html
+
+class DocumentationProcessingError(Exception):
+    """Custom exception for documentation processing errors"""
+    pass
+
+
+class CloneProgress(RemoteProgress):
+    def __init__(self):
+        super().__init__()
+        self.pbar = tqdm()
+
+    def update(self, op_code, cur_count, max_count=None, message=''):
+        if max_count is not None:
+            self.pbar.total = max_count
+        self.pbar.update(cur_count - self.pbar.n)
+
+    def finalize(self):
+        self.pbar.close()
+
+
+def cleanup_directory(directory):
+    """Safely clean up a directory"""
+    try:
+        if os.path.exists(directory):
+            shutil.rmtree(directory)
+    except Exception as e:
+        print(f"Warning: Failed to clean up directory {directory}: {e}")
+
+
+def process_image_paths(md_content):
+    """Process both MDX imports and markdown images, including SVGs"""
+    try:
+        # Handle MDX image imports
+        mdx_pattern = r'import\s+(\w+)\s+from\s+[\'"]([^\'"]+)[\'"]'
+        def mdx_replace(match):
+            var_name = match.group(1)
+            image_path = match.group(2)
+            image_path = re.sub(r'^[./~]+', '', image_path)
+            return f'![{var_name}](https://docs.astro.build/{image_path})'
+        
+        md_content = re.sub(mdx_pattern, mdx_replace, md_content)
+        
+        # Handle standard markdown images and SVGs
+        md_pattern = r'!\[(.*?)\]\(([^http].*?)\)'
+        def md_replace(match):
+            alt_text = match.group(1)
+            image_path = match.group(2)
+            image_path = re.sub(r'^[./~]+', '', image_path)
+            return f'![{alt_text}](https://docs.astro.build/{image_path})'
+        
+        md_content = re.sub(md_pattern, md_replace, md_content)
+        
+        # Handle Astro image components
+        component_pattern = r'<Image\s+src={([^}]+)}\s+alt="([^"]+)"[^>]*>'
+        def component_replace(match):
+            src = match.group(1).strip('{}').strip('"\'')
+            alt = match.group(2)
+            src = re.sub(r'^[./~]+', '', src)
+            return f'![{alt}](https://docs.astro.build/{src})'
+        
+        return re.sub(component_pattern, component_replace, md_content)
+    except Exception as e:
+        raise DocumentationProcessingError(f"Error processing image paths: {e}")
+
+
+def preprocess_code_blocks(md_content):
+    """Handle Astro's code blocks with advanced features"""
+    try:
+        pattern = r'```(?:(\w+))?\s*(?:\{([^}]*)\})?\s*(.*?)```'
+        
+        def replace(match):
+            language = match.group(1) or ''
+            attributes = match.group(2) or ''
+            code_block = match.group(3)
+            
+            # Parse attributes
+            title = ''
+            if attributes:
+                title_match = re.search(r'title="([^"]+)"', attributes)
+                if title_match:
+                    title = title_match.group(1)
+            
+            # Build header
+            header_parts = []
+            if title:
+                header_parts.append(title)
+            if language:
+                header_parts.append(f"({language})")
+            
+            header_html = (f'<div class="code-header"><i>{" ".join(header_parts)}</i></div>' 
+                         if header_parts else '')
+            
+            return f'{header_html}\n```{language}\n{code_block.strip()}\n```'
+        
+        return re.sub(pattern, replace, md_content, flags=re.DOTALL)
+    except Exception as e:
+        raise DocumentationProcessingError(f"Error processing code blocks: {e}")
+
+
+def parse_frontmatter(md_content):
+    """Parse frontmatter with support for Astro's format"""
+    try:
+        lines = md_content.split('\n')
+        if lines[0].strip() == '---':
+            try:
+                end_of_frontmatter = lines[1:].index('---') + 1
+                frontmatter = '\n'.join(lines[1:end_of_frontmatter])
+                content = '\n'.join(lines[end_of_frontmatter + 1:])
+                
+                # Remove import statements from content
+                content = re.sub(r'import.*?\n', '', content)
+                
+                return frontmatter, content
+            except ValueError:
+                return None, md_content
+        return None, md_content
+    except Exception as e:
+        raise DocumentationProcessingError(f"Error parsing frontmatter: {e}")
+
+
+def safe_load_frontmatter(frontmatter_content):
+    """Safely load YAML frontmatter with robust error handling"""
+    if not frontmatter_content:
+        return None
+
+    try:
+        # First pass: try to extract title and description using regex
+        title_match = re.search(r'title:\s*[\'"]?(.*?)(?:[\'"]?\s*$|[\'"]?\s+\w+:)', frontmatter_content, re.MULTILINE)
+        desc_match = re.search(r'description:\s*[\'"]?(.*?)(?:[\'"]?\s*$|[\'"]?\s+\w+:)', frontmatter_content, re.MULTILINE)
+        
+        # Initialize metadata with found values
+        metadata = {}
+        if title_match:
+            title = title_match.group(1).strip(' \'"')
+            metadata['title'] = title
+        if desc_match:
+            description = desc_match.group(1).strip(' \'"')
+            metadata['description'] = description
+
+        # Clean up the frontmatter
+        lines = []
+        for line in frontmatter_content.split('\n'):
+            # Skip problematic lines
+            if any(skip in line for skip in ['githubIntegrationURL:', 'label:', 'maxHeadingLevel:']):
+                continue
+                
+            # Clean up quotes and URLs
+            if ':' in line:
+                key, *value_parts = line.split(':', 1)
+                if value_parts:
+                    value = value_parts[0]
+                    # Handle truncated URLs
+                    if 'http' in value and "'" in value and not value.endswith("'"):
+                        continue
+                    # Handle other truncated quoted strings
+                    if value.count("'") == 1 or value.count('"') == 1:
+                        continue
+                    lines.append(line)
+                else:
+                    lines.append(line)
+
+        # Try to parse cleaned content
+        cleaned_content = '\n'.join(lines)
+        try:
+            parsed_data = yaml.safe_load(cleaned_content)
+            if isinstance(parsed_data, dict):
+                # Update metadata with any additional valid fields
+                metadata.update(parsed_data)
+        except:
+            pass  # Use regex-extracted metadata if YAML parsing fails
+
+        return metadata if metadata else None
+
+    except Exception as e:
+        print(f"Warning: Error processing frontmatter: {e}")
+        return None
+
+
+def parse_frontmatter(md_content):
+    """Parse frontmatter with improved error handling"""
+    try:
+        lines = md_content.split('\n')
+        if lines[0].strip() == '---':
+            try:
+                # Find the closing --- marker
+                end_of_frontmatter = lines[1:].index('---') + 1
+                frontmatter = '\n'.join(lines[1:end_of_frontmatter])
+                content = '\n'.join(lines[end_of_frontmatter + 1:])
+                
+                # Clean up frontmatter
+                frontmatter = re.sub(r'\s+i18nReady:\s*true', '', frontmatter)
+                frontmatter = re.sub(r':\s*\|', ': ', frontmatter)  # Handle YAML block indicators
+                
+                return frontmatter, content
+            except ValueError:
+                return None, md_content
+        return None, md_content
+    except Exception as e:
+        print(f"Warning: Error in document structure: {e}")
+        return None, md_content
+
+def clone_repo(repo_url, branch, docs_dir, repo_dir):
+    """Clone repository with proper error handling and cleanup"""
+    progress = None
+    try:
+        if os.path.exists(repo_dir):
+            print("Updating existing repository...")
+            repo = Repo(repo_dir)
+            origin = repo.remotes.origin
+            origin.fetch()
+            repo.git.checkout(branch)
+            origin.pull()
+            print("Repository updated successfully.")
+            return
+
+        print("Cloning repository...")
+        os.makedirs(repo_dir, exist_ok=True)
+        progress = CloneProgress()
+        
+        # Initialize repository
+        repo = Repo.init(repo_dir)
+        with repo.config_writer() as git_config:
+            git_config.set_value("core", "sparseCheckout", "true")
+
+        # Setup sparse checkout
+        sparse_checkout_path = Path(repo_dir) / ".git" / "info" / "sparse-checkout"
+        sparse_checkout_path.parent.mkdir(exist_ok=True)
+        sparse_checkout_path.write_text(f"/{docs_dir}/*\n")
+
+        # Clone and checkout
+        origin = repo.create_remote("origin", repo_url)
+        origin.fetch(progress=progress)
+        repo.git.checkout(branch)
+        print("Repository cloned successfully.")
+
+    except GitCommandError as e:
+        cleanup_directory(repo_dir)
+        raise DocumentationProcessingError(f"Git operation failed: {e}")
+    except Exception as e:
+        cleanup_directory(repo_dir)
+        raise DocumentationProcessingError(f"Repository operation failed: {e}")
+    finally:
+        if progress:
+            progress.finalize()
+
+
+def get_files_sorted(root_dir):
+    """Get sorted files with comprehensive filtering"""
+    try:
+        all_files = []
+        excluded_dirs = {'node_modules', '.git', '_internal', 'dist', 'temp', '__pycache__'}
+        
+        for root, dirs, files in os.walk(root_dir):
+            # Skip excluded directories
+            dirs[:] = [d for d in dirs if d not in excluded_dirs and not d.startswith('_')]
+            
+            for file in files:
+                if file.endswith(('.md', '.mdx')):
+                    full_path = os.path.join(root, file)
+                    
+                    # Skip excluded paths
+                    if any(x in Path(full_path).parts for x in excluded_dirs):
+                        continue
+                        
+                    # Prioritize index files within their directories
+                    is_index = file == 'index.md'
+                    dir_path = os.path.dirname(full_path)
+                    sort_key = f"{dir_path}/{'0' if is_index else '1'}{file}"
+                    all_files.append((full_path, sort_key))
+        
+        if not all_files:
+            raise DocumentationProcessingError(f"No markdown files found in {root_dir}")
+        
+        all_files.sort(key=lambda x: x[1])
+        return [full_path for full_path, _ in all_files]
+    except Exception as e:
+        raise DocumentationProcessingError(f"Error getting sorted files: {e}")
+
+
+def create_default_css():
+    """Create a default CSS file if it doesn't exist"""
+    css_content = """
+body { 
+    font-family: 'Arial', sans-serif; 
+    line-height: 1.6; 
+    margin: 0; 
+    padding: 20px;
+    color: #1a1a1a;
+}
+.master-container { 
+    display: flex; 
+    justify-content: center; 
+    align-items: center; 
+    min-height: 100vh;
+}
+.container { 
+    text-align: center; 
+    max-width: 800px;
+    margin: 0 auto;
+}
+.title { 
+    font-size: 28px; 
+    font-weight: bold; 
+    margin-bottom: 20px;
+    color: #000;
+}
+.date { 
+    font-size: 16px; 
+    color: #666;
+}
+.page-break { 
+    page-break-after: always;
+}
+code { 
+    background-color: #f4f4f4; 
+    padding: 2px 4px; 
+    border-radius: 4px;
+    font-family: 'Courier New', monospace;
+    font-size: 0.9em;
+}
+pre { 
+    background-color: #f8f8f8; 
+    padding: 15px; 
+    border-radius: 5px; 
+    overflow-x: auto;
+    border: 1px solid #e1e1e1;
+}
+.code-header { 
+    background-color: #e0e0e0; 
+    padding: 8px 15px; 
+    border-radius: 5px 5px 0 0;
+    font-size: 0.9em;
+    color: #333;
+}
+table { 
+    border-collapse: collapse; 
+    width: 100%; 
+    margin: 15px 0;
+}
+th, td { 
+    border: 1px solid #ddd; 
+    padding: 12px; 
+    text-align: left;
+}
+th { 
+    background-color: #f5f5f5;
+    font-weight: bold;
+}
+img { 
+    max-width: 100%; 
+    height: auto; 
+    margin: 10px 0;
+    border-radius: 5px;
+}
+h1, h2, h3, h4, h5, h6 {
+    color: #2c3e50;
+    margin-top: 24px;
+    margin-bottom: 16px;
+}
+h1 { font-size: 2em; border-bottom: 1px solid #eee; padding-bottom: 0.3em; }
+h2 { font-size: 1.5em; }
+h3 { font-size: 1.25em; }
+a { color: #0366d6; text-decoration: none; }
+a:hover { text-decoration: underline; }
+blockquote {
+    margin: 0;
+    padding: 0 1em;
+    color: #6a737d;
+    border-left: 0.25em solid #dfe2e5;
+}
+.doc-path {
+    color: #666;
+    font-size: 0.9em;
+    margin-bottom: 20px;
+    padding: 8px;
+    background-color: #f8f9fa;
+    border-radius: 4px;
+}
+"""
+    try:
+        with open('styles.css', 'w', encoding='utf8') as f:
+            f.write(css_content.strip())
+    except Exception as e:
+        raise DocumentationProcessingError(f"Failed to create CSS file: {e}")
+
+
+def process_files(files, repo_dir, docs_dir):
+    """Process markdown files into HTML with error handling"""
+    try:
+        if not os.path.exists('styles.css'):
+            create_default_css()
+            
+        with open('styles.css', 'r', encoding='utf8') as f:
+            css_content = f.read()
+            
+        toc = []
+        html_all_pages_content = []
+        numbering = [0]
+
+        html_header = f"""
+        <!DOCTYPE html>
+        <html lang="en">
+        <head>
+            <meta charset="UTF-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1.0">
+            <style>{css_content}</style>
+        </head>
+        <body>
+        """
+
+        for index, file_path in enumerate(files):
+            try:
+                with open(file_path, 'r', encoding='utf8') as f:
+                    md_content = f.read()
+
+                md_content = process_image_paths(md_content)
+                md_content = preprocess_code_blocks(md_content)
+                frontmatter, md_content = parse_frontmatter(md_content)
+
+                if frontmatter:
+                    data = safe_load_frontmatter(frontmatter)
+                    if data is not None:
+                        rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir))
+                        depth = rel_path.count(os.sep)
+                        
+                        if os.path.basename(file_path) == 'index.md' and depth > 0:
+                            depth -= 1
+                            
+                        indent = '&nbsp;' * 5 * depth
+
+                        while len(numbering) <= depth:
+                            numbering.append(0)
+
+                        numbering[depth] += 1
+                        for i in range(depth + 1, len(numbering)):
+                            numbering[i] = 0
+
+                        toc_numbering = '.'.join(map(str, numbering[:depth + 1]))
+                        toc_title = data.get('title', Path(file_path).stem.title())
+                        toc_full_title = f"{toc_numbering} - {toc_title}"
+                        
+                        toc.append(f"{indent}<a href='#{toc_full_title}'>{toc_full_title}</a><br/>")
+
+                        html_page_content = [
+                            f"<h1 id='{toc_full_title}'>{toc_full_title}</h1>",
+                            f"<div class='doc-path'><p>Documentation path: {Path(file_path).relative_to(Path(repo_dir) / docs_dir).as_posix()}</p></div>"
+                        ]
+
+                        if 'description' in data:
+                            html_page_content.append(f"<p><strong>Description:</strong> {data['description']}</p>")
+                            html_page_content.append('<br/>')
+                        
+                        # Convert Markdown to HTML with extended features
+                        html_page_content.append(markdown.markdown(
+                            md_content,
+                            extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'attr_list', 'def_list']
+                        ))
+                        
+                        html_all_pages_content.append('\n'.join(html_page_content))
+
+                        if index < len(files) - 1:
+                            html_all_pages_content.append('<div class="page-break"></div>')
+                            
+            except Exception as e:
+                print(f"Warning: Error processing file {file_path}: {e}")
+                continue
+
+        if not html_all_pages_content:
+            raise DocumentationProcessingError("No content was successfully processed")
+
+        # Create table of contents
+        toc_html = f"""
+        <div style="padding-bottom: 10px">
+            <div style="padding-bottom: 20px">
+                <h1>Table of Contents</h1>
+            </div>
+            {''.join(toc)}
+        </div>
+        <div style="page-break-before: always;">
+        """
+
+        # Combine all content
+        final_content = '\n'.join(html_all_pages_content)
+        html_all_content = f"{html_header}{toc_html}{final_content}</body></html>"
+
+        return html_all_content
+
+    except Exception as e:
+        raise DocumentationProcessingError(f"Error processing documentation: {e}")
+
+
+def generate_pdf(html_content, output_pdf, format_options=None):
+    """Generate PDF using Playwright with enhanced error handling"""
+    default_format = {
+        'format': 'A4',
+        'margin': {
+            'top': '50px',
+            'right': '50px',
+            'bottom': '50px',
+            'left': '50px'
+        },
+        'print_background': True,
+        'display_header_footer': True,
+        'header_template': '<div style="font-size: 10px; text-align: right; width: 100%; padding-right: 20px; margin-top: 20px;"><span class="pageNumber"></span> of <span class="totalPages"></span></div>',
+        'footer_template': '<div style="font-size: 10px; text-align: center; width: 100%; margin-bottom: 20px;"><span class="url"></span></div>'
+    }
+    
+    format_options = format_options or default_format
+
+    try:
+        playwright = sync_playwright().start()
+        browser = playwright.chromium.launch()
+        context = browser.new_context()
+        page = context.new_page()
+        
+        # Set viewport size for consistent rendering
+        page.set_viewport_size({"width": 1280, "height": 1024})
+        
+        # Increase timeouts for better reliability
+        page.set_default_timeout(120000)  # 2 minutes
+        
+        # Set content and wait for loading
+        page.set_content(html_content, wait_until='networkidle')
+        
+        # Additional waits for content
+        page.wait_for_load_state('networkidle')
+        page.wait_for_load_state('domcontentloaded')
+        
+        # Generate PDF
+        page.pdf(path=output_pdf, **format_options)
+        
+    except Exception as e:
+        raise DocumentationProcessingError(f"Error generating PDF: {str(e)}")
+    finally:
+        if 'page' in locals():
+            page.close()
+        if 'context' in locals():
+            context.close()
+        if 'browser' in locals():
+            browser.close()
+        if 'playwright' in locals():
+            playwright.stop()
+
+def main():
+    """Main execution function with error handling"""
+    repo_dir = "astro-docs"
+    repo_url = "https://github.com/withastro/docs.git"
+    branch = "main"
+    docs_dir = "src/content/docs/en"
+    
+    output_pdf = f"Astro_Documentation_{datetime.now().strftime('%Y-%m-%d')}.pdf"
+    temp_dir = None
+
+    try:
+        # Add version and license information to console
+        print(f"""
+Astro Documentation PDF Generator v1.0.0
+Copyright (C) 2024 PacNPal
+This program comes with ABSOLUTELY NO WARRANTY; for details see the LICENSE file.
+This is free software, and you are welcome to redistribute it
+under certain conditions; see the LICENSE file for details.
+""")
+        # Create CSS if it doesn't exist
+        if not os.path.exists('styles.css'):
+            print("Creating default styles.css...")
+            create_default_css()
+            print("Default CSS file created.")
+        
+        # Create temporary directory for processing
+        temp_dir = tempfile.mkdtemp()
+        
+        # Clone repository
+        print("Cloning Astro documentation repository...")
+        clone_repo(repo_url, branch, docs_dir, repo_dir)
+
+        # Get and sort files
+        print("Finding and sorting documentation files...")
+        docs_dir_full_path = os.path.join(repo_dir, docs_dir)
+        files_to_process = get_files_sorted(docs_dir_full_path)
+        print(f"Found {len(files_to_process)} files to process")
+
+        # Create cover page
+        with open('styles.css', 'r', encoding='utf8') as f:
+            css_content = f.read()
+            
+        cover_html = f"""
+        <!DOCTYPE html>
+        <html lang="en">
+        <head>
+            <meta charset="UTF-8">
+            <style>{css_content}</style>
+        </head>
+        <body>
+            <div class="master-container">
+                <div class="container">
+                    <div class="title">Astro Documentation</div>
+                    <div class="date">Generated on {datetime.now().strftime('%Y-%m-%d')}</div>
+                </div>
+            </div>
+        </body>
+        </html>
+        """
+
+        # Process files and generate HTML
+        print("Processing documentation files...")
+        html_content = process_files(files_to_process, repo_dir, docs_dir)
+        
+        # Combine cover and content
+        final_html = f"{cover_html}<div class='page-break'></div>{html_content}"
+        final_html = add_license_page(final_html)
+
+        # Generate PDF
+        print(f"Generating PDF: {output_pdf}")
+        generate_pdf(final_html, output_pdf)
+        
+        print(f"Documentation successfully generated: {output_pdf}")
+        
+    except DocumentationProcessingError as e:
+        print(f"Error: {e}")
+        return 1
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        print("This program is licensed under AGPL-3.0. Source code is available at: [Your Repository URL]")
+        return 1
+    finally:
+        # Cleanup
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+if __name__ == "__main__":
+    exit(main())
--- a/export-docs.py
+++ b/export-docs.py
@@ -1,440 +0,0 @@
-"""
-Nextjs Documentation PDF Generator
-Modified version of Docs-Exporter for Astro documentation
-
-Original work Copyright (C) 2024 Riyooo
-Modified work Copyright (C) 2024 PacNPal
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-Modifications:
- Replaced wkhtmltopdf with Playwright for PDF generation
- Improved error handling and reporting
- Added support for custom headers, footers, and styles in the generated PDFs.
- Enhanced error handling with `try-except` blocks, especially for frontmatter parsing and file operations.
-"""
-
-import os
-import markdown
-import tempfile
-import yaml
-import re
-import html
-from git import Repo, RemoteProgress
-from datetime import datetime
-from packaging import version
-from tqdm import tqdm
-from playwright.sync_api import sync_playwright
-
-def get_license_notice():
-    """Return a formatted license notice for inclusion in output"""
-    return """
-This PDF was generated by Nextjs Documentation PDF Generator
-Original work Copyright (C) 2024 Riyooo
-Modified work Copyright (C) 2024 PacNPal
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License version 3.
-Source code is available at: https://github.com/pacnpal/Docs-Exporter
-"""
-
-def add_license_page(html_content):
-    """Add a license notice page to the HTML content"""
-    license_html = f"""
-    <div class="license-notice" style="page-break-before: always;">
-        <h2>License Notice</h2>
-        <pre style="white-space: pre-wrap; font-family: monospace;">
-            {get_license_notice()}
-        </pre>
-        <p>Complete source code for this program is available at: https://github.com/pacnpal/Docs-Exporter</p>
-        <p>This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you 
-        are welcome to redistribute it under certain conditions. See the GNU Affero General 
-        Public License version 3 for details.</p>
-    </div>
-    """
-    return html_content + license_html
-
-def process_image_paths(md_content):
-    # Define a regular expression pattern to find image tags
-    pattern = r'src(?:Light|Dark)="(.*?)"'
-
-    # Function to replace the relative path with an absolute path
-    def replace(match):
-        relative_path = match.group(1)
-        absolute_path = f'{base_path}{relative_path}{path_args}'
-
-        # Print the original and new image URLs for debugging
-        return f'src="{absolute_path}"'
-
-    # Use the sub method to replace all occurrences
-    return re.sub(pattern, replace, md_content)
-
-
-def preprocess_code_blocks(md_content):
-    # Regular expression to match extended code blocks with filename and language
-    pattern = r'```(\w+)?\s+filename="([^"]+)"\s*(switcher)?\n(.*?)```'
-
-    def replace(match):
-        language = match.group(1) if match.group(1) else ''
-        filename = match.group(2)
-        code_block = match.group(4)
-
-        # Format the header with filename and language
-        header = f'<div class="code-header"><i>{filename} ({language})</i></div>' if language else f'<div class="code-header"><i>{filename}</i></div>'
-
-        return f'{header}\n```{language}\n{code_block}\n```'
-
-    # Replace all occurrences in the content
-    return re.sub(pattern, replace, md_content, flags=re.DOTALL)
-
-
-def safe_load_frontmatter(frontmatter_content):
-    try:
-        return yaml.safe_load(frontmatter_content)
-    except yaml.YAMLError:
-        return None
-
-
-def preprocess_mdx_content(md_content):
-    # Replace HTML tags in frontmatter
-    md_content = re.sub(r'<(/?\w+)>', lambda m: html.escape(m.group(0)), md_content)
-    return md_content
-
-
-def parse_frontmatter(md_content):
-    lines = md_content.split('\n')
-    if lines[0].strip() == '---':
-        end_of_frontmatter = lines.index('---', 1)
-        frontmatter = '\n'.join(lines[1:end_of_frontmatter])
-        content = '\n'.join(lines[end_of_frontmatter + 1:])
-        return frontmatter, content
-    return None, md_content
-
-
-class CloneProgress(RemoteProgress):
-    def __init__(self):
-        super().__init__()
-        self.pbar = tqdm()
-
-    def update(self, op_code, cur_count, max_count=None, message=''):
-        if max_count is not None:
-            self.pbar.total = max_count
-        self.pbar.update(cur_count - self.pbar.n)
-
-    def finalize(self):
-        self.pbar.close()
-
-
-def clone_repo(repo_url, branch, docs_dir, repo_dir):
-    if not os.path.isdir(repo_dir):
-        os.makedirs(repo_dir, exist_ok=True)
-        print("Cloning repository...")
-        repo = Repo.init(repo_dir)
-        with repo.config_writer() as git_config:
-            git_config.set_value("core", "sparseCheckout", "true")
-
-        with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file:
-            sparse_checkout_file.write(f"/{docs_dir}\n")
-
-        origin = repo.create_remote("origin", repo_url)
-        origin.fetch(progress=CloneProgress())
-        repo.git.checkout(branch)
-        print("Repository cloned.")
-    else:
-        print("Repository already exists. Updating...")
-        repo = Repo(repo_dir)
-        origin = repo.remotes.origin
-        origin.fetch(progress=CloneProgress())
-        repo.git.checkout(branch)
-        origin.pull(progress=CloneProgress())
-        print("Repository updated.")
-
-
-def is_file_open(file_path):
-    if not os.path.exists(file_path):
-        return False
-    try:
-        with open(file_path, 'a'):
-            pass
-        return False
-    except PermissionError:
-        return True
-
-
-def get_files_sorted(root_dir):
-    all_files = []
-    for root, _, files in os.walk(root_dir):
-        for file in files:
-            full_path = os.path.join(root, file)
-            modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file
-            sort_key = os.path.join(root, modified_basename)
-            all_files.append((full_path, sort_key))
-    all_files.sort(key=lambda x: x[1])
-    return [full_path for full_path, _ in all_files]
-
-
-def preprocess_frontmatter(frontmatter):
-    html_tags = {}
-
-    def replace_tag(match):
-        tag = match.group(0)
-        placeholder = f"HTML_TAG_{len(html_tags)}"
-        html_tags[placeholder] = tag
-        return placeholder
-
-    modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter)
-    return modified_frontmatter, html_tags
-
-
-def restore_html_tags(parsed_data, html_tags):
-    if isinstance(parsed_data, dict):
-        for key, value in parsed_data.items():
-            if isinstance(value, str):
-                for placeholder, tag in html_tags.items():
-                    value = value.replace(placeholder, tag)
-                value = html.escape(value)
-                parsed_data[key] = value
-    return parsed_data
-
-
-def process_files(files, repo_dir, docs_dir):
-    toc = ""
-    html_all_pages_content = ""
-
-    html_header = f"""
-    <html>
-    <head>
-        <style>
-            {open('styles.css').read()}
-        </style>
-    </head>
-    <body>
-    """
-
-    numbering = [0]
-
-    for index, file_path in enumerate(files):
-        with open(file_path, 'r', encoding='utf8') as f:
-            md_content = f.read()
-
-            if Change_img_url:
-                md_content = process_image_paths(md_content)
-
-            md_content = preprocess_code_blocks(md_content)
-            frontmatter, md_content = parse_frontmatter(md_content)
-
-            if frontmatter:
-                frontmatter, html_tags = preprocess_frontmatter(frontmatter)
-                data = safe_load_frontmatter(frontmatter)
-                if data is not None:
-                    data = restore_html_tags(data, html_tags)
-                    rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir))
-                    depth = rel_path.count(os.sep)
-                    file_basename = os.path.basename(file_path)
-                    if file_basename.startswith("index.") and depth > 0:
-                        depth += -1
-                    indent = '&nbsp;' * 5 * depth
-
-                    while len(numbering) <= depth:
-                        numbering.append(0)
-
-                    numbering[depth] += 1
-
-                    for i in range(depth + 1, len(numbering)):
-                        numbering[i] = 0
-                    
-                    toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}"
-                    toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title())
-                    toc_full_title = f"{toc_numbering} - {toc_title}"
-                    toc += f"{indent}<a href='#{toc_full_title}'>{toc_full_title}</a><br/>"
-
-                    html_page_content = f"""
-                    <h1>{toc_full_title}</h1>
-                    <div class="doc-path"><p>Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}</p></div>
-                    <p><strong>Description:</strong> {data.get('description', 'No description')}</p>
-                    """
-                    if data.get('related', {}):
-                        html_page_content += f"""
-                        <div style="margin-left:20px;">
-                            <p><strong>Related:</strong></p>
-                            <p><strong>Title:</strong> {data.get('related', {}).get('title', 'Related')}</p>
-                            <p><strong>Related Description:</strong> {data.get('related', {}).get('description', 'No related description')}</p>
-                            <p><strong>Links:</strong></p>
-                        <ul>
-                            {''.join([f'<li>{link}</li>' for link in data.get('related', {}).get('links', [])])}
-                        </ul>
-                        </div>
-                        """
-                    html_page_content += '</br>'
-                else:
-                    html_page_content = ""
-            else:
-                html_page_content = ""
-
-            html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition'])
-            html_all_pages_content += html_page_content
-
-            if index < len(files) - 1:
-                html_all_pages_content += '<div class="page-break"></div>'
-    
-    toc_html = f"""<div style="padding-bottom: 10px"><div style="padding-bottom: 20px"><h1>Table of Contents</h1></div>{toc}</div><div style="page-break-before: always;">"""
-    html_all_content = toc_html + html_all_pages_content
-
-    html_all_pages_content = html_header + html_all_pages_content + "</body></html>"
-    toc_html = html_header + toc_html + "</body></html>"
-    html_all_content = html_header + html_all_content + "</body></html>"
-
-    return(html_all_content, toc_html, html_all_pages_content)
-
-
-def find_latest_version(html_content):
-    version_pattern = re.compile(r"v(\d+\.\d+\.\d+)")
-    versions = version_pattern.findall(html_content)
-    unique_versions = sorted(set(versions), key=lambda v: version.parse(v), reverse=True)
-    return unique_versions[0] if unique_versions else None
-
-
-def generate_pdf(html_content, output_pdf, format_options=None):
-    """
-    Generate PDF from HTML content using Playwright
-    """
-    default_format = {
-        'format': 'A4',
-        'margin': {
-            'top': '50px',
-            'right': '50px',
-            'bottom': '50px',
-            'left': '50px'
-        },
-        'print_background': True,
-        'display_header_footer': True,
-        'header_template': '<div style="font-size: 10px; text-align: right; width: 100%; padding-right: 20px; margin-top: 20px;"><span class="pageNumber"></span> of <span class="totalPages"></span></div>',
-        'footer_template': '<div style="font-size: 10px; text-align: center; width: 100%; margin-bottom: 20px;"><span class="url"></span></div>'
-    }
-    
-    format_options = format_options or default_format
-
-    with sync_playwright() as p:
-        browser = p.chromium.launch()
-        page = browser.new_page()
-        
-        # Set viewport size to ensure consistent rendering
-        page.set_viewport_size({"width": 1280, "height": 1024})
-        
-        # Set content and wait for network idle
-        page.set_content(html_content, wait_until='networkidle')
-        
-        # Wait for any images and fonts to load
-        page.wait_for_load_state('networkidle')
-        page.wait_for_load_state('domcontentloaded')
-        
-        # Generate PDF
-        page.pdf(path=output_pdf, **format_options)
-        
-        browser.close()
-
-
-if __name__ == "__main__":
-    export_html = False
-
-    repo_dir = "nextjs-docs" 
-    repo_url = "https://github.com/vercel/next.js.git"
-    branch = "canary"
-    docs_dir = "docs"
-
-    Change_img_url = True
-    base_path = "https://nextjs.org/_next/image?url="
-    path_args = "&w=1920&q=75"
-
-    clone_repo(repo_url, branch, docs_dir, repo_dir)
-    print(f"""
-    Nextjs Documentation PDF Generator v1.0.0
-    Copyright (C) 2024 PacNPal
-    This program comes with ABSOLUTELY NO WARRANTY; for details see the LICENSE file.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; see the LICENSE file for details.
-    """)
-    print("Converting the Documentation to HTML...")
-    docs_dir_full_path = os.path.join(repo_dir, docs_dir)
-    files_to_process = get_files_sorted(docs_dir_full_path)
-    html_all_content, _, _ = process_files(files_to_process, repo_dir, docs_dir)
-    print("Converted all MDX to HTML.")
-
-    if export_html:
-        with open('output.html', 'w', encoding='utf8') as f:
-            f.write(html_all_content)
-            print("HTML Content exported.")
-
-    latest_version = find_latest_version(html_all_content)
-    if latest_version:
-        project_title = f"""Next.js Documentation v{latest_version}"""
-        output_pdf = f"""Next.js_Docs_v{latest_version}_{datetime.now().strftime("%Y-%m-%d")}.pdf"""
-    else:
-        project_title = "Next.js Documentation"
-        output_pdf = "Next.js_Documentation.pdf"
-
-    cover_html = f"""
-    <html>
-        <head>
-            <style>
-                {open('styles.css').read()}
-            </style>
-        </head>
-        <body>
-            <div class="master-container">
-                <div class="container">
-                    <div class="title">{project_title}</div>
-                    <div class="date">Date: {datetime.now().strftime("%Y-%m-%d")}</div>
-                </div>
-            </div>
-        </body>
-    </html>
-    """
-
-    format_options = {
-                'format': 'A4',
-                'margin': {
-                    'top': '50px',
-                    'right': '50px',
-                    'bottom': '50px',
-                    'left': '50px'
-                },
-                'print_background': True,
-                'display_header_footer': True,
-                'header_template': f'''
-                    <div style="font-size: 10px; padding: 10px 20px; margin-top: 20px;">
-                        <span style="float: left;">{project_title}</span>
-                        <span style="float: right;">Page <span class="pageNumber"></span> of <span class="totalPages"></span></span>
-                    </div>
-                ''',
-                'footer_template': f'''
-                    <div style="font-size: 10px; padding: 10px 20px; margin-bottom: 20px; text-align: center;">
-                        Generated on {datetime.now().strftime("%Y-%m-%d")}
-                    </div>
-                '''
-            }
-
-            # Check if file is open
-    if is_file_open(output_pdf):
-                print("The output file is already open in another process. Please close it and try again.")
-    else:
-                try:
-                    print("Generating PDF...")
-                    # Generate PDF with cover page and content
-                    html_all_content = add_license_page(html_all_content)
-                    generate_pdf(cover_html + html_all_content, output_pdf, format_options)
-                    print("Created the PDF file successfully.")
-
-                except Exception as e:
-                    print(f"Error generating PDF: {str(e)}")