From 5889ad944082bd18504aec54bb5e32106e42ff33 Mon Sep 17 00:00:00 2001 From: pacnpal <183241239+pacnpal@users.noreply.github.com> Date: Mon, 9 Dec 2024 21:01:16 -0500 Subject: [PATCH] Update and rename export-docs.py to astro_docs_to_pdf.py --- astro_docs_to_pdf.py | 700 +++++++++++++++++++++++++++++++++++++++++++ export-docs.py | 440 --------------------------- 2 files changed, 700 insertions(+), 440 deletions(-) create mode 100644 astro_docs_to_pdf.py delete mode 100644 export-docs.py diff --git a/astro_docs_to_pdf.py b/astro_docs_to_pdf.py new file mode 100644 index 0000000..da58476 --- /dev/null +++ b/astro_docs_to_pdf.py @@ -0,0 +1,700 @@ +""" +Astro Documentation PDF Generator +Modified version of Docs-Exporter for Astro documentation + +Original work Copyright (C) 2024 Riyooo +Modified work Copyright (C) 2024 PacNPal + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . + +Modifications: +- Replaced wkhtmltopdf with Playwright for PDF generation +- Added support for Astro's MDX format and frontmatter +- Enhanced frontmatter parsing +- Added automatic CSS generation +- Improved error handling and reporting +""" + +import os +import markdown +import tempfile +import yaml +import re +import html +import shutil +from pathlib import Path +from git import Repo, RemoteProgress, GitCommandError +from datetime import datetime +from packaging import version +from tqdm import tqdm +from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError + +def get_license_notice(): + """Return a formatted license notice for inclusion in output""" + return """ +This PDF was generated by Astro Documentation PDF Generator +Original work Copyright (C) 2024 Riyooo +Modified work Copyright (C) 2024 PacNPal + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License version 3. +Source code is available at: [Your Repository URL] +""" + +def add_license_page(html_content): + """Add a license notice page to the HTML content""" + license_html = f""" +
+

License Notice

+
+            {get_license_notice()}
+        
+

Complete source code for this program is available at: [Your Repository URL]

+

This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you + are welcome to redistribute it under certain conditions. See the GNU Affero General + Public License version 3 for details.

+
+ """ + return html_content + license_html + +class DocumentationProcessingError(Exception): + """Custom exception for documentation processing errors""" + pass + + +class CloneProgress(RemoteProgress): + def __init__(self): + super().__init__() + self.pbar = tqdm() + + def update(self, op_code, cur_count, max_count=None, message=''): + if max_count is not None: + self.pbar.total = max_count + self.pbar.update(cur_count - self.pbar.n) + + def finalize(self): + self.pbar.close() + + +def cleanup_directory(directory): + """Safely clean up a directory""" + try: + if os.path.exists(directory): + shutil.rmtree(directory) + except Exception as e: + print(f"Warning: Failed to clean up directory {directory}: {e}") + + +def process_image_paths(md_content): + """Process both MDX imports and markdown images, including SVGs""" + try: + # Handle MDX image imports + mdx_pattern = r'import\s+(\w+)\s+from\s+[\'"]([^\'"]+)[\'"]' + def mdx_replace(match): + var_name = match.group(1) + image_path = match.group(2) + image_path = re.sub(r'^[./~]+', '', image_path) + return f'![{var_name}](https://docs.astro.build/{image_path})' + + md_content = re.sub(mdx_pattern, mdx_replace, md_content) + + # Handle standard markdown images and SVGs + md_pattern = r'!\[(.*?)\]\(([^http].*?)\)' + def md_replace(match): + alt_text = match.group(1) + image_path = match.group(2) + image_path = re.sub(r'^[./~]+', '', image_path) + return f'![{alt_text}](https://docs.astro.build/{image_path})' + + md_content = re.sub(md_pattern, md_replace, md_content) + + # Handle Astro image components + component_pattern = r']*>' + def component_replace(match): + src = match.group(1).strip('{}').strip('"\'') + alt = match.group(2) + src = re.sub(r'^[./~]+', '', src) + return f'![{alt}](https://docs.astro.build/{src})' + + return re.sub(component_pattern, component_replace, md_content) + except Exception as e: + raise DocumentationProcessingError(f"Error processing image paths: {e}") + + +def preprocess_code_blocks(md_content): + """Handle Astro's code blocks with advanced features""" + try: + pattern = r'```(?:(\w+))?\s*(?:\{([^}]*)\})?\s*(.*?)```' + + def replace(match): + language = match.group(1) or '' + attributes = match.group(2) or '' + code_block = match.group(3) + + # Parse attributes + title = '' + if attributes: + title_match = re.search(r'title="([^"]+)"', attributes) + if title_match: + title = title_match.group(1) + + # Build header + header_parts = [] + if title: + header_parts.append(title) + if language: + header_parts.append(f"({language})") + + header_html = (f'
{" ".join(header_parts)}
' + if header_parts else '') + + return f'{header_html}\n```{language}\n{code_block.strip()}\n```' + + return re.sub(pattern, replace, md_content, flags=re.DOTALL) + except Exception as e: + raise DocumentationProcessingError(f"Error processing code blocks: {e}") + + +def parse_frontmatter(md_content): + """Parse frontmatter with support for Astro's format""" + try: + lines = md_content.split('\n') + if lines[0].strip() == '---': + try: + end_of_frontmatter = lines[1:].index('---') + 1 + frontmatter = '\n'.join(lines[1:end_of_frontmatter]) + content = '\n'.join(lines[end_of_frontmatter + 1:]) + + # Remove import statements from content + content = re.sub(r'import.*?\n', '', content) + + return frontmatter, content + except ValueError: + return None, md_content + return None, md_content + except Exception as e: + raise DocumentationProcessingError(f"Error parsing frontmatter: {e}") + + +def safe_load_frontmatter(frontmatter_content): + """Safely load YAML frontmatter with robust error handling""" + if not frontmatter_content: + return None + + try: + # First pass: try to extract title and description using regex + title_match = re.search(r'title:\s*[\'"]?(.*?)(?:[\'"]?\s*$|[\'"]?\s+\w+:)', frontmatter_content, re.MULTILINE) + desc_match = re.search(r'description:\s*[\'"]?(.*?)(?:[\'"]?\s*$|[\'"]?\s+\w+:)', frontmatter_content, re.MULTILINE) + + # Initialize metadata with found values + metadata = {} + if title_match: + title = title_match.group(1).strip(' \'"') + metadata['title'] = title + if desc_match: + description = desc_match.group(1).strip(' \'"') + metadata['description'] = description + + # Clean up the frontmatter + lines = [] + for line in frontmatter_content.split('\n'): + # Skip problematic lines + if any(skip in line for skip in ['githubIntegrationURL:', 'label:', 'maxHeadingLevel:']): + continue + + # Clean up quotes and URLs + if ':' in line: + key, *value_parts = line.split(':', 1) + if value_parts: + value = value_parts[0] + # Handle truncated URLs + if 'http' in value and "'" in value and not value.endswith("'"): + continue + # Handle other truncated quoted strings + if value.count("'") == 1 or value.count('"') == 1: + continue + lines.append(line) + else: + lines.append(line) + + # Try to parse cleaned content + cleaned_content = '\n'.join(lines) + try: + parsed_data = yaml.safe_load(cleaned_content) + if isinstance(parsed_data, dict): + # Update metadata with any additional valid fields + metadata.update(parsed_data) + except: + pass # Use regex-extracted metadata if YAML parsing fails + + return metadata if metadata else None + + except Exception as e: + print(f"Warning: Error processing frontmatter: {e}") + return None + + +def parse_frontmatter(md_content): + """Parse frontmatter with improved error handling""" + try: + lines = md_content.split('\n') + if lines[0].strip() == '---': + try: + # Find the closing --- marker + end_of_frontmatter = lines[1:].index('---') + 1 + frontmatter = '\n'.join(lines[1:end_of_frontmatter]) + content = '\n'.join(lines[end_of_frontmatter + 1:]) + + # Clean up frontmatter + frontmatter = re.sub(r'\s+i18nReady:\s*true', '', frontmatter) + frontmatter = re.sub(r':\s*\|', ': ', frontmatter) # Handle YAML block indicators + + return frontmatter, content + except ValueError: + return None, md_content + return None, md_content + except Exception as e: + print(f"Warning: Error in document structure: {e}") + return None, md_content + +def clone_repo(repo_url, branch, docs_dir, repo_dir): + """Clone repository with proper error handling and cleanup""" + progress = None + try: + if os.path.exists(repo_dir): + print("Updating existing repository...") + repo = Repo(repo_dir) + origin = repo.remotes.origin + origin.fetch() + repo.git.checkout(branch) + origin.pull() + print("Repository updated successfully.") + return + + print("Cloning repository...") + os.makedirs(repo_dir, exist_ok=True) + progress = CloneProgress() + + # Initialize repository + repo = Repo.init(repo_dir) + with repo.config_writer() as git_config: + git_config.set_value("core", "sparseCheckout", "true") + + # Setup sparse checkout + sparse_checkout_path = Path(repo_dir) / ".git" / "info" / "sparse-checkout" + sparse_checkout_path.parent.mkdir(exist_ok=True) + sparse_checkout_path.write_text(f"/{docs_dir}/*\n") + + # Clone and checkout + origin = repo.create_remote("origin", repo_url) + origin.fetch(progress=progress) + repo.git.checkout(branch) + print("Repository cloned successfully.") + + except GitCommandError as e: + cleanup_directory(repo_dir) + raise DocumentationProcessingError(f"Git operation failed: {e}") + except Exception as e: + cleanup_directory(repo_dir) + raise DocumentationProcessingError(f"Repository operation failed: {e}") + finally: + if progress: + progress.finalize() + + +def get_files_sorted(root_dir): + """Get sorted files with comprehensive filtering""" + try: + all_files = [] + excluded_dirs = {'node_modules', '.git', '_internal', 'dist', 'temp', '__pycache__'} + + for root, dirs, files in os.walk(root_dir): + # Skip excluded directories + dirs[:] = [d for d in dirs if d not in excluded_dirs and not d.startswith('_')] + + for file in files: + if file.endswith(('.md', '.mdx')): + full_path = os.path.join(root, file) + + # Skip excluded paths + if any(x in Path(full_path).parts for x in excluded_dirs): + continue + + # Prioritize index files within their directories + is_index = file == 'index.md' + dir_path = os.path.dirname(full_path) + sort_key = f"{dir_path}/{'0' if is_index else '1'}{file}" + all_files.append((full_path, sort_key)) + + if not all_files: + raise DocumentationProcessingError(f"No markdown files found in {root_dir}") + + all_files.sort(key=lambda x: x[1]) + return [full_path for full_path, _ in all_files] + except Exception as e: + raise DocumentationProcessingError(f"Error getting sorted files: {e}") + + +def create_default_css(): + """Create a default CSS file if it doesn't exist""" + css_content = """ +body { + font-family: 'Arial', sans-serif; + line-height: 1.6; + margin: 0; + padding: 20px; + color: #1a1a1a; +} +.master-container { + display: flex; + justify-content: center; + align-items: center; + min-height: 100vh; +} +.container { + text-align: center; + max-width: 800px; + margin: 0 auto; +} +.title { + font-size: 28px; + font-weight: bold; + margin-bottom: 20px; + color: #000; +} +.date { + font-size: 16px; + color: #666; +} +.page-break { + page-break-after: always; +} +code { + background-color: #f4f4f4; + padding: 2px 4px; + border-radius: 4px; + font-family: 'Courier New', monospace; + font-size: 0.9em; +} +pre { + background-color: #f8f8f8; + padding: 15px; + border-radius: 5px; + overflow-x: auto; + border: 1px solid #e1e1e1; +} +.code-header { + background-color: #e0e0e0; + padding: 8px 15px; + border-radius: 5px 5px 0 0; + font-size: 0.9em; + color: #333; +} +table { + border-collapse: collapse; + width: 100%; + margin: 15px 0; +} +th, td { + border: 1px solid #ddd; + padding: 12px; + text-align: left; +} +th { + background-color: #f5f5f5; + font-weight: bold; +} +img { + max-width: 100%; + height: auto; + margin: 10px 0; + border-radius: 5px; +} +h1, h2, h3, h4, h5, h6 { + color: #2c3e50; + margin-top: 24px; + margin-bottom: 16px; +} +h1 { font-size: 2em; border-bottom: 1px solid #eee; padding-bottom: 0.3em; } +h2 { font-size: 1.5em; } +h3 { font-size: 1.25em; } +a { color: #0366d6; text-decoration: none; } +a:hover { text-decoration: underline; } +blockquote { + margin: 0; + padding: 0 1em; + color: #6a737d; + border-left: 0.25em solid #dfe2e5; +} +.doc-path { + color: #666; + font-size: 0.9em; + margin-bottom: 20px; + padding: 8px; + background-color: #f8f9fa; + border-radius: 4px; +} +""" + try: + with open('styles.css', 'w', encoding='utf8') as f: + f.write(css_content.strip()) + except Exception as e: + raise DocumentationProcessingError(f"Failed to create CSS file: {e}") + + +def process_files(files, repo_dir, docs_dir): + """Process markdown files into HTML with error handling""" + try: + if not os.path.exists('styles.css'): + create_default_css() + + with open('styles.css', 'r', encoding='utf8') as f: + css_content = f.read() + + toc = [] + html_all_pages_content = [] + numbering = [0] + + html_header = f""" + + + + + + + + + """ + + for index, file_path in enumerate(files): + try: + with open(file_path, 'r', encoding='utf8') as f: + md_content = f.read() + + md_content = process_image_paths(md_content) + md_content = preprocess_code_blocks(md_content) + frontmatter, md_content = parse_frontmatter(md_content) + + if frontmatter: + data = safe_load_frontmatter(frontmatter) + if data is not None: + rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir)) + depth = rel_path.count(os.sep) + + if os.path.basename(file_path) == 'index.md' and depth > 0: + depth -= 1 + + indent = ' ' * 5 * depth + + while len(numbering) <= depth: + numbering.append(0) + + numbering[depth] += 1 + for i in range(depth + 1, len(numbering)): + numbering[i] = 0 + + toc_numbering = '.'.join(map(str, numbering[:depth + 1])) + toc_title = data.get('title', Path(file_path).stem.title()) + toc_full_title = f"{toc_numbering} - {toc_title}" + + toc.append(f"{indent}{toc_full_title}
") + + html_page_content = [ + f"

{toc_full_title}

", + f"

Documentation path: {Path(file_path).relative_to(Path(repo_dir) / docs_dir).as_posix()}

" + ] + + if 'description' in data: + html_page_content.append(f"

Description: {data['description']}

") + html_page_content.append('
') + + # Convert Markdown to HTML with extended features + html_page_content.append(markdown.markdown( + md_content, + extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'attr_list', 'def_list'] + )) + + html_all_pages_content.append('\n'.join(html_page_content)) + + if index < len(files) - 1: + html_all_pages_content.append('
') + + except Exception as e: + print(f"Warning: Error processing file {file_path}: {e}") + continue + + if not html_all_pages_content: + raise DocumentationProcessingError("No content was successfully processed") + + # Create table of contents + toc_html = f""" +
+
+

Table of Contents

+
+ {''.join(toc)} +
+
+ """ + + # Combine all content + final_content = '\n'.join(html_all_pages_content) + html_all_content = f"{html_header}{toc_html}{final_content}" + + return html_all_content + + except Exception as e: + raise DocumentationProcessingError(f"Error processing documentation: {e}") + + +def generate_pdf(html_content, output_pdf, format_options=None): + """Generate PDF using Playwright with enhanced error handling""" + default_format = { + 'format': 'A4', + 'margin': { + 'top': '50px', + 'right': '50px', + 'bottom': '50px', + 'left': '50px' + }, + 'print_background': True, + 'display_header_footer': True, + 'header_template': '
of
', + 'footer_template': '
' + } + + format_options = format_options or default_format + + try: + playwright = sync_playwright().start() + browser = playwright.chromium.launch() + context = browser.new_context() + page = context.new_page() + + # Set viewport size for consistent rendering + page.set_viewport_size({"width": 1280, "height": 1024}) + + # Increase timeouts for better reliability + page.set_default_timeout(120000) # 2 minutes + + # Set content and wait for loading + page.set_content(html_content, wait_until='networkidle') + + # Additional waits for content + page.wait_for_load_state('networkidle') + page.wait_for_load_state('domcontentloaded') + + # Generate PDF + page.pdf(path=output_pdf, **format_options) + + except Exception as e: + raise DocumentationProcessingError(f"Error generating PDF: {str(e)}") + finally: + if 'page' in locals(): + page.close() + if 'context' in locals(): + context.close() + if 'browser' in locals(): + browser.close() + if 'playwright' in locals(): + playwright.stop() + +def main(): + """Main execution function with error handling""" + repo_dir = "astro-docs" + repo_url = "https://github.com/withastro/docs.git" + branch = "main" + docs_dir = "src/content/docs/en" + + output_pdf = f"Astro_Documentation_{datetime.now().strftime('%Y-%m-%d')}.pdf" + temp_dir = None + + try: + # Add version and license information to console + print(f""" +Astro Documentation PDF Generator v1.0.0 +Copyright (C) 2024 PacNPal +This program comes with ABSOLUTELY NO WARRANTY; for details see the LICENSE file. +This is free software, and you are welcome to redistribute it +under certain conditions; see the LICENSE file for details. +""") + # Create CSS if it doesn't exist + if not os.path.exists('styles.css'): + print("Creating default styles.css...") + create_default_css() + print("Default CSS file created.") + + # Create temporary directory for processing + temp_dir = tempfile.mkdtemp() + + # Clone repository + print("Cloning Astro documentation repository...") + clone_repo(repo_url, branch, docs_dir, repo_dir) + + # Get and sort files + print("Finding and sorting documentation files...") + docs_dir_full_path = os.path.join(repo_dir, docs_dir) + files_to_process = get_files_sorted(docs_dir_full_path) + print(f"Found {len(files_to_process)} files to process") + + # Create cover page + with open('styles.css', 'r', encoding='utf8') as f: + css_content = f.read() + + cover_html = f""" + + + + + + + +
+
+
Astro Documentation
+
Generated on {datetime.now().strftime('%Y-%m-%d')}
+
+
+ + + """ + + # Process files and generate HTML + print("Processing documentation files...") + html_content = process_files(files_to_process, repo_dir, docs_dir) + + # Combine cover and content + final_html = f"{cover_html}
{html_content}" + final_html = add_license_page(final_html) + + # Generate PDF + print(f"Generating PDF: {output_pdf}") + generate_pdf(final_html, output_pdf) + + print(f"Documentation successfully generated: {output_pdf}") + + except DocumentationProcessingError as e: + print(f"Error: {e}") + return 1 + except Exception as e: + print(f"Unexpected error: {e}") + print("This program is licensed under AGPL-3.0. Source code is available at: [Your Repository URL]") + return 1 + finally: + # Cleanup + if temp_dir and os.path.exists(temp_dir): + shutil.rmtree(temp_dir, ignore_errors=True) + +if __name__ == "__main__": + exit(main()) diff --git a/export-docs.py b/export-docs.py deleted file mode 100644 index fa9a7a6..0000000 --- a/export-docs.py +++ /dev/null @@ -1,440 +0,0 @@ -""" -Nextjs Documentation PDF Generator -Modified version of Docs-Exporter for Astro documentation - -Original work Copyright (C) 2024 Riyooo -Modified work Copyright (C) 2024 PacNPal - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . - -Modifications: -- Replaced wkhtmltopdf with Playwright for PDF generation -- Improved error handling and reporting -- Added support for custom headers, footers, and styles in the generated PDFs. -- Enhanced error handling with `try-except` blocks, especially for frontmatter parsing and file operations. -""" - -import os -import markdown -import tempfile -import yaml -import re -import html -from git import Repo, RemoteProgress -from datetime import datetime -from packaging import version -from tqdm import tqdm -from playwright.sync_api import sync_playwright - -def get_license_notice(): - """Return a formatted license notice for inclusion in output""" - return """ -This PDF was generated by Nextjs Documentation PDF Generator -Original work Copyright (C) 2024 Riyooo -Modified work Copyright (C) 2024 PacNPal - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License version 3. -Source code is available at: https://github.com/pacnpal/Docs-Exporter -""" - -def add_license_page(html_content): - """Add a license notice page to the HTML content""" - license_html = f""" -
-

License Notice

-
-            {get_license_notice()}
-        
-

Complete source code for this program is available at: https://github.com/pacnpal/Docs-Exporter

-

This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you - are welcome to redistribute it under certain conditions. See the GNU Affero General - Public License version 3 for details.

-
- """ - return html_content + license_html - -def process_image_paths(md_content): - # Define a regular expression pattern to find image tags - pattern = r'src(?:Light|Dark)="(.*?)"' - - # Function to replace the relative path with an absolute path - def replace(match): - relative_path = match.group(1) - absolute_path = f'{base_path}{relative_path}{path_args}' - - # Print the original and new image URLs for debugging - return f'src="{absolute_path}"' - - # Use the sub method to replace all occurrences - return re.sub(pattern, replace, md_content) - - -def preprocess_code_blocks(md_content): - # Regular expression to match extended code blocks with filename and language - pattern = r'```(\w+)?\s+filename="([^"]+)"\s*(switcher)?\n(.*?)```' - - def replace(match): - language = match.group(1) if match.group(1) else '' - filename = match.group(2) - code_block = match.group(4) - - # Format the header with filename and language - header = f'
{filename} ({language})
' if language else f'
{filename}
' - - return f'{header}\n```{language}\n{code_block}\n```' - - # Replace all occurrences in the content - return re.sub(pattern, replace, md_content, flags=re.DOTALL) - - -def safe_load_frontmatter(frontmatter_content): - try: - return yaml.safe_load(frontmatter_content) - except yaml.YAMLError: - return None - - -def preprocess_mdx_content(md_content): - # Replace HTML tags in frontmatter - md_content = re.sub(r'<(/?\w+)>', lambda m: html.escape(m.group(0)), md_content) - return md_content - - -def parse_frontmatter(md_content): - lines = md_content.split('\n') - if lines[0].strip() == '---': - end_of_frontmatter = lines.index('---', 1) - frontmatter = '\n'.join(lines[1:end_of_frontmatter]) - content = '\n'.join(lines[end_of_frontmatter + 1:]) - return frontmatter, content - return None, md_content - - -class CloneProgress(RemoteProgress): - def __init__(self): - super().__init__() - self.pbar = tqdm() - - def update(self, op_code, cur_count, max_count=None, message=''): - if max_count is not None: - self.pbar.total = max_count - self.pbar.update(cur_count - self.pbar.n) - - def finalize(self): - self.pbar.close() - - -def clone_repo(repo_url, branch, docs_dir, repo_dir): - if not os.path.isdir(repo_dir): - os.makedirs(repo_dir, exist_ok=True) - print("Cloning repository...") - repo = Repo.init(repo_dir) - with repo.config_writer() as git_config: - git_config.set_value("core", "sparseCheckout", "true") - - with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file: - sparse_checkout_file.write(f"/{docs_dir}\n") - - origin = repo.create_remote("origin", repo_url) - origin.fetch(progress=CloneProgress()) - repo.git.checkout(branch) - print("Repository cloned.") - else: - print("Repository already exists. Updating...") - repo = Repo(repo_dir) - origin = repo.remotes.origin - origin.fetch(progress=CloneProgress()) - repo.git.checkout(branch) - origin.pull(progress=CloneProgress()) - print("Repository updated.") - - -def is_file_open(file_path): - if not os.path.exists(file_path): - return False - try: - with open(file_path, 'a'): - pass - return False - except PermissionError: - return True - - -def get_files_sorted(root_dir): - all_files = [] - for root, _, files in os.walk(root_dir): - for file in files: - full_path = os.path.join(root, file) - modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file - sort_key = os.path.join(root, modified_basename) - all_files.append((full_path, sort_key)) - all_files.sort(key=lambda x: x[1]) - return [full_path for full_path, _ in all_files] - - -def preprocess_frontmatter(frontmatter): - html_tags = {} - - def replace_tag(match): - tag = match.group(0) - placeholder = f"HTML_TAG_{len(html_tags)}" - html_tags[placeholder] = tag - return placeholder - - modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter) - return modified_frontmatter, html_tags - - -def restore_html_tags(parsed_data, html_tags): - if isinstance(parsed_data, dict): - for key, value in parsed_data.items(): - if isinstance(value, str): - for placeholder, tag in html_tags.items(): - value = value.replace(placeholder, tag) - value = html.escape(value) - parsed_data[key] = value - return parsed_data - - -def process_files(files, repo_dir, docs_dir): - toc = "" - html_all_pages_content = "" - - html_header = f""" - - - - - - """ - - numbering = [0] - - for index, file_path in enumerate(files): - with open(file_path, 'r', encoding='utf8') as f: - md_content = f.read() - - if Change_img_url: - md_content = process_image_paths(md_content) - - md_content = preprocess_code_blocks(md_content) - frontmatter, md_content = parse_frontmatter(md_content) - - if frontmatter: - frontmatter, html_tags = preprocess_frontmatter(frontmatter) - data = safe_load_frontmatter(frontmatter) - if data is not None: - data = restore_html_tags(data, html_tags) - rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir)) - depth = rel_path.count(os.sep) - file_basename = os.path.basename(file_path) - if file_basename.startswith("index.") and depth > 0: - depth += -1 - indent = ' ' * 5 * depth - - while len(numbering) <= depth: - numbering.append(0) - - numbering[depth] += 1 - - for i in range(depth + 1, len(numbering)): - numbering[i] = 0 - - toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}" - toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title()) - toc_full_title = f"{toc_numbering} - {toc_title}" - toc += f"{indent}{toc_full_title}
" - - html_page_content = f""" -

{toc_full_title}

-

Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}

-

Description: {data.get('description', 'No description')}

- """ - if data.get('related', {}): - html_page_content += f""" -
-

Related:

-

Title: {data.get('related', {}).get('title', 'Related')}

-

Related Description: {data.get('related', {}).get('description', 'No related description')}

-

Links:

-
    - {''.join([f'
  • {link}
  • ' for link in data.get('related', {}).get('links', [])])} -
-
- """ - html_page_content += '
' - else: - html_page_content = "" - else: - html_page_content = "" - - html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition']) - html_all_pages_content += html_page_content - - if index < len(files) - 1: - html_all_pages_content += '
' - - toc_html = f"""

Table of Contents

{toc}
""" - html_all_content = toc_html + html_all_pages_content - - html_all_pages_content = html_header + html_all_pages_content + "" - toc_html = html_header + toc_html + "" - html_all_content = html_header + html_all_content + "" - - return(html_all_content, toc_html, html_all_pages_content) - - -def find_latest_version(html_content): - version_pattern = re.compile(r"v(\d+\.\d+\.\d+)") - versions = version_pattern.findall(html_content) - unique_versions = sorted(set(versions), key=lambda v: version.parse(v), reverse=True) - return unique_versions[0] if unique_versions else None - - -def generate_pdf(html_content, output_pdf, format_options=None): - """ - Generate PDF from HTML content using Playwright - """ - default_format = { - 'format': 'A4', - 'margin': { - 'top': '50px', - 'right': '50px', - 'bottom': '50px', - 'left': '50px' - }, - 'print_background': True, - 'display_header_footer': True, - 'header_template': '
of
', - 'footer_template': '
' - } - - format_options = format_options or default_format - - with sync_playwright() as p: - browser = p.chromium.launch() - page = browser.new_page() - - # Set viewport size to ensure consistent rendering - page.set_viewport_size({"width": 1280, "height": 1024}) - - # Set content and wait for network idle - page.set_content(html_content, wait_until='networkidle') - - # Wait for any images and fonts to load - page.wait_for_load_state('networkidle') - page.wait_for_load_state('domcontentloaded') - - # Generate PDF - page.pdf(path=output_pdf, **format_options) - - browser.close() - - -if __name__ == "__main__": - export_html = False - - repo_dir = "nextjs-docs" - repo_url = "https://github.com/vercel/next.js.git" - branch = "canary" - docs_dir = "docs" - - Change_img_url = True - base_path = "https://nextjs.org/_next/image?url=" - path_args = "&w=1920&q=75" - - clone_repo(repo_url, branch, docs_dir, repo_dir) - print(f""" - Nextjs Documentation PDF Generator v1.0.0 - Copyright (C) 2024 PacNPal - This program comes with ABSOLUTELY NO WARRANTY; for details see the LICENSE file. - This is free software, and you are welcome to redistribute it - under certain conditions; see the LICENSE file for details. - """) - print("Converting the Documentation to HTML...") - docs_dir_full_path = os.path.join(repo_dir, docs_dir) - files_to_process = get_files_sorted(docs_dir_full_path) - html_all_content, _, _ = process_files(files_to_process, repo_dir, docs_dir) - print("Converted all MDX to HTML.") - - if export_html: - with open('output.html', 'w', encoding='utf8') as f: - f.write(html_all_content) - print("HTML Content exported.") - - latest_version = find_latest_version(html_all_content) - if latest_version: - project_title = f"""Next.js Documentation v{latest_version}""" - output_pdf = f"""Next.js_Docs_v{latest_version}_{datetime.now().strftime("%Y-%m-%d")}.pdf""" - else: - project_title = "Next.js Documentation" - output_pdf = "Next.js_Documentation.pdf" - - cover_html = f""" - - - - - -
-
-
{project_title}
-
Date: {datetime.now().strftime("%Y-%m-%d")}
-
-
- - - """ - - format_options = { - 'format': 'A4', - 'margin': { - 'top': '50px', - 'right': '50px', - 'bottom': '50px', - 'left': '50px' - }, - 'print_background': True, - 'display_header_footer': True, - 'header_template': f''' -
- {project_title} - Page of -
- ''', - 'footer_template': f''' -
- Generated on {datetime.now().strftime("%Y-%m-%d")} -
- ''' - } - - # Check if file is open - if is_file_open(output_pdf): - print("The output file is already open in another process. Please close it and try again.") - else: - try: - print("Generating PDF...") - # Generate PDF with cover page and content - html_all_content = add_license_page(html_all_content) - generate_pdf(cover_html + html_all_content, output_pdf, format_options) - print("Created the PDF file successfully.") - - except Exception as e: - print(f"Error generating PDF: {str(e)}") \ No newline at end of file