From bc59a325a5c9b7c4e607b53c13ec737930ed8dd5 Mon Sep 17 00:00:00 2001 From: pacnpal <183241239+pacnpal@users.noreply.github.com> Date: Mon, 9 Dec 2024 20:26:35 -0500 Subject: [PATCH] Refactored code for Playwright Refactored the code for Playwright, replacing wkhtmltopdf --- .gitignore | 1 + CHANGELOG.md | 56 +++++++++++++ README.md | 199 +++++++++++++++++++++++++++++++++++++++++------ export-docs.py | 183 ++++++++++++++++++++----------------------- requirements.txt | 13 ++-- 5 files changed, 325 insertions(+), 127 deletions(-) create mode 100644 CHANGELOG.md diff --git a/.gitignore b/.gitignore index ac2ba2d..6719c6c 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ cython_debug/ /*-docs *.pdf *.html +.DS_Store \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..c74d545 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,56 @@ +# CHANGELOG.md + +--- + +## General Changes +1. **Code Refactoring** + - Significant restructuring of the functions to improve readability and maintainability. + - Added Playwright to replace wkhtmltopdf. + - Introduction of meaningful function and variable names for better clarity. + +2. **Error Handling** + - Enhanced error handling with `try-except` blocks, especially for frontmatter parsing and file operations. + +--- + +## New Features +1. **HTML Preprocessing** + - Added functions `preprocess_code_blocks` and `process_image_paths` to handle custom Markdown syntax and image path updates for rendering consistency. + +2. **Frontmatter Parsing** + - Introduced `parse_frontmatter`, `preprocess_frontmatter`, and `restore_html_tags` functions to manage YAML frontmatter in Markdown files, enhancing metadata handling. + +3. **Repository Cloning** + - Added `clone_repo` to handle Git repository cloning with sparse checkout support, improving integration with remote documentation sources. + +4. **PDF Generation** + - Integrated Playwright for rendering and generating PDFs from HTML content. + - Added support for custom headers, footers, and styles in the generated PDFs. + +5. **Table of Contents (ToC)** + - Automatically generates a ToC from parsed metadata with proper hierarchy and numbering. + +--- + +## Bug Fixes +1. **File Sorting** + - Fixed sorting issues in `get_files_sorted` to prioritize `index.md` and `index.mdx` files. + +2. **Open File Check** + - Added `is_file_open` to ensure output files are not already open, preventing write conflicts. + +3. **Version Detection** + - Improved `find_latest_version` logic to detect and sort unique versions from HTML content. + +--- + +## Removed Features +- Any obsolete or unused features from `export-docs.old.py` were removed to streamline the codebase. + +--- + +## Performance Improvements +- Optimized file processing loop to handle large repositories more efficiently. +- Improved Playwright's rendering performance by preloading images and resources. + +--- \ No newline at end of file diff --git a/README.md b/README.md index bf0cf9a..19e79d3 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,191 @@ -# Docs-Exporter +# README.md +# Documentation to PDF Converter -This script automates the process of exporting Next.js documentation from the GitHub repository, converting it to HTML, and then compiling it into a PDF document. It also ensures that all visual content, including images used in the online documentation, and crucial formatting, such as code blocks and tables, are accurately fetched and included. +A Python script that clones documentation from a Git repository (default: Next.js), processes it, and generates a well-formatted PDF with table of contents, proper formatting, and consistent styling. ## Features -- **Accurate Content Replication**: Clones the Next.js documentation from the Canary channel of the GitHub repository and preserves its layout. -- **Image Handling**: Fetches and embeds the exact images used in the online documentation, ensuring that all visual explanations and illustrations are retained. -- **Advanced Formatting**: Maintains the integrity of advanced formatting elements such as code blocks, tables, and special markdown features, ensuring that the educational value of the documentation is preserved. -- **Custom PDF Styling**: Generates a styled PDF document with a cover page and a detailed table of contents, formatted through an external CSS file. +- Clones specific documentation directories from Git repositories +- Processes Markdown and MDX files +- Generates table of contents with proper numbering +- Handles code blocks with filename annotations +- Processes frontmatter for metadata +- Supports image path transformations +- Creates PDF with customizable headers and footers +- Includes cover page and proper page breaks -## Prerequisites +## Requirements -- Python -- Git -- wkhtmltopdf +### System Requirements +- Python 3.7+ +- Git installed and accessible from command line +- Internet connection for cloning repositories -## Installation -- Install `wkhtmltopdf` which is required for PDF generation. You can download it from [wkhtmltopdf downloads](https://wkhtmltopdf.org/downloads.html) and follow the installation instructions for your operating system. -- Clone the Repository -```bash -git clone https://github.com/Riyooo/Docs-Exporter.git -``` -- Go into the Directory -```bash -cd Docs-Exporter -``` -- Install Python Dependencies +### Python Dependencies +Install all required packages using: ```bash pip install -r requirements.txt ``` +Then install Playwright's browser: +```bash +playwright install chromium +``` + +## Setup + +1. Clone this repository: +```bash +git clone +cd +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +playwright install chromium +``` + +3. Ensure you have a `styles.css` file in the same directory as the script. This file should contain your desired CSS styling for the PDF output. + ## Usage -To run the script, execute the following command from the root of the repository: +1. Basic usage with default settings (Next.js documentation): ```bash -python export-docs.py +python docs_to_pdf.py ``` + +2. The script will: + - Clone/update the specified Git repository + - Process all documentation files + - Generate a PDF with proper formatting + - Include a cover page and table of contents + +## Configuration + +You can modify these variables in the script for different configurations: + +```python +repo_dir = "nextjs-docs" # Local directory for cloned repo +repo_url = "https://github.com/vercel/next.js.git" # Repository URL +branch = "canary" # Branch to clone +docs_dir = "docs" # Directory containing documentation + +# Image URL transformation settings +Change_img_url = True +base_path = "https://nextjs.org/_next/image?url=" +path_args = "&w=1920&q=75" +``` + +## PDF Output Settings + +The PDF generation includes: +- A4 format +- Custom margins +- Page numbers in header +- Generation date in footer +- Background colors/images +- Proper page breaks between sections + +## File Organization + +- `docs_to_pdf.py`: Main script file +- `requirements.txt`: Python dependencies +- `styles.css`: CSS styling for PDF output +- `README.md`: This documentation + +## Troubleshooting + +1. If the PDF file is locked: + - Ensure the output PDF is not open in any application + - Check file permissions + +2. If images are not loading: + - Verify internet connection + - Check if image URLs are accessible + - Adjust the `wait_for_load_state` timing if needed + +3. If the repository won't clone: + - Verify Git is installed and accessible + - Check internet connection + - Ensure you have access to the repository + +## Notes + +- The script creates temporary files during processing +- Large documentation sets may take several minutes to process +- Memory usage depends on the size of the documentation +- The script requires active internet connection for repository cloning and image processing + +## CSS Recommendations + +Your `styles.css` should include at least these basic styles for proper PDF formatting: + +```css +body { + font-family: Arial, sans-serif; + line-height: 1.6; + margin: 0; + padding: 20px; +} + +.master-container { + display: flex; + justify-content: center; + align-items: center; + min-height: 100vh; +} + +.container { + text-align: center; +} + +.title { + font-size: 24px; + font-weight: bold; + margin-bottom: 20px; +} + +.date { + font-size: 16px; +} + +.page-break { + page-break-after: always; +} + +code { + background-color: #f4f4f4; + padding: 2px 4px; + border-radius: 4px; +} + +pre { + background-color: #f8f8f8; + padding: 15px; + border-radius: 5px; + overflow-x: auto; +} + +.code-header { + background-color: #e0e0e0; + padding: 5px 10px; + border-radius: 5px 5px 0 0; +} + +table { + border-collapse: collapse; + width: 100%; + margin: 15px 0; +} + +th, td { + border: 1px solid #ddd; + padding: 8px; + text-align: left; +} + +th { + background-color: #f5f5f5; +} +``` \ No newline at end of file diff --git a/export-docs.py b/export-docs.py index 5da9347..4ca16f4 100644 --- a/export-docs.py +++ b/export-docs.py @@ -1,6 +1,5 @@ import os import markdown -import pdfkit import tempfile import yaml import re @@ -9,6 +8,7 @@ from git import Repo, RemoteProgress from datetime import datetime from packaging import version from tqdm import tqdm +from playwright.sync_api import sync_playwright def process_image_paths(md_content): @@ -76,14 +76,13 @@ class CloneProgress(RemoteProgress): def update(self, op_code, cur_count, max_count=None, message=''): if max_count is not None: self.pbar.total = max_count - self.pbar.update(cur_count - self.pbar.n) # increment the pbar with the increment + self.pbar.update(cur_count - self.pbar.n) def finalize(self): self.pbar.close() -# Clone a specific directory of a repository / branch + def clone_repo(repo_url, branch, docs_dir, repo_dir): - # Initialize and configure the repository for sparse checkout if not os.path.isdir(repo_dir): os.makedirs(repo_dir, exist_ok=True) print("Cloning repository...") @@ -91,17 +90,13 @@ def clone_repo(repo_url, branch, docs_dir, repo_dir): with repo.config_writer() as git_config: git_config.set_value("core", "sparseCheckout", "true") - # Define the sparse checkout settings with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file: sparse_checkout_file.write(f"/{docs_dir}\n") - # Pull the specific directory from the repository origin = repo.create_remote("origin", repo_url) origin.fetch(progress=CloneProgress()) repo.git.checkout(branch) print("Repository cloned.") - - # Update the repository if it already exists else: print("Repository already exists. Updating...") repo = Repo(repo_dir) @@ -114,54 +109,37 @@ def clone_repo(repo_url, branch, docs_dir, repo_dir): def is_file_open(file_path): if not os.path.exists(file_path): - return False # File does not exist, so it's not open - + return False try: - # Try to open the file in append mode. If the file is open in another program, this might fail with open(file_path, 'a'): pass return False except PermissionError: - # If a PermissionError is raised, it's likely the file is open elsewhere return True def get_files_sorted(root_dir): all_files = [] - - # Step 1: Traverse the directory structure for root, _, files in os.walk(root_dir): for file in files: full_path = os.path.join(root, file) - - # Step 2: Prioritize 'index.mdx' or 'index.md' within the same folder modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file sort_key = os.path.join(root, modified_basename) - - # Add tuple to the list all_files.append((full_path, sort_key)) - - # Step 3: Perform a global sort based on modified basename all_files.sort(key=lambda x: x[1]) - - # Step 4: Return the full paths in sorted order return [full_path for full_path, _ in all_files] def preprocess_frontmatter(frontmatter): - # Dictionary to store HTML tags and their placeholders html_tags = {} - # Function to replace HTML tags with placeholders def replace_tag(match): tag = match.group(0) placeholder = f"HTML_TAG_{len(html_tags)}" html_tags[placeholder] = tag return placeholder - # Replace HTML tags with placeholders modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter) - return modified_frontmatter, html_tags @@ -171,18 +149,15 @@ def restore_html_tags(parsed_data, html_tags): if isinstance(value, str): for placeholder, tag in html_tags.items(): value = value.replace(placeholder, tag) - # if key == 'title': # Escape HTML characters for titles value = html.escape(value) parsed_data[key] = value return parsed_data def process_files(files, repo_dir, docs_dir): - # Initialize the Table of Contents - toc = "" + toc = "" html_all_pages_content = "" - # Initialize an empty string to hold all the HTML content & Include the main CSS directly in the HTML html_header = f""" @@ -193,63 +168,43 @@ def process_files(files, repo_dir, docs_dir): """ - numbering = [0] # Starting with the first level + numbering = [0] for index, file_path in enumerate(files): with open(file_path, 'r', encoding='utf8') as f: md_content = f.read() - # Process the markdown content for image paths if Change_img_url: md_content = process_image_paths(md_content) - # Process the markdown content for non standard code blocks md_content = preprocess_code_blocks(md_content) - - # Parse the frontmatter and markdown frontmatter, md_content = parse_frontmatter(md_content) if frontmatter: - # Preprocessing: replaces HTML tags with unique placeholders and stores the mappings frontmatter, html_tags = preprocess_frontmatter(frontmatter) - - # Parse the YAML frontmatter data = safe_load_frontmatter(frontmatter) if data is not None: - - # Preprocessing: After parsing the YAML, restore the HTML tags in place of the placeholders data = restore_html_tags(data, html_tags) - - # Depth Level: Calculate relative path, directory depth and TOC rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir)) - - # Depth Level: Calculate the depth of each section - depth = rel_path.count(os.sep) # Count separators to determine depth - file_basename = os.path.basename(file_path) + depth = rel_path.count(os.sep) + file_basename = os.path.basename(file_path) if file_basename.startswith("index.") and depth > 0: - depth += -1 # or another title for the main index - indent = ' ' * 5 * depth # Adjust indentation based on depth + depth += -1 + indent = ' ' * 5 * depth - # Numbering: Ensure numbering has enough levels while len(numbering) <= depth: numbering.append(0) - # Numbering: Increment at the current level numbering[depth] += 1 - # Numbering: Reset for any lower levels for i in range(depth + 1, len(numbering)): numbering[i] = 0 - # Numbering: Create entry toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}" - - # TOC: Generate the section title toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title()) toc_full_title = f"{toc_numbering} - {toc_title}" toc += f"{indent}{toc_full_title}
" - # Page Content: Format the parsed YAML to HTML html_page_content = f"""

{toc_full_title}

Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}

@@ -268,78 +223,99 @@ def process_files(files, repo_dir, docs_dir): """ html_page_content += '
' - else: html_page_content = "" else: html_page_content = "" - # Convert Markdown to HTML with table support and add content to the identified header html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition']) - - # Add page content to all cumulated pages content html_all_pages_content += html_page_content - # Add a page break unless it is the last file if index < len(files) - 1: html_all_pages_content += '
' - # Prepend the ToC to the beginning of the HTML content toc_html = f"""

Table of Contents

{toc}
""" html_all_content = toc_html + html_all_pages_content - # Finalize html formatting - html_all_pages_content = html_header + html_all_pages_content + "" - toc_html = html_header + toc_html + "" - html_all_content = html_header + html_all_content + "" + html_all_pages_content = html_header + html_all_pages_content + "" + toc_html = html_header + toc_html + "" + html_all_content = html_header + html_all_content + "" return(html_all_content, toc_html, html_all_pages_content) def find_latest_version(html_content): - # Regular expression to find versions like v14.2.0 version_pattern = re.compile(r"v(\d+\.\d+\.\d+)") versions = version_pattern.findall(html_content) - # Remove duplicates and sort versions unique_versions = sorted(set(versions), key=lambda v: version.parse(v), reverse=True) return unique_versions[0] if unique_versions else None -if __name__ == "__main__": +def generate_pdf(html_content, output_pdf, format_options=None): + """ + Generate PDF from HTML content using Playwright + """ + default_format = { + 'format': 'A4', + 'margin': { + 'top': '50px', + 'right': '50px', + 'bottom': '50px', + 'left': '50px' + }, + 'print_background': True, + 'display_header_footer': True, + 'header_template': '
of
', + 'footer_template': '
' + } + + format_options = format_options or default_format - # Define the output PDF file name - # project_title = "Next.js v14 Documentation" - # output_pdf = "Next.js_v14_Documentation.pdf" + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + + # Set viewport size to ensure consistent rendering + page.set_viewport_size({"width": 1280, "height": 1024}) + + # Set content and wait for network idle + page.set_content(html_content, wait_until='networkidle') + + # Wait for any images and fonts to load + page.wait_for_load_state('networkidle') + page.wait_for_load_state('domcontentloaded') + + # Generate PDF + page.pdf(path=output_pdf, **format_options) + + browser.close() + + +if __name__ == "__main__": export_html = False - # Clone the repository and checkout the canary branch repo_dir = "nextjs-docs" repo_url = "https://github.com/vercel/next.js.git" branch = "canary" docs_dir = "docs" - # Define a base path and quality for the image URLs Change_img_url = True base_path = "https://nextjs.org/_next/image?url=" path_args = "&w=1920&q=75" - # Clone the repository clone_repo(repo_url, branch, docs_dir, repo_dir) - # Traverse the docs directory and convert each markdown file to HTML - print ("Converting the Documentation to HTML...") + print("Converting the Documentation to HTML...") docs_dir_full_path = os.path.join(repo_dir, docs_dir) files_to_process = get_files_sorted(docs_dir_full_path) html_all_content, _, _ = process_files(files_to_process, repo_dir, docs_dir) print("Converted all MDX to HTML.") - # Save the HTML content to a file for inspection if export_html: with open('output.html', 'w', encoding='utf8') as f: f.write(html_all_content) print("HTML Content exported.") - # Find the latest version in the HTML content latest_version = find_latest_version(html_all_content) if latest_version: project_title = f"""Next.js Documentation v{latest_version}""" @@ -348,7 +324,6 @@ if __name__ == "__main__": project_title = "Next.js Documentation" output_pdf = "Next.js_Documentation.pdf" - # Define the cover HTML with local CSS file cover_html = f""" @@ -367,26 +342,38 @@ if __name__ == "__main__": """ - # Write the cover HTML to a temporary file - with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as cover_file: - cover_file.write(cover_html.encode('utf-8')) - print("HTML Cover exported.") + format_options = { + 'format': 'A4', + 'margin': { + 'top': '50px', + 'right': '50px', + 'bottom': '50px', + 'left': '50px' + }, + 'print_background': True, + 'display_header_footer': True, + 'header_template': f''' +
+ {project_title} + Page of +
+ ''', + 'footer_template': f''' +
+ Generated on {datetime.now().strftime("%Y-%m-%d")} +
+ ''' + } - # Convert the combined HTML content to PDF with a cover and a table of contents + # Check if file is open if is_file_open(output_pdf): - print("The output file is already open in another process. Please close it and try again.") + print("The output file is already open in another process. Please close it and try again.") else: - options = { - 'encoding': 'UTF-8', - 'page-size': 'A4', - 'quiet': '', - 'image-dpi': 150, # General reco.: printer - hq, 300 dpi| ebook - low quality, 150 dpi| screen-view-only quality, 72 dpi - 'image-quality': 75, - # 'no-outline': None, - # 'no-images': None, - } - pdfkit.from_string(html_all_content, output_pdf, options=options, cover=cover_file.name, toc={}) - print("Created the PDF file successfully.") + try: + print("Generating PDF...") + # Generate PDF with cover page and content + generate_pdf(cover_html + html_all_content, output_pdf, format_options) + print("Created the PDF file successfully.") - # Delete the temporary file - os.unlink(cover_file.name) \ No newline at end of file + except Exception as e: + print(f"Error generating PDF: {str(e)}") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7db731a..8b4b4e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -GitPython -Markdown -pdfkit -PyYAML -packaging -tqdm \ No newline at end of file +# requirements.txt +gitpython==3.1.40 +markdown==3.5.1 +packaging==23.2 +playwright==1.40.0 +PyYAML==6.0.1 +tqdm==4.66.1 \ No newline at end of file