Update and rename export-docs.py to astro_docs_to_pdf.py

This commit is contained in:
pacnpal
2024-12-09 21:01:16 -05:00
committed by GitHub
parent 056c7956e7
commit 5889ad9440
2 changed files with 700 additions and 440 deletions

700
astro_docs_to_pdf.py Normal file
View File

@@ -0,0 +1,700 @@
"""
Astro Documentation PDF Generator
Modified version of Docs-Exporter for Astro documentation
Original work Copyright (C) 2024 Riyooo
Modified work Copyright (C) 2024 PacNPal
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Modifications:
- Replaced wkhtmltopdf with Playwright for PDF generation
- Added support for Astro's MDX format and frontmatter
- Enhanced frontmatter parsing
- Added automatic CSS generation
- Improved error handling and reporting
"""
import os
import markdown
import tempfile
import yaml
import re
import html
import shutil
from pathlib import Path
from git import Repo, RemoteProgress, GitCommandError
from datetime import datetime
from packaging import version
from tqdm import tqdm
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
def get_license_notice():
"""Return a formatted license notice for inclusion in output"""
return """
This PDF was generated by Astro Documentation PDF Generator
Original work Copyright (C) 2024 Riyooo
Modified work Copyright (C) 2024 PacNPal
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License version 3.
Source code is available at: [Your Repository URL]
"""
def add_license_page(html_content):
"""Add a license notice page to the HTML content"""
license_html = f"""
<div class="license-notice" style="page-break-before: always;">
<h2>License Notice</h2>
<pre style="white-space: pre-wrap; font-family: monospace;">
{get_license_notice()}
</pre>
<p>Complete source code for this program is available at: [Your Repository URL]</p>
<p>This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you
are welcome to redistribute it under certain conditions. See the GNU Affero General
Public License version 3 for details.</p>
</div>
"""
return html_content + license_html
class DocumentationProcessingError(Exception):
"""Custom exception for documentation processing errors"""
pass
class CloneProgress(RemoteProgress):
def __init__(self):
super().__init__()
self.pbar = tqdm()
def update(self, op_code, cur_count, max_count=None, message=''):
if max_count is not None:
self.pbar.total = max_count
self.pbar.update(cur_count - self.pbar.n)
def finalize(self):
self.pbar.close()
def cleanup_directory(directory):
"""Safely clean up a directory"""
try:
if os.path.exists(directory):
shutil.rmtree(directory)
except Exception as e:
print(f"Warning: Failed to clean up directory {directory}: {e}")
def process_image_paths(md_content):
"""Process both MDX imports and markdown images, including SVGs"""
try:
# Handle MDX image imports
mdx_pattern = r'import\s+(\w+)\s+from\s+[\'"]([^\'"]+)[\'"]'
def mdx_replace(match):
var_name = match.group(1)
image_path = match.group(2)
image_path = re.sub(r'^[./~]+', '', image_path)
return f'![{var_name}](https://docs.astro.build/{image_path})'
md_content = re.sub(mdx_pattern, mdx_replace, md_content)
# Handle standard markdown images and SVGs
md_pattern = r'!\[(.*?)\]\(([^http].*?)\)'
def md_replace(match):
alt_text = match.group(1)
image_path = match.group(2)
image_path = re.sub(r'^[./~]+', '', image_path)
return f'![{alt_text}](https://docs.astro.build/{image_path})'
md_content = re.sub(md_pattern, md_replace, md_content)
# Handle Astro image components
component_pattern = r'<Image\s+src={([^}]+)}\s+alt="([^"]+)"[^>]*>'
def component_replace(match):
src = match.group(1).strip('{}').strip('"\'')
alt = match.group(2)
src = re.sub(r'^[./~]+', '', src)
return f'![{alt}](https://docs.astro.build/{src})'
return re.sub(component_pattern, component_replace, md_content)
except Exception as e:
raise DocumentationProcessingError(f"Error processing image paths: {e}")
def preprocess_code_blocks(md_content):
"""Handle Astro's code blocks with advanced features"""
try:
pattern = r'```(?:(\w+))?\s*(?:\{([^}]*)\})?\s*(.*?)```'
def replace(match):
language = match.group(1) or ''
attributes = match.group(2) or ''
code_block = match.group(3)
# Parse attributes
title = ''
if attributes:
title_match = re.search(r'title="([^"]+)"', attributes)
if title_match:
title = title_match.group(1)
# Build header
header_parts = []
if title:
header_parts.append(title)
if language:
header_parts.append(f"({language})")
header_html = (f'<div class="code-header"><i>{" ".join(header_parts)}</i></div>'
if header_parts else '')
return f'{header_html}\n```{language}\n{code_block.strip()}\n```'
return re.sub(pattern, replace, md_content, flags=re.DOTALL)
except Exception as e:
raise DocumentationProcessingError(f"Error processing code blocks: {e}")
def parse_frontmatter(md_content):
"""Parse frontmatter with support for Astro's format"""
try:
lines = md_content.split('\n')
if lines[0].strip() == '---':
try:
end_of_frontmatter = lines[1:].index('---') + 1
frontmatter = '\n'.join(lines[1:end_of_frontmatter])
content = '\n'.join(lines[end_of_frontmatter + 1:])
# Remove import statements from content
content = re.sub(r'import.*?\n', '', content)
return frontmatter, content
except ValueError:
return None, md_content
return None, md_content
except Exception as e:
raise DocumentationProcessingError(f"Error parsing frontmatter: {e}")
def safe_load_frontmatter(frontmatter_content):
"""Safely load YAML frontmatter with robust error handling"""
if not frontmatter_content:
return None
try:
# First pass: try to extract title and description using regex
title_match = re.search(r'title:\s*[\'"]?(.*?)(?:[\'"]?\s*$|[\'"]?\s+\w+:)', frontmatter_content, re.MULTILINE)
desc_match = re.search(r'description:\s*[\'"]?(.*?)(?:[\'"]?\s*$|[\'"]?\s+\w+:)', frontmatter_content, re.MULTILINE)
# Initialize metadata with found values
metadata = {}
if title_match:
title = title_match.group(1).strip(' \'"')
metadata['title'] = title
if desc_match:
description = desc_match.group(1).strip(' \'"')
metadata['description'] = description
# Clean up the frontmatter
lines = []
for line in frontmatter_content.split('\n'):
# Skip problematic lines
if any(skip in line for skip in ['githubIntegrationURL:', 'label:', 'maxHeadingLevel:']):
continue
# Clean up quotes and URLs
if ':' in line:
key, *value_parts = line.split(':', 1)
if value_parts:
value = value_parts[0]
# Handle truncated URLs
if 'http' in value and "'" in value and not value.endswith("'"):
continue
# Handle other truncated quoted strings
if value.count("'") == 1 or value.count('"') == 1:
continue
lines.append(line)
else:
lines.append(line)
# Try to parse cleaned content
cleaned_content = '\n'.join(lines)
try:
parsed_data = yaml.safe_load(cleaned_content)
if isinstance(parsed_data, dict):
# Update metadata with any additional valid fields
metadata.update(parsed_data)
except:
pass # Use regex-extracted metadata if YAML parsing fails
return metadata if metadata else None
except Exception as e:
print(f"Warning: Error processing frontmatter: {e}")
return None
def parse_frontmatter(md_content):
"""Parse frontmatter with improved error handling"""
try:
lines = md_content.split('\n')
if lines[0].strip() == '---':
try:
# Find the closing --- marker
end_of_frontmatter = lines[1:].index('---') + 1
frontmatter = '\n'.join(lines[1:end_of_frontmatter])
content = '\n'.join(lines[end_of_frontmatter + 1:])
# Clean up frontmatter
frontmatter = re.sub(r'\s+i18nReady:\s*true', '', frontmatter)
frontmatter = re.sub(r':\s*\|', ': ', frontmatter) # Handle YAML block indicators
return frontmatter, content
except ValueError:
return None, md_content
return None, md_content
except Exception as e:
print(f"Warning: Error in document structure: {e}")
return None, md_content
def clone_repo(repo_url, branch, docs_dir, repo_dir):
"""Clone repository with proper error handling and cleanup"""
progress = None
try:
if os.path.exists(repo_dir):
print("Updating existing repository...")
repo = Repo(repo_dir)
origin = repo.remotes.origin
origin.fetch()
repo.git.checkout(branch)
origin.pull()
print("Repository updated successfully.")
return
print("Cloning repository...")
os.makedirs(repo_dir, exist_ok=True)
progress = CloneProgress()
# Initialize repository
repo = Repo.init(repo_dir)
with repo.config_writer() as git_config:
git_config.set_value("core", "sparseCheckout", "true")
# Setup sparse checkout
sparse_checkout_path = Path(repo_dir) / ".git" / "info" / "sparse-checkout"
sparse_checkout_path.parent.mkdir(exist_ok=True)
sparse_checkout_path.write_text(f"/{docs_dir}/*\n")
# Clone and checkout
origin = repo.create_remote("origin", repo_url)
origin.fetch(progress=progress)
repo.git.checkout(branch)
print("Repository cloned successfully.")
except GitCommandError as e:
cleanup_directory(repo_dir)
raise DocumentationProcessingError(f"Git operation failed: {e}")
except Exception as e:
cleanup_directory(repo_dir)
raise DocumentationProcessingError(f"Repository operation failed: {e}")
finally:
if progress:
progress.finalize()
def get_files_sorted(root_dir):
"""Get sorted files with comprehensive filtering"""
try:
all_files = []
excluded_dirs = {'node_modules', '.git', '_internal', 'dist', 'temp', '__pycache__'}
for root, dirs, files in os.walk(root_dir):
# Skip excluded directories
dirs[:] = [d for d in dirs if d not in excluded_dirs and not d.startswith('_')]
for file in files:
if file.endswith(('.md', '.mdx')):
full_path = os.path.join(root, file)
# Skip excluded paths
if any(x in Path(full_path).parts for x in excluded_dirs):
continue
# Prioritize index files within their directories
is_index = file == 'index.md'
dir_path = os.path.dirname(full_path)
sort_key = f"{dir_path}/{'0' if is_index else '1'}{file}"
all_files.append((full_path, sort_key))
if not all_files:
raise DocumentationProcessingError(f"No markdown files found in {root_dir}")
all_files.sort(key=lambda x: x[1])
return [full_path for full_path, _ in all_files]
except Exception as e:
raise DocumentationProcessingError(f"Error getting sorted files: {e}")
def create_default_css():
"""Create a default CSS file if it doesn't exist"""
css_content = """
body {
font-family: 'Arial', sans-serif;
line-height: 1.6;
margin: 0;
padding: 20px;
color: #1a1a1a;
}
.master-container {
display: flex;
justify-content: center;
align-items: center;
min-height: 100vh;
}
.container {
text-align: center;
max-width: 800px;
margin: 0 auto;
}
.title {
font-size: 28px;
font-weight: bold;
margin-bottom: 20px;
color: #000;
}
.date {
font-size: 16px;
color: #666;
}
.page-break {
page-break-after: always;
}
code {
background-color: #f4f4f4;
padding: 2px 4px;
border-radius: 4px;
font-family: 'Courier New', monospace;
font-size: 0.9em;
}
pre {
background-color: #f8f8f8;
padding: 15px;
border-radius: 5px;
overflow-x: auto;
border: 1px solid #e1e1e1;
}
.code-header {
background-color: #e0e0e0;
padding: 8px 15px;
border-radius: 5px 5px 0 0;
font-size: 0.9em;
color: #333;
}
table {
border-collapse: collapse;
width: 100%;
margin: 15px 0;
}
th, td {
border: 1px solid #ddd;
padding: 12px;
text-align: left;
}
th {
background-color: #f5f5f5;
font-weight: bold;
}
img {
max-width: 100%;
height: auto;
margin: 10px 0;
border-radius: 5px;
}
h1, h2, h3, h4, h5, h6 {
color: #2c3e50;
margin-top: 24px;
margin-bottom: 16px;
}
h1 { font-size: 2em; border-bottom: 1px solid #eee; padding-bottom: 0.3em; }
h2 { font-size: 1.5em; }
h3 { font-size: 1.25em; }
a { color: #0366d6; text-decoration: none; }
a:hover { text-decoration: underline; }
blockquote {
margin: 0;
padding: 0 1em;
color: #6a737d;
border-left: 0.25em solid #dfe2e5;
}
.doc-path {
color: #666;
font-size: 0.9em;
margin-bottom: 20px;
padding: 8px;
background-color: #f8f9fa;
border-radius: 4px;
}
"""
try:
with open('styles.css', 'w', encoding='utf8') as f:
f.write(css_content.strip())
except Exception as e:
raise DocumentationProcessingError(f"Failed to create CSS file: {e}")
def process_files(files, repo_dir, docs_dir):
"""Process markdown files into HTML with error handling"""
try:
if not os.path.exists('styles.css'):
create_default_css()
with open('styles.css', 'r', encoding='utf8') as f:
css_content = f.read()
toc = []
html_all_pages_content = []
numbering = [0]
html_header = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<style>{css_content}</style>
</head>
<body>
"""
for index, file_path in enumerate(files):
try:
with open(file_path, 'r', encoding='utf8') as f:
md_content = f.read()
md_content = process_image_paths(md_content)
md_content = preprocess_code_blocks(md_content)
frontmatter, md_content = parse_frontmatter(md_content)
if frontmatter:
data = safe_load_frontmatter(frontmatter)
if data is not None:
rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir))
depth = rel_path.count(os.sep)
if os.path.basename(file_path) == 'index.md' and depth > 0:
depth -= 1
indent = '&nbsp;' * 5 * depth
while len(numbering) <= depth:
numbering.append(0)
numbering[depth] += 1
for i in range(depth + 1, len(numbering)):
numbering[i] = 0
toc_numbering = '.'.join(map(str, numbering[:depth + 1]))
toc_title = data.get('title', Path(file_path).stem.title())
toc_full_title = f"{toc_numbering} - {toc_title}"
toc.append(f"{indent}<a href='#{toc_full_title}'>{toc_full_title}</a><br/>")
html_page_content = [
f"<h1 id='{toc_full_title}'>{toc_full_title}</h1>",
f"<div class='doc-path'><p>Documentation path: {Path(file_path).relative_to(Path(repo_dir) / docs_dir).as_posix()}</p></div>"
]
if 'description' in data:
html_page_content.append(f"<p><strong>Description:</strong> {data['description']}</p>")
html_page_content.append('<br/>')
# Convert Markdown to HTML with extended features
html_page_content.append(markdown.markdown(
md_content,
extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'attr_list', 'def_list']
))
html_all_pages_content.append('\n'.join(html_page_content))
if index < len(files) - 1:
html_all_pages_content.append('<div class="page-break"></div>')
except Exception as e:
print(f"Warning: Error processing file {file_path}: {e}")
continue
if not html_all_pages_content:
raise DocumentationProcessingError("No content was successfully processed")
# Create table of contents
toc_html = f"""
<div style="padding-bottom: 10px">
<div style="padding-bottom: 20px">
<h1>Table of Contents</h1>
</div>
{''.join(toc)}
</div>
<div style="page-break-before: always;">
"""
# Combine all content
final_content = '\n'.join(html_all_pages_content)
html_all_content = f"{html_header}{toc_html}{final_content}</body></html>"
return html_all_content
except Exception as e:
raise DocumentationProcessingError(f"Error processing documentation: {e}")
def generate_pdf(html_content, output_pdf, format_options=None):
"""Generate PDF using Playwright with enhanced error handling"""
default_format = {
'format': 'A4',
'margin': {
'top': '50px',
'right': '50px',
'bottom': '50px',
'left': '50px'
},
'print_background': True,
'display_header_footer': True,
'header_template': '<div style="font-size: 10px; text-align: right; width: 100%; padding-right: 20px; margin-top: 20px;"><span class="pageNumber"></span> of <span class="totalPages"></span></div>',
'footer_template': '<div style="font-size: 10px; text-align: center; width: 100%; margin-bottom: 20px;"><span class="url"></span></div>'
}
format_options = format_options or default_format
try:
playwright = sync_playwright().start()
browser = playwright.chromium.launch()
context = browser.new_context()
page = context.new_page()
# Set viewport size for consistent rendering
page.set_viewport_size({"width": 1280, "height": 1024})
# Increase timeouts for better reliability
page.set_default_timeout(120000) # 2 minutes
# Set content and wait for loading
page.set_content(html_content, wait_until='networkidle')
# Additional waits for content
page.wait_for_load_state('networkidle')
page.wait_for_load_state('domcontentloaded')
# Generate PDF
page.pdf(path=output_pdf, **format_options)
except Exception as e:
raise DocumentationProcessingError(f"Error generating PDF: {str(e)}")
finally:
if 'page' in locals():
page.close()
if 'context' in locals():
context.close()
if 'browser' in locals():
browser.close()
if 'playwright' in locals():
playwright.stop()
def main():
"""Main execution function with error handling"""
repo_dir = "astro-docs"
repo_url = "https://github.com/withastro/docs.git"
branch = "main"
docs_dir = "src/content/docs/en"
output_pdf = f"Astro_Documentation_{datetime.now().strftime('%Y-%m-%d')}.pdf"
temp_dir = None
try:
# Add version and license information to console
print(f"""
Astro Documentation PDF Generator v1.0.0
Copyright (C) 2024 PacNPal
This program comes with ABSOLUTELY NO WARRANTY; for details see the LICENSE file.
This is free software, and you are welcome to redistribute it
under certain conditions; see the LICENSE file for details.
""")
# Create CSS if it doesn't exist
if not os.path.exists('styles.css'):
print("Creating default styles.css...")
create_default_css()
print("Default CSS file created.")
# Create temporary directory for processing
temp_dir = tempfile.mkdtemp()
# Clone repository
print("Cloning Astro documentation repository...")
clone_repo(repo_url, branch, docs_dir, repo_dir)
# Get and sort files
print("Finding and sorting documentation files...")
docs_dir_full_path = os.path.join(repo_dir, docs_dir)
files_to_process = get_files_sorted(docs_dir_full_path)
print(f"Found {len(files_to_process)} files to process")
# Create cover page
with open('styles.css', 'r', encoding='utf8') as f:
css_content = f.read()
cover_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<style>{css_content}</style>
</head>
<body>
<div class="master-container">
<div class="container">
<div class="title">Astro Documentation</div>
<div class="date">Generated on {datetime.now().strftime('%Y-%m-%d')}</div>
</div>
</div>
</body>
</html>
"""
# Process files and generate HTML
print("Processing documentation files...")
html_content = process_files(files_to_process, repo_dir, docs_dir)
# Combine cover and content
final_html = f"{cover_html}<div class='page-break'></div>{html_content}"
final_html = add_license_page(final_html)
# Generate PDF
print(f"Generating PDF: {output_pdf}")
generate_pdf(final_html, output_pdf)
print(f"Documentation successfully generated: {output_pdf}")
except DocumentationProcessingError as e:
print(f"Error: {e}")
return 1
except Exception as e:
print(f"Unexpected error: {e}")
print("This program is licensed under AGPL-3.0. Source code is available at: [Your Repository URL]")
return 1
finally:
# Cleanup
if temp_dir and os.path.exists(temp_dir):
shutil.rmtree(temp_dir, ignore_errors=True)
if __name__ == "__main__":
exit(main())

View File

@@ -1,440 +0,0 @@
"""
Nextjs Documentation PDF Generator
Modified version of Docs-Exporter for Astro documentation
Original work Copyright (C) 2024 Riyooo
Modified work Copyright (C) 2024 PacNPal
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Modifications:
- Replaced wkhtmltopdf with Playwright for PDF generation
- Improved error handling and reporting
- Added support for custom headers, footers, and styles in the generated PDFs.
- Enhanced error handling with `try-except` blocks, especially for frontmatter parsing and file operations.
"""
import os
import markdown
import tempfile
import yaml
import re
import html
from git import Repo, RemoteProgress
from datetime import datetime
from packaging import version
from tqdm import tqdm
from playwright.sync_api import sync_playwright
def get_license_notice():
"""Return a formatted license notice for inclusion in output"""
return """
This PDF was generated by Nextjs Documentation PDF Generator
Original work Copyright (C) 2024 Riyooo
Modified work Copyright (C) 2024 PacNPal
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License version 3.
Source code is available at: https://github.com/pacnpal/Docs-Exporter
"""
def add_license_page(html_content):
"""Add a license notice page to the HTML content"""
license_html = f"""
<div class="license-notice" style="page-break-before: always;">
<h2>License Notice</h2>
<pre style="white-space: pre-wrap; font-family: monospace;">
{get_license_notice()}
</pre>
<p>Complete source code for this program is available at: https://github.com/pacnpal/Docs-Exporter</p>
<p>This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you
are welcome to redistribute it under certain conditions. See the GNU Affero General
Public License version 3 for details.</p>
</div>
"""
return html_content + license_html
def process_image_paths(md_content):
# Define a regular expression pattern to find image tags
pattern = r'src(?:Light|Dark)="(.*?)"'
# Function to replace the relative path with an absolute path
def replace(match):
relative_path = match.group(1)
absolute_path = f'{base_path}{relative_path}{path_args}'
# Print the original and new image URLs for debugging
return f'src="{absolute_path}"'
# Use the sub method to replace all occurrences
return re.sub(pattern, replace, md_content)
def preprocess_code_blocks(md_content):
# Regular expression to match extended code blocks with filename and language
pattern = r'```(\w+)?\s+filename="([^"]+)"\s*(switcher)?\n(.*?)```'
def replace(match):
language = match.group(1) if match.group(1) else ''
filename = match.group(2)
code_block = match.group(4)
# Format the header with filename and language
header = f'<div class="code-header"><i>{filename} ({language})</i></div>' if language else f'<div class="code-header"><i>{filename}</i></div>'
return f'{header}\n```{language}\n{code_block}\n```'
# Replace all occurrences in the content
return re.sub(pattern, replace, md_content, flags=re.DOTALL)
def safe_load_frontmatter(frontmatter_content):
try:
return yaml.safe_load(frontmatter_content)
except yaml.YAMLError:
return None
def preprocess_mdx_content(md_content):
# Replace HTML tags in frontmatter
md_content = re.sub(r'<(/?\w+)>', lambda m: html.escape(m.group(0)), md_content)
return md_content
def parse_frontmatter(md_content):
lines = md_content.split('\n')
if lines[0].strip() == '---':
end_of_frontmatter = lines.index('---', 1)
frontmatter = '\n'.join(lines[1:end_of_frontmatter])
content = '\n'.join(lines[end_of_frontmatter + 1:])
return frontmatter, content
return None, md_content
class CloneProgress(RemoteProgress):
def __init__(self):
super().__init__()
self.pbar = tqdm()
def update(self, op_code, cur_count, max_count=None, message=''):
if max_count is not None:
self.pbar.total = max_count
self.pbar.update(cur_count - self.pbar.n)
def finalize(self):
self.pbar.close()
def clone_repo(repo_url, branch, docs_dir, repo_dir):
if not os.path.isdir(repo_dir):
os.makedirs(repo_dir, exist_ok=True)
print("Cloning repository...")
repo = Repo.init(repo_dir)
with repo.config_writer() as git_config:
git_config.set_value("core", "sparseCheckout", "true")
with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file:
sparse_checkout_file.write(f"/{docs_dir}\n")
origin = repo.create_remote("origin", repo_url)
origin.fetch(progress=CloneProgress())
repo.git.checkout(branch)
print("Repository cloned.")
else:
print("Repository already exists. Updating...")
repo = Repo(repo_dir)
origin = repo.remotes.origin
origin.fetch(progress=CloneProgress())
repo.git.checkout(branch)
origin.pull(progress=CloneProgress())
print("Repository updated.")
def is_file_open(file_path):
if not os.path.exists(file_path):
return False
try:
with open(file_path, 'a'):
pass
return False
except PermissionError:
return True
def get_files_sorted(root_dir):
all_files = []
for root, _, files in os.walk(root_dir):
for file in files:
full_path = os.path.join(root, file)
modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file
sort_key = os.path.join(root, modified_basename)
all_files.append((full_path, sort_key))
all_files.sort(key=lambda x: x[1])
return [full_path for full_path, _ in all_files]
def preprocess_frontmatter(frontmatter):
html_tags = {}
def replace_tag(match):
tag = match.group(0)
placeholder = f"HTML_TAG_{len(html_tags)}"
html_tags[placeholder] = tag
return placeholder
modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter)
return modified_frontmatter, html_tags
def restore_html_tags(parsed_data, html_tags):
if isinstance(parsed_data, dict):
for key, value in parsed_data.items():
if isinstance(value, str):
for placeholder, tag in html_tags.items():
value = value.replace(placeholder, tag)
value = html.escape(value)
parsed_data[key] = value
return parsed_data
def process_files(files, repo_dir, docs_dir):
toc = ""
html_all_pages_content = ""
html_header = f"""
<html>
<head>
<style>
{open('styles.css').read()}
</style>
</head>
<body>
"""
numbering = [0]
for index, file_path in enumerate(files):
with open(file_path, 'r', encoding='utf8') as f:
md_content = f.read()
if Change_img_url:
md_content = process_image_paths(md_content)
md_content = preprocess_code_blocks(md_content)
frontmatter, md_content = parse_frontmatter(md_content)
if frontmatter:
frontmatter, html_tags = preprocess_frontmatter(frontmatter)
data = safe_load_frontmatter(frontmatter)
if data is not None:
data = restore_html_tags(data, html_tags)
rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir))
depth = rel_path.count(os.sep)
file_basename = os.path.basename(file_path)
if file_basename.startswith("index.") and depth > 0:
depth += -1
indent = '&nbsp;' * 5 * depth
while len(numbering) <= depth:
numbering.append(0)
numbering[depth] += 1
for i in range(depth + 1, len(numbering)):
numbering[i] = 0
toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}"
toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title())
toc_full_title = f"{toc_numbering} - {toc_title}"
toc += f"{indent}<a href='#{toc_full_title}'>{toc_full_title}</a><br/>"
html_page_content = f"""
<h1>{toc_full_title}</h1>
<div class="doc-path"><p>Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}</p></div>
<p><strong>Description:</strong> {data.get('description', 'No description')}</p>
"""
if data.get('related', {}):
html_page_content += f"""
<div style="margin-left:20px;">
<p><strong>Related:</strong></p>
<p><strong>Title:</strong> {data.get('related', {}).get('title', 'Related')}</p>
<p><strong>Related Description:</strong> {data.get('related', {}).get('description', 'No related description')}</p>
<p><strong>Links:</strong></p>
<ul>
{''.join([f'<li>{link}</li>' for link in data.get('related', {}).get('links', [])])}
</ul>
</div>
"""
html_page_content += '</br>'
else:
html_page_content = ""
else:
html_page_content = ""
html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition'])
html_all_pages_content += html_page_content
if index < len(files) - 1:
html_all_pages_content += '<div class="page-break"></div>'
toc_html = f"""<div style="padding-bottom: 10px"><div style="padding-bottom: 20px"><h1>Table of Contents</h1></div>{toc}</div><div style="page-break-before: always;">"""
html_all_content = toc_html + html_all_pages_content
html_all_pages_content = html_header + html_all_pages_content + "</body></html>"
toc_html = html_header + toc_html + "</body></html>"
html_all_content = html_header + html_all_content + "</body></html>"
return(html_all_content, toc_html, html_all_pages_content)
def find_latest_version(html_content):
version_pattern = re.compile(r"v(\d+\.\d+\.\d+)")
versions = version_pattern.findall(html_content)
unique_versions = sorted(set(versions), key=lambda v: version.parse(v), reverse=True)
return unique_versions[0] if unique_versions else None
def generate_pdf(html_content, output_pdf, format_options=None):
"""
Generate PDF from HTML content using Playwright
"""
default_format = {
'format': 'A4',
'margin': {
'top': '50px',
'right': '50px',
'bottom': '50px',
'left': '50px'
},
'print_background': True,
'display_header_footer': True,
'header_template': '<div style="font-size: 10px; text-align: right; width: 100%; padding-right: 20px; margin-top: 20px;"><span class="pageNumber"></span> of <span class="totalPages"></span></div>',
'footer_template': '<div style="font-size: 10px; text-align: center; width: 100%; margin-bottom: 20px;"><span class="url"></span></div>'
}
format_options = format_options or default_format
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# Set viewport size to ensure consistent rendering
page.set_viewport_size({"width": 1280, "height": 1024})
# Set content and wait for network idle
page.set_content(html_content, wait_until='networkidle')
# Wait for any images and fonts to load
page.wait_for_load_state('networkidle')
page.wait_for_load_state('domcontentloaded')
# Generate PDF
page.pdf(path=output_pdf, **format_options)
browser.close()
if __name__ == "__main__":
export_html = False
repo_dir = "nextjs-docs"
repo_url = "https://github.com/vercel/next.js.git"
branch = "canary"
docs_dir = "docs"
Change_img_url = True
base_path = "https://nextjs.org/_next/image?url="
path_args = "&w=1920&q=75"
clone_repo(repo_url, branch, docs_dir, repo_dir)
print(f"""
Nextjs Documentation PDF Generator v1.0.0
Copyright (C) 2024 PacNPal
This program comes with ABSOLUTELY NO WARRANTY; for details see the LICENSE file.
This is free software, and you are welcome to redistribute it
under certain conditions; see the LICENSE file for details.
""")
print("Converting the Documentation to HTML...")
docs_dir_full_path = os.path.join(repo_dir, docs_dir)
files_to_process = get_files_sorted(docs_dir_full_path)
html_all_content, _, _ = process_files(files_to_process, repo_dir, docs_dir)
print("Converted all MDX to HTML.")
if export_html:
with open('output.html', 'w', encoding='utf8') as f:
f.write(html_all_content)
print("HTML Content exported.")
latest_version = find_latest_version(html_all_content)
if latest_version:
project_title = f"""Next.js Documentation v{latest_version}"""
output_pdf = f"""Next.js_Docs_v{latest_version}_{datetime.now().strftime("%Y-%m-%d")}.pdf"""
else:
project_title = "Next.js Documentation"
output_pdf = "Next.js_Documentation.pdf"
cover_html = f"""
<html>
<head>
<style>
{open('styles.css').read()}
</style>
</head>
<body>
<div class="master-container">
<div class="container">
<div class="title">{project_title}</div>
<div class="date">Date: {datetime.now().strftime("%Y-%m-%d")}</div>
</div>
</div>
</body>
</html>
"""
format_options = {
'format': 'A4',
'margin': {
'top': '50px',
'right': '50px',
'bottom': '50px',
'left': '50px'
},
'print_background': True,
'display_header_footer': True,
'header_template': f'''
<div style="font-size: 10px; padding: 10px 20px; margin-top: 20px;">
<span style="float: left;">{project_title}</span>
<span style="float: right;">Page <span class="pageNumber"></span> of <span class="totalPages"></span></span>
</div>
''',
'footer_template': f'''
<div style="font-size: 10px; padding: 10px 20px; margin-bottom: 20px; text-align: center;">
Generated on {datetime.now().strftime("%Y-%m-%d")}
</div>
'''
}
# Check if file is open
if is_file_open(output_pdf):
print("The output file is already open in another process. Please close it and try again.")
else:
try:
print("Generating PDF...")
# Generate PDF with cover page and content
html_all_content = add_license_page(html_all_content)
generate_pdf(cover_html + html_all_content, output_pdf, format_options)
print("Created the PDF file successfully.")
except Exception as e:
print(f"Error generating PDF: {str(e)}")