"""
Nextjs Documentation PDF Generator
Modified version of Docs-Exporter for Astro documentation
Original work Copyright (C) 2024 Riyooo
Modified work Copyright (C) 2024 PacFactory
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
Modifications:
- Replaced wkhtmltopdf with Playwright for PDF generation
- Improved error handling and reporting
- Added support for custom headers, footers, and styles in the generated PDFs.
- Enhanced error handling with `try-except` blocks, especially for frontmatter parsing and file operations.
"""
import os
import markdown
import tempfile
import yaml
import re
import html
from git import Repo, RemoteProgress
from datetime import datetime
from packaging import version
from tqdm import tqdm
from playwright.sync_api import sync_playwright
def get_license_notice():
"""Return a formatted license notice for inclusion in output"""
return """
This PDF was generated by Nextjs Documentation PDF Generator
Original work Copyright (C) 2024 Riyooo
Modified work Copyright (C) 2024 PacFactory
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License version 3.
Source code is available at: https://github.com/PacFactory/Docs-Exporter
"""
def add_license_page(html_content):
"""Add a license notice page to the HTML content"""
license_html = f"""
License Notice
{get_license_notice()}
Complete source code for this program is available at: https://github.com/PacFactory/Docs-Exporter
This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you
are welcome to redistribute it under certain conditions. See the GNU Affero General
Public License version 3 for details.
"""
return html_content + license_html
def process_image_paths(md_content):
# Define a regular expression pattern to find image tags
pattern = r'src(?:Light|Dark)="(.*?)"'
# Function to replace the relative path with an absolute path
def replace(match):
relative_path = match.group(1)
absolute_path = f'{base_path}{relative_path}{path_args}'
# Print the original and new image URLs for debugging
return f'src="{absolute_path}"'
# Use the sub method to replace all occurrences
return re.sub(pattern, replace, md_content)
def preprocess_code_blocks(md_content):
# Regular expression to match extended code blocks with filename and language
pattern = r'```(\w+)?\s+filename="([^"]+)"\s*(switcher)?\n(.*?)```'
def replace(match):
language = match.group(1) if match.group(1) else ''
filename = match.group(2)
code_block = match.group(4)
# Format the header with filename and language
header = f'' if language else f''
return f'{header}\n```{language}\n{code_block}\n```'
# Replace all occurrences in the content
return re.sub(pattern, replace, md_content, flags=re.DOTALL)
def safe_load_frontmatter(frontmatter_content):
try:
return yaml.safe_load(frontmatter_content)
except yaml.YAMLError:
return None
def preprocess_mdx_content(md_content):
# Replace HTML tags in frontmatter
md_content = re.sub(r'<(/?\w+)>', lambda m: html.escape(m.group(0)), md_content)
return md_content
def parse_frontmatter(md_content):
lines = md_content.split('\n')
if lines[0].strip() == '---':
end_of_frontmatter = lines.index('---', 1)
frontmatter = '\n'.join(lines[1:end_of_frontmatter])
content = '\n'.join(lines[end_of_frontmatter + 1:])
return frontmatter, content
return None, md_content
class CloneProgress(RemoteProgress):
def __init__(self):
super().__init__()
self.pbar = tqdm()
def update(self, op_code, cur_count, max_count=None, message=''):
if max_count is not None:
self.pbar.total = max_count
self.pbar.update(cur_count - self.pbar.n)
def finalize(self):
self.pbar.close()
def clone_repo(repo_url, branch, docs_dir, repo_dir):
if not os.path.isdir(repo_dir):
os.makedirs(repo_dir, exist_ok=True)
print("Cloning repository...")
repo = Repo.init(repo_dir)
with repo.config_writer() as git_config:
git_config.set_value("core", "sparseCheckout", "true")
with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file:
sparse_checkout_file.write(f"/{docs_dir}\n")
origin = repo.create_remote("origin", repo_url)
origin.fetch(progress=CloneProgress())
repo.git.checkout(branch)
print("Repository cloned.")
else:
print("Repository already exists. Updating...")
repo = Repo(repo_dir)
origin = repo.remotes.origin
origin.fetch(progress=CloneProgress())
repo.git.checkout(branch)
origin.pull(progress=CloneProgress())
print("Repository updated.")
def is_file_open(file_path):
if not os.path.exists(file_path):
return False
try:
with open(file_path, 'a'):
pass
return False
except PermissionError:
return True
def get_files_sorted(root_dir):
all_files = []
for root, _, files in os.walk(root_dir):
for file in files:
full_path = os.path.join(root, file)
modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file
sort_key = os.path.join(root, modified_basename)
all_files.append((full_path, sort_key))
all_files.sort(key=lambda x: x[1])
return [full_path for full_path, _ in all_files]
def preprocess_frontmatter(frontmatter):
html_tags = {}
def replace_tag(match):
tag = match.group(0)
placeholder = f"HTML_TAG_{len(html_tags)}"
html_tags[placeholder] = tag
return placeholder
modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter)
return modified_frontmatter, html_tags
def restore_html_tags(parsed_data, html_tags):
if isinstance(parsed_data, dict):
for key, value in parsed_data.items():
if isinstance(value, str):
for placeholder, tag in html_tags.items():
value = value.replace(placeholder, tag)
value = html.escape(value)
parsed_data[key] = value
return parsed_data
def process_files(files, repo_dir, docs_dir):
toc = ""
html_all_pages_content = ""
html_header = f"""
"""
numbering = [0]
for index, file_path in enumerate(files):
with open(file_path, 'r', encoding='utf8') as f:
md_content = f.read()
if Change_img_url:
md_content = process_image_paths(md_content)
md_content = preprocess_code_blocks(md_content)
frontmatter, md_content = parse_frontmatter(md_content)
if frontmatter:
frontmatter, html_tags = preprocess_frontmatter(frontmatter)
data = safe_load_frontmatter(frontmatter)
if data is not None:
data = restore_html_tags(data, html_tags)
rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir))
depth = rel_path.count(os.sep)
file_basename = os.path.basename(file_path)
if file_basename.startswith("index.") and depth > 0:
depth += -1
indent = ' ' * 5 * depth
while len(numbering) <= depth:
numbering.append(0)
numbering[depth] += 1
for i in range(depth + 1, len(numbering)):
numbering[i] = 0
toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}"
toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title())
toc_full_title = f"{toc_numbering} - {toc_title}"
toc += f"{indent}{toc_full_title}
"
html_page_content = f"""
{toc_full_title}
Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}
Description: {data.get('description', 'No description')}
"""
if data.get('related', {}):
html_page_content += f"""
Related:
Title: {data.get('related', {}).get('title', 'Related')}
Related Description: {data.get('related', {}).get('description', 'No related description')}
Links:
{''.join([f'- {link}
' for link in data.get('related', {}).get('links', [])])}
"""
html_page_content += ''
else:
html_page_content = ""
else:
html_page_content = ""
html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition'])
html_all_pages_content += html_page_content
if index < len(files) - 1:
html_all_pages_content += ''
toc_html = f""""""
html_all_content = toc_html + html_all_pages_content
html_all_pages_content = html_header + html_all_pages_content + ""
toc_html = html_header + toc_html + "