Files
Docs-Exporter-Nextjs/export-docs.py
2024-05-17 01:44:30 +02:00

392 lines
15 KiB
Python

import os
import markdown
import pdfkit
import tempfile
import yaml
import re
import html
from git import Repo, RemoteProgress
from datetime import datetime
from packaging import version
from tqdm import tqdm
def process_image_paths(md_content):
# Define a regular expression pattern to find image tags
pattern = r'src(?:Light|Dark)="(.*?)"'
# Function to replace the relative path with an absolute path
def replace(match):
relative_path = match.group(1)
absolute_path = f'{base_path}{relative_path}{path_args}'
# Print the original and new image URLs for debugging
return f'src="{absolute_path}"'
# Use the sub method to replace all occurrences
return re.sub(pattern, replace, md_content)
def preprocess_code_blocks(md_content):
# Regular expression to match extended code blocks with filename and language
pattern = r'```(\w+)?\s+filename="([^"]+)"\s*(switcher)?\n(.*?)```'
def replace(match):
language = match.group(1) if match.group(1) else ''
filename = match.group(2)
code_block = match.group(4)
# Format the header with filename and language
header = f'<div class="code-header"><i>{filename} ({language})</i></div>' if language else f'<div class="code-header"><i>{filename}</i></div>'
return f'{header}\n```{language}\n{code_block}\n```'
# Replace all occurrences in the content
return re.sub(pattern, replace, md_content, flags=re.DOTALL)
def safe_load_frontmatter(frontmatter_content):
try:
return yaml.safe_load(frontmatter_content)
except yaml.YAMLError:
return None
def preprocess_mdx_content(md_content):
# Replace HTML tags in frontmatter
md_content = re.sub(r'<(/?\w+)>', lambda m: html.escape(m.group(0)), md_content)
return md_content
def parse_frontmatter(md_content):
lines = md_content.split('\n')
if lines[0].strip() == '---':
end_of_frontmatter = lines.index('---', 1)
frontmatter = '\n'.join(lines[1:end_of_frontmatter])
content = '\n'.join(lines[end_of_frontmatter + 1:])
return frontmatter, content
return None, md_content
class CloneProgress(RemoteProgress):
def __init__(self):
super().__init__()
self.pbar = tqdm()
def update(self, op_code, cur_count, max_count=None, message=''):
if max_count is not None:
self.pbar.total = max_count
self.pbar.update(cur_count - self.pbar.n) # increment the pbar with the increment
def finalize(self):
self.pbar.close()
# Clone a specific directory of a repository / branch
def clone_repo(repo_url, branch, docs_dir, repo_dir):
# Initialize and configure the repository for sparse checkout
if not os.path.isdir(repo_dir):
os.makedirs(repo_dir, exist_ok=True)
print("Cloning repository...")
repo = Repo.init(repo_dir)
with repo.config_writer() as git_config:
git_config.set_value("core", "sparseCheckout", "true")
# Define the sparse checkout settings
with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file:
sparse_checkout_file.write(f"/{docs_dir}\n")
# Pull the specific directory from the repository
origin = repo.create_remote("origin", repo_url)
origin.fetch(progress=CloneProgress())
repo.git.checkout(branch)
print("Repository cloned.")
# Update the repository if it already exists
else:
print("Repository already exists. Updating...")
repo = Repo(repo_dir)
origin = repo.remotes.origin
origin.fetch(progress=CloneProgress())
repo.git.checkout(branch)
origin.pull(progress=CloneProgress())
print("Repository updated.")
def is_file_open(file_path):
if not os.path.exists(file_path):
return False # File does not exist, so it's not open
try:
# Try to open the file in append mode. If the file is open in another program, this might fail
with open(file_path, 'a'):
pass
return False
except PermissionError:
# If a PermissionError is raised, it's likely the file is open elsewhere
return True
def get_files_sorted(root_dir):
all_files = []
# Step 1: Traverse the directory structure
for root, _, files in os.walk(root_dir):
for file in files:
full_path = os.path.join(root, file)
# Step 2: Prioritize 'index.mdx' or 'index.md' within the same folder
modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file
sort_key = os.path.join(root, modified_basename)
# Add tuple to the list
all_files.append((full_path, sort_key))
# Step 3: Perform a global sort based on modified basename
all_files.sort(key=lambda x: x[1])
# Step 4: Return the full paths in sorted order
return [full_path for full_path, _ in all_files]
def preprocess_frontmatter(frontmatter):
# Dictionary to store HTML tags and their placeholders
html_tags = {}
# Function to replace HTML tags with placeholders
def replace_tag(match):
tag = match.group(0)
placeholder = f"HTML_TAG_{len(html_tags)}"
html_tags[placeholder] = tag
return placeholder
# Replace HTML tags with placeholders
modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter)
return modified_frontmatter, html_tags
def restore_html_tags(parsed_data, html_tags):
if isinstance(parsed_data, dict):
for key, value in parsed_data.items():
if isinstance(value, str):
for placeholder, tag in html_tags.items():
value = value.replace(placeholder, tag)
# if key == 'title': # Escape HTML characters for titles
value = html.escape(value)
parsed_data[key] = value
return parsed_data
def process_files(files, repo_dir, docs_dir):
# Initialize the Table of Contents
toc = ""
html_all_pages_content = ""
# Initialize an empty string to hold all the HTML content & Include the main CSS directly in the HTML
html_header = f"""
<html>
<head>
<style>
{open('styles.css').read()}
</style>
</head>
<body>
"""
numbering = [0] # Starting with the first level
for index, file_path in enumerate(files):
with open(file_path, 'r', encoding='utf8') as f:
md_content = f.read()
# Process the markdown content for image paths
if Change_img_url:
md_content = process_image_paths(md_content)
# Process the markdown content for non standard code blocks
md_content = preprocess_code_blocks(md_content)
# Parse the frontmatter and markdown
frontmatter, md_content = parse_frontmatter(md_content)
if frontmatter:
# Preprocessing: replaces HTML tags with unique placeholders and stores the mappings
frontmatter, html_tags = preprocess_frontmatter(frontmatter)
# Parse the YAML frontmatter
data = safe_load_frontmatter(frontmatter)
if data is not None:
# Preprocessing: After parsing the YAML, restore the HTML tags in place of the placeholders
data = restore_html_tags(data, html_tags)
# Depth Level: Calculate relative path, directory depth and TOC
rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir))
# Depth Level: Calculate the depth of each section
depth = rel_path.count(os.sep) # Count separators to determine depth
file_basename = os.path.basename(file_path)
if file_basename.startswith("index.") and depth > 0:
depth += -1 # or another title for the main index
indent = '&nbsp;' * 5 * depth # Adjust indentation based on depth
# Numbering: Ensure numbering has enough levels
while len(numbering) <= depth:
numbering.append(0)
# Numbering: Increment at the current level
numbering[depth] += 1
# Numbering: Reset for any lower levels
for i in range(depth + 1, len(numbering)):
numbering[i] = 0
# Numbering: Create entry
toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}"
# TOC: Generate the section title
toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title())
toc_full_title = f"{toc_numbering} - {toc_title}"
toc += f"{indent}<a href='#{toc_full_title}'>{toc_full_title}</a><br/>"
# Page Content: Format the parsed YAML to HTML
html_page_content = f"""
<h1>{toc_full_title}</h1>
<div class="doc-path"><p>Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}</p></div>
<p><strong>Description:</strong> {data.get('description', 'No description')}</p>
"""
if data.get('related', {}):
html_page_content += f"""
<div style="margin-left:20px;">
<p><strong>Related:</strong></p>
<p><strong>Title:</strong> {data.get('related', {}).get('title', 'Related')}</p>
<p><strong>Related Description:</strong> {data.get('related', {}).get('description', 'No related description')}</p>
<p><strong>Links:</strong></p>
<ul>
{''.join([f'<li>{link}</li>' for link in data.get('related', {}).get('links', [])])}
</ul>
</div>
"""
html_page_content += '</br>'
else:
html_page_content = ""
else:
html_page_content = ""
# Convert Markdown to HTML with table support and add content to the identified header
html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition'])
# Add page content to all cumulated pages content
html_all_pages_content += html_page_content
# Add a page break unless it is the last file
if index < len(files) - 1:
html_all_pages_content += '<div class="page-break"></div>'
# Prepend the ToC to the beginning of the HTML content
toc_html = f"""<div style="padding-bottom: 10px"><div style="padding-bottom: 20px"><h1>Table of Contents</h1></div>{toc}</div><div style="page-break-before: always;">"""
html_all_content = toc_html + html_all_pages_content
# Finalize html formatting
html_all_pages_content = html_header + html_all_pages_content + "</body></html>"
toc_html = html_header + toc_html + "</body></html>"
html_all_content = html_header + html_all_content + "</body></html>"
return(html_all_content, toc_html, html_all_pages_content)
def find_latest_version(html_content):
# Regular expression to find versions like v14.2.0
version_pattern = re.compile(r"v(\d+\.\d+\.\d+)")
versions = version_pattern.findall(html_content)
# Remove duplicates and sort versions
unique_versions = sorted(set(versions), key=lambda v: version.parse(v), reverse=True)
return unique_versions[0] if unique_versions else None
if __name__ == "__main__":
# Define the output PDF file name
# project_title = "Next.js v14 Documentation"
# output_pdf = "Next.js_v14_Documentation.pdf"
export_html = False
# Clone the repository and checkout the canary branch
repo_dir = "nextjs-docs"
repo_url = "https://github.com/vercel/next.js.git"
branch = "canary"
docs_dir = "docs"
# Define a base path and quality for the image URLs
Change_img_url = True
base_path = "https://nextjs.org/_next/image?url="
path_args = "&w=1920&q=75"
# Clone the repository
clone_repo(repo_url, branch, docs_dir, repo_dir)
# Traverse the docs directory and convert each markdown file to HTML
print ("Converting the Documentation to HTML...")
docs_dir_full_path = os.path.join(repo_dir, docs_dir)
files_to_process = get_files_sorted(docs_dir_full_path)
html_all_content, _, _ = process_files(files_to_process, repo_dir, docs_dir)
print("Converted all MDX to HTML.")
# Save the HTML content to a file for inspection
if export_html:
with open('output.html', 'w', encoding='utf8') as f:
f.write(html_all_content)
print("HTML Content exported.")
# Find the latest version in the HTML content
latest_version = find_latest_version(html_all_content)
if latest_version:
project_title = f"""Next.js Documentation v{latest_version}"""
output_pdf = f"""Next.js_Docs_v{latest_version}_{datetime.now().strftime("%Y-%m-%d")}.pdf"""
else:
project_title = "Next.js Documentation"
output_pdf = "Next.js_Documentation.pdf"
# Define the cover HTML with local CSS file
cover_html = f"""
<html>
<head>
<style>
{open('styles.css').read()}
</style>
</head>
<body>
<div class="master-container">
<div class="container">
<div class="title">{project_title}</div>
<div class="date">Date: {datetime.now().strftime("%Y-%m-%d")}</div>
</div>
</div>
</body>
</html>
"""
# Write the cover HTML to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as cover_file:
cover_file.write(cover_html.encode('utf-8'))
print("HTML Cover exported.")
# Convert the combined HTML content to PDF with a cover and a table of contents
if is_file_open(output_pdf):
print("The output file is already open in another process. Please close it and try again.")
else:
options = {
'encoding': 'UTF-8',
'page-size': 'A4',
'quiet': '',
'image-dpi': 150, # General reco.: printer - hq, 300 dpi| ebook - low quality, 150 dpi| screen-view-only quality, 72 dpi
'image-quality': 75,
# 'no-outline': None,
# 'no-images': None,
}
pdfkit.from_string(html_all_content, output_pdf, options=options, cover=cover_file.name, toc={})
print("Created the PDF file successfully.")
# Delete the temporary file
os.unlink(cover_file.name)