mirror of
https://github.com/PacFactory/Docs-Exporter-Nextjs.git
synced 2025-12-20 03:31:05 -05:00
392 lines
15 KiB
Python
392 lines
15 KiB
Python
import os
|
|
import markdown
|
|
import pdfkit
|
|
import tempfile
|
|
import yaml
|
|
import re
|
|
import html
|
|
from git import Repo, RemoteProgress
|
|
from datetime import datetime
|
|
from packaging import version
|
|
from tqdm import tqdm
|
|
|
|
|
|
def process_image_paths(md_content):
|
|
# Define a regular expression pattern to find image tags
|
|
pattern = r'src(?:Light|Dark)="(.*?)"'
|
|
|
|
# Function to replace the relative path with an absolute path
|
|
def replace(match):
|
|
relative_path = match.group(1)
|
|
absolute_path = f'{base_path}{relative_path}{path_args}'
|
|
|
|
# Print the original and new image URLs for debugging
|
|
return f'src="{absolute_path}"'
|
|
|
|
# Use the sub method to replace all occurrences
|
|
return re.sub(pattern, replace, md_content)
|
|
|
|
|
|
def preprocess_code_blocks(md_content):
|
|
# Regular expression to match extended code blocks with filename and language
|
|
pattern = r'```(\w+)?\s+filename="([^"]+)"\s*(switcher)?\n(.*?)```'
|
|
|
|
def replace(match):
|
|
language = match.group(1) if match.group(1) else ''
|
|
filename = match.group(2)
|
|
code_block = match.group(4)
|
|
|
|
# Format the header with filename and language
|
|
header = f'<div class="code-header"><i>{filename} ({language})</i></div>' if language else f'<div class="code-header"><i>{filename}</i></div>'
|
|
|
|
return f'{header}\n```{language}\n{code_block}\n```'
|
|
|
|
# Replace all occurrences in the content
|
|
return re.sub(pattern, replace, md_content, flags=re.DOTALL)
|
|
|
|
|
|
def safe_load_frontmatter(frontmatter_content):
|
|
try:
|
|
return yaml.safe_load(frontmatter_content)
|
|
except yaml.YAMLError:
|
|
return None
|
|
|
|
|
|
def preprocess_mdx_content(md_content):
|
|
# Replace HTML tags in frontmatter
|
|
md_content = re.sub(r'<(/?\w+)>', lambda m: html.escape(m.group(0)), md_content)
|
|
return md_content
|
|
|
|
|
|
def parse_frontmatter(md_content):
|
|
lines = md_content.split('\n')
|
|
if lines[0].strip() == '---':
|
|
end_of_frontmatter = lines.index('---', 1)
|
|
frontmatter = '\n'.join(lines[1:end_of_frontmatter])
|
|
content = '\n'.join(lines[end_of_frontmatter + 1:])
|
|
return frontmatter, content
|
|
return None, md_content
|
|
|
|
|
|
class CloneProgress(RemoteProgress):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.pbar = tqdm()
|
|
|
|
def update(self, op_code, cur_count, max_count=None, message=''):
|
|
if max_count is not None:
|
|
self.pbar.total = max_count
|
|
self.pbar.update(cur_count - self.pbar.n) # increment the pbar with the increment
|
|
|
|
def finalize(self):
|
|
self.pbar.close()
|
|
|
|
# Clone a specific directory of a repository / branch
|
|
def clone_repo(repo_url, branch, docs_dir, repo_dir):
|
|
# Initialize and configure the repository for sparse checkout
|
|
if not os.path.isdir(repo_dir):
|
|
os.makedirs(repo_dir, exist_ok=True)
|
|
print("Cloning repository...")
|
|
repo = Repo.init(repo_dir)
|
|
with repo.config_writer() as git_config:
|
|
git_config.set_value("core", "sparseCheckout", "true")
|
|
|
|
# Define the sparse checkout settings
|
|
with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file:
|
|
sparse_checkout_file.write(f"/{docs_dir}\n")
|
|
|
|
# Pull the specific directory from the repository
|
|
origin = repo.create_remote("origin", repo_url)
|
|
origin.fetch(progress=CloneProgress())
|
|
repo.git.checkout(branch)
|
|
print("Repository cloned.")
|
|
|
|
# Update the repository if it already exists
|
|
else:
|
|
print("Repository already exists. Updating...")
|
|
repo = Repo(repo_dir)
|
|
origin = repo.remotes.origin
|
|
origin.fetch(progress=CloneProgress())
|
|
repo.git.checkout(branch)
|
|
origin.pull(progress=CloneProgress())
|
|
print("Repository updated.")
|
|
|
|
|
|
def is_file_open(file_path):
|
|
if not os.path.exists(file_path):
|
|
return False # File does not exist, so it's not open
|
|
|
|
try:
|
|
# Try to open the file in append mode. If the file is open in another program, this might fail
|
|
with open(file_path, 'a'):
|
|
pass
|
|
return False
|
|
except PermissionError:
|
|
# If a PermissionError is raised, it's likely the file is open elsewhere
|
|
return True
|
|
|
|
|
|
def get_files_sorted(root_dir):
|
|
all_files = []
|
|
|
|
# Step 1: Traverse the directory structure
|
|
for root, _, files in os.walk(root_dir):
|
|
for file in files:
|
|
full_path = os.path.join(root, file)
|
|
|
|
# Step 2: Prioritize 'index.mdx' or 'index.md' within the same folder
|
|
modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file
|
|
sort_key = os.path.join(root, modified_basename)
|
|
|
|
# Add tuple to the list
|
|
all_files.append((full_path, sort_key))
|
|
|
|
# Step 3: Perform a global sort based on modified basename
|
|
all_files.sort(key=lambda x: x[1])
|
|
|
|
# Step 4: Return the full paths in sorted order
|
|
return [full_path for full_path, _ in all_files]
|
|
|
|
|
|
def preprocess_frontmatter(frontmatter):
|
|
# Dictionary to store HTML tags and their placeholders
|
|
html_tags = {}
|
|
|
|
# Function to replace HTML tags with placeholders
|
|
def replace_tag(match):
|
|
tag = match.group(0)
|
|
placeholder = f"HTML_TAG_{len(html_tags)}"
|
|
html_tags[placeholder] = tag
|
|
return placeholder
|
|
|
|
# Replace HTML tags with placeholders
|
|
modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter)
|
|
|
|
return modified_frontmatter, html_tags
|
|
|
|
|
|
def restore_html_tags(parsed_data, html_tags):
|
|
if isinstance(parsed_data, dict):
|
|
for key, value in parsed_data.items():
|
|
if isinstance(value, str):
|
|
for placeholder, tag in html_tags.items():
|
|
value = value.replace(placeholder, tag)
|
|
# if key == 'title': # Escape HTML characters for titles
|
|
value = html.escape(value)
|
|
parsed_data[key] = value
|
|
return parsed_data
|
|
|
|
|
|
def process_files(files, repo_dir, docs_dir):
|
|
# Initialize the Table of Contents
|
|
toc = ""
|
|
html_all_pages_content = ""
|
|
|
|
# Initialize an empty string to hold all the HTML content & Include the main CSS directly in the HTML
|
|
html_header = f"""
|
|
<html>
|
|
<head>
|
|
<style>
|
|
{open('styles.css').read()}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
"""
|
|
|
|
numbering = [0] # Starting with the first level
|
|
|
|
for index, file_path in enumerate(files):
|
|
with open(file_path, 'r', encoding='utf8') as f:
|
|
md_content = f.read()
|
|
|
|
# Process the markdown content for image paths
|
|
if Change_img_url:
|
|
md_content = process_image_paths(md_content)
|
|
|
|
# Process the markdown content for non standard code blocks
|
|
md_content = preprocess_code_blocks(md_content)
|
|
|
|
# Parse the frontmatter and markdown
|
|
frontmatter, md_content = parse_frontmatter(md_content)
|
|
|
|
if frontmatter:
|
|
# Preprocessing: replaces HTML tags with unique placeholders and stores the mappings
|
|
frontmatter, html_tags = preprocess_frontmatter(frontmatter)
|
|
|
|
# Parse the YAML frontmatter
|
|
data = safe_load_frontmatter(frontmatter)
|
|
if data is not None:
|
|
|
|
# Preprocessing: After parsing the YAML, restore the HTML tags in place of the placeholders
|
|
data = restore_html_tags(data, html_tags)
|
|
|
|
# Depth Level: Calculate relative path, directory depth and TOC
|
|
rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir))
|
|
|
|
# Depth Level: Calculate the depth of each section
|
|
depth = rel_path.count(os.sep) # Count separators to determine depth
|
|
file_basename = os.path.basename(file_path)
|
|
if file_basename.startswith("index.") and depth > 0:
|
|
depth += -1 # or another title for the main index
|
|
indent = ' ' * 5 * depth # Adjust indentation based on depth
|
|
|
|
# Numbering: Ensure numbering has enough levels
|
|
while len(numbering) <= depth:
|
|
numbering.append(0)
|
|
|
|
# Numbering: Increment at the current level
|
|
numbering[depth] += 1
|
|
|
|
# Numbering: Reset for any lower levels
|
|
for i in range(depth + 1, len(numbering)):
|
|
numbering[i] = 0
|
|
|
|
# Numbering: Create entry
|
|
toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}"
|
|
|
|
# TOC: Generate the section title
|
|
toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title())
|
|
toc_full_title = f"{toc_numbering} - {toc_title}"
|
|
toc += f"{indent}<a href='#{toc_full_title}'>{toc_full_title}</a><br/>"
|
|
|
|
# Page Content: Format the parsed YAML to HTML
|
|
html_page_content = f"""
|
|
<h1>{toc_full_title}</h1>
|
|
<div class="doc-path"><p>Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}</p></div>
|
|
<p><strong>Description:</strong> {data.get('description', 'No description')}</p>
|
|
"""
|
|
if data.get('related', {}):
|
|
html_page_content += f"""
|
|
<div style="margin-left:20px;">
|
|
<p><strong>Related:</strong></p>
|
|
<p><strong>Title:</strong> {data.get('related', {}).get('title', 'Related')}</p>
|
|
<p><strong>Related Description:</strong> {data.get('related', {}).get('description', 'No related description')}</p>
|
|
<p><strong>Links:</strong></p>
|
|
<ul>
|
|
{''.join([f'<li>{link}</li>' for link in data.get('related', {}).get('links', [])])}
|
|
</ul>
|
|
</div>
|
|
"""
|
|
html_page_content += '</br>'
|
|
|
|
else:
|
|
html_page_content = ""
|
|
else:
|
|
html_page_content = ""
|
|
|
|
# Convert Markdown to HTML with table support and add content to the identified header
|
|
html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition'])
|
|
|
|
# Add page content to all cumulated pages content
|
|
html_all_pages_content += html_page_content
|
|
|
|
# Add a page break unless it is the last file
|
|
if index < len(files) - 1:
|
|
html_all_pages_content += '<div class="page-break"></div>'
|
|
|
|
# Prepend the ToC to the beginning of the HTML content
|
|
toc_html = f"""<div style="padding-bottom: 10px"><div style="padding-bottom: 20px"><h1>Table of Contents</h1></div>{toc}</div><div style="page-break-before: always;">"""
|
|
html_all_content = toc_html + html_all_pages_content
|
|
|
|
# Finalize html formatting
|
|
html_all_pages_content = html_header + html_all_pages_content + "</body></html>"
|
|
toc_html = html_header + toc_html + "</body></html>"
|
|
html_all_content = html_header + html_all_content + "</body></html>"
|
|
|
|
return(html_all_content, toc_html, html_all_pages_content)
|
|
|
|
|
|
def find_latest_version(html_content):
|
|
# Regular expression to find versions like v14.2.0
|
|
version_pattern = re.compile(r"v(\d+\.\d+\.\d+)")
|
|
versions = version_pattern.findall(html_content)
|
|
# Remove duplicates and sort versions
|
|
unique_versions = sorted(set(versions), key=lambda v: version.parse(v), reverse=True)
|
|
return unique_versions[0] if unique_versions else None
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
# Define the output PDF file name
|
|
# project_title = "Next.js v14 Documentation"
|
|
# output_pdf = "Next.js_v14_Documentation.pdf"
|
|
export_html = False
|
|
|
|
# Clone the repository and checkout the canary branch
|
|
repo_dir = "nextjs-docs"
|
|
repo_url = "https://github.com/vercel/next.js.git"
|
|
branch = "canary"
|
|
docs_dir = "docs"
|
|
|
|
# Define a base path and quality for the image URLs
|
|
Change_img_url = True
|
|
base_path = "https://nextjs.org/_next/image?url="
|
|
path_args = "&w=1920&q=75"
|
|
|
|
# Clone the repository
|
|
clone_repo(repo_url, branch, docs_dir, repo_dir)
|
|
|
|
# Traverse the docs directory and convert each markdown file to HTML
|
|
print ("Converting the Documentation to HTML...")
|
|
docs_dir_full_path = os.path.join(repo_dir, docs_dir)
|
|
files_to_process = get_files_sorted(docs_dir_full_path)
|
|
html_all_content, _, _ = process_files(files_to_process, repo_dir, docs_dir)
|
|
print("Converted all MDX to HTML.")
|
|
|
|
# Save the HTML content to a file for inspection
|
|
if export_html:
|
|
with open('output.html', 'w', encoding='utf8') as f:
|
|
f.write(html_all_content)
|
|
print("HTML Content exported.")
|
|
|
|
# Find the latest version in the HTML content
|
|
latest_version = find_latest_version(html_all_content)
|
|
if latest_version:
|
|
project_title = f"""Next.js Documentation v{latest_version}"""
|
|
output_pdf = f"""Next.js_Docs_v{latest_version}_{datetime.now().strftime("%Y-%m-%d")}.pdf"""
|
|
else:
|
|
project_title = "Next.js Documentation"
|
|
output_pdf = "Next.js_Documentation.pdf"
|
|
|
|
# Define the cover HTML with local CSS file
|
|
cover_html = f"""
|
|
<html>
|
|
<head>
|
|
<style>
|
|
{open('styles.css').read()}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="master-container">
|
|
<div class="container">
|
|
<div class="title">{project_title}</div>
|
|
<div class="date">Date: {datetime.now().strftime("%Y-%m-%d")}</div>
|
|
</div>
|
|
</div>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
# Write the cover HTML to a temporary file
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as cover_file:
|
|
cover_file.write(cover_html.encode('utf-8'))
|
|
print("HTML Cover exported.")
|
|
|
|
# Convert the combined HTML content to PDF with a cover and a table of contents
|
|
if is_file_open(output_pdf):
|
|
print("The output file is already open in another process. Please close it and try again.")
|
|
else:
|
|
options = {
|
|
'encoding': 'UTF-8',
|
|
'page-size': 'A4',
|
|
'quiet': '',
|
|
'image-dpi': 150, # General reco.: printer - hq, 300 dpi| ebook - low quality, 150 dpi| screen-view-only quality, 72 dpi
|
|
'image-quality': 75,
|
|
# 'no-outline': None,
|
|
# 'no-images': None,
|
|
}
|
|
pdfkit.from_string(html_all_content, output_pdf, options=options, cover=cover_file.name, toc={})
|
|
print("Created the PDF file successfully.")
|
|
|
|
# Delete the temporary file
|
|
os.unlink(cover_file.name) |