Initial commit

2025-12-20 02:31:04 -05:00 · 2024-05-03 01:18:58 +01:00
commit e0395ed877
6 changed files with 1400 additions and 0 deletions
--- a/export-docs.py
+++ b/export-docs.py
@@ -0,0 +1,392 @@
+import os
+import markdown
+import pdfkit
+import tempfile
+import yaml
+import re
+import html
+from git import Repo, RemoteProgress
+from datetime import datetime
+from packaging import version
+from tqdm import tqdm
+
+
+def process_image_paths(md_content):
+    # Define a regular expression pattern to find image tags
+    pattern = r'src(?:Light|Dark)="(.*?)"'
+
+    # Function to replace the relative path with an absolute path
+    def replace(match):
+        relative_path = match.group(1)
+        absolute_path = f'{base_path}{relative_path}{path_args}'
+
+        # Print the original and new image URLs for debugging
+        return f'src="{absolute_path}"'
+
+    # Use the sub method to replace all occurrences
+    return re.sub(pattern, replace, md_content)
+
+
+def preprocess_code_blocks(md_content):
+    # Regular expression to match extended code blocks with filename and language
+    pattern = r'```(\w+)?\s+filename="([^"]+)"\s*(switcher)?\n(.*?)```'
+
+    def replace(match):
+        language = match.group(1) if match.group(1) else ''
+        filename = match.group(2)
+        code_block = match.group(4)
+
+        # Format the header with filename and language
+        header = f'<div class="code-header"><i>{filename} ({language})</i></div>' if language else f'<div class="code-header"><i>{filename}</i></div>'
+
+        return f'{header}\n```{language}\n{code_block}\n```'
+
+    # Replace all occurrences in the content
+    return re.sub(pattern, replace, md_content, flags=re.DOTALL)
+
+
+def safe_load_frontmatter(frontmatter_content):
+    try:
+        return yaml.safe_load(frontmatter_content)
+    except yaml.YAMLError:
+        return None
+
+
+def preprocess_mdx_content(md_content):
+    # Replace HTML tags in frontmatter
+    md_content = re.sub(r'<(/?\w+)>', lambda m: html.escape(m.group(0)), md_content)
+    return md_content
+
+
+def parse_frontmatter(md_content):
+    lines = md_content.split('\n')
+    if lines[0].strip() == '---':
+        end_of_frontmatter = lines.index('---', 1)
+        frontmatter = '\n'.join(lines[1:end_of_frontmatter])
+        content = '\n'.join(lines[end_of_frontmatter + 1:])
+        return frontmatter, content
+    return None, md_content
+
+
+class CloneProgress(RemoteProgress):
+    def __init__(self):
+        super().__init__()
+        self.pbar = tqdm()
+
+    def update(self, op_code, cur_count, max_count=None, message=''):
+        if max_count is not None:
+            self.pbar.total = max_count
+        self.pbar.update(cur_count - self.pbar.n)  # increment the pbar with the increment
+
+    def finalize(self):
+        self.pbar.close()
+
+# Clone a specific directory of a repository / branch
+def clone_repo(repo_url, branch, docs_dir, repo_dir):
+    # Initialize and configure the repository for sparse checkout
+    if not os.path.isdir(repo_dir):
+        os.makedirs(repo_dir, exist_ok=True)
+        print("Cloning repository...")
+        repo = Repo.init(repo_dir)
+        with repo.config_writer() as git_config:
+            git_config.set_value("core", "sparseCheckout", "true")
+
+        # Define the sparse checkout settings
+        with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file:
+            sparse_checkout_file.write(f"/{docs_dir}\n")
+
+        # Pull the specific directory from the repository
+        origin = repo.create_remote("origin", repo_url)
+        origin.fetch(progress=CloneProgress())
+        repo.git.checkout(branch)
+        print("Repository cloned.")
+
+    # Update the repository if it already exists
+    else:
+        print("Repository already exists. Updating...")
+        repo = Repo(repo_dir)
+        origin = repo.remotes.origin
+        origin.fetch(progress=CloneProgress())
+        repo.git.checkout(branch)
+        origin.pull(progress=CloneProgress())
+        print("Repository updated.")
+
+
+def is_file_open(file_path):
+    if not os.path.exists(file_path):
+        return False  # File does not exist, so it's not open
+
+    try:
+        # Try to open the file in append mode. If the file is open in another program, this might fail
+        with open(file_path, 'a'):
+            pass
+        return False
+    except PermissionError:
+        # If a PermissionError is raised, it's likely the file is open elsewhere
+        return True
+
+
+def get_files_sorted(root_dir):
+    all_files = []
+
+    # Step 1: Traverse the directory structure
+    for root, _, files in os.walk(root_dir):
+        for file in files:
+            full_path = os.path.join(root, file)
+
+            # Step 2: Prioritize 'index.mdx' or 'index.md' within the same folder
+            modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file
+            sort_key = os.path.join(root, modified_basename)
+
+            # Add tuple to the list
+            all_files.append((full_path, sort_key))
+
+    # Step 3: Perform a global sort based on modified basename
+    all_files.sort(key=lambda x: x[1])
+
+    # Step 4: Return the full paths in sorted order
+    return [full_path for full_path, _ in all_files]
+
+
+def preprocess_frontmatter(frontmatter):
+    # Dictionary to store HTML tags and their placeholders
+    html_tags = {}
+
+    # Function to replace HTML tags with placeholders
+    def replace_tag(match):
+        tag = match.group(0)
+        placeholder = f"HTML_TAG_{len(html_tags)}"
+        html_tags[placeholder] = tag
+        return placeholder
+
+    # Replace HTML tags with placeholders
+    modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter)
+
+    return modified_frontmatter, html_tags
+
+
+def restore_html_tags(parsed_data, html_tags):
+    if isinstance(parsed_data, dict):
+        for key, value in parsed_data.items():
+            if isinstance(value, str):
+                for placeholder, tag in html_tags.items():
+                    value = value.replace(placeholder, tag)
+                # if key == 'title':  # Escape HTML characters for titles
+                value = html.escape(value)
+                parsed_data[key] = value
+    return parsed_data
+
+
+def process_files(files, repo_dir, docs_dir):
+    # Initialize the Table of Contents
+    toc = ""  
+    html_all_pages_content = ""
+
+    # Initialize an empty string to hold all the HTML content & Include the main CSS directly in the HTML
+    html_header = f"""
+    <html>
+    <head>
+        <style>
+            {open('styles.css').read()}
+        </style>
+    </head>
+    <body>
+    """
+
+    numbering = [0]  # Starting with the first level
+
+    for index, file_path in enumerate(files):
+        with open(file_path, 'r', encoding='utf8') as f:
+            md_content = f.read()
+
+            # Process the markdown content for image paths
+            if Change_img_url:
+                md_content = process_image_paths(md_content)
+
+            # Process the markdown content for non standard code blocks
+            md_content = preprocess_code_blocks(md_content)
+
+            # Parse the frontmatter and markdown
+            frontmatter, md_content = parse_frontmatter(md_content)
+
+            if frontmatter:
+                # Preprocessing: replaces HTML tags with unique placeholders and stores the mappings
+                frontmatter, html_tags = preprocess_frontmatter(frontmatter)
+
+                # Parse the YAML frontmatter
+                data = safe_load_frontmatter(frontmatter)
+                if data is not None:
+
+                    # Preprocessing: After parsing the YAML, restore the HTML tags in place of the placeholders
+                    data = restore_html_tags(data, html_tags)
+                
+                    # Depth Level: Calculate relative path, directory depth and TOC
+                    rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir))
+
+                    # Depth Level: Calculate the depth of each section
+                    depth = rel_path.count(os.sep)  # Count separators to determine depth
+                    file_basename = os.path.basename(file_path)                    
+                    if file_basename.startswith("index.") and depth > 0:
+                        depth += -1  # or another title for the main index
+                    indent = '&nbsp;' * 5 * depth  # Adjust indentation based on depth
+
+                    # Numbering: Ensure numbering has enough levels
+                    while len(numbering) <= depth:
+                        numbering.append(0)
+
+                    # Numbering: Increment at the current level
+                    numbering[depth] += 1
+
+                    # Numbering: Reset for any lower levels
+                    for i in range(depth + 1, len(numbering)):
+                        numbering[i] = 0
+                    
+                    # Numbering: Create entry
+                    toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}"
+
+                    # TOC: Generate the section title
+                    toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title())
+                    toc_full_title = f"{toc_numbering} - {toc_title}"
+                    toc += f"{indent}<a href='#{toc_full_title}'>{toc_full_title}</a><br/>"
+
+                    # Page Content: Format the parsed YAML to HTML
+                    html_page_content = f"""
+                    <h1>{toc_full_title}</h1>
+                    <div class="doc-path"><p>Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}</p></div>
+                    <p><strong>Description:</strong> {data.get('description', 'No description')}</p>
+                    """
+                    if data.get('related', {}):
+                        html_page_content += f"""
+                        <div style="margin-left:20px;">
+                            <p><strong>Related:</strong></p>
+                            <p><strong>Title:</strong> {data.get('related', {}).get('title', 'Related')}</p>
+                            <p><strong>Related Description:</strong> {data.get('related', {}).get('description', 'No related description')}</p>
+                            <p><strong>Links:</strong></p>
+                        <ul>
+                            {''.join([f'<li>{link}</li>' for link in data.get('related', {}).get('links', [])])}
+                        </ul>
+                        </div>
+                        """
+                    html_page_content += '</br>'
+
+                else:
+                    html_page_content = ""
+            else:
+                html_page_content = ""
+
+            # Convert Markdown to HTML with table support and add content to the identified header
+            html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition'])
+            
+            # Add page content to all cumulated pages content
+            html_all_pages_content += html_page_content
+
+            # Add a page break unless it is the last file
+            if index < len(files) - 1:
+                html_all_pages_content += '<div class="page-break"></div>'
+    
+    # Prepend the ToC to the beginning of the HTML content
+    toc_html = f"""<div style="padding-bottom: 10px"><div style="padding-bottom: 20px"><h1>Table of Contents</h1></div>{toc}</div><div style="page-break-before: always;">"""
+    html_all_content = toc_html + html_all_pages_content
+
+    # Finalize html formatting
+    html_all_pages_content  = html_header + html_all_pages_content + "</body></html>"
+    toc_html                = html_header + toc_html + "</body></html>"
+    html_all_content        = html_header + html_all_content + "</body></html>"
+
+    return(html_all_content, toc_html, html_all_pages_content)
+
+
+def find_latest_version(html_content):
+    # Regular expression to find versions like v14.2.0
+    version_pattern = re.compile(r"v(\d+\.\d+\.\d+)")
+    versions = version_pattern.findall(html_content)
+    # Remove duplicates and sort versions
+    unique_versions = sorted(set(versions), key=lambda v: version.parse(v), reverse=True)
+    return unique_versions[0] if unique_versions else None
+
+
+if __name__ == "__main__":
+
+    # Define the output PDF file name
+    # project_title = "Next.js v14 Documentation"
+    # output_pdf = "Next.js_v14_Documentation.pdf"
+    export_html = False
+
+    # Clone the repository and checkout the canary branch
+    repo_dir = "nextjs-docs" 
+    repo_url = "https://github.com/vercel/next.js.git"
+    branch = "canary"
+    docs_dir = "docs"
+
+    # Define a base path and quality for the image URLs
+    Change_img_url = True
+    base_path = "https://nextjs.org/_next/image?url="
+    path_args = "&w=1920&q=75"
+
+    # Clone the repository
+    clone_repo(repo_url, branch, docs_dir, repo_dir)
+
+    # Traverse the docs directory and convert each markdown file to HTML
+    print ("Converting the Documentation to HTML...")
+    docs_dir_full_path = os.path.join(repo_dir, docs_dir)
+    files_to_process = get_files_sorted(docs_dir_full_path)
+    html_all_content, _, _ = process_files(files_to_process, repo_dir, docs_dir)
+    print("Converted all MDX to HTML.")
+
+    # Save the HTML content to a file for inspection
+    if export_html:
+        with open('output.html', 'w', encoding='utf8') as f:
+            f.write(html_all_content)
+            print("HTML Content exported.")
+
+    # Find the latest version in the HTML content
+    latest_version = find_latest_version(html_all_content)
+    if latest_version:
+        project_title = f"""Next.js Documentation v{latest_version}"""
+        output_pdf = f"""Next.js_Docs_v{latest_version}_{datetime.now().strftime("%Y-%m-%d")}.pdf"""
+    else:
+        project_title = "Next.js Documentation"
+        output_pdf = "Next.js_Documentation.pdf"
+
+    # Define the cover HTML with local CSS file
+    cover_html = f"""
+    <html>
+        <head>
+            <style>
+                {open('styles.css').read()}
+            </style>
+        </head>
+        <body>
+            <div class="master-container">
+                <div class="container">
+                    <div class="title">{project_title}</div>
+                    <div class="date">Date: {datetime.now().strftime("%Y-%m-%d")}</div>
+                </div>
+            </div>
+        </body>
+    </html>
+    """
+
+    # Write the cover HTML to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as cover_file:
+        cover_file.write(cover_html.encode('utf-8'))
+        print("HTML Cover exported.")
+
+    # Convert the combined HTML content to PDF with a cover and a table of contents
+    if is_file_open(output_pdf):
+        print("The output file is already open in another process. Please close it and try again.")
+    else:
+        options = {
+            'encoding': 'UTF-8',
+            'page-size': 'A4',
+            'quiet': '',
+            'image-dpi': 150, # General reco.: printer - hq, 300 dpi| ebook - low quality, 150 dpi| screen-view-only quality, 72 dpi
+            'image-quality': 75,
+            # 'no-outline': None,
+            # 'no-images': None,
+        }
+        pdfkit.from_string(html_all_content, output_pdf, options=options, cover=cover_file.name, toc={})
+        print("Created the PDF file successfully.")
+
+    # Delete the temporary file
+    os.unlink(cover_file.name)