import os import markdown import pdfkit import tempfile import yaml import re import html from git import Repo, RemoteProgress from datetime import datetime from packaging import version from tqdm import tqdm def process_image_paths(md_content): # Define a regular expression pattern to find image tags pattern = r'src(?:Light|Dark)="(.*?)"' # Function to replace the relative path with an absolute path def replace(match): relative_path = match.group(1) absolute_path = f'{base_path}{relative_path}{path_args}' # Print the original and new image URLs for debugging return f'src="{absolute_path}"' # Use the sub method to replace all occurrences return re.sub(pattern, replace, md_content) def preprocess_code_blocks(md_content): # Regular expression to match extended code blocks with filename and language pattern = r'```(\w+)?\s+filename="([^"]+)"\s*(switcher)?\n(.*?)```' def replace(match): language = match.group(1) if match.group(1) else '' filename = match.group(2) code_block = match.group(4) # Format the header with filename and language header = f'
{filename} ({language})
' if language else f'
{filename}
' return f'{header}\n```{language}\n{code_block}\n```' # Replace all occurrences in the content return re.sub(pattern, replace, md_content, flags=re.DOTALL) def safe_load_frontmatter(frontmatter_content): try: return yaml.safe_load(frontmatter_content) except yaml.YAMLError: return None def preprocess_mdx_content(md_content): # Replace HTML tags in frontmatter md_content = re.sub(r'<(/?\w+)>', lambda m: html.escape(m.group(0)), md_content) return md_content def parse_frontmatter(md_content): lines = md_content.split('\n') if lines[0].strip() == '---': end_of_frontmatter = lines.index('---', 1) frontmatter = '\n'.join(lines[1:end_of_frontmatter]) content = '\n'.join(lines[end_of_frontmatter + 1:]) return frontmatter, content return None, md_content class CloneProgress(RemoteProgress): def __init__(self): super().__init__() self.pbar = tqdm() def update(self, op_code, cur_count, max_count=None, message=''): if max_count is not None: self.pbar.total = max_count self.pbar.update(cur_count - self.pbar.n) # increment the pbar with the increment def finalize(self): self.pbar.close() # Clone a specific directory of a repository / branch def clone_repo(repo_url, branch, docs_dir, repo_dir): # Initialize and configure the repository for sparse checkout if not os.path.isdir(repo_dir): os.makedirs(repo_dir, exist_ok=True) print("Cloning repository...") repo = Repo.init(repo_dir) with repo.config_writer() as git_config: git_config.set_value("core", "sparseCheckout", "true") # Define the sparse checkout settings with open(os.path.join(repo_dir, ".git/info/sparse-checkout"), "w") as sparse_checkout_file: sparse_checkout_file.write(f"/{docs_dir}\n") # Pull the specific directory from the repository origin = repo.create_remote("origin", repo_url) origin.fetch(progress=CloneProgress()) repo.git.checkout(branch) print("Repository cloned.") # Update the repository if it already exists else: print("Repository already exists. Updating...") repo = Repo(repo_dir) origin = repo.remotes.origin origin.fetch(progress=CloneProgress()) repo.git.checkout(branch) origin.pull(progress=CloneProgress()) print("Repository updated.") def is_file_open(file_path): if not os.path.exists(file_path): return False # File does not exist, so it's not open try: # Try to open the file in append mode. If the file is open in another program, this might fail with open(file_path, 'a'): pass return False except PermissionError: # If a PermissionError is raised, it's likely the file is open elsewhere return True def get_files_sorted(root_dir): all_files = [] # Step 1: Traverse the directory structure for root, _, files in os.walk(root_dir): for file in files: full_path = os.path.join(root, file) # Step 2: Prioritize 'index.mdx' or 'index.md' within the same folder modified_basename = '!!!' + file if file in ['index.mdx', 'index.md'] else file sort_key = os.path.join(root, modified_basename) # Add tuple to the list all_files.append((full_path, sort_key)) # Step 3: Perform a global sort based on modified basename all_files.sort(key=lambda x: x[1]) # Step 4: Return the full paths in sorted order return [full_path for full_path, _ in all_files] def preprocess_frontmatter(frontmatter): # Dictionary to store HTML tags and their placeholders html_tags = {} # Function to replace HTML tags with placeholders def replace_tag(match): tag = match.group(0) placeholder = f"HTML_TAG_{len(html_tags)}" html_tags[placeholder] = tag return placeholder # Replace HTML tags with placeholders modified_frontmatter = re.sub(r'<[^>]+>', replace_tag, frontmatter) return modified_frontmatter, html_tags def restore_html_tags(parsed_data, html_tags): if isinstance(parsed_data, dict): for key, value in parsed_data.items(): if isinstance(value, str): for placeholder, tag in html_tags.items(): value = value.replace(placeholder, tag) # if key == 'title': # Escape HTML characters for titles value = html.escape(value) parsed_data[key] = value return parsed_data def process_files(files, repo_dir, docs_dir): # Initialize the Table of Contents toc = "" html_all_pages_content = "" # Initialize an empty string to hold all the HTML content & Include the main CSS directly in the HTML html_header = f""" """ numbering = [0] # Starting with the first level for index, file_path in enumerate(files): with open(file_path, 'r', encoding='utf8') as f: md_content = f.read() # Process the markdown content for image paths if Change_img_url: md_content = process_image_paths(md_content) # Process the markdown content for non standard code blocks md_content = preprocess_code_blocks(md_content) # Parse the frontmatter and markdown frontmatter, md_content = parse_frontmatter(md_content) if frontmatter: # Preprocessing: replaces HTML tags with unique placeholders and stores the mappings frontmatter, html_tags = preprocess_frontmatter(frontmatter) # Parse the YAML frontmatter data = safe_load_frontmatter(frontmatter) if data is not None: # Preprocessing: After parsing the YAML, restore the HTML tags in place of the placeholders data = restore_html_tags(data, html_tags) # Depth Level: Calculate relative path, directory depth and TOC rel_path = os.path.relpath(file_path, os.path.join(repo_dir, docs_dir)) # Depth Level: Calculate the depth of each section depth = rel_path.count(os.sep) # Count separators to determine depth file_basename = os.path.basename(file_path) if file_basename.startswith("index.") and depth > 0: depth += -1 # or another title for the main index indent = ' ' * 5 * depth # Adjust indentation based on depth # Numbering: Ensure numbering has enough levels while len(numbering) <= depth: numbering.append(0) # Numbering: Increment at the current level numbering[depth] += 1 # Numbering: Reset for any lower levels for i in range(depth + 1, len(numbering)): numbering[i] = 0 # Numbering: Create entry toc_numbering = f"{'.'.join(map(str, numbering[:depth + 1]))}" # TOC: Generate the section title toc_title = data.get('title', os.path.splitext(os.path.basename(file_path))[0].title()) toc_full_title = f"{toc_numbering} - {toc_title}" toc += f"{indent}{toc_full_title}
" # Page Content: Format the parsed YAML to HTML html_page_content = f"""

{toc_full_title}

Documentation path: {file_path.replace(chr(92),'/').replace('.mdx', '').replace(repo_dir + '/' + docs_dir,'')}

Description: {data.get('description', 'No description')}

""" if data.get('related', {}): html_page_content += f"""

Related:

Title: {data.get('related', {}).get('title', 'Related')}

Related Description: {data.get('related', {}).get('description', 'No related description')}

Links:

""" html_page_content += '
' else: html_page_content = "" else: html_page_content = "" # Convert Markdown to HTML with table support and add content to the identified header html_page_content += markdown.markdown(md_content, extensions=['fenced_code', 'codehilite', 'tables', 'footnotes', 'toc', 'abbr', 'attr_list', 'def_list', 'smarty', 'admonition']) # Add page content to all cumulated pages content html_all_pages_content += html_page_content # Add a page break unless it is the last file if index < len(files) - 1: html_all_pages_content += '
' # Prepend the ToC to the beginning of the HTML content toc_html = f"""

Table of Contents

{toc}
""" html_all_content = toc_html + html_all_pages_content # Finalize html formatting html_all_pages_content = html_header + html_all_pages_content + "" toc_html = html_header + toc_html + "" html_all_content = html_header + html_all_content + "" return(html_all_content, toc_html, html_all_pages_content) def find_latest_version(html_content): # Regular expression to find versions like v14.2.0 version_pattern = re.compile(r"v(\d+\.\d+\.\d+)") versions = version_pattern.findall(html_content) # Remove duplicates and sort versions unique_versions = sorted(set(versions), key=lambda v: version.parse(v), reverse=True) return unique_versions[0] if unique_versions else None if __name__ == "__main__": # Define the output PDF file name # project_title = "Next.js v14 Documentation" # output_pdf = "Next.js_v14_Documentation.pdf" export_html = False # Clone the repository and checkout the canary branch repo_dir = "nextjs-docs" repo_url = "https://github.com/vercel/next.js.git" branch = "canary" docs_dir = "docs" # Define a base path and quality for the image URLs Change_img_url = True base_path = "https://nextjs.org/_next/image?url=" path_args = "&w=1920&q=75" # Clone the repository clone_repo(repo_url, branch, docs_dir, repo_dir) # Traverse the docs directory and convert each markdown file to HTML print ("Converting the Documentation to HTML...") docs_dir_full_path = os.path.join(repo_dir, docs_dir) files_to_process = get_files_sorted(docs_dir_full_path) html_all_content, _, _ = process_files(files_to_process, repo_dir, docs_dir) print("Converted all MDX to HTML.") # Save the HTML content to a file for inspection if export_html: with open('output.html', 'w', encoding='utf8') as f: f.write(html_all_content) print("HTML Content exported.") # Find the latest version in the HTML content latest_version = find_latest_version(html_all_content) if latest_version: project_title = f"""Next.js Documentation v{latest_version}""" output_pdf = f"""Next.js_Docs_v{latest_version}_{datetime.now().strftime("%Y-%m-%d")}.pdf""" else: project_title = "Next.js Documentation" output_pdf = "Next.js_Documentation.pdf" # Define the cover HTML with local CSS file cover_html = f"""
{project_title}
Date: {datetime.now().strftime("%Y-%m-%d")}
""" # Write the cover HTML to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as cover_file: cover_file.write(cover_html.encode('utf-8')) print("HTML Cover exported.") # Convert the combined HTML content to PDF with a cover and a table of contents if is_file_open(output_pdf): print("The output file is already open in another process. Please close it and try again.") else: options = { 'encoding': 'UTF-8', 'page-size': 'A4', 'quiet': '', 'image-dpi': 150, # General reco.: printer - hq, 300 dpi| ebook - low quality, 150 dpi| screen-view-only quality, 72 dpi 'image-quality': 75, # 'no-outline': None, # 'no-images': None, } pdfkit.from_string(html_all_content, output_pdf, options=options, cover=cover_file.name, toc={}) print("Created the PDF file successfully.") # Delete the temporary file os.unlink(cover_file.name)