From c40fae4cfb1d51d9703b7abdb1703a2196c7d747 Mon Sep 17 00:00:00 2001 From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com> Date: Sat, 27 Jul 2024 07:54:44 -0400 Subject: [PATCH] Optimize language parser loading by only loading once for all files --- src/AnalyzeProject/index.ts | 109 ++------------------ src/AnalyzeProject/languageParser.ts | 145 +++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 98 deletions(-) create mode 100644 src/AnalyzeProject/languageParser.ts diff --git a/src/AnalyzeProject/index.ts b/src/AnalyzeProject/index.ts index 88cd80f..0d143dc 100644 --- a/src/AnalyzeProject/index.ts +++ b/src/AnalyzeProject/index.ts @@ -1,22 +1,7 @@ -import * as path from "path" -import { globby } from "globby" import * as fs from "fs/promises" -import Parser from "web-tree-sitter" - -import { - javascriptQuery, - typescriptQuery, - pythonQuery, - rustQuery, - goQuery, - cppQuery, - cQuery, - csharpQuery, - rubyQuery, - javaQuery, - phpQuery, - swiftQuery, -} from "./tree-sitter-queries/tags" +import { globby } from "globby" +import * as path from "path" +import { LanguageParser, loadAllLanguages } from "./languageParser" async function analyzeProject(dirPath: string): Promise { let result = "" @@ -27,11 +12,14 @@ async function analyzeProject(dirPath: string): Promise { // Separate files to parse and remaining files const { filesToParse, remainingFiles } = separateFiles(allFiles) + // Load only the necessary language parsers + const languageParsers = await loadAllLanguages(filesToParse) + // Parse specific files and generate result result += "Files parsed with ASTs:\n" for (const file of filesToParse) { result += `File: ${file}\n` - const ast = await parseFile(file) + const ast = await parseFile(file, languageParsers) result += `AST: ${JSON.stringify(ast, null, 2)}\n\n` } @@ -121,84 +109,13 @@ This approach allows us to focus on the most relevant parts of the code (defined - https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/test/helper.js - https://tree-sitter.github.io/tree-sitter/code-navigation-systems */ -async function parseFile(filePath: string): Promise { +async function parseFile(filePath: string, languageParsers: LanguageParser): Promise { const fileContent = await fs.readFile(filePath, "utf8") const ext = path.extname(filePath).toLowerCase().slice(1) - await Parser.init() - const parser = new Parser() - let query: Parser.Query - switch (ext) { - case "js": - case "jsx": - const JavaScript = await loadLanguage("javascript") - parser.setLanguage(JavaScript) - query = JavaScript.query(javascriptQuery) - break - case "ts": - const TypeScript = await loadLanguage("typescript") - parser.setLanguage(TypeScript) - query = TypeScript.query(typescriptQuery) - break - case "tsx": - const Tsx = await loadLanguage("tsx") - parser.setLanguage(Tsx) - query = Tsx.query(typescriptQuery) - break - case "py": - const Python = await loadLanguage("python") - parser.setLanguage(Python) - query = Python.query(pythonQuery) - break - case "rs": - const Rust = await loadLanguage("rust") - parser.setLanguage(Rust) - query = Rust.query(rustQuery) - break - case "go": - const Go = await loadLanguage("go") - parser.setLanguage(Go) - query = Go.query(goQuery) - break - case "cpp": - case "hpp": - const Cpp = await loadLanguage("cpp") - parser.setLanguage(Cpp) - query = Cpp.query(cppQuery) - break - case "c": - case "h": - const C = await loadLanguage("c") - parser.setLanguage(C) - query = C.query(cQuery) - break - case "cs": - const CSharp = await loadLanguage("c_sharp") - parser.setLanguage(CSharp) - query = CSharp.query(csharpQuery) - break - case "rb": - const Ruby = await loadLanguage("ruby") - parser.setLanguage(Ruby) - query = Ruby.query(rubyQuery) - break - case "java": - const Java = await loadLanguage("java") - parser.setLanguage(Java) - query = Java.query(javaQuery) - break - case "php": - const PHP = await loadLanguage("php") - parser.setLanguage(PHP) - query = PHP.query(phpQuery) - break - case "swift": - const Swift = await loadLanguage("swift") - parser.setLanguage(Swift) - query = Swift.query(swiftQuery) - break - default: - return `Unsupported file type: ${filePath}` + const { parser, query } = languageParsers[ext] || {} + if (!parser || !query) { + return `Unsupported file type: ${filePath}` } let formattedOutput = `${filePath}:\n|----\n` @@ -247,8 +164,4 @@ async function parseFile(filePath: string): Promise { return formattedOutput } -async function loadLanguage(langName: string) { - return await Parser.Language.load(path.join(__dirname, `tree-sitter-${langName}.wasm`)) -} - export { analyzeProject } diff --git a/src/AnalyzeProject/languageParser.ts b/src/AnalyzeProject/languageParser.ts new file mode 100644 index 0000000..d19bdb6 --- /dev/null +++ b/src/AnalyzeProject/languageParser.ts @@ -0,0 +1,145 @@ +import * as path from "path" +import Parser from "web-tree-sitter" +import { + javascriptQuery, + typescriptQuery, + pythonQuery, + rustQuery, + goQuery, + cppQuery, + cQuery, + csharpQuery, + rubyQuery, + javaQuery, + phpQuery, + swiftQuery, +} from "./tree-sitter-queries/tags" + +export interface LanguageParser { + [key: string]: { + parser: Parser + query: Parser.Query + } +} + +async function loadLanguage(langName: string) { + return await Parser.Language.load(path.join(__dirname, `tree-sitter-${langName}.wasm`)) +} + +/* +Using node bindings for tree-sitter is problematic in vscode extensions +because of incompatibility with electron. Going the .wasm route has the +advantage of not having to build for multiple architectures. + +We use web-tree-sitter and tree-sitter-wasms which provides auto-updating prebuilt WASM binaries for tree-sitter's language parsers. + +This function loads WASM modules for relevant language parsers based on input files: +1. Extracts unique file extensions +2. Maps extensions to language names +3. Loads corresponding WASM files (containing grammar rules) +4. Uses WASM modules to initialize tree-sitter parsers + +This approach optimizes performance by loading only necessary parsers once for all relevant files. + +Sources: +- https://github.com/tree-sitter/node-tree-sitter/issues/169 +- https://github.com/tree-sitter/node-tree-sitter/issues/168 +- https://github.com/Gregoor/tree-sitter-wasms/blob/main/README.md +*/ +export async function loadAllLanguages(filesToParse: string[]): Promise { + await Parser.init() + + const extensionsToLoad = new Set(filesToParse.map((file) => path.extname(file).toLowerCase().slice(1))) + + const languageMap: { [key: string]: string } = { + js: "javascript", + jsx: "javascript", + ts: "typescript", + tsx: "tsx", + py: "python", + rs: "rust", + go: "go", + cpp: "cpp", + hpp: "cpp", + c: "c", + h: "c", + cs: "c_sharp", + rb: "ruby", + java: "java", + php: "php", + swift: "swift", + } + + const languages: { [key: string]: Parser.Language } = {} + + for (const ext of extensionsToLoad) { + if (ext in languageMap) { + const langName = languageMap[ext as keyof typeof languageMap] + if (!languages[langName]) { + languages[langName] = await loadLanguage(langName) + } + } + } + + const parsers: LanguageParser = {} + + for (const ext of extensionsToLoad) { + if (ext in languageMap) { + const langName = languageMap[ext as keyof typeof languageMap] + const lang = languages[langName] + + const parser = new Parser() + parser.setLanguage(lang) + let query: Parser.Query + + switch (ext) { + case "js": + case "jsx": + query = lang.query(javascriptQuery) + break + case "ts": + case "tsx": + query = lang.query(typescriptQuery) + break + case "py": + query = lang.query(pythonQuery) + break + case "rs": + query = lang.query(rustQuery) + break + case "go": + query = lang.query(goQuery) + break + case "cpp": + case "hpp": + query = lang.query(cppQuery) + break + case "c": + case "h": + query = lang.query(cQuery) + break + case "cs": + query = lang.query(csharpQuery) + break + case "rb": + query = lang.query(rubyQuery) + break + case "java": + query = lang.query(javaQuery) + break + case "php": + query = lang.query(phpQuery) + break + case "swift": + query = lang.query(swiftQuery) + break + default: + throw new Error(`Unsupported language: ${ext}`) + } + + parsers[ext] = { parser, query } + } + } + + return parsers +}