Optimize language parser loading by only loading once for all files

This commit is contained in:
Saoud Rizwan
2024-07-27 07:54:44 -04:00
parent 303da320aa
commit c40fae4cfb
2 changed files with 156 additions and 98 deletions

View File

@@ -1,22 +1,7 @@
import * as path from "path"
import { globby } from "globby"
import * as fs from "fs/promises"
import Parser from "web-tree-sitter"
import {
javascriptQuery,
typescriptQuery,
pythonQuery,
rustQuery,
goQuery,
cppQuery,
cQuery,
csharpQuery,
rubyQuery,
javaQuery,
phpQuery,
swiftQuery,
} from "./tree-sitter-queries/tags"
import { globby } from "globby"
import * as path from "path"
import { LanguageParser, loadAllLanguages } from "./languageParser"
async function analyzeProject(dirPath: string): Promise<string> {
let result = ""
@@ -27,11 +12,14 @@ async function analyzeProject(dirPath: string): Promise<string> {
// Separate files to parse and remaining files
const { filesToParse, remainingFiles } = separateFiles(allFiles)
// Load only the necessary language parsers
const languageParsers = await loadAllLanguages(filesToParse)
// Parse specific files and generate result
result += "Files parsed with ASTs:\n"
for (const file of filesToParse) {
result += `File: ${file}\n`
const ast = await parseFile(file)
const ast = await parseFile(file, languageParsers)
result += `AST: ${JSON.stringify(ast, null, 2)}\n\n`
}
@@ -121,83 +109,12 @@ This approach allows us to focus on the most relevant parts of the code (defined
- https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/test/helper.js
- https://tree-sitter.github.io/tree-sitter/code-navigation-systems
*/
async function parseFile(filePath: string): Promise<string> {
async function parseFile(filePath: string, languageParsers: LanguageParser): Promise<string> {
const fileContent = await fs.readFile(filePath, "utf8")
const ext = path.extname(filePath).toLowerCase().slice(1)
await Parser.init()
const parser = new Parser()
let query: Parser.Query
switch (ext) {
case "js":
case "jsx":
const JavaScript = await loadLanguage("javascript")
parser.setLanguage(JavaScript)
query = JavaScript.query(javascriptQuery)
break
case "ts":
const TypeScript = await loadLanguage("typescript")
parser.setLanguage(TypeScript)
query = TypeScript.query(typescriptQuery)
break
case "tsx":
const Tsx = await loadLanguage("tsx")
parser.setLanguage(Tsx)
query = Tsx.query(typescriptQuery)
break
case "py":
const Python = await loadLanguage("python")
parser.setLanguage(Python)
query = Python.query(pythonQuery)
break
case "rs":
const Rust = await loadLanguage("rust")
parser.setLanguage(Rust)
query = Rust.query(rustQuery)
break
case "go":
const Go = await loadLanguage("go")
parser.setLanguage(Go)
query = Go.query(goQuery)
break
case "cpp":
case "hpp":
const Cpp = await loadLanguage("cpp")
parser.setLanguage(Cpp)
query = Cpp.query(cppQuery)
break
case "c":
case "h":
const C = await loadLanguage("c")
parser.setLanguage(C)
query = C.query(cQuery)
break
case "cs":
const CSharp = await loadLanguage("c_sharp")
parser.setLanguage(CSharp)
query = CSharp.query(csharpQuery)
break
case "rb":
const Ruby = await loadLanguage("ruby")
parser.setLanguage(Ruby)
query = Ruby.query(rubyQuery)
break
case "java":
const Java = await loadLanguage("java")
parser.setLanguage(Java)
query = Java.query(javaQuery)
break
case "php":
const PHP = await loadLanguage("php")
parser.setLanguage(PHP)
query = PHP.query(phpQuery)
break
case "swift":
const Swift = await loadLanguage("swift")
parser.setLanguage(Swift)
query = Swift.query(swiftQuery)
break
default:
const { parser, query } = languageParsers[ext] || {}
if (!parser || !query) {
return `Unsupported file type: ${filePath}`
}
@@ -247,8 +164,4 @@ async function parseFile(filePath: string): Promise<string> {
return formattedOutput
}
async function loadLanguage(langName: string) {
return await Parser.Language.load(path.join(__dirname, `tree-sitter-${langName}.wasm`))
}
export { analyzeProject }

View File

@@ -0,0 +1,145 @@
import * as path from "path"
import Parser from "web-tree-sitter"
import {
javascriptQuery,
typescriptQuery,
pythonQuery,
rustQuery,
goQuery,
cppQuery,
cQuery,
csharpQuery,
rubyQuery,
javaQuery,
phpQuery,
swiftQuery,
} from "./tree-sitter-queries/tags"
export interface LanguageParser {
[key: string]: {
parser: Parser
query: Parser.Query
}
}
async function loadLanguage(langName: string) {
return await Parser.Language.load(path.join(__dirname, `tree-sitter-${langName}.wasm`))
}
/*
Using node bindings for tree-sitter is problematic in vscode extensions
because of incompatibility with electron. Going the .wasm route has the
advantage of not having to build for multiple architectures.
We use web-tree-sitter and tree-sitter-wasms which provides auto-updating prebuilt WASM binaries for tree-sitter's language parsers.
This function loads WASM modules for relevant language parsers based on input files:
1. Extracts unique file extensions
2. Maps extensions to language names
3. Loads corresponding WASM files (containing grammar rules)
4. Uses WASM modules to initialize tree-sitter parsers
This approach optimizes performance by loading only necessary parsers once for all relevant files.
Sources:
- https://github.com/tree-sitter/node-tree-sitter/issues/169
- https://github.com/tree-sitter/node-tree-sitter/issues/168
- https://github.com/Gregoor/tree-sitter-wasms/blob/main/README.md
*/
export async function loadAllLanguages(filesToParse: string[]): Promise<LanguageParser> {
await Parser.init()
const extensionsToLoad = new Set(filesToParse.map((file) => path.extname(file).toLowerCase().slice(1)))
const languageMap: { [key: string]: string } = {
js: "javascript",
jsx: "javascript",
ts: "typescript",
tsx: "tsx",
py: "python",
rs: "rust",
go: "go",
cpp: "cpp",
hpp: "cpp",
c: "c",
h: "c",
cs: "c_sharp",
rb: "ruby",
java: "java",
php: "php",
swift: "swift",
}
const languages: { [key: string]: Parser.Language } = {}
for (const ext of extensionsToLoad) {
if (ext in languageMap) {
const langName = languageMap[ext as keyof typeof languageMap]
if (!languages[langName]) {
languages[langName] = await loadLanguage(langName)
}
}
}
const parsers: LanguageParser = {}
for (const ext of extensionsToLoad) {
if (ext in languageMap) {
const langName = languageMap[ext as keyof typeof languageMap]
const lang = languages[langName]
const parser = new Parser()
parser.setLanguage(lang)
let query: Parser.Query
switch (ext) {
case "js":
case "jsx":
query = lang.query(javascriptQuery)
break
case "ts":
case "tsx":
query = lang.query(typescriptQuery)
break
case "py":
query = lang.query(pythonQuery)
break
case "rs":
query = lang.query(rustQuery)
break
case "go":
query = lang.query(goQuery)
break
case "cpp":
case "hpp":
query = lang.query(cppQuery)
break
case "c":
case "h":
query = lang.query(cQuery)
break
case "cs":
query = lang.query(csharpQuery)
break
case "rb":
query = lang.query(rubyQuery)
break
case "java":
query = lang.query(javaQuery)
break
case "php":
query = lang.query(phpQuery)
break
case "swift":
query = lang.query(swiftQuery)
break
default:
throw new Error(`Unsupported language: ${ext}`)
}
parsers[ext] = { parser, query }
}
}
return parsers
}