Refactor analyze-project to parse-source-code

This commit is contained in:
Saoud Rizwan
2024-07-31 09:52:36 -04:00
parent 71a9ed1d1e
commit 3daca3d68f
16 changed files with 1 additions and 1 deletions

View File

@@ -0,0 +1,194 @@
import * as fs from "fs/promises"
import { globby } from "globby"
import * as path from "path"
import { LanguageParser, loadRequiredLanguageParsers } from "./languageParser"
// TODO: implement caching behavior to avoid having to keep analyzing project for new tasks.
export async function parseSourceCodeForDefinitions(dirPath: string): Promise<string> {
// Get all files at top level (not gitignored)
const allFiles = await listFiles(dirPath, false)
let result = ""
// Separate files to parse and remaining files
const { filesToParse, remainingFiles } = separateFiles(allFiles)
const languageParsers = await loadRequiredLanguageParsers(filesToParse)
// Parse specific files we have language parsers for
// const filesWithoutDefinitions: string[] = []
for (const file of filesToParse) {
const definitions = await parseFile(file, languageParsers)
if (definitions) {
result += `${path.relative(dirPath, file)}\n${definitions}\n`
}
// else {
// filesWithoutDefinitions.push(file)
// }
}
// List remaining files' paths
// let didFindUnparsedFiles = false
// filesWithoutDefinitions
// .concat(remainingFiles)
// .sort()
// .forEach((file) => {
// if (!didFindUnparsedFiles) {
// result += "# Unparsed Files\n\n"
// didFindUnparsedFiles = true
// }
// result += `${path.relative(dirPath, file)}\n`
// })
return result ? result : "No source code definitions found."
}
export async function listFiles(dirPath: string, recursive: boolean): Promise<string[]> {
const absolutePath = path.resolve(dirPath)
const root = process.platform === "win32" ? path.parse(absolutePath).root : "/"
const isRoot = absolutePath === root
if (isRoot) {
return [root]
}
const dirsToIgnore = [
"node_modules",
"__pycache__",
"env",
"venv",
"target/dependency",
"build/dependencies",
"dist",
"out",
"bundle",
"vendor",
"tmp",
"temp",
"deps",
"pkg",
"Pods",
".*", // '!**/.*' excludes hidden directories, while '!**/.*/**' excludes only their contents. This way we are at least aware of the existence of hidden directories.
].map((dir) => `**/${dir}/**`)
const options = {
cwd: dirPath,
dot: true, // do not ignore hidden files/directories
absolute: true,
markDirectories: true, // Append a / on any directories matched
gitignore: recursive, // globby ignores any files that are gitignored
ignore: recursive ? dirsToIgnore : undefined, // just in case there is no gitignore, we ignore sensible defaults
onlyFiles: recursive, // true by default, false means it will list directories on their own too
}
// * globs all files in one dir, ** globs files in nested directories
const files = await globby(recursive ? "**" : "*", options)
return files
}
function separateFiles(allFiles: string[]): { filesToParse: string[]; remainingFiles: string[] } {
const extensions = [
"js",
"jsx",
"ts",
"tsx",
"py",
// Rust
"rs",
"go",
// C
"c",
"h",
// C++
"cpp",
"hpp",
// C#
"cs",
// Ruby
"rb",
"java",
"php",
"swift",
].map((e) => `.${e}`)
const filesToParse = allFiles.filter((file) => extensions.includes(path.extname(file))).slice(0, 50) // 50 files max
const remainingFiles = allFiles.filter((file) => !filesToParse.includes(file))
return { filesToParse, remainingFiles }
}
/*
Parsing files using tree-sitter
1. Parse the file content into an AST (Abstract Syntax Tree) using the appropriate language grammar (set of rules that define how the components of a language like keywords, expressions, and statements can be combined to create valid programs).
2. Create a query using a language-specific query string, and run it against the AST's root node to capture specific syntax elements.
- We use tag queries to identify named entities in a program, and then use a syntax capture to label the entity and its name. A notable example of this is GitHub's search-based code navigation.
- Our custom tag queries are based on tree-sitter's default tag queries, but modified to only capture definitions.
3. Sort the captures by their position in the file, output the name of the definition, and format by i.e. adding "|----\n" for gaps between captured sections.
This approach allows us to focus on the most relevant parts of the code (defined by our language-specific queries) and provides a concise yet informative view of the file's structure and key elements.
- https://github.com/tree-sitter/node-tree-sitter/blob/master/test/query_test.js
- https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/test/query-test.js
- https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/test/helper.js
- https://tree-sitter.github.io/tree-sitter/code-navigation-systems
*/
async function parseFile(filePath: string, languageParsers: LanguageParser): Promise<string | undefined> {
const fileContent = await fs.readFile(filePath, "utf8")
const ext = path.extname(filePath).toLowerCase().slice(1)
const { parser, query } = languageParsers[ext] || {}
if (!parser || !query) {
return `Unsupported file type: ${filePath}`
}
let formattedOutput = ""
try {
// Parse the file content into an Abstract Syntax Tree (AST), a tree-like representation of the code
const tree = parser.parse(fileContent)
// Apply the query to the AST and get the captures
// Captures are specific parts of the AST that match our query patterns, each capture represents a node in the AST that we're interested in.
const captures = query.captures(tree.rootNode)
// Sort captures by their start position
captures.sort((a, b) => a.node.startPosition.row - b.node.startPosition.row)
// Split the file content into individual lines
const lines = fileContent.split("\n")
// Keep track of the last line we've processed
let lastLine = -1
captures.forEach((capture) => {
const { node, name } = capture
// Get the start and end lines of the current AST node
const startLine = node.startPosition.row
const endLine = node.endPosition.row
// Once we've retrieved the nodes we care about through the language query, we filter for lines with definition names only.
// name.startsWith("name.reference.") > refs can be used for ranking purposes, but we don't need them for the output
// previously we did `name.startsWith("name.definition.")` but this was too strict and excluded some relevant definitions
// Add separator if there's a gap between captures
if (lastLine !== -1 && startLine > lastLine + 1) {
formattedOutput += "|----\n"
}
// Only add the first line of the definition
// query captures includes the definition name and the definition implementation, but we only want the name (I found discrepencies in the naming structure for various languages, i.e. javascript names would be 'name' and typescript names would be 'name.definition)
if (name.includes("name") && lines[startLine]) {
formattedOutput += `${lines[startLine]}\n`
}
// Adds all the captured lines
// for (let i = startLine; i <= endLine; i++) {
// formattedOutput += `│${lines[i]}\n`
// }
//}
lastLine = endLine
})
} catch (error) {
console.log(`Error parsing file: ${error}\n`)
}
if (formattedOutput.length > 0) {
return `|----\n${formattedOutput}|----\n`
}
return undefined
}

View File

@@ -0,0 +1,122 @@
import * as path from "path"
import Parser from "web-tree-sitter"
import {
javascriptQuery,
typescriptQuery,
pythonQuery,
rustQuery,
goQuery,
cppQuery,
cQuery,
csharpQuery,
rubyQuery,
javaQuery,
phpQuery,
swiftQuery,
} from "./queries"
export interface LanguageParser {
[key: string]: {
parser: Parser
query: Parser.Query
}
}
async function loadLanguage(langName: string) {
return await Parser.Language.load(path.join(__dirname, `tree-sitter-${langName}.wasm`))
}
/*
Using node bindings for tree-sitter is problematic in vscode extensions
because of incompatibility with electron. Going the .wasm route has the
advantage of not having to build for multiple architectures.
We use web-tree-sitter and tree-sitter-wasms which provides auto-updating prebuilt WASM binaries for tree-sitter's language parsers.
This function loads WASM modules for relevant language parsers based on input files:
1. Extracts unique file extensions
2. Maps extensions to language names
3. Loads corresponding WASM files (containing grammar rules)
4. Uses WASM modules to initialize tree-sitter parsers
This approach optimizes performance by loading only necessary parsers once for all relevant files.
Sources:
- https://github.com/tree-sitter/node-tree-sitter/issues/169
- https://github.com/tree-sitter/node-tree-sitter/issues/168
- https://github.com/Gregoor/tree-sitter-wasms/blob/main/README.md
- https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/README.md
- https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/test/query-test.js
*/
export async function loadRequiredLanguageParsers(filesToParse: string[]): Promise<LanguageParser> {
await Parser.init()
const extensionsToLoad = new Set(filesToParse.map((file) => path.extname(file).toLowerCase().slice(1)))
const parsers: LanguageParser = {}
for (const ext of extensionsToLoad) {
let language: Parser.Language
let query: Parser.Query
switch (ext) {
case "js":
case "jsx":
language = await loadLanguage("javascript")
query = language.query(javascriptQuery)
break
case "ts":
language = await loadLanguage("typescript")
query = language.query(typescriptQuery)
break
case "tsx":
language = await loadLanguage("tsx")
query = language.query(typescriptQuery)
break
case "py":
language = await loadLanguage("python")
query = language.query(pythonQuery)
break
case "rs":
language = await loadLanguage("rust")
query = language.query(rustQuery)
break
case "go":
language = await loadLanguage("go")
query = language.query(goQuery)
break
case "cpp":
case "hpp":
language = await loadLanguage("cpp")
query = language.query(cppQuery)
break
case "c":
case "h":
language = await loadLanguage("c")
query = language.query(cQuery)
break
case "cs":
language = await loadLanguage("c_sharp")
query = language.query(csharpQuery)
break
case "rb":
language = await loadLanguage("ruby")
query = language.query(rubyQuery)
break
case "java":
language = await loadLanguage("java")
query = language.query(javaQuery)
break
case "php":
language = await loadLanguage("php")
query = language.query(phpQuery)
break
case "swift":
language = await loadLanguage("swift")
query = language.query(swiftQuery)
break
default:
throw new Error(`Unsupported language: ${ext}`)
}
const parser = new Parser()
parser.setLanguage(language)
parsers[ext] = { parser, query }
}
return parsers
}

View File

@@ -0,0 +1,23 @@
/*
- class declarations
- interface declarations
- method declarations
- namespace declarations
*/
export default `
(class_declaration
name: (identifier) @name.definition.class
) @definition.class
(interface_declaration
name: (identifier) @name.definition.interface
) @definition.interface
(method_declaration
name: (identifier) @name.definition.method
) @definition.method
(namespace_declaration
name: (identifier) @name.definition.module
) @definition.module
`

View File

@@ -0,0 +1,15 @@
/*
- struct declarations
- union declarations
- function declarations
- typedef declarations
*/
export default `
(struct_specifier name: (type_identifier) @name.definition.class body:(_)) @definition.class
(declaration type: (union_specifier name: (type_identifier) @name.definition.class)) @definition.class
(function_declarator declarator: (identifier) @name.definition.function) @definition.function
(type_definition declarator: (type_identifier) @name.definition.type) @definition.type
`

View File

@@ -0,0 +1,23 @@
/*
- struct declarations
- union declarations
- function declarations
- method declarations (with namespace scope)
- typedef declarations
- class declarations
*/
export default `
(struct_specifier name: (type_identifier) @name.definition.class body:(_)) @definition.class
(declaration type: (union_specifier name: (type_identifier) @name.definition.class)) @definition.class
(function_declarator declarator: (identifier) @name.definition.function) @definition.function
(function_declarator declarator: (field_identifier) @name.definition.function) @definition.function
(function_declarator declarator: (qualified_identifier scope: (namespace_identifier) @scope name: (identifier) @name.definition.method)) @definition.method
(type_definition declarator: (type_identifier) @name.definition.type) @definition.type
(class_specifier name: (type_identifier) @name.definition.class) @definition.class
`

View File

@@ -0,0 +1,27 @@
/*
- function declarations (with associated comments)
- method declarations (with associated comments)
- type specifications
*/
export default `
(
(comment)* @doc
.
(function_declaration
name: (identifier) @name.definition.function) @definition.function
(#strip! @doc "^//\\s*")
(#set-adjacent! @doc @definition.function)
)
(
(comment)* @doc
.
(method_declaration
name: (field_identifier) @name.definition.method) @definition.method
(#strip! @doc "^//\\s*")
(#set-adjacent! @doc @definition.method)
)
(type_spec
name: (type_identifier) @name.definition.type) @definition.type
`

View File

@@ -0,0 +1,12 @@
export { default as phpQuery } from "./php"
export { default as typescriptQuery } from "./typescript"
export { default as pythonQuery } from "./python"
export { default as javascriptQuery } from "./javascript"
export { default as javaQuery } from "./java"
export { default as rustQuery } from "./rust"
export { default as rubyQuery } from "./ruby"
export { default as cppQuery } from "./cpp"
export { default as cQuery } from "./c"
export { default as csharpQuery } from "./c-sharp"
export { default as goQuery } from "./go"
export { default as swiftQuery } from "./swift"

View File

@@ -0,0 +1,15 @@
/*
- class declarations
- method declarations
- interface declarations
*/
export default `
(class_declaration
name: (identifier) @name.definition.class) @definition.class
(method_declaration
name: (identifier) @name.definition.method) @definition.method
(interface_declaration
name: (identifier) @name.definition.interface) @definition.interface
`

View File

@@ -0,0 +1,65 @@
/*
- class definitions
- method definitions
- named function declarations
- arrow functions and function expressions assigned to variables
*/
export default `
(
(comment)* @doc
.
(method_definition
name: (property_identifier) @name) @definition.method
(#not-eq? @name "constructor")
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.method)
)
(
(comment)* @doc
.
[
(class
name: (_) @name)
(class_declaration
name: (_) @name)
] @definition.class
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.class)
)
(
(comment)* @doc
.
[
(function_declaration
name: (identifier) @name)
(generator_function_declaration
name: (identifier) @name)
] @definition.function
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.function)
)
(
(comment)* @doc
.
(lexical_declaration
(variable_declarator
name: (identifier) @name
value: [(arrow_function) (function_expression)]) @definition.function)
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.function)
)
(
(comment)* @doc
.
(variable_declaration
(variable_declarator
name: (identifier) @name
value: [(arrow_function) (function_expression)]) @definition.function)
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.function)
)
`

View File

@@ -0,0 +1,15 @@
/*
- class declarations
- function definitions
- method declarations
*/
export default `
(class_declaration
name: (name) @name.definition.class) @definition.class
(function_definition
name: (name) @name.definition.function) @definition.function
(method_declaration
name: (name) @name.definition.function) @definition.function
`

View File

@@ -0,0 +1,11 @@
/*
- class definitions
- function definitions
*/
export default `
(class_definition
name: (identifier) @name.definition.class) @definition.class
(function_definition
name: (identifier) @name.definition.function) @definition.function
`

View File

@@ -0,0 +1,52 @@
/*
- method definitions (including singleton methods and aliases, with associated comments)
- class definitions (including singleton classes, with associated comments)
- module definitions
*/
export default `
(
(comment)* @doc
.
[
(method
name: (_) @name.definition.method) @definition.method
(singleton_method
name: (_) @name.definition.method) @definition.method
]
(#strip! @doc "^#\\s*")
(#select-adjacent! @doc @definition.method)
)
(alias
name: (_) @name.definition.method) @definition.method
(
(comment)* @doc
.
[
(class
name: [
(constant) @name.definition.class
(scope_resolution
name: (_) @name.definition.class)
]) @definition.class
(singleton_class
value: [
(constant) @name.definition.class
(scope_resolution
name: (_) @name.definition.class)
]) @definition.class
]
(#strip! @doc "^#\\s*")
(#select-adjacent! @doc @definition.class)
)
(
(module
name: [
(constant) @name.definition.module
(scope_resolution
name: (_) @name.definition.module)
]) @definition.module
)
`

View File

@@ -0,0 +1,16 @@
/*
- struct definitions
- method definitions
- function definitions
*/
export default `
(struct_item
name: (type_identifier) @name.definition.class) @definition.class
(declaration_list
(function_item
name: (identifier) @name.definition.method)) @definition.method
(function_item
name: (identifier) @name.definition.function) @definition.function
`

View File

@@ -0,0 +1,45 @@
/*
- class declarations
- method declarations (including initializers and deinitializers)
- property declarations
- function declarations
*/
export default `
(class_declaration
name: (type_identifier) @name) @definition.class
(protocol_declaration
name: (type_identifier) @name) @definition.interface
(class_declaration
(class_body
[
(function_declaration
name: (simple_identifier) @name
)
(subscript_declaration
(parameter (simple_identifier) @name)
)
(init_declaration "init" @name)
(deinit_declaration "deinit" @name)
]
)
) @definition.method
(class_declaration
(class_body
[
(property_declaration
(pattern (simple_identifier) @name)
)
]
)
) @definition.property
(property_declaration
(pattern (simple_identifier) @name)
) @definition.property
(function_declaration
name: (simple_identifier) @name) @definition.function
`

View File

@@ -0,0 +1,32 @@
/*
- function signatures and declarations
- method signatures and definitions
- abstract method signatures
- class declarations (including abstract classes)
- module declarations
*/
export default `
(function_signature
name: (identifier) @name.definition.function) @definition.function
(method_signature
name: (property_identifier) @name.definition.method) @definition.method
(abstract_method_signature
name: (property_identifier) @name.definition.method) @definition.method
(abstract_class_declaration
name: (type_identifier) @name.definition.class) @definition.class
(module
name: (identifier) @name.definition.module) @definition.module
(function_declaration
name: (identifier) @name.definition.function) @definition.function
(method_definition
name: (property_identifier) @name.definition.method) @definition.method
(class_declaration
name: (type_identifier) @name.definition.class) @definition.class
`