feat: Parser & Definitions

This commit is contained in:
Bogdan Buduroiu 2025-03-30 16:46:45 +08:00
parent 39e9f4d2c8
commit 1c688e5e45
Signed by: bruvduroiu
GPG key ID: A8722B2334DE9499
4 changed files with 254 additions and 28 deletions

1
.gitignore vendored
View file

@ -20,3 +20,4 @@ zig-out/
# Although this was renamed to .zig-cache, let's leave it here for a few # Although this was renamed to .zig-cache, let's leave it here for a few
# releases to make it less annoying to work with multiple branches. # releases to make it less annoying to work with multiple branches.
zig-cache/ zig-cache/
.aider*

72
src/definitions.zig Normal file
View file

@ -0,0 +1,72 @@
const std = @import("std");
pub const Function = struct {
name: []u8,
params: []u8,
return_type: []u8,
access_modifier: []u8,
documentation: []u8,
allocator: std.mem.Allocator,
pub fn init(
allocator: std.mem.Allocator,
name: []const u8,
params: []const u8,
return_type: []const u8,
access_modifier: []const u8,
documentation: []const u8,
) !Function {
return .{
.name = try allocator.dupe(u8, name),
.params = try allocator.dupe(u8, params),
.return_type = try allocator.dupe(u8, return_type),
.access_modifier = try allocator.dupe(u8, access_modifier),
.documentation = try allocator.dupe(u8, documentation),
.allocator = allocator,
};
}
pub fn destroy(self: *Function) void {
self.allocator.free(self.name);
self.allocator.free(self.params);
self.allocator.free(self.return_type);
self.allocator.free(self.access_modifier);
self.allocator.free(self.documentation);
}
pub fn print(self: Function, writer: anytype) void {
writer.print("func {s}() -> {s};", .{ self.name, self.return_type });
}
};
pub const Property = struct {
name: []u8,
allocator: std.mem.Allocator,
pub fn init(allocator: std.mem.Allocator, name: []const u8) !Property {
return Property{
.name = try allocator.dupe(u8, name),
.allocator = allocator,
};
}
pub fn destroy(self: *Property) void {
self.allocator.free(self.name);
}
};
pub const Definition = union(enum) {
function: Function,
pub fn print(self: Definition, writer: anytype) !void {
switch (self) {
inline else => |case| return case.print(writer),
}
}
pub fn destroy(self: *Definition) void {
switch (self) {
inline else => |case| return case.destroy(),
}
}
};

View file

@ -1,36 +1,19 @@
const std = @import("std"); const std = @import("std");
const ts = @import("tree-sitter"); const ts = @import("tree-sitter");
const Parser = @import("parser.zig");
extern fn tree_sitter_zig() callconv(.C) *ts.Language; extern fn tree_sitter_zig() callconv(.C) *ts.Language;
pub fn main() !void { pub fn main() !void {
// Create a parser for the zig language var gpa = std.heap.GeneralPurposeAllocator(.{}){};
const language = tree_sitter_zig(); defer _ = gpa.deinit();
defer language.destroy(); const allocator = gpa.allocator();
const parser = ts.Parser.create(); const file_path = "/Users/bogdanbuduroiu/development/aurelio-labs/semantic-router/semantic_router/route.py";
var parser = try Parser.create(allocator, file_path);
defer parser.destroy(); defer parser.destroy();
try parser.setLanguage(language); const definitions = try parser.extractDefinitions();
_ = definitions; // autofix
// Parse some source code and get the root node
const tree = parser.parseString("pub fn main() !void {}", null);
defer tree.?.destroy();
const node = tree.?.rootNode();
std.debug.assert(std.mem.eql(u8, node.kind(), "source_file"));
std.debug.print("{s}", .{node.kind()});
// Create a query and execute it
var error_offset: u32 = 0;
const query = try ts.Query.create(language, "name: (identifier) @name", &error_offset);
defer query.destroy();
const cursor = ts.QueryCursor.create();
defer cursor.destroy();
cursor.exec(query, node);
// Get the captured node of the first match
const match = cursor.nextMatch().?;
const capture = match.captures[0].node;
std.debug.assert(std.mem.eql(u8, capture.kind(), "identifier"));
} }

170
src/parser.zig Normal file
View file

@ -0,0 +1,170 @@
const std = @import("std");
const ts = @import("tree-sitter");
const Allocator = std.mem.Allocator;
const MultiArrayList = std.MultiArrayList;
const definitions = @import("definitions.zig");
const Definition = definitions.Definition;
const Function = definitions.Function;
const Self = @This();
parser: *ts.Parser,
language_name: []const u8,
source: []const u8,
allocator: Allocator,
pub fn create(allocator: Allocator, file_path: []const u8) !*Self {
const ext = std.fs.path.extension(file_path);
var parser = ts.Parser.create();
errdefer parser.destroy();
const language = try getLanguageForExtension(ext);
try parser.setLanguage(language);
const source = try std.fs.cwd().readFileAlloc(allocator, file_path, 1024 * 1024 * 10); // 10MB max
errdefer allocator.free(source);
const p = try allocator.create(Self);
p.* = .{
.parser = parser,
.source = source,
.allocator = allocator,
.language_name = "python",
};
return p;
}
pub fn destroy(self: *Self) void {
self.parser.destroy();
self.allocator.free(self.source);
self.allocator.destroy(self);
}
pub fn extractDefinitions(self: *Self) !MultiArrayList(Definition) {
var defs = MultiArrayList(Definition){};
defer defs.deinit(self.allocator);
// Parse the source code
const tree = self.parser.parseString(self.source, null);
if (tree == null) {
return error.ParseFailed;
}
defer tree.?.destroy();
const root_node = tree.?.rootNode();
// Get the appropriate query for this language
const query_string = try getQueryForLanguage(self.language_name);
var error_offset: u32 = 0;
const query = try ts.Query.create(self.parser.getLanguage() orelse tree_sitter_python(), query_string, &error_offset);
defer query.destroy();
// Execute the query
const cursor = ts.QueryCursor.create();
defer cursor.destroy();
cursor.exec(query, root_node);
while (cursor.nextMatch()) |match| {
for (match.captures) |capture| {
const capture_name = query.captureNameForId(capture.index) orelse "mock_caputer";
const node = capture.node;
const node_text = self.source[node.startByte()..node.endByte()];
const name = if (node.childByFieldName("name")) |name_node|
self.source[name_node.startByte()..name_node.endByte()]
else
node_text;
if (std.mem.eql(u8, capture_name, "function")) {
var func_def = try Function.init(self.allocator, name, "", "", "", "");
try defs.append(self.allocator, func_def);
defer func_def.destroy();
}
}
}
for (defs) |def| {
try def.print(std.debug);
}
return defs;
}
// Helper
fn getLanguageForExtension(ext: []const u8) !*ts.Language {
if (std.mem.eql(u8, ext, ".zig")) {
return tree_sitter_zig();
} else if (std.mem.eql(u8, ext, ".c") or std.mem.eql(u8, ext, ".h")) {
return tree_sitter_c();
} else if (std.mem.eql(u8, ext, ".py")) {
return tree_sitter_python();
} else {
return error.UnsupportedLanguage;
}
}
fn getQueryForLanguage(language_name: []const u8) ![]const u8 {
// In a real implementation, this would load queries from files
if (std.mem.eql(u8, language_name, "python")) {
return
\\;; Capture top-level functions, class, and method definitions
\\(module
\\ (expression_statement
\\ (assignment) @assignment
\\ )
\\)
\\(module
\\ (function_definition) @function
\\)
\\(module
\\ (decorated_definition
\\ definition: (function_definition) @function
\\ )
\\)
\\(module
\\ (class_definition
\\ body: (block
\\ (expression_statement
\\ (assignment) @class_assignment
\\ )
\\ )
\\ ) @class
\\)
\\(module
\\ (class_definition
\\ body: (block
\\ (function_definition) @method
\\ )
\\ ) @class
\\)
\\(module
\\ (class_definition
\\ body: (block
\\ (expression_statement
\\ (string) @docstring
\\ )
\\ )
\\ ) @class
\\)
\\(module
\\ (class_definition
\\ body: (block
\\ (decorated_definition
\\ definition: (function_definition) @method
\\ )
\\ )
\\ ) @class
\\)
;
} else {
return
\\(function_definition name: (identifier) @function)
\\(class_definition name: (identifier) @class)
\\(method_definition name: (identifier) @method)
;
}
}
// External C functions for tree-sitter languages
extern fn tree_sitter_zig() callconv(.C) *ts.Language;
extern fn tree_sitter_c() callconv(.C) *ts.Language;
extern fn tree_sitter_python() callconv(.C) *ts.Language;