diff --git a/CLAUDE.md b/CLAUDE.md index 8146f6b6..4eeb1edc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,7 +27,7 @@ landing) and `c630245` (release infra). - **Go 1.25.10** — toolchain pin; module min is 1.25.0 (clamped by the MCP SDK's own `go` directive). - **Kuzu 0.7.1** (`github.com/kuzudb/go-kuzu`) — embedded graph DB. - CGO. v0.7.1 quirks documented in `## Gotchas` below. + CGO. v0.11.3 capability matrix documented in `## Gotchas` below. - **`mattn/go-sqlite3` 1.14.22** — SQLite analysis cache. CGO. - **`smacker/go-tree-sitter`** — AST parsing for Java / Python / TypeScript / Go. @@ -357,26 +357,34 @@ Release pipeline: silently produced 0 nodes pre-fix. Test: `codeiq plugins` lists every detector by name; new ones must appear. -### Kuzu v0.7.1 quirks - -- FTS extension not bundled, not downloadable offline. `INSTALL fts` - errors with "fts is not an official extension". `CreateIndexes()` - no-ops FTS; `SearchByLabel` / `SearchLexical` use case-insensitive - `CONTAINS` predicates. -- LIMIT / SKIP can't be parameterized. Inline as literals; - parameterize the needle only. -- Uses `lower()` (SQL) not `toLower()` (openCypher). -- `RETURN DISTINCT` scope tighter than openCypher; `ORDER BY` must - reference the projected alias, not the bound variable. +### Kuzu v0.11.3 (current pin) + +**Lifted in 0.11.3** — `CLAUDE.md` previously documented these as 0.7.1 +quirks; they were unwound in the post-bump cleanup: + +- FTS extension ships bundled. `CreateIndexes()` runs `INSTALL fts; LOAD + EXTENSION fts;` then `CALL CREATE_FTS_INDEX`. `SearchByLabel` / + `SearchLexical` query via `CALL QUERY_FTS_INDEX` with BM25 ranking; + CONTAINS predicates remain as fallback for pre-enrich graphs. +- `LIMIT $param` and `SKIP $param` work as bound parameters. No more + `fmt.Sprintf` for integer literals. +- `toLower()` works (use it; `lower()` still accepted for SQL parity). +- Go binding accepts `[]string` for `IN $param` directly. The + `stringsToAny` widener is gone. + +**Still present in 0.11.3** — keep workarounds: + - List comprehension binder rejects out-of-scope variables. Use `properties(nodes(p), 'id')` instead of `[n IN nodes(p) | n.id]`. - `EXISTS { … }` subquery doesn't see outer-scope `$param`. Inline static lists as rel-pattern alternations. -- Go binding's `goValueToKuzuValue` accepts `[]any` only. Added - `stringsToAny` widener for `IN $param` use cases. - Multi-label rel alternation + kleene-star in the same recursive pattern breaks the binder. BlastRadius uses an anonymous recursive pattern. +- Recursive pattern upper bound `[*1..N]` must be a literal, not a + parameter — only LIMIT/SKIP are now bindable. +- Mutation gate allows `CALL QUERY_FTS_INDEX` but blocks + `CALL CREATE_FTS_INDEX` / `CALL DROP_FTS_INDEX` (catalog writes). ### MCP SDK v1.6 diff --git a/go/internal/graph/indexes.go b/go/internal/graph/indexes.go index f4df0d2a..06b81fdf 100644 --- a/go/internal/graph/indexes.go +++ b/go/internal/graph/indexes.go @@ -7,78 +7,127 @@ import ( "github.com/randomcodespace/codeiq/go/internal/model" ) +// FTS index names. CreateIndexes builds these after enrich. Read paths +// query them via QUERY_FTS_INDEX. +const ( + ftsLabelIndex = "code_node_label_fts" + ftsLexicalIndex = "code_node_lexical_fts" +) + // CreateIndexes installs the fulltext-search indexes the read side relies -// on. Mirrors GraphStore.createIndexes() on the Java side, which declares -// two Neo4j fulltext indexes: +// on. Two indexes are created: // -// - search_index: covers label_lower + fqn_lower. Powers /api/search and -// the search_graph MCP tool. -// - lexical_index: covers prop_lex_comment + prop_lex_config_keys. +// - code_node_label_fts: covers label + fqn_lower. Powers SearchByLabel +// and the search_graph MCP tool surface. +// - code_node_lexical_fts: covers prop_lex_comment + prop_lex_config_keys. // Powers LexicalQueryService's doc-comment / config-key search. // -// Implementation note (Kuzu version gap): Kuzu's official FTS extension -// ships pre-bundled from v0.11.3 onwards. We pin go-kuzu v0.7.1 (Kuzu -// 0.7.x runtime), which requires a network INSTALL of the FTS extension — -// incompatible with the air-gapped build policy. We therefore expose the -// same SearchByLabel / SearchLexical surface and back it with Cypher -// CONTAINS predicates. When we bump Kuzu past 0.11.3 the implementation -// swaps to CALL CREATE_FTS_INDEX / QUERY_FTS_INDEX without touching the -// caller surface. +// Idempotent: existing indexes are dropped before re-create. The enrich +// pipeline calls this once after BulkLoadNodes / BulkLoadEdges complete, +// so the indexes always reflect the latest snapshot. // -// Because there is no actual index to create at this version, CreateIndexes -// is a no-op that returns nil. It stays in the API so call sites in the -// enrich command line up with the eventual FTS implementation. +// FTS bundled in Kuzu 0.11.3+ (no network install needed — air-gapped safe). func (s *Store) CreateIndexes() error { - // Touch the property columns to make sure schema is in place. We do - // NOT attempt INSTALL fts here — that path requires network access - // the air-gapped build policy forbids (see playbooks/build.md). + // FTS extension ships bundled but still needs LOAD to register the + // catalog functions. INSTALL is a no-op when bundled. + if _, err := s.Cypher("INSTALL fts;"); err != nil { + return fmt.Errorf("graph: install fts: %w", err) + } + if _, err := s.Cypher("LOAD EXTENSION fts;"); err != nil { + return fmt.Errorf("graph: load fts: %w", err) + } + // Drop-then-create — idempotent across re-enrich. Dropping a missing + // index errors; ignore that single error path. + for _, idx := range []string{ftsLabelIndex, ftsLexicalIndex} { + _, _ = s.Cypher(fmt.Sprintf("CALL DROP_FTS_INDEX('CodeNode', '%s');", idx)) + } + if _, err := s.Cypher(fmt.Sprintf( + `CALL CREATE_FTS_INDEX('CodeNode', '%s', ['label', 'fqn_lower']);`, + ftsLabelIndex)); err != nil { + return fmt.Errorf("graph: create fts label index: %w", err) + } + if _, err := s.Cypher(fmt.Sprintf( + `CALL CREATE_FTS_INDEX('CodeNode', '%s', ['prop_lex_comment', 'prop_lex_config_keys']);`, + ftsLexicalIndex)); err != nil { + return fmt.Errorf("graph: create fts lexical index: %w", err) + } return nil } -// SearchByLabel runs a case-insensitive substring search across -// label_lower and fqn_lower. Returns up to `limit` nodes ordered by id for -// stable test output. Behaviour matches the Java search_index contract at -// the API surface; ranking differs (no BM25 until Kuzu FTS lands). +// SearchByLabel runs a fulltext search across the label + fqn_lower index. +// The query is auto-suffixed with '*' to give prefix matching (so 'auth' +// matches 'AuthService' identifiers). Results are ranked by BM25 score. +// Falls back to CONTAINS predicate when the FTS index hasn't been built +// (pre-enrich or enrich aborted before CreateIndexes). func (s *Store) SearchByLabel(q string, limit int) ([]*model.CodeNode, error) { - needle := strings.ToLower(q) - // Kuzu 0.7.1 rejects parameter binding on LIMIT — the value must be - // an inline literal. Coerce `limit` to a non-negative int and inline - // it via fmt; the user-supplied needle still goes through prepared - // parameter binding. + return s.ftsSearch(ftsLabelIndex, q, limit, s.searchByLabelFallback) +} + +// SearchLexical runs a fulltext search across the prose columns +// (prop_lex_comment + prop_lex_config_keys). BM25 ranks results. Same +// CONTAINS fallback as SearchByLabel for pre-enrich graphs. +func (s *Store) SearchLexical(q string, limit int) ([]*model.CodeNode, error) { + return s.ftsSearch(ftsLexicalIndex, q, limit, s.searchLexicalFallback) +} + +// ftsSearch is the shared FTS path for SearchByLabel and SearchLexical. +// On any FTS error (missing index, malformed query, etc.) it routes to the +// caller-supplied CONTAINS fallback. +func (s *Store) ftsSearch(idx, q string, limit int, + fallback func(string, int) ([]*model.CodeNode, error)) ([]*model.CodeNode, error) { if limit < 0 { limit = 0 } - rows, err := s.Cypher(fmt.Sprintf(` + needle := strings.TrimSpace(strings.ToLower(q)) + // Prefix-search via wildcard: "auth" → "auth*". Skip if user already + // supplied a wildcard or a multi-token query (FTS treats space as AND). + if needle != "" && !strings.ContainsAny(needle, "* ") { + needle += "*" + } + rows, err := s.Cypher(` + CALL QUERY_FTS_INDEX('CodeNode', $idx, $q) + WITH node AS n, score + RETURN n.id AS id, n.kind AS kind, n.label AS label, + n.file_path AS file_path, n.layer AS layer, score + ORDER BY score DESC, n.id + LIMIT $lim`, + map[string]any{"idx": idx, "q": needle, "lim": int64(limit)}) + if err != nil { + return fallback(needle, limit) + } + return rowsToNodes(rows), nil +} + +// searchByLabelFallback uses CONTAINS — same shape as pre-FTS code, retained +// for graphs where CreateIndexes has not run. Strips the trailing '*' added +// by ftsSearch since CONTAINS is already substring-y. +func (s *Store) searchByLabelFallback(needle string, limit int) ([]*model.CodeNode, error) { + q := strings.TrimSuffix(needle, "*") + rows, err := s.Cypher(` MATCH (n:CodeNode) WHERE n.label_lower CONTAINS $q OR n.fqn_lower CONTAINS $q RETURN n.id AS id, n.kind AS kind, n.label AS label, n.file_path AS file_path, n.layer AS layer - ORDER BY n.id LIMIT %d`, limit), - map[string]any{"q": needle}) + ORDER BY n.id LIMIT $lim`, + map[string]any{"q": q, "lim": int64(limit)}) if err != nil { return nil, fmt.Errorf("graph: search by label: %w", err) } return rowsToNodes(rows), nil } -// SearchLexical runs a case-insensitive substring search across -// prop_lex_comment and prop_lex_config_keys — the two columns -// LexicalEnricher fills with doc-comment text and surfaced config keys. -// Same Kuzu version caveat as SearchByLabel above. -func (s *Store) SearchLexical(q string, limit int) ([]*model.CodeNode, error) { - needle := strings.ToLower(q) - if limit < 0 { - limit = 0 - } - // Kuzu 0.7.1 uses SQL-style `lower()`, not `toLower()`. - rows, err := s.Cypher(fmt.Sprintf(` +// searchLexicalFallback uses CONTAINS with toLower() over prose columns. +// Retained for graphs that haven't run enrich/CreateIndexes. +func (s *Store) searchLexicalFallback(needle string, limit int) ([]*model.CodeNode, error) { + q := strings.TrimSuffix(needle, "*") + rows, err := s.Cypher(` MATCH (n:CodeNode) - WHERE lower(n.prop_lex_comment) CONTAINS $q - OR lower(n.prop_lex_config_keys) CONTAINS $q + WHERE toLower(n.prop_lex_comment) CONTAINS $q + OR toLower(n.prop_lex_config_keys) CONTAINS $q RETURN n.id AS id, n.kind AS kind, n.label AS label, n.file_path AS file_path, n.layer AS layer - ORDER BY n.id LIMIT %d`, limit), - map[string]any{"q": needle}) + ORDER BY n.id LIMIT $lim`, + map[string]any{"q": q, "lim": int64(limit)}) if err != nil { return nil, fmt.Errorf("graph: search lexical: %w", err) } diff --git a/go/internal/graph/mutation.go b/go/internal/graph/mutation.go index cb186a84..6a814c96 100644 --- a/go/internal/graph/mutation.go +++ b/go/internal/graph/mutation.go @@ -41,13 +41,16 @@ var callRE = regexp.MustCompile(`(?i)\bCALL\s+(\w+(?:\.\w+)?)`) // readOnlyCallPrefixes are case-insensitive procedure-name prefixes that // are permitted under CALL. db.* covers Neo4j's read-only schema // procedures (db.indexes, db.constraints, db.labels); show_/table_/ -// current_setting/table_info cover Kuzu's introspection helpers. +// current_setting/table_info cover Kuzu's introspection helpers; +// query_fts_index is Kuzu 0.11's read-only FTS search procedure +// (create_/drop_fts_index stay blocked because they mutate the catalog). var readOnlyCallPrefixes = []string{ "db.", "show_", "table_", "current_setting", "table_info", + "query_fts_index", } // blockCommentRE matches /* … */ and line comments. Both are stripped diff --git a/go/internal/graph/reads.go b/go/internal/graph/reads.go index 6fa7dde4..cca31af2 100644 --- a/go/internal/graph/reads.go +++ b/go/internal/graph/reads.go @@ -11,8 +11,7 @@ import ( // GraphController. All return projections through rowsToNodes (defined in // indexes.go) — `id`, `kind`, `label`, and optionally `file_path` / `layer`. // -// Kuzu 0.7.1 caveats relevant here: -// - LIMIT/SKIP values must be inlined literals, not bound parameters. +// Kuzu caveats relevant here: // - count(*) on rels works fine across all rel tables via // `MATCH ()-[r]->()` — Kuzu treats the wildcard as the union of every // declared rel type. @@ -107,13 +106,12 @@ func (s *Store) FindByKindPaginated(kind string, offset, limit int) ([]*model.Co if limit < 0 { limit = 0 } - // Kuzu 0.7.1 disallows parameter binding on SKIP/LIMIT — inline them. - rows, err := s.Cypher(fmt.Sprintf(` + rows, err := s.Cypher(` MATCH (n:CodeNode) WHERE n.kind = $k RETURN n.id AS id, n.kind AS kind, n.label AS label, n.file_path AS file_path, n.layer AS layer - ORDER BY n.id SKIP %d LIMIT %d`, offset, limit), - map[string]any{"k": kind}) + ORDER BY n.id SKIP $skip LIMIT $lim`, + map[string]any{"k": kind, "skip": int64(offset), "lim": int64(limit)}) if err != nil { return nil, fmt.Errorf("graph: find by kind: %w", err) } diff --git a/go/internal/mcp/tools_graph.go b/go/internal/mcp/tools_graph.go index c245992e..b3fdd7df 100644 --- a/go/internal/mcp/tools_graph.go +++ b/go/internal/mcp/tools_graph.go @@ -181,21 +181,15 @@ func toolQueryEdges(d *Deps) Tool { // the anonymous-rel pattern. cypher := `MATCH (a:CodeNode)-[r]->(b:CodeNode) RETURN a.id AS source, b.id AS target, LABEL(r) AS kind - ORDER BY source, kind, target LIMIT ` + intLiteral(limit) - args := map[string]any{} + ORDER BY source, kind, target LIMIT $lim` + args := map[string]any{"lim": int64(limit)} if p.Kind != "" { cypher = `MATCH (a:CodeNode)-[r]->(b:CodeNode) WHERE LABEL(r) = $k RETURN a.id AS source, b.id AS target, LABEL(r) AS kind - ORDER BY source, kind, target LIMIT ` + intLiteral(limit) + ORDER BY source, kind, target LIMIT $lim` args["k"] = p.Kind } - var rows []map[string]any - var err error - if len(args) == 0 { - rows, err = d.Store.Cypher(cypher) - } else { - rows, err = d.Store.Cypher(cypher, args) - } + rows, err := d.Store.Cypher(cypher, args) if err != nil { return NewErrorEnvelope(CodeInternalError, err, RequestID(ctx)), nil } @@ -275,11 +269,11 @@ func toolGetEgoGraph(d *Deps) Tool { } depth := CapDepth(p.Radius, d.MaxDepth) // Variable-length match centered on Center, walking outbound up to - // depth. Kuzu 0.7's binder is fussy about projecting properties - // from the endpoint of a variable-length pattern; the supported - // shape is `properties(nodes(p), 'id')` over the named path. - // Splitting outbound + inbound queries keeps the rows shape - // uniform (both sides projected through nodes(p)). + // depth. Kuzu's binder is fussy about projecting properties from + // the endpoint of a variable-length pattern; the supported shape + // is `properties(nodes(p), 'id')` over the named path. The + // recursive `[*1..N]` upper bound must be a literal (binder gap); + // LIMIT goes through parameter binding fine. limit := CapResults(0, d.MaxResults) cypher := fmt.Sprintf(` MATCH p = (c:CodeNode {id: $center})-[*1..%d]-(:CodeNode) @@ -289,8 +283,10 @@ func toolGetEgoGraph(d *Deps) Tool { WHERE n.id <> $center RETURN n.id AS id, n.kind AS kind, n.label AS label, n.file_path AS file_path, n.layer AS layer - ORDER BY n.id LIMIT %d`, depth, limit) - rows, err := d.Store.Cypher(cypher, map[string]any{"center": p.Center}) + ORDER BY n.id LIMIT $lim`, depth) + rows, err := d.Store.Cypher(cypher, map[string]any{ + "center": p.Center, "lim": int64(limit), + }) if err != nil { return NewErrorEnvelope(CodeInternalError, err, RequestID(ctx)), nil } @@ -621,7 +617,7 @@ func toolFindRelatedEndpoints(d *Deps) Tool { // Endpoints that share a service container with the identifier // (file path / class / fqn) — the simplest semantic match that // works across languages. - cypher := fmt.Sprintf(` + cypher := ` MATCH (target:CodeNode) WHERE target.file_path = $i OR target.label = $i OR target.id = $i OR target.fqn = $i MATCH (target)<-[:CONTAINS]-(svc:CodeNode {kind: 'service'})-[:CONTAINS]->(ep:CodeNode) @@ -629,8 +625,8 @@ func toolFindRelatedEndpoints(d *Deps) Tool { RETURN DISTINCT ep.id AS id, ep.kind AS kind, ep.label AS label, ep.file_path AS file_path, ep.layer AS layer, svc.label AS service - ORDER BY ep.id LIMIT %d`, limit) - rows, err := d.Store.Cypher(cypher, map[string]any{"i": p.Identifier}) + ORDER BY ep.id LIMIT $lim` + rows, err := d.Store.Cypher(cypher, map[string]any{"i": p.Identifier, "lim": int64(limit)}) if err != nil { return NewErrorEnvelope(CodeInternalError, err, RequestID(ctx)), nil } @@ -754,13 +750,4 @@ func toolReadFile(d *Deps) Tool { } } -// intLiteral renders a non-negative int as a Cypher literal. Kuzu 0.7.1 -// rejects parameter binding on LIMIT — the value must be inline. The cap -// floor is 1 to match Kuzu's `LIMIT 0` failure mode. -func intLiteral(n int) string { - if n < 1 { - n = 1 - } - return fmt.Sprintf("%d", n) -} diff --git a/go/internal/query/service.go b/go/internal/query/service.go index 1c79db7c..626f0133 100644 --- a/go/internal/query/service.go +++ b/go/internal/query/service.go @@ -145,12 +145,14 @@ func (s *Service) FindCycles(limit int) ([][]string, error) { if limit <= 0 { limit = 100 } - // Same Kuzu 0.7 list-comprehension caveat — `properties(nodes(p), 'id')` - // is the supported shape for projecting recursive-rel paths. - rows, err := s.store.Cypher(fmt.Sprintf(` + // Same list-comprehension caveat as FindShortestPath — + // `properties(nodes(p), 'id')` is the supported shape for projecting + // recursive-rel paths. + rows, err := s.store.Cypher(` MATCH p = (a:CodeNode)-[* 2..10]->(b:CodeNode) WHERE a.id = b.id - RETURN properties(nodes(p), 'id') AS ids LIMIT %d`, limit)) + RETURN properties(nodes(p), 'id') AS ids LIMIT $lim`, + map[string]any{"lim": int64(limit)}) if err != nil { return nil, fmt.Errorf("query: find cycles: %w", err) } @@ -201,13 +203,13 @@ func (s *Service) FindDeadCode(kinds []string, limit int) ([]*model.CodeNode, er limit = 100 } - // Kuzu 0.7 binder gap: parameters declared at the outer scope are not - // visible inside an `EXISTS { MATCH ... WHERE ... }` subquery, so a - // `LABEL(r) IN $semanticKinds` predicate inside the EXISTS fails with - // "Parameter semanticKinds not found". Workaround: inline the semantic - // edges as a rel-pattern alternation, which is bound at parse time. - // Outer-scope parameters ($kinds / $excludeKinds) work fine because - // they live in the top-level WHERE clause. + // Kuzu binder gap (still present in 0.11): parameters declared at the + // outer scope are not visible inside an `EXISTS { MATCH ... WHERE ... }` + // subquery, so a `LABEL(r) IN $semanticKinds` predicate inside EXISTS + // fails with "Parameter semanticKinds not found". Workaround: inline the + // semantic edges as a rel-pattern alternation, which is bound at parse + // time. Outer-scope $kinds / $excludeKinds work fine because they live + // in the top-level WHERE clause. semanticPat := ":" + strings.Join(semanticEdgeKinds, "|") q := fmt.Sprintf(` MATCH (n:CodeNode) @@ -218,13 +220,12 @@ func (s *Service) FindDeadCode(kinds []string, limit int) ([]*model.CodeNode, er } RETURN n.id AS id, n.kind AS kind, n.label AS label, n.file_path AS file_path, n.layer AS layer - ORDER BY n.id LIMIT %d`, semanticPat, limit) + ORDER BY n.id LIMIT $lim`, semanticPat) - // Kuzu 0.7's Go binding only accepts []any for list parameters; []string - // trips "unsupported type" in goValueToKuzuValue. Convert via stringsToAny. rows, err := s.store.Cypher(q, map[string]any{ - "kinds": stringsToAny(kinds), - "excludeKinds": stringsToAny(entryPointKinds), + "kinds": kinds, + "excludeKinds": entryPointKinds, + "lim": int64(limit), }) if err != nil { return nil, fmt.Errorf("query: find dead code: %w", err) @@ -232,16 +233,6 @@ func (s *Service) FindDeadCode(kinds []string, limit int) ([]*model.CodeNode, er return rowsToNodes(rows), nil } -// stringsToAny widens []string to []any so Kuzu's parameter binder accepts -// it as a LIST. Kuzu 0.7's goValueToKuzuValue switch only matches []any. -func stringsToAny(xs []string) []any { - out := make([]any, len(xs)) - for i, x := range xs { - out[i] = x - } - return out -} - // rowsToNodes mirrors graph.rowsToNodes — kept package-local here to avoid // exporting the helper. Projects the canonical {id,kind,label,file_path, // layer} columns onto CodeNode shells. diff --git a/go/internal/query/topology.go b/go/internal/query/topology.go index f08ac214..39557884 100644 --- a/go/internal/query/topology.go +++ b/go/internal/query/topology.go @@ -295,7 +295,7 @@ func (t *Topology) childNodesByKinds(serviceName string, kinds []string) ([]map[ RETURN n.id AS id, n.kind AS kind, n.label AS label, n.file_path AS file_path, n.layer AS layer ORDER BY n.id`, - map[string]any{"name": serviceName, "kinds": stringsToAny(kinds)}) + map[string]any{"name": serviceName, "kinds": kinds}) if err != nil { return nil, fmt.Errorf("topology: childNodesByKinds %s: %w", serviceName, err) } @@ -652,7 +652,7 @@ func (t *Topology) servicesContainingNodes(nodeIDs []string) ([]string, error) { WHERE s.kind = 'service' AND n.id IN $ids RETURN DISTINCT s.label AS name ORDER BY name`, - map[string]any{"ids": stringsToAny(nodeIDs)}) + map[string]any{"ids": nodeIDs}) if err != nil { return nil, fmt.Errorf("topology: services containing: %w", err) }