From e1d6a6cd1f9da92d411a8c66b0db82c090b0a34b Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 13:50:08 +0000 Subject: [PATCH] perf(enrich): --memprofile + --max-buffer-pool + --copy-threads flags (Phase C) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase C of the OOM fix plan. Surfaces the memory-budgeting knobs that Phase A baked into the codebase but kept as compile-time defaults. Empirical finding: Phase A+B alone brought ~/projects/-scale enrich (49k files) from 9-15 GB peak RSS (OOM-killed exit 137) to 3.12 GB — well under the 4 GiB acceptance bar. The full streaming refactor originally scoped for this phase is not load-bearing at current scale; it remains a worthwhile future investment for 10M+ node graphs but ships separately if/when that scale arrives. Flags added to `codeiq enrich`: - --memprofile= Write a heap profile after enrich completes. For OOM debugging — pair with /usr/bin/time -v. - --max-buffer-pool=N Cap Kuzu BufferPoolSize in bytes (default 2 GiB). For hosts where 2 GiB is still too much. - --copy-threads=N Cap Kuzu COPY FROM parallelism (default min(4, GOMAXPROCS)). EnrichOptions struct extended with StoreBufferPoolBytes + StoreCopyThreads; analyzer.Enrich now routes through graph.OpenWithOptions with those values. Plan: docs/superpowers/plans/2026-05-13-enrich-oom-fix.md Phase C. Verification: - go test ./... -count=1: 877 pass. - /tmp/codeiq-c enrich ~/projects/polyglot-bench/airflow recorded 1.27 GB peak RSS via /usr/bin/time -v (down from pre-Phase-A 3.8 GB observed by the research pprof agent). - ~/projects/ enrich peak RSS: 3.12 GB (below 4 GiB acceptance bar). --- go/internal/analyzer/enrich.go | 11 +++++++++- go/internal/cli/enrich.go | 37 ++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/go/internal/analyzer/enrich.go b/go/internal/analyzer/enrich.go index 916520aa..59bd5a52 100644 --- a/go/internal/analyzer/enrich.go +++ b/go/internal/analyzer/enrich.go @@ -23,6 +23,12 @@ type EnrichOptions struct { // GraphDir overrides the Kuzu output directory. When "", the default // `/.codeiq/graph/codeiq.kuzu` is used. GraphDir string + // StoreBufferPoolBytes caps Kuzu's buffer pool. Zero -> graph package + // default (2 GiB). + StoreBufferPoolBytes uint64 + // StoreCopyThreads caps Kuzu COPY FROM parallelism. Zero -> graph + // package default (min(4, GOMAXPROCS)). + StoreCopyThreads uint64 } // EnrichSummary reports per-run counters from a successful Enrich. @@ -109,7 +115,10 @@ func Enrich(root string, c *cache.Cache, opts EnrichOptions) (EnrichSummary, err // 6. Bulk-load Kuzu — schema + nodes + edges + indexes. The store is // closed when this function returns; read-side commands re-open it. - store, err := graph.Open(opts.GraphDir) + store, err := graph.OpenWithOptions(opts.GraphDir, graph.OpenOptions{ + BufferPoolBytes: opts.StoreBufferPoolBytes, + MaxThreads: opts.StoreCopyThreads, + }) if err != nil { return EnrichSummary{}, fmt.Errorf("enrich: open graph: %w", err) } diff --git a/go/internal/cli/enrich.go b/go/internal/cli/enrich.go index 4aba1aac..77b24162 100644 --- a/go/internal/cli/enrich.go +++ b/go/internal/cli/enrich.go @@ -2,7 +2,10 @@ package cli import ( "fmt" + "os" "path/filepath" + "runtime" + "runtime/pprof" "github.com/randomcodespace/codeiq/go/internal/analyzer" "github.com/randomcodespace/codeiq/go/internal/cache" @@ -11,7 +14,12 @@ import ( func init() { registerSubcommand(func() *cobra.Command { - var graphDir string + var ( + graphDir string + memProfile string + maxBufferPool int64 + copyThreads int + ) cmd := &cobra.Command{ Use: "enrich [path]", Short: "Load the SQLite cache into Kuzu and run linkers, classifiers, intelligence.", @@ -48,10 +56,29 @@ become available and the stdio MCP server can serve clients.`, return fmt.Errorf("open cache %s: %w", cachePath, err) } defer c.Close() - summary, err := analyzer.Enrich(root, c, analyzer.EnrichOptions{GraphDir: graphDir}) + opts := analyzer.EnrichOptions{GraphDir: graphDir} + if maxBufferPool > 0 { + opts.StoreBufferPoolBytes = uint64(maxBufferPool) + } + if copyThreads > 0 { + opts.StoreCopyThreads = uint64(copyThreads) + } + summary, err := analyzer.Enrich(root, c, opts) if err != nil { return err } + if memProfile != "" { + runtime.GC() + f, ferr := os.Create(memProfile) + if ferr != nil { + return fmt.Errorf("create mem profile: %w", ferr) + } + defer f.Close() + if perr := pprof.WriteHeapProfile(f); perr != nil { + return fmt.Errorf("write mem profile: %w", perr) + } + fmt.Fprintf(cmd.ErrOrStderr(), "heap profile written to %s\n", memProfile) + } fmt.Fprintf(cmd.OutOrStdout(), "enrich complete: %d nodes, %d edges, %d services\n", summary.Nodes, summary.Edges, summary.Services) @@ -60,6 +87,12 @@ become available and the stdio MCP server can serve clients.`, } cmd.Flags().StringVar(&graphDir, "graph-dir", "", "Output directory for the Kuzu graph store (default: /.codeiq/graph/codeiq.kuzu).") + cmd.Flags().StringVar(&memProfile, "memprofile", "", + "Write a heap profile to this path after enrich completes. For OOM debugging — use with /usr/bin/time -v.") + cmd.Flags().Int64Var(&maxBufferPool, "max-buffer-pool", 0, + "Cap Kuzu BufferPoolSize in bytes (default: 2 GiB; 0 means default).") + cmd.Flags().IntVar(©Threads, "copy-threads", 0, + "Cap Kuzu COPY FROM parallelism (default: min(4, GOMAXPROCS); 0 means default).") return cmd }) }