From cbfc0bcd5bdefe6a4f061bf857709bdb1d6a8735 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 06:59:11 +0000 Subject: [PATCH 01/12] feat(cache): add GetFileByPath lookup --- internal/cache/lookup.go | 11 ++++++++ internal/cache/lookup_test.go | 49 +++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 internal/cache/lookup.go create mode 100644 internal/cache/lookup_test.go diff --git a/internal/cache/lookup.go b/internal/cache/lookup.go new file mode 100644 index 00000000..3466fd10 --- /dev/null +++ b/internal/cache/lookup.go @@ -0,0 +1,11 @@ +package cache + +// GetFileByPath returns the cached (content_hash, parsed_at) for the given +// file path. Returns ok=false when no row exists for that path. +func (c *Cache) GetFileByPath(path string) (hash, parsedAt string, ok bool) { + row := c.db.QueryRow(`SELECT content_hash, parsed_at FROM files WHERE path = ? LIMIT 1`, path) + if err := row.Scan(&hash, &parsedAt); err != nil { + return "", "", false + } + return hash, parsedAt, true +} diff --git a/internal/cache/lookup_test.go b/internal/cache/lookup_test.go new file mode 100644 index 00000000..64e7967c --- /dev/null +++ b/internal/cache/lookup_test.go @@ -0,0 +1,49 @@ +package cache + +import ( + "path/filepath" + "testing" + + "github.com/randomcodespace/codeiq/internal/model" +) + +func TestGetFileByPathReturnsHashWhenPresent(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + + e := &Entry{ + ContentHash: "abc123", + Path: "foo/bar.go", + Language: "go", + ParsedAt: "2026-05-15T00:00:00Z", + Nodes: []*model.CodeNode{model.NewCodeNode("n1", model.NodeClass, "Bar")}, + } + if err := c.Put(e); err != nil { + t.Fatalf("put: %v", err) + } + + hash, parsedAt, ok := c.GetFileByPath("foo/bar.go") + if !ok { + t.Fatal("ok = false, want true") + } + if hash != "abc123" { + t.Fatalf("hash = %q, want %q", hash, "abc123") + } + if parsedAt != "2026-05-15T00:00:00Z" { + t.Fatalf("parsedAt = %q, want %q", parsedAt, "2026-05-15T00:00:00Z") + } +} + +func TestGetFileByPathReturnsFalseWhenAbsent(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + if _, _, ok := c.GetFileByPath("does/not/exist"); ok { + t.Fatal("ok = true, want false for missing path") + } +} From e380162fa0a6020c1934a85bde3e1bc81f464b6c Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 06:59:53 +0000 Subject: [PATCH 02/12] feat(cache): add AllFiles streaming iterator --- internal/cache/lookup.go | 21 +++++++++++ internal/cache/lookup_test.go | 65 +++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/internal/cache/lookup.go b/internal/cache/lookup.go index 3466fd10..bcf81ccb 100644 --- a/internal/cache/lookup.go +++ b/internal/cache/lookup.go @@ -9,3 +9,24 @@ func (c *Cache) GetFileByPath(path string) (hash, parsedAt string, ok bool) { } return hash, parsedAt, true } + +// AllFiles invokes fn once per cached file in path order. fn returning a +// non-nil error stops iteration and propagates the error. Stream-iterated +// via rows.Next(); the whole cache never lives in memory at once. +func (c *Cache) AllFiles(fn func(path, hash string) error) error { + rows, err := c.db.Query(`SELECT path, content_hash FROM files ORDER BY path`) + if err != nil { + return err + } + defer rows.Close() + for rows.Next() { + var path, hash string + if err := rows.Scan(&path, &hash); err != nil { + return err + } + if err := fn(path, hash); err != nil { + return err + } + } + return rows.Err() +} diff --git a/internal/cache/lookup_test.go b/internal/cache/lookup_test.go index 64e7967c..dec36408 100644 --- a/internal/cache/lookup_test.go +++ b/internal/cache/lookup_test.go @@ -47,3 +47,68 @@ func TestGetFileByPathReturnsFalseWhenAbsent(t *testing.T) { t.Fatal("ok = true, want false for missing path") } } + +func TestAllFilesYieldsEveryRow(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + for _, e := range []*Entry{ + {ContentHash: "h1", Path: "a.go", Language: "go", ParsedAt: "t"}, + {ContentHash: "h2", Path: "b.go", Language: "go", ParsedAt: "t"}, + {ContentHash: "h3", Path: "c.go", Language: "go", ParsedAt: "t"}, + } { + if err := c.Put(e); err != nil { + t.Fatalf("put: %v", err) + } + } + got := map[string]string{} + if err := c.AllFiles(func(path, hash string) error { + got[path] = hash + return nil + }); err != nil { + t.Fatalf("AllFiles: %v", err) + } + want := map[string]string{"a.go": "h1", "b.go": "h2", "c.go": "h3"} + if len(got) != len(want) { + t.Fatalf("got %d entries, want %d: %v", len(got), len(want), got) + } + for k, v := range want { + if got[k] != v { + t.Errorf("got[%q]=%q, want %q", k, got[k], v) + } + } +} + +func TestAllFilesIteratesInPathOrder(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + // Insert in non-alphabetical order on purpose. + for _, e := range []*Entry{ + {ContentHash: "h-c", Path: "c.go", Language: "go", ParsedAt: "t"}, + {ContentHash: "h-a", Path: "a.go", Language: "go", ParsedAt: "t"}, + {ContentHash: "h-b", Path: "b.go", Language: "go", ParsedAt: "t"}, + } { + if err := c.Put(e); err != nil { + t.Fatalf("put: %v", err) + } + } + var paths []string + _ = c.AllFiles(func(path, _ string) error { + paths = append(paths, path) + return nil + }) + want := []string{"a.go", "b.go", "c.go"} + if len(paths) != len(want) { + t.Fatalf("len(paths)=%d, want %d", len(paths), len(want)) + } + for i := range paths { + if paths[i] != want[i] { + t.Errorf("paths[%d]=%q, want %q", i, paths[i], want[i]) + } + } +} From e9cc8a52c02ba69aaf08f96aea2ca2a2245653d4 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:00:33 +0000 Subject: [PATCH 03/12] feat(cache): add PurgeByPath for incremental cleanup --- internal/cache/lookup.go | 25 +++++++++++++ internal/cache/lookup_test.go | 70 +++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/internal/cache/lookup.go b/internal/cache/lookup.go index bcf81ccb..49edc4f1 100644 --- a/internal/cache/lookup.go +++ b/internal/cache/lookup.go @@ -10,6 +10,31 @@ func (c *Cache) GetFileByPath(path string) (hash, parsedAt string, ok bool) { return hash, parsedAt, true } +// PurgeByPath deletes every row associated with the file at path: the +// files row, all nodes joined by its content_hash, and all edges joined +// by its content_hash. Idempotent — a missing path returns nil. +func (c *Cache) PurgeByPath(path string) error { + tx, err := c.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() + var hash string + if err := tx.QueryRow(`SELECT content_hash FROM files WHERE path = ?`, path).Scan(&hash); err != nil { + return nil + } + if _, err := tx.Exec(`DELETE FROM nodes WHERE content_hash = ?`, hash); err != nil { + return err + } + if _, err := tx.Exec(`DELETE FROM edges WHERE content_hash = ?`, hash); err != nil { + return err + } + if _, err := tx.Exec(`DELETE FROM files WHERE path = ?`, path); err != nil { + return err + } + return tx.Commit() +} + // AllFiles invokes fn once per cached file in path order. fn returning a // non-nil error stops iteration and propagates the error. Stream-iterated // via rows.Next(); the whole cache never lives in memory at once. diff --git a/internal/cache/lookup_test.go b/internal/cache/lookup_test.go index dec36408..bdc5ea83 100644 --- a/internal/cache/lookup_test.go +++ b/internal/cache/lookup_test.go @@ -81,6 +81,76 @@ func TestAllFilesYieldsEveryRow(t *testing.T) { } } +func TestPurgeByPathRemovesAllRows(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + e := &Entry{ + ContentHash: "abc", + Path: "foo.go", + Language: "go", + ParsedAt: "t", + Nodes: []*model.CodeNode{model.NewCodeNode("n1", model.NodeClass, "Foo")}, + Edges: []*model.CodeEdge{model.NewCodeEdge("e1", model.EdgeContains, "n1", "n2")}, + } + if err := c.Put(e); err != nil { + t.Fatalf("put: %v", err) + } + if err := c.PurgeByPath("foo.go"); err != nil { + t.Fatalf("purge: %v", err) + } + if _, _, ok := c.GetFileByPath("foo.go"); ok { + t.Fatal("file row still present after purge") + } + if c.Has("abc") { + t.Fatal("cache.Has returns true after purge") + } +} + +func TestPurgeByPathIsNoOpForMissingPath(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + if err := c.PurgeByPath("nope.go"); err != nil { + t.Fatalf("purge of missing path should be no-op, got: %v", err) + } +} + +func TestPurgeByPathLeavesUnrelatedRows(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + _ = c.Put(&Entry{ + ContentHash: "h-keep", + Path: "keep.go", + Language: "go", + ParsedAt: "t", + Nodes: []*model.CodeNode{model.NewCodeNode("k", model.NodeClass, "K")}, + }) + _ = c.Put(&Entry{ + ContentHash: "h-drop", + Path: "drop.go", + Language: "go", + ParsedAt: "t", + Nodes: []*model.CodeNode{model.NewCodeNode("d", model.NodeClass, "D")}, + }) + if err := c.PurgeByPath("drop.go"); err != nil { + t.Fatal(err) + } + if !c.Has("h-keep") { + t.Fatal("unrelated file purged") + } + if c.Has("h-drop") { + t.Fatal("target hash still present") + } +} + func TestAllFilesIteratesInPathOrder(t *testing.T) { c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) if err != nil { From cbdffd9030b05e3a536ac8ca65a82fc50d8691d3 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:01:07 +0000 Subject: [PATCH 04/12] feat(cache): add ManifestHash for graph-freshness short-circuit --- internal/cache/manifest.go | 31 +++++++++++ internal/cache/manifest_test.go | 93 +++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 internal/cache/manifest.go create mode 100644 internal/cache/manifest_test.go diff --git a/internal/cache/manifest.go b/internal/cache/manifest.go new file mode 100644 index 00000000..47dbe8cc --- /dev/null +++ b/internal/cache/manifest.go @@ -0,0 +1,31 @@ +package cache + +import ( + "crypto/sha256" + "encoding/hex" +) + +// ManifestHash returns a deterministic SHA-256 hex digest over the sorted +// (path, content_hash) tuples in the cache. Two caches produce the same +// manifest iff they hold the same set of file→hash bindings. +// +// Used by the enrich pipeline to short-circuit: if the graph's stored +// manifest matches the cache's current manifest, the graph is fresh and +// enrich can exit immediately. +// +// Format: "\x00\x00" per row, concatenated in path order. +// The NUL separator makes collisions impossible regardless of path chars. +func (c *Cache) ManifestHash() (string, error) { + h := sha256.New() + err := c.AllFiles(func(path, hash string) error { + h.Write([]byte(path)) + h.Write([]byte{0}) + h.Write([]byte(hash)) + h.Write([]byte{0}) + return nil + }) + if err != nil { + return "", err + } + return hex.EncodeToString(h.Sum(nil)), nil +} diff --git a/internal/cache/manifest_test.go b/internal/cache/manifest_test.go new file mode 100644 index 00000000..ec3fac1d --- /dev/null +++ b/internal/cache/manifest_test.go @@ -0,0 +1,93 @@ +package cache + +import ( + "path/filepath" + "testing" +) + +func TestManifestHashIsDeterministic(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + for _, e := range []*Entry{ + {ContentHash: "h1", Path: "a.go", Language: "go", ParsedAt: "t"}, + {ContentHash: "h2", Path: "b.go", Language: "go", ParsedAt: "t"}, + } { + if err := c.Put(e); err != nil { + t.Fatalf("put: %v", err) + } + } + a, err := c.ManifestHash() + if err != nil { + t.Fatalf("manifest: %v", err) + } + b, _ := c.ManifestHash() + if a != b { + t.Fatalf("ManifestHash not deterministic: %s != %s", a, b) + } + if len(a) != 64 { + t.Fatalf("manifest hash len = %d, want 64 (sha256 hex)", len(a)) + } +} + +func TestManifestHashChangesWhenFileChanges(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + _ = c.Put(&Entry{ContentHash: "h1", Path: "a.go", Language: "go", ParsedAt: "t"}) + before, _ := c.ManifestHash() + if err := c.PurgeByPath("a.go"); err != nil { + t.Fatal(err) + } + _ = c.Put(&Entry{ContentHash: "h2", Path: "a.go", Language: "go", ParsedAt: "t"}) + after, _ := c.ManifestHash() + if before == after { + t.Fatal("ManifestHash unchanged after file mutation") + } +} + +func TestManifestHashEmptyCache(t *testing.T) { + c, err := Open(filepath.Join(t.TempDir(), "c.sqlite")) + if err != nil { + t.Fatalf("open: %v", err) + } + defer c.Close() + h, _ := c.ManifestHash() + want := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + if h != want { + t.Fatalf("empty manifest = %q, want %q", h, want) + } +} + +func TestManifestHashIndependentOfInsertOrder(t *testing.T) { + a, err := Open(filepath.Join(t.TempDir(), "a.sqlite")) + if err != nil { + t.Fatal(err) + } + defer a.Close() + b, err := Open(filepath.Join(t.TempDir(), "b.sqlite")) + if err != nil { + t.Fatal(err) + } + defer b.Close() + files := []*Entry{ + {ContentHash: "h1", Path: "a.go", Language: "go", ParsedAt: "t"}, + {ContentHash: "h2", Path: "b.go", Language: "go", ParsedAt: "t"}, + {ContentHash: "h3", Path: "c.go", Language: "go", ParsedAt: "t"}, + } + for _, e := range files { + _ = a.Put(e) + } + for i := len(files) - 1; i >= 0; i-- { + _ = b.Put(files[i]) + } + ha, _ := a.ManifestHash() + hb, _ := b.ManifestHash() + if ha != hb { + t.Fatalf("manifest depends on insert order: %s != %s", ha, hb) + } +} From 44f9936bb1ab8294effbcba4887deb9d4fec20fd Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:02:52 +0000 Subject: [PATCH 05/12] feat(analyzer): add Diff for incremental file classification --- internal/analyzer/diff.go | 66 ++++++++++++++ internal/analyzer/diff_test.go | 155 +++++++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 internal/analyzer/diff.go create mode 100644 internal/analyzer/diff_test.go diff --git a/internal/analyzer/diff.go b/internal/analyzer/diff.go new file mode 100644 index 00000000..840516e4 --- /dev/null +++ b/internal/analyzer/diff.go @@ -0,0 +1,66 @@ +package analyzer + +import ( + "fmt" + "os" + + "github.com/randomcodespace/codeiq/internal/cache" +) + +// Delta is the result of comparing the on-disk file set to the cache state. +// All slices are sorted by path (FileDiscovery sorts; AllFiles iterates in +// path order) so callers can rely on stable order. +type Delta struct { + Added []string // on disk, not in cache + Modified []string // path in cache but content_hash differs from disk + Deleted []string // in cache, missing from disk + Unchanged []string // path + content_hash match cache exactly +} + +// Diff walks the project root via FileDiscovery and classifies each file +// against the cache. UNCHANGED files cost one hash per file; nothing else +// is parsed or detected. +// +// Returns Delta with empty slices (not nil) when there is no work in a +// bucket. +func (a *Analyzer) Diff(root string) (Delta, error) { + d := Delta{} + if a.opts.Cache == nil { + return d, fmt.Errorf("diff: cache is required") + } + disc := NewFileDiscovery() + files, err := disc.Discover(root) + if err != nil { + return d, fmt.Errorf("file discovery: %w", err) + } + + seen := make(map[string]bool, len(files)) + for _, f := range files { + seen[f.RelPath] = true + content, err := os.ReadFile(f.AbsPath) + if err != nil { + fmt.Fprintf(os.Stderr, "codeiq: diff: %s: %v\n", f.RelPath, err) + continue + } + curHash := cache.HashString(string(content)) + cachedHash, _, ok := a.opts.Cache.GetFileByPath(f.RelPath) + switch { + case !ok: + d.Added = append(d.Added, f.RelPath) + case cachedHash == curHash: + d.Unchanged = append(d.Unchanged, f.RelPath) + default: + d.Modified = append(d.Modified, f.RelPath) + } + } + + if err := a.opts.Cache.AllFiles(func(path, _ string) error { + if !seen[path] { + d.Deleted = append(d.Deleted, path) + } + return nil + }); err != nil { + return d, err + } + return d, nil +} diff --git a/internal/analyzer/diff_test.go b/internal/analyzer/diff_test.go new file mode 100644 index 00000000..14b897d8 --- /dev/null +++ b/internal/analyzer/diff_test.go @@ -0,0 +1,155 @@ +package analyzer + +import ( + "os" + "path/filepath" + "sort" + "testing" + + "github.com/randomcodespace/codeiq/internal/cache" + "github.com/randomcodespace/codeiq/internal/detector" + + // Register a couple of phase-1 detectors so the Analyzer Registry + // doesn't sit empty (Diff doesn't actually run detectors but the + // Analyzer requires a non-nil Registry, satisfied here via blank imports). + _ "github.com/randomcodespace/codeiq/internal/detector/jvm/java" + _ "github.com/randomcodespace/codeiq/internal/detector/python" +) + +func mustOpenCache(t *testing.T) (*cache.Cache, string) { + t.Helper() + dir := t.TempDir() + c, err := cache.Open(filepath.Join(dir, "c.sqlite")) + if err != nil { + t.Fatalf("cache: %v", err) + } + return c, dir +} + +func writeDiffFile(t *testing.T, root, rel, content string) { + t.Helper() + full := filepath.Join(root, rel) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(full, []byte(content), 0o644); err != nil { + t.Fatal(err) + } +} + +func TestDiffEmptyCacheAndEmptyRoot(t *testing.T) { + c, root := mustOpenCache(t) + defer c.Close() + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default}) + d, err := a.Diff(root) + if err != nil { + t.Fatalf("diff: %v", err) + } + if len(d.Added)+len(d.Modified)+len(d.Deleted)+len(d.Unchanged) != 0 { + t.Fatalf("empty root + empty cache: got %+v", d) + } +} + +func TestDiffDetectsAddedFile(t *testing.T) { + c, root := mustOpenCache(t) + defer c.Close() + writeDiffFile(t, root, "X.java", "public class X {}") + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default}) + d, err := a.Diff(root) + if err != nil { + t.Fatal(err) + } + if len(d.Added) != 1 || d.Added[0] != "X.java" { + t.Fatalf("Added = %v, want [X.java]", d.Added) + } + if len(d.Modified)+len(d.Deleted)+len(d.Unchanged) != 0 { + t.Fatalf("other buckets non-empty: %+v", d) + } +} + +func TestDiffDetectsUnchangedFile(t *testing.T) { + c, root := mustOpenCache(t) + defer c.Close() + src := "public class X {}" + writeDiffFile(t, root, "X.java", src) + _ = c.Put(&cache.Entry{ContentHash: cache.HashString(src), Path: "X.java", Language: "java", ParsedAt: "t"}) + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default}) + d, err := a.Diff(root) + if err != nil { + t.Fatal(err) + } + if len(d.Unchanged) != 1 || d.Unchanged[0] != "X.java" { + t.Fatalf("Unchanged = %v, want [X.java]", d.Unchanged) + } + if len(d.Added)+len(d.Modified)+len(d.Deleted) != 0 { + t.Fatalf("other buckets non-empty: %+v", d) + } +} + +func TestDiffDetectsModifiedFile(t *testing.T) { + c, root := mustOpenCache(t) + defer c.Close() + writeDiffFile(t, root, "X.java", "public class X { int v = 2; }") + _ = c.Put(&cache.Entry{ + ContentHash: cache.HashString("public class X {}"), + Path: "X.java", + Language: "java", + ParsedAt: "t", + }) + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default}) + d, err := a.Diff(root) + if err != nil { + t.Fatal(err) + } + if len(d.Modified) != 1 || d.Modified[0] != "X.java" { + t.Fatalf("Modified = %v, want [X.java]", d.Modified) + } +} + +func TestDiffDetectsDeletedFile(t *testing.T) { + c, root := mustOpenCache(t) + defer c.Close() + _ = c.Put(&cache.Entry{ContentHash: "h", Path: "Y.java", Language: "java", ParsedAt: "t"}) + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default}) + d, err := a.Diff(root) + if err != nil { + t.Fatal(err) + } + if len(d.Deleted) != 1 || d.Deleted[0] != "Y.java" { + t.Fatalf("Deleted = %v, want [Y.java]", d.Deleted) + } +} + +func TestDiffMixedScenario(t *testing.T) { + c, root := mustOpenCache(t) + defer c.Close() + writeDiffFile(t, root, "A.java", "class A {}") + writeDiffFile(t, root, "B.java", "class B {}") + writeDiffFile(t, root, "C.java", "class C v2 {}") + _ = c.Put(&cache.Entry{ContentHash: cache.HashString("class A {}"), Path: "A.java", Language: "java", ParsedAt: "t"}) + _ = c.Put(&cache.Entry{ContentHash: cache.HashString("class C {}"), Path: "C.java", Language: "java", ParsedAt: "t"}) + _ = c.Put(&cache.Entry{ContentHash: "h-d", Path: "D.java", Language: "java", ParsedAt: "t"}) + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default}) + d, err := a.Diff(root) + if err != nil { + t.Fatal(err) + } + check := func(label string, got, want []string) { + t.Helper() + sort.Strings(got) + sort.Strings(want) + if len(got) != len(want) { + t.Errorf("%s: got %v, want %v", label, got, want) + return + } + for i := range got { + if got[i] != want[i] { + t.Errorf("%s[%d]: got %q, want %q", label, i, got[i], want[i]) + } + } + } + check("Added", d.Added, []string{"B.java"}) + check("Modified", d.Modified, []string{"C.java"}) + check("Deleted", d.Deleted, []string{"D.java"}) + check("Unchanged", d.Unchanged, []string{"A.java"}) +} From d467b7ecefb03815827e1fb1608d6e469da78443 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:03:59 +0000 Subject: [PATCH 06/12] feat(analyzer): cache-hit early-exit + Diff/purge integration in Run --- internal/analyzer/analyzer.go | 61 +++++++++++++- internal/analyzer/analyzer_test.go | 128 +++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+), 3 deletions(-) diff --git a/internal/analyzer/analyzer.go b/internal/analyzer/analyzer.go index 4a70b496..ffb184d5 100644 --- a/internal/analyzer/analyzer.go +++ b/internal/analyzer/analyzer.go @@ -5,6 +5,7 @@ import ( "os" "runtime" "sync" + "sync/atomic" "time" "github.com/randomcodespace/codeiq/internal/cache" @@ -19,13 +20,19 @@ const DefaultBatchSize = 500 type Options struct { Cache *cache.Cache Registry *detector.Registry - BatchSize int // defaults to DefaultBatchSize - Workers int // defaults to 2 * GOMAXPROCS + BatchSize int // defaults to DefaultBatchSize + Workers int // defaults to 2 * GOMAXPROCS + Force bool // bypass cache early-exit; re-parse every file } // Analyzer orchestrates the index pipeline. type Analyzer struct { - opts Options + opts Options + counter runCounter +} + +type runCounter struct { + cacheHits atomic.Int64 } // NewAnalyzer returns an analyzer wired to opts. @@ -47,6 +54,9 @@ func NewAnalyzer(opts Options) *Analyzer { // Plan §1.5 — DedupedNodes/DedupedEdges/DroppedEdges expose dedup activity // so operators can see "graph collapsed 312 duplicate nodes, dropped 14 // phantom edges" — the visibility is what makes "meaningful" diagnosable. +// +// Added/Modified/Deleted/Unchanged/CacheHits are incremental counters, +// zero on full `--force` runs. type Stats struct { Files int Nodes int @@ -54,13 +64,38 @@ type Stats struct { DedupedNodes int DedupedEdges int DroppedEdges int + Added int + Modified int + Deleted int + Unchanged int + CacheHits int } // Run executes FileDiscovery → parse → detectors → GraphBuilder → cache writes // and returns aggregate stats. Errors from individual file processing are // logged to stderr but do not stop the run — partial output is better than no // output (matches Java's per-file try/catch behaviour). +// +// On non-Force runs with a cache present, Run first runs Diff() to classify +// files, purges cache rows for deleted files, then proceeds. processFile +// skips parse+detect for UNCHANGED files (content_hash hit in cache). func (a *Analyzer) Run(root string) (Stats, error) { + a.counter.cacheHits.Store(0) + + var d Delta + if a.opts.Cache != nil && !a.opts.Force { + var err error + d, err = a.Diff(root) + if err != nil { + return Stats{}, err + } + for _, path := range d.Deleted { + if err := a.opts.Cache.PurgeByPath(path); err != nil { + fmt.Fprintf(os.Stderr, "codeiq: purge %s: %v\n", path, err) + } + } + } + disc := NewFileDiscovery() files, err := disc.Discover(root) if err != nil { @@ -99,6 +134,11 @@ func (a *Analyzer) Run(root string) (Stats, error) { DedupedNodes: snap.DedupedNodes, DedupedEdges: snap.DedupedEdges, DroppedEdges: snap.DroppedEdges, + Added: len(d.Added), + Modified: len(d.Modified), + Deleted: len(d.Deleted), + Unchanged: len(d.Unchanged), + CacheHits: int(a.counter.cacheHits.Load()), }, nil } @@ -108,6 +148,18 @@ func (a *Analyzer) processFile(f DiscoveredFile, gb *GraphBuilder) error { return err } hash := cache.HashString(string(content)) + + // Fast path: cache hit. Reuse the previous emissions; skip parse+detect. + if a.opts.Cache != nil && !a.opts.Force && a.opts.Cache.Has(hash) { + entry, gerr := a.opts.Cache.Get(hash) + if gerr == nil && entry != nil { + gb.Add(&detector.Result{Nodes: entry.Nodes, Edges: entry.Edges}) + a.counter.cacheHits.Add(1) + return nil + } + // Has() true but Get() failed — pathological. Fall through to re-parse. + } + tree, err := parser.Parse(f.Language, content) if err != nil { // Continue with regex-only detectors when the parser bails — matches @@ -142,6 +194,9 @@ func (a *Analyzer) processFile(f DiscoveredFile, gb *GraphBuilder) error { entry.Edges = append(entry.Edges, r.Edges...) } if a.opts.Cache != nil { + // MODIFIED files: purge prior (path, old_hash) row so a single path + // never has two cache entries. + _ = a.opts.Cache.PurgeByPath(f.RelPath) if err := a.opts.Cache.Put(entry); err != nil { return fmt.Errorf("cache put: %w", err) } diff --git a/internal/analyzer/analyzer_test.go b/internal/analyzer/analyzer_test.go index d7555506..85d886f7 100644 --- a/internal/analyzer/analyzer_test.go +++ b/internal/analyzer/analyzer_test.go @@ -66,6 +66,134 @@ func TestAnalyzerEndToEnd(t *testing.T) { } } +func TestStatsHasIncrementalCounters(t *testing.T) { + var s Stats + // Compile-time check that the new fields exist with the expected names. + _ = s.Added + _ = s.Modified + _ = s.Deleted + _ = s.Unchanged + _ = s.CacheHits +} + +func TestProcessFileSkipsOnCacheHit(t *testing.T) { + root := t.TempDir() + cachePath := filepath.Join(root, ".codeiq", "cache.sqlite") + if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil { + t.Fatal(err) + } + src := "public class A {}" + if err := os.WriteFile(filepath.Join(root, "A.java"), []byte(src), 0o644); err != nil { + t.Fatal(err) + } + + c, err := cache.Open(cachePath) + if err != nil { + t.Fatalf("cache: %v", err) + } + defer c.Close() + + // Seed the cache with a row for this content hash. processFile MUST + // not re-parse the file when its hash already lives in the cache. + if err := c.Put(&cache.Entry{ + ContentHash: cache.HashString(src), + Path: "A.java", + Language: "java", + ParsedAt: "2026-01-01T00:00:00Z", + }); err != nil { + t.Fatal(err) + } + + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1}) + stats, err := a.Run(root) + if err != nil { + t.Fatalf("run: %v", err) + } + if stats.CacheHits != 1 { + t.Fatalf("CacheHits = %d, want 1", stats.CacheHits) + } + if stats.Files != 1 { + t.Fatalf("Files = %d, want 1", stats.Files) + } + if stats.Unchanged != 1 { + t.Fatalf("Unchanged = %d, want 1", stats.Unchanged) + } +} + +func TestForceBypassesCacheHit(t *testing.T) { + root := t.TempDir() + cachePath := filepath.Join(root, ".codeiq", "cache.sqlite") + if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil { + t.Fatal(err) + } + src := "public class A {}" + if err := os.WriteFile(filepath.Join(root, "A.java"), []byte(src), 0o644); err != nil { + t.Fatal(err) + } + + c, err := cache.Open(cachePath) + if err != nil { + t.Fatal(err) + } + defer c.Close() + _ = c.Put(&cache.Entry{ + ContentHash: cache.HashString(src), + Path: "A.java", + Language: "java", + ParsedAt: "t", + }) + + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1, Force: true}) + stats, err := a.Run(root) + if err != nil { + t.Fatal(err) + } + if stats.CacheHits != 0 { + t.Fatalf("Force=true should bypass cache; CacheHits = %d", stats.CacheHits) + } +} + +func TestRunPurgesDeletedFiles(t *testing.T) { + root := t.TempDir() + cachePath := filepath.Join(root, ".codeiq", "cache.sqlite") + if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil { + t.Fatal(err) + } + c, err := cache.Open(cachePath) + if err != nil { + t.Fatal(err) + } + defer c.Close() + + // Seed a phantom file that's gone from disk. + if err := c.Put(&cache.Entry{ + ContentHash: "ghost-hash", + Path: "deleted.java", + Language: "java", + ParsedAt: "t", + }); err != nil { + t.Fatal(err) + } + if !c.Has("ghost-hash") { + t.Fatal("seed didn't take") + } + if err := os.WriteFile(filepath.Join(root, "real.java"), []byte("class R {}"), 0o644); err != nil { + t.Fatal(err) + } + + a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1}) + stats, err := a.Run(root) + if err != nil { + t.Fatal(err) + } + if c.Has("ghost-hash") { + t.Fatal("deleted file's cache row not purged") + } + if stats.Deleted != 1 { + t.Fatalf("Deleted = %d, want 1", stats.Deleted) + } +} + func TestAnalyzerDeterminism(t *testing.T) { dir := t.TempDir() if err := os.WriteFile(filepath.Join(dir, "UserController.java"), []byte(fixtureJava), 0644); err != nil { From 85a829639224215f147614cbb3cc6a8be7f63023 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:06:16 +0000 Subject: [PATCH 07/12] feat(graph): RemoveFile/InsertFile/ReplaceFile per-file mutation APIs --- internal/graph/refresh.go | 66 +++++++++++++ internal/graph/refresh_test.go | 165 +++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 internal/graph/refresh.go create mode 100644 internal/graph/refresh_test.go diff --git a/internal/graph/refresh.go b/internal/graph/refresh.go new file mode 100644 index 00000000..855efc3d --- /dev/null +++ b/internal/graph/refresh.go @@ -0,0 +1,66 @@ +package graph + +import ( + "fmt" + + "github.com/randomcodespace/codeiq/internal/model" +) + +// RemoveFile deletes every CodeNode whose file_path matches path along with +// every incident relationship across all rel tables. Idempotent — calling +// with a path that has no matching nodes is a no-op that returns nil. +// +// Implementation note: we iterate per rel table because Kuzu (0.11.3) does +// not yet support a heterogeneous "match any rel" DELETE across rel tables. +// The DETACH DELETE on CodeNode then handles whatever remains. +func (s *Store) RemoveFile(path string) error { + // First, drop every incident rel by walking each declared rel table. + // DETACH DELETE on CodeNode would handle this, but being explicit per + // table keeps the delete plan simple and predictable on Kuzu 0.11.3. + for _, kind := range model.AllEdgeKinds() { + q := fmt.Sprintf( + `MATCH (n:CodeNode)-[r:%s]->(m:CodeNode) + WHERE n.file_path = $p OR m.file_path = $p + DELETE r`, + relTableName(kind)) + if _, err := s.Cypher(q, map[string]any{"p": path}); err != nil { + return fmt.Errorf("graph: remove edges for %s: %w", path, err) + } + } + // Now drop the nodes themselves. + if _, err := s.Cypher( + `MATCH (n:CodeNode) WHERE n.file_path = $p DELETE n`, + map[string]any{"p": path}, + ); err != nil { + return fmt.Errorf("graph: remove nodes for %s: %w", path, err) + } + return nil +} + +// InsertFile bulk-loads nodes + edges for a single file. Equivalent to +// BulkLoadNodes + BulkLoadEdges; the path parameter is for API symmetry +// (file_path is on each node). +func (s *Store) InsertFile(path string, nodes []*model.CodeNode, edges []*model.CodeEdge) error { + if len(nodes) == 0 && len(edges) == 0 { + return nil + } + if err := s.BulkLoadNodes(nodes); err != nil { + return fmt.Errorf("graph: insert file %s nodes: %w", path, err) + } + if err := s.BulkLoadEdges(edges); err != nil { + return fmt.Errorf("graph: insert file %s edges: %w", path, err) + } + return nil +} + +// ReplaceFile is the MODIFIED-file path: RemoveFile followed by InsertFile. +// There is a brief window between the two calls where the file's nodes are +// absent from the graph; concurrent readers see either pre-state or +// post-state but may briefly observe the file as missing. The window is +// fine for incremental enrich since enrich is the single writer. +func (s *Store) ReplaceFile(path string, nodes []*model.CodeNode, edges []*model.CodeEdge) error { + if err := s.RemoveFile(path); err != nil { + return err + } + return s.InsertFile(path, nodes, edges) +} diff --git a/internal/graph/refresh_test.go b/internal/graph/refresh_test.go new file mode 100644 index 00000000..6016e86f --- /dev/null +++ b/internal/graph/refresh_test.go @@ -0,0 +1,165 @@ +package graph_test + +import ( + "path/filepath" + "testing" + + "github.com/randomcodespace/codeiq/internal/graph" + "github.com/randomcodespace/codeiq/internal/model" +) + +func openSchemaStore(t *testing.T) *graph.Store { + t.Helper() + s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu")) + if err != nil { + t.Fatal(err) + } + if err := s.ApplySchema(); err != nil { + s.Close() + t.Fatal(err) + } + return s +} + +func countCodeNodes(t *testing.T, s *graph.Store) int64 { + t.Helper() + rows, err := s.Cypher("MATCH (n:CodeNode) RETURN count(n) AS c") + if err != nil { + t.Fatal(err) + } + return rows[0]["c"].(int64) +} + +func TestRemoveFileDeletesAllNodesForPath(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + + nodes := []*model.CodeNode{ + {ID: "n1", Kind: model.NodeClass, Label: "A", FilePath: "A.java", Layer: model.LayerBackend}, + {ID: "n2", Kind: model.NodeMethod, Label: "foo", FilePath: "A.java", Layer: model.LayerBackend}, + {ID: "n3", Kind: model.NodeClass, Label: "B", FilePath: "B.java", Layer: model.LayerBackend}, + } + if err := s.BulkLoadNodes(nodes); err != nil { + t.Fatal(err) + } + if err := s.RemoveFile("A.java"); err != nil { + t.Fatalf("RemoveFile: %v", err) + } + rows, err := s.Cypher("MATCH (n:CodeNode) RETURN n.id AS id ORDER BY id") + if err != nil { + t.Fatal(err) + } + if len(rows) != 1 { + t.Fatalf("want 1 remaining (n3), got %d: %v", len(rows), rows) + } + if rows[0]["id"] != "n3" { + t.Fatalf("wrong survivor: %v", rows[0]) + } +} + +func TestRemoveFileIsIdempotent(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + if err := s.RemoveFile("never-existed.java"); err != nil { + t.Fatalf("RemoveFile on missing: %v", err) + } +} + +func TestRemoveFileDeletesIncidentEdges(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + nodes := []*model.CodeNode{ + {ID: "n1", Kind: model.NodeClass, Label: "A", FilePath: "A.java"}, + {ID: "n2", Kind: model.NodeClass, Label: "B", FilePath: "B.java"}, + } + if err := s.BulkLoadNodes(nodes); err != nil { + t.Fatal(err) + } + edges := []*model.CodeEdge{ + {ID: "n1->n2", Kind: model.EdgeCalls, SourceID: "n1", TargetID: "n2", + Confidence: model.ConfidenceSyntactic}, + } + if err := s.BulkLoadEdges(edges); err != nil { + t.Fatal(err) + } + if err := s.RemoveFile("A.java"); err != nil { + t.Fatalf("RemoveFile: %v", err) + } + rows, err := s.Cypher("MATCH ()-[r:CALLS]->() RETURN count(r) AS c") + if err != nil { + t.Fatal(err) + } + if rows[0]["c"].(int64) != 0 { + t.Fatalf("edge to deleted node survived: %v", rows[0]) + } +} + +func TestInsertFileAddsNodesAndEdges(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + nodes := []*model.CodeNode{ + {ID: "j:file:C.java", Kind: model.NodeModule, Label: "C.java", FilePath: "C.java"}, + {ID: "j:C.java:class:C", Kind: model.NodeClass, Label: "C", FilePath: "C.java"}, + } + edges := []*model.CodeEdge{{ + ID: "j:file:C.java->j:C.java:class:C", Kind: model.EdgeContains, + SourceID: "j:file:C.java", TargetID: "j:C.java:class:C", + Confidence: model.ConfidenceSyntactic, Source: "test", + }} + if err := s.InsertFile("C.java", nodes, edges); err != nil { + t.Fatalf("InsertFile: %v", err) + } + rows, err := s.Cypher("MATCH (n:CodeNode) WHERE n.file_path = 'C.java' RETURN count(n) AS c") + if err != nil { + t.Fatal(err) + } + if got := rows[0]["c"].(int64); got != 2 { + t.Fatalf("want 2 nodes, got %v", rows[0]["c"]) + } + rels, err := s.Cypher("MATCH ()-[r:CONTAINS]->() RETURN count(r) AS c") + if err != nil { + t.Fatal(err) + } + if got := rels[0]["c"].(int64); got != 1 { + t.Fatalf("want 1 CONTAINS edge, got %v", rels[0]["c"]) + } +} + +func TestInsertFileEmptyIsNoop(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + if err := s.InsertFile("empty.java", nil, nil); err != nil { + t.Fatalf("InsertFile with empty input should be no-op, got: %v", err) + } + if countCodeNodes(t, s) != 0 { + t.Fatal("empty insert created phantom nodes") + } +} + +func TestReplaceFileSwapsContent(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + + if err := s.InsertFile("D.java", []*model.CodeNode{ + {ID: "old-d", Kind: model.NodeClass, Label: "OldD", FilePath: "D.java"}, + }, nil); err != nil { + t.Fatal(err) + } + if err := s.ReplaceFile("D.java", + []*model.CodeNode{ + {ID: "new-d", Kind: model.NodeClass, Label: "NewD", FilePath: "D.java"}, + {ID: "new-d-m", Kind: model.NodeMethod, Label: "m", FilePath: "D.java"}, + }, nil); err != nil { + t.Fatalf("ReplaceFile: %v", err) + } + rows, err := s.Cypher("MATCH (n:CodeNode) WHERE n.file_path = 'D.java' RETURN n.id AS id ORDER BY n.id") + if err != nil { + t.Fatal(err) + } + if len(rows) != 2 { + t.Fatalf("want 2 nodes after replace, got %d: %v", len(rows), rows) + } + if rows[0]["id"] != "new-d" || rows[1]["id"] != "new-d-m" { + t.Fatalf("nodes after replace: %v", rows) + } +} From 1702f176e09c3ce39d3b4c16d7d96fcc543e4011 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:07:50 +0000 Subject: [PATCH 08/12] feat(graph): GraphMeta table + ReadManifest/WriteManifest --- internal/graph/manifest.go | 40 +++++++++++++++ internal/graph/manifest_test.go | 90 +++++++++++++++++++++++++++++++++ internal/graph/schema.go | 10 ++++ internal/graph/schema_test.go | 23 ++++++--- 4 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 internal/graph/manifest.go create mode 100644 internal/graph/manifest_test.go diff --git a/internal/graph/manifest.go b/internal/graph/manifest.go new file mode 100644 index 00000000..e70ad15e --- /dev/null +++ b/internal/graph/manifest.go @@ -0,0 +1,40 @@ +package graph + +import "fmt" + +const metaKeyManifest = "manifest_hash" + +// ReadManifest returns the manifest_hash stored by the last successful +// enrich, or "" if none exists (fresh graph). Used by enrich to short- +// circuit when the cache and graph are already in sync. +func (s *Store) ReadManifest() (string, error) { + rows, err := s.Cypher( + `MATCH (m:GraphMeta) WHERE m.meta_key = $k RETURN m.value AS v`, + map[string]any{"k": metaKeyManifest}) + if err != nil { + return "", fmt.Errorf("graph: read manifest: %w", err) + } + if len(rows) == 0 { + return "", nil + } + v, _ := rows[0]["v"].(string) + return v, nil +} + +// WriteManifest stores hash as the manifest_hash, replacing any existing +// value. Called at the tail of every successful enrich. +func (s *Store) WriteManifest(hash string) error { + if _, err := s.Cypher( + `MATCH (m:GraphMeta) WHERE m.meta_key = $k DELETE m`, + map[string]any{"k": metaKeyManifest}, + ); err != nil { + return fmt.Errorf("graph: clear manifest: %w", err) + } + if _, err := s.Cypher( + `CREATE (:GraphMeta {meta_key: $k, value: $v})`, + map[string]any{"k": metaKeyManifest, "v": hash}, + ); err != nil { + return fmt.Errorf("graph: write manifest: %w", err) + } + return nil +} diff --git a/internal/graph/manifest_test.go b/internal/graph/manifest_test.go new file mode 100644 index 00000000..9a1dbd21 --- /dev/null +++ b/internal/graph/manifest_test.go @@ -0,0 +1,90 @@ +package graph_test + +import ( + "path/filepath" + "testing" + + "github.com/randomcodespace/codeiq/internal/graph" +) + +func TestReadManifestEmptyOnFreshGraph(t *testing.T) { + s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu")) + if err != nil { + t.Fatal(err) + } + defer s.Close() + if err := s.ApplySchema(); err != nil { + t.Fatal(err) + } + got, err := s.ReadManifest() + if err != nil { + t.Fatalf("ReadManifest: %v", err) + } + if got != "" { + t.Fatalf("fresh manifest = %q, want \"\"", got) + } +} + +func TestWriteThenReadManifestRoundTrip(t *testing.T) { + s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu")) + if err != nil { + t.Fatal(err) + } + defer s.Close() + if err := s.ApplySchema(); err != nil { + t.Fatal(err) + } + want := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + if err := s.WriteManifest(want); err != nil { + t.Fatalf("WriteManifest: %v", err) + } + got, err := s.ReadManifest() + if err != nil { + t.Fatalf("ReadManifest: %v", err) + } + if got != want { + t.Fatalf("ReadManifest = %q, want %q", got, want) + } +} + +func TestWriteManifestOverwrites(t *testing.T) { + s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu")) + if err != nil { + t.Fatal(err) + } + defer s.Close() + if err := s.ApplySchema(); err != nil { + t.Fatal(err) + } + if err := s.WriteManifest("h1"); err != nil { + t.Fatal(err) + } + if err := s.WriteManifest("h2"); err != nil { + t.Fatal(err) + } + got, _ := s.ReadManifest() + if got != "h2" { + t.Fatalf("got %q after overwrite, want h2", got) + } +} + +func TestManifestRejectedOnReadOnly(t *testing.T) { + path := filepath.Join(t.TempDir(), "g.kuzu") + w, err := graph.Open(path) + if err != nil { + t.Fatal(err) + } + if err := w.ApplySchema(); err != nil { + t.Fatal(err) + } + w.Close() + + ro, err := graph.OpenReadOnly(path, 0) + if err != nil { + t.Fatal(err) + } + defer ro.Close() + if err := ro.WriteManifest("blocked"); err == nil { + t.Fatal("WriteManifest should fail on read-only store") + } +} diff --git a/internal/graph/schema.go b/internal/graph/schema.go index a4011a99..65e78598 100644 --- a/internal/graph/schema.go +++ b/internal/graph/schema.go @@ -16,6 +16,16 @@ import ( // a label. Properties round-trip through a JSON-serialised `props` column // plus a small set of first-class columns we want to index / project on. func (s *Store) ApplySchema() error { + // GraphMeta stores small key→value strings (e.g., manifest hash for the + // incremental enrich short-circuit). One row per key; PK enforces uniqueness. + metaDDL := `CREATE NODE TABLE IF NOT EXISTS GraphMeta( + meta_key STRING, + value STRING, + PRIMARY KEY(meta_key))` + if _, err := s.Cypher(metaDDL); err != nil { + return fmt.Errorf("graph: create GraphMeta: %w", err) + } + nodeDDL := `CREATE NODE TABLE IF NOT EXISTS CodeNode( id STRING, kind STRING, diff --git a/internal/graph/schema_test.go b/internal/graph/schema_test.go index 4ace1c5f..bcd47f59 100644 --- a/internal/graph/schema_test.go +++ b/internal/graph/schema_test.go @@ -8,9 +8,10 @@ import ( "github.com/randomcodespace/codeiq/internal/model" ) -// TestApplySchemaCreatesAllTables asserts ApplySchema produces exactly one -// CodeNode node table and one rel table per EdgeKind. The Java side mirrors -// this implicitly through SDN's label-driven schema; on Kuzu we declare it. +// TestApplySchemaCreatesAllTables asserts ApplySchema produces the expected +// node tables (CodeNode + GraphMeta) and one rel table per EdgeKind. The +// Java side mirrors this implicitly through SDN's label-driven schema; on +// Kuzu we declare it. func TestApplySchemaCreatesAllTables(t *testing.T) { s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu")) if err != nil { @@ -26,17 +27,25 @@ func TestApplySchemaCreatesAllTables(t *testing.T) { if err != nil { t.Fatalf("show tables: %v", err) } - var nodeTables, relTables int + nodeTables := map[string]bool{} + relTables := 0 for _, r := range rows { switch r["type"] { case "NODE": - nodeTables++ + name, _ := r["name"].(string) + nodeTables[name] = true case "REL": relTables++ } } - if nodeTables != 1 { - t.Errorf("want 1 node table, got %d", nodeTables) + if !nodeTables["CodeNode"] { + t.Error("CodeNode node table missing") + } + if !nodeTables["GraphMeta"] { + t.Error("GraphMeta node table missing") + } + if len(nodeTables) != 2 { + t.Errorf("want 2 node tables (CodeNode, GraphMeta), got %d: %v", len(nodeTables), nodeTables) } if relTables != len(model.AllEdgeKinds()) { t.Errorf("want %d rel tables, got %d", len(model.AllEdgeKinds()), relTables) From f9074d88b8a3d36d36a7912ccb5a24b449524180 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:10:00 +0000 Subject: [PATCH 09/12] feat(linker+graph): source-tag linker emissions + WipeLinkerEdges API --- internal/analyzer/linker/entity_linker.go | 1 + .../linker/module_containment_linker.go | 2 + internal/analyzer/linker/source_tag.go | 20 +++++ internal/analyzer/linker/source_tag_test.go | 85 +++++++++++++++++++ internal/analyzer/linker/topic_linker.go | 1 + internal/graph/refresh.go | 33 +++++++ internal/graph/refresh_test.go | 62 ++++++++++++++ 7 files changed, 204 insertions(+) create mode 100644 internal/analyzer/linker/source_tag.go create mode 100644 internal/analyzer/linker/source_tag_test.go diff --git a/internal/analyzer/linker/entity_linker.go b/internal/analyzer/linker/entity_linker.go index 57ca72a4..a4894728 100644 --- a/internal/analyzer/linker/entity_linker.go +++ b/internal/analyzer/linker/entity_linker.go @@ -86,6 +86,7 @@ func (l *EntityLinker) Link(nodes []*model.CodeNode, edges []*model.CodeEdge) Re Kind: model.EdgeQueries, SourceID: repo.ID, TargetID: ent.ID, + Source: SrcEntityLinker, Properties: map[string]any{"inferred": true}, }) break diff --git a/internal/analyzer/linker/module_containment_linker.go b/internal/analyzer/linker/module_containment_linker.go index c33830b3..87b8fa35 100644 --- a/internal/analyzer/linker/module_containment_linker.go +++ b/internal/analyzer/linker/module_containment_linker.go @@ -66,6 +66,7 @@ func (l *ModuleContainmentLinker) Link(nodes []*model.CodeNode, edges []*model.C Label: m, FQN: m, Module: m, + Source: SrcModuleContainmentLinker, }) existingModules[moduleID] = struct{}{} } @@ -81,6 +82,7 @@ func (l *ModuleContainmentLinker) Link(nodes []*model.CodeNode, edges []*model.C Kind: model.EdgeContains, SourceID: moduleID, TargetID: mem.ID, + Source: SrcModuleContainmentLinker, Properties: map[string]any{"inferred": true}, }) existingContains[key] = struct{}{} diff --git a/internal/analyzer/linker/source_tag.go b/internal/analyzer/linker/source_tag.go new file mode 100644 index 00000000..1d9861af --- /dev/null +++ b/internal/analyzer/linker/source_tag.go @@ -0,0 +1,20 @@ +package linker + +// Source-tag constants stamped onto every edge a linker emits. They give +// the incremental enrich pipeline a way to clear previous linker output +// (via Store.WipeLinkerEdges) before re-running linkers, without touching +// detector-emitted edges. +const ( + SrcTopicLinker = "linker:topic" + SrcEntityLinker = "linker:entity" + SrcModuleContainmentLinker = "linker:module_containment" +) + +// AllSources lists every linker source tag. Store.WipeLinkerEdges takes +// this slice (or a subset) when clearing linker output. Add new entries +// here when adding a new linker. +var AllSources = []string{ + SrcTopicLinker, + SrcEntityLinker, + SrcModuleContainmentLinker, +} diff --git a/internal/analyzer/linker/source_tag_test.go b/internal/analyzer/linker/source_tag_test.go new file mode 100644 index 00000000..c4d3e633 --- /dev/null +++ b/internal/analyzer/linker/source_tag_test.go @@ -0,0 +1,85 @@ +package linker + +import ( + "testing" + + "github.com/randomcodespace/codeiq/internal/model" +) + +func TestTopicLinkerStampsSource(t *testing.T) { + nodes := []*model.CodeNode{ + {ID: "svc:p", Kind: model.NodeService, Label: "p"}, + {ID: "svc:c", Kind: model.NodeService, Label: "c"}, + {ID: "topic:t", Kind: model.NodeTopic, Label: "t"}, + } + edges := []*model.CodeEdge{ + {ID: "p->t", Kind: model.EdgeProduces, SourceID: "svc:p", TargetID: "topic:t"}, + {ID: "c->t", Kind: model.EdgeConsumes, SourceID: "svc:c", TargetID: "topic:t"}, + } + r := NewTopicLinker().Link(nodes, edges) + if len(r.Edges) == 0 { + t.Fatal("TopicLinker emitted no edges") + } + for _, e := range r.Edges { + if e.Source != SrcTopicLinker { + t.Errorf("edge %s: Source = %q, want %q", e.ID, e.Source, SrcTopicLinker) + } + } +} + +func TestEntityLinkerStampsSource(t *testing.T) { + nodes := []*model.CodeNode{ + {ID: "ent:User", Kind: model.NodeEntity, Label: "User"}, + {ID: "repo:UserRepo", Kind: model.NodeRepository, Label: "UserRepository"}, + } + r := NewEntityLinker().Link(nodes, nil) + if len(r.Edges) == 0 { + t.Fatal("EntityLinker emitted no edges") + } + for _, e := range r.Edges { + if e.Source != SrcEntityLinker { + t.Errorf("edge %s: Source = %q, want %q", e.ID, e.Source, SrcEntityLinker) + } + } +} + +func TestModuleContainmentLinkerStampsSource(t *testing.T) { + nodes := []*model.CodeNode{ + {ID: "n1", Kind: model.NodeClass, Label: "A", Module: "auth"}, + {ID: "n2", Kind: model.NodeClass, Label: "B", Module: "auth"}, + } + r := NewModuleContainmentLinker().Link(nodes, nil) + if len(r.Edges) == 0 { + t.Fatal("ModuleContainmentLinker emitted no edges") + } + for _, e := range r.Edges { + if e.Source != SrcModuleContainmentLinker { + t.Errorf("edge %s: Source = %q, want %q", e.ID, e.Source, SrcModuleContainmentLinker) + } + } + for _, n := range r.Nodes { + if n.Source != SrcModuleContainmentLinker { + t.Errorf("node %s: Source = %q, want %q", n.ID, n.Source, SrcModuleContainmentLinker) + } + } +} + +func TestAllSourcesIncludesEveryLinker(t *testing.T) { + want := map[string]bool{ + SrcTopicLinker: false, + SrcEntityLinker: false, + SrcModuleContainmentLinker: false, + } + for _, s := range AllSources { + if _, ok := want[s]; !ok { + t.Errorf("AllSources contains unknown tag %q", s) + continue + } + want[s] = true + } + for tag, found := range want { + if !found { + t.Errorf("AllSources missing %q", tag) + } + } +} diff --git a/internal/analyzer/linker/topic_linker.go b/internal/analyzer/linker/topic_linker.go index f620c8ac..29e33c17 100644 --- a/internal/analyzer/linker/topic_linker.go +++ b/internal/analyzer/linker/topic_linker.go @@ -93,6 +93,7 @@ func (l *TopicLinker) Link(nodes []*model.CodeNode, edges []*model.CodeEdge) Res Kind: model.EdgeCalls, SourceID: p, TargetID: c, + Source: SrcTopicLinker, Properties: map[string]any{ "inferred": true, "topic": label, diff --git a/internal/graph/refresh.go b/internal/graph/refresh.go index 855efc3d..be58155c 100644 --- a/internal/graph/refresh.go +++ b/internal/graph/refresh.go @@ -53,6 +53,39 @@ func (s *Store) InsertFile(path string, nodes []*model.CodeNode, edges []*model. return nil } +// WipeLinkerEdges deletes every relationship whose source property is in +// the given sources set, across every declared rel table. Used by +// incremental enrich to clear previous linker emissions before re-running +// linker passes. +// +// Iterates per rel-type because Kuzu (0.11.3) doesn't support a +// heterogeneous "match any rel" DELETE across rel tables. +// +// Linker-emitted nodes (e.g., MODULE nodes from ModuleContainmentLinker) +// are also wiped when their source tag is in the set — caller can pass +// only edge-relevant tags if they want to preserve nodes. +func (s *Store) WipeLinkerEdges(sources []string) error { + if len(sources) == 0 { + return nil + } + for _, kind := range model.AllEdgeKinds() { + q := fmt.Sprintf( + `MATCH ()-[r:%s]->() WHERE r.source IN $sources DELETE r`, + relTableName(kind)) + if _, err := s.Cypher(q, map[string]any{"sources": sources}); err != nil { + return fmt.Errorf("graph: wipe %s edges: %w", relTableName(kind), err) + } + } + // Linker-emitted nodes (module nodes) — drop any CodeNode whose source + // tag is in the set. These re-emit on the next linker pass. + if _, err := s.Cypher( + `MATCH (n:CodeNode) WHERE n.source IN $sources DELETE n`, + map[string]any{"sources": sources}); err != nil { + return fmt.Errorf("graph: wipe linker nodes: %w", err) + } + return nil +} + // ReplaceFile is the MODIFIED-file path: RemoveFile followed by InsertFile. // There is a brief window between the two calls where the file's nodes are // absent from the graph; concurrent readers see either pre-state or diff --git a/internal/graph/refresh_test.go b/internal/graph/refresh_test.go index 6016e86f..34058057 100644 --- a/internal/graph/refresh_test.go +++ b/internal/graph/refresh_test.go @@ -136,6 +136,68 @@ func TestInsertFileEmptyIsNoop(t *testing.T) { } } +func TestWipeLinkerEdgesByTag(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + if err := s.BulkLoadNodes([]*model.CodeNode{ + {ID: "a", Kind: model.NodeService, Label: "A"}, + {ID: "b", Kind: model.NodeService, Label: "B"}, + }); err != nil { + t.Fatal(err) + } + if err := s.BulkLoadEdges([]*model.CodeEdge{ + {ID: "e-det", Kind: model.EdgeDependsOn, SourceID: "a", TargetID: "b", + Source: "SomeDetector", Confidence: model.ConfidenceSyntactic}, + {ID: "e-lnk", Kind: model.EdgeDependsOn, SourceID: "a", TargetID: "b", + Source: "linker:topic", Confidence: model.ConfidenceLexical}, + }); err != nil { + t.Fatal(err) + } + if err := s.WipeLinkerEdges([]string{"linker:topic", "linker:entity", "linker:module_containment"}); err != nil { + t.Fatalf("WipeLinkerEdges: %v", err) + } + rows, err := s.Cypher("MATCH ()-[r:DEPENDS_ON]->() RETURN r.id AS id ORDER BY r.id") + if err != nil { + t.Fatal(err) + } + if len(rows) != 1 { + t.Fatalf("want 1 surviving (detector), got %d: %v", len(rows), rows) + } + if rows[0]["id"] != "e-det" { + t.Fatalf("wrong edge survived: %v", rows[0]) + } +} + +func TestWipeLinkerEdgesEmptySources(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + // Empty sources is a no-op, not an error. + if err := s.WipeLinkerEdges(nil); err != nil { + t.Fatalf("WipeLinkerEdges(nil): %v", err) + } +} + +func TestWipeLinkerEdgesAlsoDropsLinkerNodes(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + if err := s.BulkLoadNodes([]*model.CodeNode{ + {ID: "m:auth", Kind: model.NodeModule, Label: "auth", Source: "linker:module_containment"}, + {ID: "c:Foo", Kind: model.NodeClass, Label: "Foo", Source: "SomeDetector"}, + }); err != nil { + t.Fatal(err) + } + if err := s.WipeLinkerEdges([]string{"linker:module_containment"}); err != nil { + t.Fatal(err) + } + rows, _ := s.Cypher("MATCH (n:CodeNode) RETURN n.id AS id ORDER BY n.id") + if len(rows) != 1 { + t.Fatalf("want 1 surviving (detector node), got %d: %v", len(rows), rows) + } + if rows[0]["id"] != "c:Foo" { + t.Fatalf("wrong node survived: %v", rows[0]) + } +} + func TestReplaceFileSwapsContent(t *testing.T) { s := openSchemaStore(t) defer s.Close() From da323e387f5ae3d31f18473b211115a6877c3644 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:14:09 +0000 Subject: [PATCH 10/12] feat(enrich): manifest short-circuit + Reset for re-runnable enrich --- internal/analyzer/enrich.go | 89 ++++++++++++++++++----- internal/analyzer/enrich_test.go | 120 +++++++++++++++++++++++++++++++ internal/graph/refresh.go | 16 +++++ internal/graph/refresh_test.go | 44 ++++++++++++ 4 files changed, 252 insertions(+), 17 deletions(-) diff --git a/internal/analyzer/enrich.go b/internal/analyzer/enrich.go index 2b0b7555..6e0f6ae4 100644 --- a/internal/analyzer/enrich.go +++ b/internal/analyzer/enrich.go @@ -29,13 +29,28 @@ type EnrichOptions struct { // StoreCopyThreads caps Kuzu COPY FROM parallelism. Zero -> graph // package default (min(4, GOMAXPROCS)). StoreCopyThreads uint64 + // Force bypasses the incremental short-circuit. With Force=false (the + // default), Enrich short-circuits when the cache and graph manifest + // hashes match. + Force bool } // EnrichSummary reports per-run counters from a successful Enrich. +// +// ShortCircuited is true when the cache and graph were already in sync +// and Enrich did no rebuild work. Nodes/Edges/Services report 0 in that +// case — callers should check ShortCircuited before treating zero as +// "empty graph". +// +// Mode is one of: +// - "short-circuit" — cache + graph manifests matched; no work done. +// - "full" — fresh graph or non-matching manifest; full rebuild. type EnrichSummary struct { - Nodes int - Edges int - Services int + Nodes int + Edges int + Services int + ShortCircuited bool + Mode string } // Enrich loads the SQLite cache for `root`, runs the linker / classifier / @@ -44,6 +59,16 @@ type EnrichSummary struct { // returned summary reports total nodes / edges / service nodes after every // pass has run. // +// Incremental short-circuit: when a prior enrich wrote a manifest_hash to +// the graph and that hash matches the cache's current manifest, Enrich +// returns immediately with ShortCircuited=true. To force a full rebuild, +// pass Force=true. +// +// Re-runnable: when the graph already holds prior data (non-empty +// CodeNode table), Enrich calls Store.Reset() first so the subsequent +// BulkLoad doesn't collide on primary keys. This makes `enrich` safe to +// re-run after `index` picks up changes. +// // Mirrors the `enrich` pipeline in Java (Analyzer.java + GraphStore.java). // The pipeline order matches the Java side exactly: // @@ -53,6 +78,7 @@ type EnrichSummary struct { // 4. LanguageEnricher (Java, TypeScript, Python, Go extractors) // 5. ServiceDetector (filesystem walk for build files) // 6. graph.Store.BulkLoadNodes / BulkLoadEdges / CreateIndexes +// 7. graph.Store.WriteManifest(cache.ManifestHash()) // // All steps are deterministic — repeated calls against the same cache + root // produce identical Kuzu output. @@ -61,10 +87,43 @@ func Enrich(root string, c *cache.Cache, opts EnrichOptions) (EnrichSummary, err opts.GraphDir = filepath.Join(root, ".codeiq", "graph", "codeiq.kuzu") } + store, err := graph.OpenWithOptions(opts.GraphDir, graph.OpenOptions{ + BufferPoolBytes: opts.StoreBufferPoolBytes, + MaxThreads: opts.StoreCopyThreads, + }) + if err != nil { + return EnrichSummary{}, fmt.Errorf("enrich: open graph: %w", err) + } + defer store.Close() + if err := store.ApplySchema(); err != nil { + return EnrichSummary{}, fmt.Errorf("enrich: apply schema: %w", err) + } + + // Short-circuit when the cache and graph are already in sync. + if !opts.Force { + cacheManifest, mErr := c.ManifestHash() + if mErr != nil { + return EnrichSummary{}, fmt.Errorf("enrich: cache manifest: %w", mErr) + } + graphManifest, gErr := store.ReadManifest() + if gErr != nil { + return EnrichSummary{}, fmt.Errorf("enrich: graph manifest: %w", gErr) + } + if graphManifest != "" && cacheManifest == graphManifest { + return EnrichSummary{ShortCircuited: true, Mode: "short-circuit"}, nil + } + } + + // Reset existing graph data so BulkLoad doesn't collide on PKs from a + // prior run. Reset on a fresh graph is a cheap no-op. + if err := store.Reset(); err != nil { + return EnrichSummary{}, fmt.Errorf("enrich: reset: %w", err) + } + // Re-hydrate the graph from cache. GraphBuilder dedupes by node/edge ID and // produces a deterministic snapshot with dangling edges dropped. builder := NewGraphBuilder() - err := c.IterateAll(func(r *cache.Entry) error { + err = c.IterateAll(func(r *cache.Entry) error { builder.Add(&detector.Result{Nodes: r.Nodes, Edges: r.Edges}) return nil }) @@ -113,19 +172,7 @@ func Enrich(root string, c *cache.Cache, opts EnrichOptions) (EnrichSummary, err nodes = append(nodes, sres.Nodes...) edges = append(edges, sres.Edges...) - // 6. Bulk-load Kuzu — schema + nodes + edges + indexes. The store is - // closed when this function returns; read-side commands re-open it. - store, err := graph.OpenWithOptions(opts.GraphDir, graph.OpenOptions{ - BufferPoolBytes: opts.StoreBufferPoolBytes, - MaxThreads: opts.StoreCopyThreads, - }) - if err != nil { - return EnrichSummary{}, fmt.Errorf("enrich: open graph: %w", err) - } - defer store.Close() - if err := store.ApplySchema(); err != nil { - return EnrichSummary{}, fmt.Errorf("enrich: apply schema: %w", err) - } + // 6. Bulk-load Kuzu — nodes + edges + indexes. if err := store.BulkLoadNodes(nodes); err != nil { return EnrichSummary{}, fmt.Errorf("enrich: bulk load nodes: %w", err) } @@ -136,10 +183,18 @@ func Enrich(root string, c *cache.Cache, opts EnrichOptions) (EnrichSummary, err return EnrichSummary{}, fmt.Errorf("enrich: create indexes: %w", err) } + // 7. Write the manifest hash so the next run can short-circuit. Best-effort + // — a failed manifest write only forces the next run to re-do the work, + // which is annoying but not incorrect. + if mh, mErr := c.ManifestHash(); mErr == nil { + _ = store.WriteManifest(mh) + } + return EnrichSummary{ Nodes: len(nodes), Edges: len(edges), Services: len(sres.Nodes), + Mode: "full", }, nil } diff --git a/internal/analyzer/enrich_test.go b/internal/analyzer/enrich_test.go index 851d6fe0..791c9eec 100644 --- a/internal/analyzer/enrich_test.go +++ b/internal/analyzer/enrich_test.go @@ -76,6 +76,126 @@ func TestEnrichEmptyCacheIsNoop(t *testing.T) { } } +// TestEnrichShortCircuitsWhenManifestMatches verifies the incremental +// short-circuit: a second enrich against an unchanged cache returns +// ShortCircuited=true without rebuilding the graph. +func TestEnrichShortCircuitsWhenManifestMatches(t *testing.T) { + dir := t.TempDir() + if err := os.WriteFile(filepath.Join(dir, "X.java"), []byte("class X {}"), 0o644); err != nil { + t.Fatal(err) + } + c, err := cache.Open(filepath.Join(dir, "cache.sqlite")) + if err != nil { + t.Fatal(err) + } + defer c.Close() + a := analyzer.NewAnalyzer(analyzer.Options{Cache: c}) + if _, err := a.Run(dir); err != nil { + t.Fatal(err) + } + graphDir := filepath.Join(dir, "graph.kuzu") + + // First run: full enrich, writes manifest. + first, err := analyzer.Enrich(dir, c, analyzer.EnrichOptions{GraphDir: graphDir}) + if err != nil { + t.Fatalf("first enrich: %v", err) + } + if first.ShortCircuited { + t.Fatal("first enrich short-circuited; want full") + } + if first.Mode != "full" { + t.Fatalf("first Mode = %q, want full", first.Mode) + } + if first.Nodes == 0 { + t.Fatal("first enrich produced 0 nodes") + } + + // Second run: no cache changes → must short-circuit. + second, err := analyzer.Enrich(dir, c, analyzer.EnrichOptions{GraphDir: graphDir}) + if err != nil { + t.Fatalf("second enrich: %v", err) + } + if !second.ShortCircuited { + t.Fatalf("second enrich did NOT short-circuit: %+v", second) + } + if second.Mode != "short-circuit" { + t.Fatalf("second Mode = %q, want short-circuit", second.Mode) + } +} + +// TestEnrichForceBypassesShortCircuit verifies Force=true re-runs the +// full pipeline even when manifests match. +func TestEnrichForceBypassesShortCircuit(t *testing.T) { + dir := t.TempDir() + if err := os.WriteFile(filepath.Join(dir, "Y.java"), []byte("class Y {}"), 0o644); err != nil { + t.Fatal(err) + } + c, err := cache.Open(filepath.Join(dir, "cache.sqlite")) + if err != nil { + t.Fatal(err) + } + defer c.Close() + a := analyzer.NewAnalyzer(analyzer.Options{Cache: c}) + if _, err := a.Run(dir); err != nil { + t.Fatal(err) + } + graphDir := filepath.Join(dir, "graph.kuzu") + if _, err := analyzer.Enrich(dir, c, analyzer.EnrichOptions{GraphDir: graphDir}); err != nil { + t.Fatal(err) + } + forced, err := analyzer.Enrich(dir, c, analyzer.EnrichOptions{GraphDir: graphDir, Force: true}) + if err != nil { + t.Fatalf("forced enrich: %v", err) + } + if forced.ShortCircuited { + t.Fatal("Force=true should bypass short-circuit") + } + if forced.Mode != "full" { + t.Fatalf("forced Mode = %q, want full", forced.Mode) + } +} + +// TestEnrichRerunAfterFileChange verifies a re-run after a file change +// produces the right graph (no PK collisions, manifest updated). +func TestEnrichRerunAfterFileChange(t *testing.T) { + dir := t.TempDir() + if err := os.WriteFile(filepath.Join(dir, "A.java"), []byte("class A {}"), 0o644); err != nil { + t.Fatal(err) + } + c, err := cache.Open(filepath.Join(dir, "cache.sqlite")) + if err != nil { + t.Fatal(err) + } + defer c.Close() + a := analyzer.NewAnalyzer(analyzer.Options{Cache: c}) + if _, err := a.Run(dir); err != nil { + t.Fatal(err) + } + graphDir := filepath.Join(dir, "graph.kuzu") + first, err := analyzer.Enrich(dir, c, analyzer.EnrichOptions{GraphDir: graphDir}) + if err != nil { + t.Fatal(err) + } + + // Add a second file, re-index, re-enrich. + if err := os.WriteFile(filepath.Join(dir, "B.java"), []byte("class B {}"), 0o644); err != nil { + t.Fatal(err) + } + if _, err := a.Run(dir); err != nil { + t.Fatal(err) + } + second, err := analyzer.Enrich(dir, c, analyzer.EnrichOptions{GraphDir: graphDir}) + if err != nil { + t.Fatalf("rerun after change: %v", err) + } + if second.ShortCircuited { + t.Fatal("rerun after file add must NOT short-circuit") + } + if second.Nodes < first.Nodes { + t.Fatalf("rerun produced fewer nodes (%d) than first (%d)", second.Nodes, first.Nodes) + } +} + // TestEnrichFixtureMinimalProducesGraph runs the full index → enrich pipeline // against the fixture-minimal corpus and asserts the resulting graph has at // least the entity / endpoint / service nodes the fixture is expected to diff --git a/internal/graph/refresh.go b/internal/graph/refresh.go index be58155c..ee9392b7 100644 --- a/internal/graph/refresh.go +++ b/internal/graph/refresh.go @@ -53,6 +53,22 @@ func (s *Store) InsertFile(path string, nodes []*model.CodeNode, edges []*model. return nil } +// Reset clears every CodeNode (and its incident edges via DETACH DELETE) +// plus every GraphMeta row, while preserving the schema. Used before a +// full re-enrich on a graph that already holds prior state, so the +// subsequent BulkLoad doesn't collide with stale primary keys. +// +// Calling Reset on a fresh (already empty) graph is a no-op. +func (s *Store) Reset() error { + if _, err := s.Cypher(`MATCH (n:CodeNode) DETACH DELETE n`); err != nil { + return fmt.Errorf("graph: reset CodeNode: %w", err) + } + if _, err := s.Cypher(`MATCH (m:GraphMeta) DELETE m`); err != nil { + return fmt.Errorf("graph: reset GraphMeta: %w", err) + } + return nil +} + // WipeLinkerEdges deletes every relationship whose source property is in // the given sources set, across every declared rel table. Used by // incremental enrich to clear previous linker emissions before re-running diff --git a/internal/graph/refresh_test.go b/internal/graph/refresh_test.go index 34058057..046a663d 100644 --- a/internal/graph/refresh_test.go +++ b/internal/graph/refresh_test.go @@ -198,6 +198,50 @@ func TestWipeLinkerEdgesAlsoDropsLinkerNodes(t *testing.T) { } } +func TestResetClearsAllData(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + if err := s.BulkLoadNodes([]*model.CodeNode{ + {ID: "n1", Kind: model.NodeClass, Label: "A", FilePath: "A.java"}, + {ID: "n2", Kind: model.NodeClass, Label: "B", FilePath: "B.java"}, + }); err != nil { + t.Fatal(err) + } + if err := s.WriteManifest("h1"); err != nil { + t.Fatal(err) + } + if countCodeNodes(t, s) != 2 { + t.Fatal("seed didn't take") + } + if err := s.Reset(); err != nil { + t.Fatalf("Reset: %v", err) + } + if countCodeNodes(t, s) != 0 { + t.Fatal("Reset left CodeNodes") + } + got, _ := s.ReadManifest() + if got != "" { + t.Fatalf("Reset left manifest %q", got) + } + // Schema must still be usable: we can bulk-load again without re-ApplySchema. + if err := s.BulkLoadNodes([]*model.CodeNode{ + {ID: "x", Kind: model.NodeClass, Label: "X", FilePath: "X.java"}, + }); err != nil { + t.Fatalf("BulkLoadNodes after Reset: %v", err) + } + if countCodeNodes(t, s) != 1 { + t.Fatal("post-Reset BulkLoad didn't take") + } +} + +func TestResetOnFreshGraphIsNoop(t *testing.T) { + s := openSchemaStore(t) + defer s.Close() + if err := s.Reset(); err != nil { + t.Fatalf("Reset on fresh graph: %v", err) + } +} + func TestReplaceFileSwapsContent(t *testing.T) { s := openSchemaStore(t) defer s.Close() From 9f8f88988fbecdcb6cf1bdb9e712e336ba1db0a2 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:17:42 +0000 Subject: [PATCH 11/12] feat(cli): index/enrich --force, enrich --diff, codeiq diff subcommand --- internal/cli/diff.go | 86 ++++++++++++++++++++++++++++++++++++++++++ internal/cli/enrich.go | 56 +++++++++++++++++++++++---- internal/cli/index.go | 15 ++++++++ 3 files changed, 149 insertions(+), 8 deletions(-) create mode 100644 internal/cli/diff.go diff --git a/internal/cli/diff.go b/internal/cli/diff.go new file mode 100644 index 00000000..b7463242 --- /dev/null +++ b/internal/cli/diff.go @@ -0,0 +1,86 @@ +package cli + +import ( + "encoding/json" + "fmt" + "path/filepath" + + "github.com/randomcodespace/codeiq/internal/analyzer" + "github.com/randomcodespace/codeiq/internal/cache" + "github.com/randomcodespace/codeiq/internal/detector" + "github.com/spf13/cobra" +) + +func init() { + registerSubcommand(func() *cobra.Command { + var cachePath string + cmd := &cobra.Command{ + Use: "diff [path]", + Short: "Show the cache vs disk delta without touching the graph.", + Long: `Walk the project at [path] and classify each file against the +SQLite analysis cache: + + - Added -- on disk, not in cache + - Modified -- path in cache but content hash differs from disk + - Deleted -- in cache, missing from disk + - Unchanged -- path + content hash match cache exactly + +Useful for previewing what an incremental ` + "`codeiq index`" + ` / +` + "`codeiq enrich`" + ` run would do. The cache is not modified.`, + Example: ` codeiq diff . + codeiq diff /path/to/repo + codeiq diff /path/to/repo --cache-path /tmp/scratch.sqlite`, + Args: cobra.MaximumNArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + root, err := resolvePath(args) + if err != nil { + return err + } + cp := cachePath + if cp == "" { + cp = filepath.Join(root, ".codeiq", "cache", "codeiq.sqlite") + } + c, err := cache.Open(cp) + if err != nil { + return fmt.Errorf("open cache %s: %w", cp, err) + } + defer c.Close() + a := analyzer.NewAnalyzer(analyzer.Options{ + Cache: c, + Registry: detector.Default, + }) + d, err := a.Diff(root) + if err != nil { + return err + } + out := map[string]any{ + "added": stringsOrEmpty(d.Added), + "modified": stringsOrEmpty(d.Modified), + "deleted": stringsOrEmpty(d.Deleted), + "unchanged": stringsOrEmpty(d.Unchanged), + "counts": map[string]int{ + "added": len(d.Added), + "modified": len(d.Modified), + "deleted": len(d.Deleted), + "unchanged": len(d.Unchanged), + }, + } + enc := json.NewEncoder(cmd.OutOrStdout()) + enc.SetIndent("", " ") + return enc.Encode(out) + }, + } + cmd.Flags().StringVar(&cachePath, "cache-path", "", + "Path to the cache file (default: /.codeiq/cache/codeiq.sqlite).") + return cmd + }) +} + +// stringsOrEmpty replaces a nil slice with an empty one so JSON output is +// `[]` instead of `null` for empty buckets. +func stringsOrEmpty(s []string) []string { + if s == nil { + return []string{} + } + return s +} diff --git a/internal/cli/enrich.go b/internal/cli/enrich.go index 67dbffff..e592c866 100644 --- a/internal/cli/enrich.go +++ b/internal/cli/enrich.go @@ -1,6 +1,7 @@ package cli import ( + "encoding/json" "fmt" "os" "path/filepath" @@ -9,16 +10,19 @@ import ( "github.com/randomcodespace/codeiq/internal/analyzer" "github.com/randomcodespace/codeiq/internal/cache" + "github.com/randomcodespace/codeiq/internal/detector" "github.com/spf13/cobra" ) func init() { registerSubcommand(func() *cobra.Command { var ( - graphDir string - memProfile string - maxBufferPool int64 - copyThreads int + graphDir string + memProfile string + maxBufferPool int64 + copyThreads int + force bool + diffOnly bool ) cmd := &cobra.Command{ Use: "enrich [path]", @@ -56,7 +60,34 @@ become available and the stdio MCP server can serve clients.`, return fmt.Errorf("open cache %s: %w", cachePath, err) } defer c.Close() - opts := analyzer.EnrichOptions{GraphDir: graphDir} + + // --diff: print Diff against the cache as JSON and exit. + // Does not touch the graph. Useful for previewing what an + // incremental enrich would do. + if diffOnly { + a := analyzer.NewAnalyzer(analyzer.Options{Cache: c, Registry: detector.Default}) + d, dErr := a.Diff(root) + if dErr != nil { + return dErr + } + out := map[string]any{ + "added": d.Added, + "modified": d.Modified, + "deleted": d.Deleted, + "unchanged": d.Unchanged, + "counts": map[string]int{ + "added": len(d.Added), + "modified": len(d.Modified), + "deleted": len(d.Deleted), + "unchanged": len(d.Unchanged), + }, + } + enc := json.NewEncoder(cmd.OutOrStdout()) + enc.SetIndent("", " ") + return enc.Encode(out) + } + + opts := analyzer.EnrichOptions{GraphDir: graphDir, Force: force} if maxBufferPool > 0 { opts.StoreBufferPoolBytes = uint64(maxBufferPool) } @@ -79,9 +110,14 @@ become available and the stdio MCP server can serve clients.`, } fmt.Fprintf(cmd.ErrOrStderr(), "heap profile written to %s\n", memProfile) } - fmt.Fprintf(cmd.OutOrStdout(), - "enrich complete: %d nodes, %d edges, %d services\n", - summary.Nodes, summary.Edges, summary.Services) + if summary.ShortCircuited { + fmt.Fprintln(cmd.OutOrStdout(), + "enrich short-circuited: graph already matches cache manifest") + } else { + fmt.Fprintf(cmd.OutOrStdout(), + "enrich complete: %d nodes, %d edges, %d services\n", + summary.Nodes, summary.Edges, summary.Services) + } return nil }, } @@ -93,6 +129,10 @@ become available and the stdio MCP server can serve clients.`, "Cap Kuzu BufferPoolSize in bytes (default: 2 GiB; 0 means default).") cmd.Flags().IntVar(©Threads, "copy-threads", 0, "Cap Kuzu COPY FROM parallelism (default: min(4, GOMAXPROCS); 0 means default).") + cmd.Flags().BoolVar(&force, "force", false, + "Bypass the incremental short-circuit; rebuild the graph from scratch.") + cmd.Flags().BoolVar(&diffOnly, "diff", false, + "Print the cache vs disk delta as JSON and exit without touching the graph.") return cmd }) } diff --git a/internal/cli/index.go b/internal/cli/index.go index 2505990b..197e499f 100644 --- a/internal/cli/index.go +++ b/internal/cli/index.go @@ -22,6 +22,7 @@ func init() { var ( batchSize int workers int + force bool ) cmd := &cobra.Command{ Use: "index [path]", @@ -68,6 +69,7 @@ Java and Python.`, Registry: detector.Default, BatchSize: batchSize, Workers: workers, + Force: force, }) stats, err := a.Run(abs) if err != nil { @@ -81,6 +83,17 @@ Java and Python.`, "Deduped: %d nodes, %d edges Dropped: %d phantom edges\n", stats.DedupedNodes, stats.DedupedEdges, stats.DroppedEdges) } + // Incremental counters are only meaningful when the cache was + // consulted (i.e. not --force). Print them when any of them is + // non-zero so unchanged re-runs see "Unchanged: N (100%)". + if !force && (stats.Added+stats.Modified+stats.Deleted+stats.Unchanged) > 0 { + line := fmt.Sprintf("Added: %d Modified: %d Deleted: %d Unchanged: %d Cache hits: %d", + stats.Added, stats.Modified, stats.Deleted, stats.Unchanged, stats.CacheHits) + if stats.Files > 0 { + line += fmt.Sprintf(" (%.1f%%)", 100.0*float64(stats.CacheHits)/float64(stats.Files)) + } + fmt.Fprintln(cmd.OutOrStdout(), line) + } return nil }, } @@ -88,6 +101,8 @@ Java and Python.`, "Number of files processed per batch (default: 500).") cmd.Flags().IntVarP(&workers, "workers", "w", 0, "Worker goroutine count (default: 2 * GOMAXPROCS).") + cmd.Flags().BoolVar(&force, "force", false, + "Bypass the incremental cache; re-parse every file even when the content hash hasn't changed.") return cmd }) } From 8c39591ba1da9e044a6c416c86fa52b507f1ffcb Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 15 May 2026 07:21:01 +0000 Subject: [PATCH 12/12] test(integration): incremental==full determinism + idempotence + delete-then-add --- .../analyzer/incremental_integration_test.go | 265 ++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 internal/analyzer/incremental_integration_test.go diff --git a/internal/analyzer/incremental_integration_test.go b/internal/analyzer/incremental_integration_test.go new file mode 100644 index 00000000..0e045b26 --- /dev/null +++ b/internal/analyzer/incremental_integration_test.go @@ -0,0 +1,265 @@ +package analyzer_test + +import ( + "os" + "path/filepath" + "sort" + "testing" + + "github.com/randomcodespace/codeiq/internal/analyzer" + "github.com/randomcodespace/codeiq/internal/cache" + "github.com/randomcodespace/codeiq/internal/graph" +) + +// graphSnapshot returns a stable, comparable representation of the graph: +// every node by id+kind+label and every edge by id+kind+source+target, +// sorted. Excludes anything timestamped or otherwise legitimately variable. +func graphSnapshot(t *testing.T, graphDir string) (nodes []string, edges []string) { + t.Helper() + s, err := graph.OpenReadOnly(graphDir, 0) + if err != nil { + t.Fatal(err) + } + defer s.Close() + nodeRows, err := s.Cypher( + `MATCH (n:CodeNode) RETURN n.id AS id, n.kind AS kind, n.label AS label ORDER BY n.id`) + if err != nil { + t.Fatal(err) + } + for _, r := range nodeRows { + nodes = append(nodes, + asString(r["id"])+"|"+asString(r["kind"])+"|"+asString(r["label"])) + } + edgeRows, err := s.Cypher( + `MATCH (a:CodeNode)-[r]->(b:CodeNode) + RETURN r.id AS id, a.id AS src, b.id AS tgt ORDER BY r.id, a.id, b.id`) + if err != nil { + t.Fatal(err) + } + for _, r := range edgeRows { + edges = append(edges, asString(r["id"])+"|"+asString(r["src"])+"|"+asString(r["tgt"])) + } + sort.Strings(nodes) + sort.Strings(edges) + return nodes, edges +} + +func asString(v any) string { + if s, ok := v.(string); ok { + return s + } + return "" +} + +// TestIncrementalEqualsFull is the core determinism gate: a sequence of +// incremental runs (file add → modify → delete) must produce a graph +// identical (modulo metadata) to a clean full rebuild on the final state. +// +// Scenario: +// Stage 1: write A.java + B.java + C.java, full index + enrich. +// Stage 2: modify B.java, delete C.java, add D.java, index + enrich. +// Stage 3: blow away .codeiq/, run index + enrich --force on the final tree. +// Assert: snapshots from Stage 2 and Stage 3 are identical. +func TestIncrementalEqualsFull(t *testing.T) { + if testing.Short() { + t.Skip("incremental integration test; -short") + } + src := t.TempDir() + mustWrite := func(rel, content string) { + t.Helper() + full := filepath.Join(src, rel) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(full, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + } + + cachePath := filepath.Join(src, ".codeiq", "cache.sqlite") + graphDir := filepath.Join(src, ".codeiq", "graph", "codeiq.kuzu") + if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil { + t.Fatal(err) + } + + indexAndEnrich := func() { + t.Helper() + c, err := cache.Open(cachePath) + if err != nil { + t.Fatal(err) + } + defer c.Close() + a := analyzer.NewAnalyzer(analyzer.Options{Cache: c, Workers: 1}) + if _, err := a.Run(src); err != nil { + t.Fatal(err) + } + if _, err := analyzer.Enrich(src, c, analyzer.EnrichOptions{GraphDir: graphDir}); err != nil { + t.Fatal(err) + } + } + + // Stage 1: initial corpus. + mustWrite("A.java", "public class A {}") + mustWrite("B.java", "public class B {}") + mustWrite("C.java", "public class C {}") + indexAndEnrich() + + // Stage 2: modify B, delete C, add D — incremental on top of Stage 1. + if err := os.WriteFile(filepath.Join(src, "B.java"), []byte("public class B { int v = 2; }"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.Remove(filepath.Join(src, "C.java")); err != nil { + t.Fatal(err) + } + mustWrite("D.java", "public class D {}") + indexAndEnrich() + incNodes, incEdges := graphSnapshot(t, graphDir) + + // Stage 3: blow away .codeiq/ entirely, run again from scratch on the + // same final filesystem. Use --force on enrich for belt-and-braces. + if err := os.RemoveAll(filepath.Join(src, ".codeiq")); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil { + t.Fatal(err) + } + c, err := cache.Open(cachePath) + if err != nil { + t.Fatal(err) + } + a := analyzer.NewAnalyzer(analyzer.Options{Cache: c, Workers: 1, Force: true}) + if _, err := a.Run(src); err != nil { + t.Fatal(err) + } + if _, err := analyzer.Enrich(src, c, analyzer.EnrichOptions{GraphDir: graphDir, Force: true}); err != nil { + t.Fatal(err) + } + c.Close() + fullNodes, fullEdges := graphSnapshot(t, graphDir) + + compareSnap := func(label string, got, want []string) { + t.Helper() + if len(got) != len(want) { + t.Errorf("%s: len mismatch incremental=%d full=%d", label, len(got), len(want)) + } + // Identify the symmetric difference. + gotSet := make(map[string]struct{}, len(got)) + for _, v := range got { + gotSet[v] = struct{}{} + } + wantSet := make(map[string]struct{}, len(want)) + for _, v := range want { + wantSet[v] = struct{}{} + } + for v := range gotSet { + if _, ok := wantSet[v]; !ok { + t.Errorf("%s only in incremental: %q", label, v) + } + } + for v := range wantSet { + if _, ok := gotSet[v]; !ok { + t.Errorf("%s only in full: %q", label, v) + } + } + } + compareSnap("nodes", incNodes, fullNodes) + compareSnap("edges", incEdges, fullEdges) +} + +// TestIncrementalRerunIsIdempotent — three successive `index → enrich` +// runs against an unchanged tree produce identical graphs and the 2nd/3rd +// enrichments short-circuit. +func TestIncrementalRerunIsIdempotent(t *testing.T) { + src := t.TempDir() + if err := os.WriteFile(filepath.Join(src, "X.java"), []byte("class X {}"), 0o644); err != nil { + t.Fatal(err) + } + cachePath := filepath.Join(src, ".codeiq", "cache.sqlite") + graphDir := filepath.Join(src, ".codeiq", "graph", "codeiq.kuzu") + if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil { + t.Fatal(err) + } + + var summaries []analyzer.EnrichSummary + for i := 0; i < 3; i++ { + c, err := cache.Open(cachePath) + if err != nil { + t.Fatal(err) + } + a := analyzer.NewAnalyzer(analyzer.Options{Cache: c, Workers: 1}) + if _, err := a.Run(src); err != nil { + t.Fatal(err) + } + summary, err := analyzer.Enrich(src, c, analyzer.EnrichOptions{GraphDir: graphDir}) + if err != nil { + t.Fatalf("enrich #%d: %v", i, err) + } + c.Close() + summaries = append(summaries, summary) + } + + if summaries[0].ShortCircuited { + t.Fatal("first enrich short-circuited; want full") + } + if !summaries[1].ShortCircuited { + t.Fatal("second enrich did NOT short-circuit") + } + if !summaries[2].ShortCircuited { + t.Fatal("third enrich did NOT short-circuit") + } +} + +// TestIncrementalAcrossDeleteThenAdd — delete a file, re-index, add it +// back with the same content, re-index. The cache should reflect the +// transition correctly (path purged on delete, re-added with same hash). +func TestIncrementalAcrossDeleteThenAdd(t *testing.T) { + src := t.TempDir() + cachePath := filepath.Join(src, ".codeiq", "cache.sqlite") + if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil { + t.Fatal(err) + } + + mustWrite := func(rel, content string) { + full := filepath.Join(src, rel) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(full, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + } + + runIndex := func() *cache.Cache { + c, err := cache.Open(cachePath) + if err != nil { + t.Fatal(err) + } + a := analyzer.NewAnalyzer(analyzer.Options{Cache: c, Workers: 1}) + if _, err := a.Run(src); err != nil { + t.Fatal(err) + } + return c + } + + mustWrite("A.java", "class A {}") + c := runIndex() + c.Close() + + // Delete the file. Next index should purge it from the cache. + if err := os.Remove(filepath.Join(src, "A.java")); err != nil { + t.Fatal(err) + } + c = runIndex() + if _, _, ok := c.GetFileByPath("A.java"); ok { + t.Fatal("deleted file still in cache after re-index") + } + c.Close() + + // Recreate the same file. Next index should re-add it. + mustWrite("A.java", "class A {}") + c = runIndex() + if _, _, ok := c.GetFileByPath("A.java"); !ok { + t.Fatal("re-added file missing from cache") + } + c.Close() +}