RandomCodeSpace · aksOps · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/internal/analyzer/analyzer.go b/internal/analyzer/analyzer.go
@@ -5,6 +5,7 @@ import (
 	"os"
 	"runtime"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/randomcodespace/codeiq/internal/cache"
@@ -19,13 +20,19 @@ const DefaultBatchSize = 500
 type Options struct {
 	Cache     *cache.Cache
 	Registry  *detector.Registry
-	BatchSize int // defaults to DefaultBatchSize
-	Workers   int // defaults to 2 * GOMAXPROCS
+	BatchSize int  // defaults to DefaultBatchSize
+	Workers   int  // defaults to 2 * GOMAXPROCS
+	Force     bool // bypass cache early-exit; re-parse every file
 }
 
 // Analyzer orchestrates the index pipeline.
 type Analyzer struct {
-	opts Options
+	opts    Options
+	counter runCounter
+}
+
+type runCounter struct {
+	cacheHits atomic.Int64
 }
 
 // NewAnalyzer returns an analyzer wired to opts.
@@ -47,20 +54,48 @@ func NewAnalyzer(opts Options) *Analyzer {
 // Plan §1.5 — DedupedNodes/DedupedEdges/DroppedEdges expose dedup activity
 // so operators can see "graph collapsed 312 duplicate nodes, dropped 14
 // phantom edges" — the visibility is what makes "meaningful" diagnosable.
+//
+// Added/Modified/Deleted/Unchanged/CacheHits are incremental counters,
+// zero on full `--force` runs.
 type Stats struct {
 	Files        int
 	Nodes        int
 	Edges        int
 	DedupedNodes int
 	DedupedEdges int
 	DroppedEdges int
+	Added        int
+	Modified     int
+	Deleted      int
+	Unchanged    int
+	CacheHits    int
 }
 
 // Run executes FileDiscovery → parse → detectors → GraphBuilder → cache writes
 // and returns aggregate stats. Errors from individual file processing are
 // logged to stderr but do not stop the run — partial output is better than no
 // output (matches Java's per-file try/catch behaviour).
+//
+// On non-Force runs with a cache present, Run first runs Diff() to classify
+// files, purges cache rows for deleted files, then proceeds. processFile
+// skips parse+detect for UNCHANGED files (content_hash hit in cache).
 func (a *Analyzer) Run(root string) (Stats, error) {
+	a.counter.cacheHits.Store(0)
+
+	var d Delta
+	if a.opts.Cache != nil && !a.opts.Force {
+		var err error
+		d, err = a.Diff(root)
+		if err != nil {
+			return Stats{}, err
+		}
+		for _, path := range d.Deleted {
+			if err := a.opts.Cache.PurgeByPath(path); err != nil {
+				fmt.Fprintf(os.Stderr, "codeiq: purge %s: %v\n", path, err)
+			}
+		}
+	}
+
 	disc := NewFileDiscovery()
 	files, err := disc.Discover(root)
 	if err != nil {
@@ -99,6 +134,11 @@ func (a *Analyzer) Run(root string) (Stats, error) {
 		DedupedNodes: snap.DedupedNodes,
 		DedupedEdges: snap.DedupedEdges,
 		DroppedEdges: snap.DroppedEdges,
+		Added:        len(d.Added),
+		Modified:     len(d.Modified),
+		Deleted:      len(d.Deleted),
+		Unchanged:    len(d.Unchanged),
+		CacheHits:    int(a.counter.cacheHits.Load()),
 	}, nil
 }
 
@@ -108,6 +148,18 @@ func (a *Analyzer) processFile(f DiscoveredFile, gb *GraphBuilder) error {
 		return err
 	}
 	hash := cache.HashString(string(content))
+
+	// Fast path: cache hit. Reuse the previous emissions; skip parse+detect.
+	if a.opts.Cache != nil && !a.opts.Force && a.opts.Cache.Has(hash) {
+		entry, gerr := a.opts.Cache.Get(hash)
+		if gerr == nil && entry != nil {
+			gb.Add(&detector.Result{Nodes: entry.Nodes, Edges: entry.Edges})
+			a.counter.cacheHits.Add(1)
+			return nil
+		}
+		// Has() true but Get() failed — pathological. Fall through to re-parse.
+	}
+
 	tree, err := parser.Parse(f.Language, content)
 	if err != nil {
 		// Continue with regex-only detectors when the parser bails — matches
@@ -142,6 +194,9 @@ func (a *Analyzer) processFile(f DiscoveredFile, gb *GraphBuilder) error {
 		entry.Edges = append(entry.Edges, r.Edges...)
 	}
 	if a.opts.Cache != nil {
+		// MODIFIED files: purge prior (path, old_hash) row so a single path
+		// never has two cache entries.
+		_ = a.opts.Cache.PurgeByPath(f.RelPath)
 		if err := a.opts.Cache.Put(entry); err != nil {
 			return fmt.Errorf("cache put: %w", err)
 		}

diff --git a/internal/analyzer/analyzer_test.go b/internal/analyzer/analyzer_test.go
@@ -66,6 +66,134 @@ func TestAnalyzerEndToEnd(t *testing.T) {
 	}
 }
 
+func TestStatsHasIncrementalCounters(t *testing.T) {
+	var s Stats
+	// Compile-time check that the new fields exist with the expected names.
+	_ = s.Added
+	_ = s.Modified
+	_ = s.Deleted
+	_ = s.Unchanged
+	_ = s.CacheHits
+}
+
+func TestProcessFileSkipsOnCacheHit(t *testing.T) {
+	root := t.TempDir()
+	cachePath := filepath.Join(root, ".codeiq", "cache.sqlite")
+	if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	src := "public class A {}"
+	if err := os.WriteFile(filepath.Join(root, "A.java"), []byte(src), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	c, err := cache.Open(cachePath)
+	if err != nil {
+		t.Fatalf("cache: %v", err)
+	}
+	defer c.Close()
+
+	// Seed the cache with a row for this content hash. processFile MUST
+	// not re-parse the file when its hash already lives in the cache.
+	if err := c.Put(&cache.Entry{
+		ContentHash: cache.HashString(src),
+		Path:        "A.java",
+		Language:    "java",
+		ParsedAt:    "2026-01-01T00:00:00Z",
+	}); err != nil {
+		t.Fatal(err)
+	}
+
+	a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1})
+	stats, err := a.Run(root)
+	if err != nil {
+		t.Fatalf("run: %v", err)
+	}
+	if stats.CacheHits != 1 {
+		t.Fatalf("CacheHits = %d, want 1", stats.CacheHits)
+	}
+	if stats.Files != 1 {
+		t.Fatalf("Files = %d, want 1", stats.Files)
+	}
+	if stats.Unchanged != 1 {
+		t.Fatalf("Unchanged = %d, want 1", stats.Unchanged)
+	}
+}
+
+func TestForceBypassesCacheHit(t *testing.T) {
+	root := t.TempDir()
+	cachePath := filepath.Join(root, ".codeiq", "cache.sqlite")
+	if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	src := "public class A {}"
+	if err := os.WriteFile(filepath.Join(root, "A.java"), []byte(src), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	c, err := cache.Open(cachePath)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer c.Close()
+	_ = c.Put(&cache.Entry{
+		ContentHash: cache.HashString(src),
+		Path:        "A.java",
+		Language:    "java",
+		ParsedAt:    "t",
+	})
+
+	a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1, Force: true})
+	stats, err := a.Run(root)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if stats.CacheHits != 0 {
+		t.Fatalf("Force=true should bypass cache; CacheHits = %d", stats.CacheHits)
+	}
+}
+
+func TestRunPurgesDeletedFiles(t *testing.T) {
+	root := t.TempDir()
+	cachePath := filepath.Join(root, ".codeiq", "cache.sqlite")
+	if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	c, err := cache.Open(cachePath)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer c.Close()
+
+	// Seed a phantom file that's gone from disk.
+	if err := c.Put(&cache.Entry{
+		ContentHash: "ghost-hash",
+		Path:        "deleted.java",
+		Language:    "java",
+		ParsedAt:    "t",
+	}); err != nil {
+		t.Fatal(err)
+	}
+	if !c.Has("ghost-hash") {
+		t.Fatal("seed didn't take")
+	}
+	if err := os.WriteFile(filepath.Join(root, "real.java"), []byte("class R {}"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1})
+	stats, err := a.Run(root)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if c.Has("ghost-hash") {
+		t.Fatal("deleted file's cache row not purged")
+	}
+	if stats.Deleted != 1 {
+		t.Fatalf("Deleted = %d, want 1", stats.Deleted)
+	}
+}
+
 func TestAnalyzerDeterminism(t *testing.T) {
 	dir := t.TempDir()
 	if err := os.WriteFile(filepath.Join(dir, "UserController.java"), []byte(fixtureJava), 0644); err != nil {

diff --git a/internal/analyzer/diff.go b/internal/analyzer/diff.go
@@ -0,0 +1,66 @@
+package analyzer
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/randomcodespace/codeiq/internal/cache"
+)
+
+// Delta is the result of comparing the on-disk file set to the cache state.
+// All slices are sorted by path (FileDiscovery sorts; AllFiles iterates in
+// path order) so callers can rely on stable order.
+type Delta struct {
+	Added     []string // on disk, not in cache
+	Modified  []string // path in cache but content_hash differs from disk
+	Deleted   []string // in cache, missing from disk
+	Unchanged []string // path + content_hash match cache exactly
+}
+
+// Diff walks the project root via FileDiscovery and classifies each file
+// against the cache. UNCHANGED files cost one hash per file; nothing else
+// is parsed or detected.
+//
+// Returns Delta with empty slices (not nil) when there is no work in a
+// bucket.
+func (a *Analyzer) Diff(root string) (Delta, error) {
+	d := Delta{}
+	if a.opts.Cache == nil {
+		return d, fmt.Errorf("diff: cache is required")
+	}
+	disc := NewFileDiscovery()
+	files, err := disc.Discover(root)
+	if err != nil {
+		return d, fmt.Errorf("file discovery: %w", err)
+	}
+
+	seen := make(map[string]bool, len(files))
+	for _, f := range files {
+		seen[f.RelPath] = true
+		content, err := os.ReadFile(f.AbsPath)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "codeiq: diff: %s: %v\n", f.RelPath, err)
+			continue
+		}
+		curHash := cache.HashString(string(content))
+		cachedHash, _, ok := a.opts.Cache.GetFileByPath(f.RelPath)
+		switch {
+		case !ok:
+			d.Added = append(d.Added, f.RelPath)
+		case cachedHash == curHash:
+			d.Unchanged = append(d.Unchanged, f.RelPath)
+		default:
+			d.Modified = append(d.Modified, f.RelPath)
+		}
+	}
+
+	if err := a.opts.Cache.AllFiles(func(path, _ string) error {
+		if !seen[path] {
+			d.Deleted = append(d.Deleted, path)
+		}
+		return nil
+	}); err != nil {
+		return d, err
+	}
+	return d, nil
+}