Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 58 additions & 3 deletions internal/analyzer/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"os"
"runtime"
"sync"
"sync/atomic"
"time"

"github.com/randomcodespace/codeiq/internal/cache"
Expand All @@ -19,13 +20,19 @@ const DefaultBatchSize = 500
type Options struct {
Cache *cache.Cache
Registry *detector.Registry
BatchSize int // defaults to DefaultBatchSize
Workers int // defaults to 2 * GOMAXPROCS
BatchSize int // defaults to DefaultBatchSize
Workers int // defaults to 2 * GOMAXPROCS
Force bool // bypass cache early-exit; re-parse every file
}

// Analyzer orchestrates the index pipeline.
type Analyzer struct {
opts Options
opts Options
counter runCounter
}

type runCounter struct {
cacheHits atomic.Int64
}

// NewAnalyzer returns an analyzer wired to opts.
Expand All @@ -47,20 +54,48 @@ func NewAnalyzer(opts Options) *Analyzer {
// Plan §1.5 — DedupedNodes/DedupedEdges/DroppedEdges expose dedup activity
// so operators can see "graph collapsed 312 duplicate nodes, dropped 14
// phantom edges" — the visibility is what makes "meaningful" diagnosable.
//
// Added/Modified/Deleted/Unchanged/CacheHits are incremental counters,
// zero on full `--force` runs.
type Stats struct {
Files int
Nodes int
Edges int
DedupedNodes int
DedupedEdges int
DroppedEdges int
Added int
Modified int
Deleted int
Unchanged int
CacheHits int
}

// Run executes FileDiscovery → parse → detectors → GraphBuilder → cache writes
// and returns aggregate stats. Errors from individual file processing are
// logged to stderr but do not stop the run — partial output is better than no
// output (matches Java's per-file try/catch behaviour).
//
// On non-Force runs with a cache present, Run first runs Diff() to classify
// files, purges cache rows for deleted files, then proceeds. processFile
// skips parse+detect for UNCHANGED files (content_hash hit in cache).
func (a *Analyzer) Run(root string) (Stats, error) {
a.counter.cacheHits.Store(0)

var d Delta
if a.opts.Cache != nil && !a.opts.Force {
var err error
d, err = a.Diff(root)
if err != nil {
return Stats{}, err
}
for _, path := range d.Deleted {
if err := a.opts.Cache.PurgeByPath(path); err != nil {
fmt.Fprintf(os.Stderr, "codeiq: purge %s: %v\n", path, err)
}
}
}

disc := NewFileDiscovery()
files, err := disc.Discover(root)
if err != nil {
Expand Down Expand Up @@ -99,6 +134,11 @@ func (a *Analyzer) Run(root string) (Stats, error) {
DedupedNodes: snap.DedupedNodes,
DedupedEdges: snap.DedupedEdges,
DroppedEdges: snap.DroppedEdges,
Added: len(d.Added),
Modified: len(d.Modified),
Deleted: len(d.Deleted),
Unchanged: len(d.Unchanged),
CacheHits: int(a.counter.cacheHits.Load()),
}, nil
}

Expand All @@ -108,6 +148,18 @@ func (a *Analyzer) processFile(f DiscoveredFile, gb *GraphBuilder) error {
return err
}
hash := cache.HashString(string(content))

// Fast path: cache hit. Reuse the previous emissions; skip parse+detect.
if a.opts.Cache != nil && !a.opts.Force && a.opts.Cache.Has(hash) {
entry, gerr := a.opts.Cache.Get(hash)
if gerr == nil && entry != nil {
gb.Add(&detector.Result{Nodes: entry.Nodes, Edges: entry.Edges})
a.counter.cacheHits.Add(1)
return nil
}
// Has() true but Get() failed — pathological. Fall through to re-parse.
}

tree, err := parser.Parse(f.Language, content)
if err != nil {
// Continue with regex-only detectors when the parser bails — matches
Expand Down Expand Up @@ -142,6 +194,9 @@ func (a *Analyzer) processFile(f DiscoveredFile, gb *GraphBuilder) error {
entry.Edges = append(entry.Edges, r.Edges...)
}
if a.opts.Cache != nil {
// MODIFIED files: purge prior (path, old_hash) row so a single path
// never has two cache entries.
_ = a.opts.Cache.PurgeByPath(f.RelPath)
if err := a.opts.Cache.Put(entry); err != nil {
return fmt.Errorf("cache put: %w", err)
}
Expand Down
128 changes: 128 additions & 0 deletions internal/analyzer/analyzer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,134 @@ func TestAnalyzerEndToEnd(t *testing.T) {
}
}

func TestStatsHasIncrementalCounters(t *testing.T) {
var s Stats
// Compile-time check that the new fields exist with the expected names.
_ = s.Added
_ = s.Modified
_ = s.Deleted
_ = s.Unchanged
_ = s.CacheHits
}

func TestProcessFileSkipsOnCacheHit(t *testing.T) {
root := t.TempDir()
cachePath := filepath.Join(root, ".codeiq", "cache.sqlite")
if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil {
t.Fatal(err)
}
src := "public class A {}"
if err := os.WriteFile(filepath.Join(root, "A.java"), []byte(src), 0o644); err != nil {
t.Fatal(err)
}

c, err := cache.Open(cachePath)
if err != nil {
t.Fatalf("cache: %v", err)
}
defer c.Close()

// Seed the cache with a row for this content hash. processFile MUST
// not re-parse the file when its hash already lives in the cache.
if err := c.Put(&cache.Entry{
ContentHash: cache.HashString(src),
Path: "A.java",
Language: "java",
ParsedAt: "2026-01-01T00:00:00Z",
}); err != nil {
t.Fatal(err)
}

a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1})
stats, err := a.Run(root)
if err != nil {
t.Fatalf("run: %v", err)
}
if stats.CacheHits != 1 {
t.Fatalf("CacheHits = %d, want 1", stats.CacheHits)
}
if stats.Files != 1 {
t.Fatalf("Files = %d, want 1", stats.Files)
}
if stats.Unchanged != 1 {
t.Fatalf("Unchanged = %d, want 1", stats.Unchanged)
}
}

func TestForceBypassesCacheHit(t *testing.T) {
root := t.TempDir()
cachePath := filepath.Join(root, ".codeiq", "cache.sqlite")
if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil {
t.Fatal(err)
}
src := "public class A {}"
if err := os.WriteFile(filepath.Join(root, "A.java"), []byte(src), 0o644); err != nil {
t.Fatal(err)
}

c, err := cache.Open(cachePath)
if err != nil {
t.Fatal(err)
}
defer c.Close()
_ = c.Put(&cache.Entry{
ContentHash: cache.HashString(src),
Path: "A.java",
Language: "java",
ParsedAt: "t",
})

a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1, Force: true})
stats, err := a.Run(root)
if err != nil {
t.Fatal(err)
}
if stats.CacheHits != 0 {
t.Fatalf("Force=true should bypass cache; CacheHits = %d", stats.CacheHits)
}
}

func TestRunPurgesDeletedFiles(t *testing.T) {
root := t.TempDir()
cachePath := filepath.Join(root, ".codeiq", "cache.sqlite")
if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil {
t.Fatal(err)
}
c, err := cache.Open(cachePath)
if err != nil {
t.Fatal(err)
}
defer c.Close()

// Seed a phantom file that's gone from disk.
if err := c.Put(&cache.Entry{
ContentHash: "ghost-hash",
Path: "deleted.java",
Language: "java",
ParsedAt: "t",
}); err != nil {
t.Fatal(err)
}
if !c.Has("ghost-hash") {
t.Fatal("seed didn't take")
}
if err := os.WriteFile(filepath.Join(root, "real.java"), []byte("class R {}"), 0o644); err != nil {
t.Fatal(err)
}

a := NewAnalyzer(Options{Cache: c, Registry: detector.Default, Workers: 1})
stats, err := a.Run(root)
if err != nil {
t.Fatal(err)
}
if c.Has("ghost-hash") {
t.Fatal("deleted file's cache row not purged")
}
if stats.Deleted != 1 {
t.Fatalf("Deleted = %d, want 1", stats.Deleted)
}
}

func TestAnalyzerDeterminism(t *testing.T) {
dir := t.TempDir()
if err := os.WriteFile(filepath.Join(dir, "UserController.java"), []byte(fixtureJava), 0644); err != nil {
Expand Down
66 changes: 66 additions & 0 deletions internal/analyzer/diff.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package analyzer

import (
"fmt"
"os"

"github.com/randomcodespace/codeiq/internal/cache"
)

// Delta is the result of comparing the on-disk file set to the cache state.
// All slices are sorted by path (FileDiscovery sorts; AllFiles iterates in
// path order) so callers can rely on stable order.
type Delta struct {
Added []string // on disk, not in cache
Modified []string // path in cache but content_hash differs from disk
Deleted []string // in cache, missing from disk
Unchanged []string // path + content_hash match cache exactly
}

// Diff walks the project root via FileDiscovery and classifies each file
// against the cache. UNCHANGED files cost one hash per file; nothing else
// is parsed or detected.
//
// Returns Delta with empty slices (not nil) when there is no work in a
// bucket.
func (a *Analyzer) Diff(root string) (Delta, error) {
d := Delta{}
if a.opts.Cache == nil {
return d, fmt.Errorf("diff: cache is required")
}
disc := NewFileDiscovery()
files, err := disc.Discover(root)
if err != nil {
return d, fmt.Errorf("file discovery: %w", err)
}

seen := make(map[string]bool, len(files))
for _, f := range files {
seen[f.RelPath] = true
content, err := os.ReadFile(f.AbsPath)
if err != nil {
fmt.Fprintf(os.Stderr, "codeiq: diff: %s: %v\n", f.RelPath, err)
continue
}
curHash := cache.HashString(string(content))
cachedHash, _, ok := a.opts.Cache.GetFileByPath(f.RelPath)
switch {
case !ok:
d.Added = append(d.Added, f.RelPath)
case cachedHash == curHash:
d.Unchanged = append(d.Unchanged, f.RelPath)
default:
d.Modified = append(d.Modified, f.RelPath)
}
}

if err := a.opts.Cache.AllFiles(func(path, _ string) error {
if !seen[path] {
d.Deleted = append(d.Deleted, path)
}
return nil
}); err != nil {
return d, err
}
return d, nil
}
Loading