From 7f502340fb45943322ad79fd95cd8598baccd6e6 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 09:25:14 +0000 Subject: [PATCH] fix(detector): anchor nodes for bicep/dockerfile/shell/proto imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five remaining detectors emitted file-path → external-name edges without anchor CodeNodes; Snapshot's phantom filter dropped every edge. Apply base.EnsureFileAnchor + base.EnsureExternalAnchor (the pattern from Phase 4 TS/Python/Rust/C++). Detectors updated: - iac/bicep.go — 1 anchor pair (module depends_on) - iac/dockerfile.go — 1 anchor pair (FROM depends_on) - script/shell/bash.go — 2 anchor pairs (source imports + tool calls) - script/shell/powershell.go — 1 anchor pair (Import-Module + dot-source) - proto/structure.go — 1 anchor pair (proto import) Each gets a survives-snapshot test asserting MODULE + EXTERNAL anchor nodes and the relevant edge kind are present in the detector result. Smoke: fixture-multi-lang phantom drop count = 4 (unchanged; fixture contains no bicep/dockerfile/shell/proto files, so these detectors don't fire on it — drop count reflects pre-existing Java/Python/TS phantom edges not in scope for this fix). Co-Authored-By: Claude Sonnet 4.6 --- go/internal/detector/iac/bicep.go | 8 +++- go/internal/detector/iac/bicep_test.go | 34 ++++++++++++++ go/internal/detector/iac/dockerfile.go | 8 +++- go/internal/detector/iac/dockerfile_test.go | 34 ++++++++++++++ go/internal/detector/proto/structure.go | 9 +++- go/internal/detector/proto/structure_test.go | 34 ++++++++++++++ go/internal/detector/script/shell/bash.go | 11 ++++- .../detector/script/shell/bash_test.go | 46 +++++++++++++++++-- .../detector/script/shell/powershell.go | 14 ++++-- .../detector/script/shell/powershell_test.go | 35 ++++++++++++++ 10 files changed, 220 insertions(+), 13 deletions(-) diff --git a/go/internal/detector/iac/bicep.go b/go/internal/detector/iac/bicep.go index d17af155..39550df8 100644 --- a/go/internal/detector/iac/bicep.go +++ b/go/internal/detector/iac/bicep.go @@ -36,6 +36,7 @@ func (d BicepDetector) Detect(ctx *detector.Context) *detector.Result { var edges []*model.CodeEdge fp := ctx.FilePath lines := strings.Split(text, "\n") + seen := map[string]bool{} for i, line := range lines { if m := bicepResourceRE.FindStringSubmatch(line); len(m) >= 3 { @@ -84,7 +85,12 @@ func (d BicepDetector) Detect(ctx *detector.Context) *detector.Result { n.Properties["module_path"] = modPath nodes = append(nodes, n) - e := model.NewCodeEdge(fp+":depends_on:"+modPath, model.EdgeDependsOn, fp, modPath) + // Emit anchor nodes so the depends_on edge survives GraphBuilder's + // phantom-drop filter. Without anchors, fp and modPath are free-form + // strings that don't match any CodeNode. + srcID := base.EnsureFileAnchor(ctx, "bicep", "BicepDetector", model.ConfidenceLexical, &nodes, seen) + tgtID := base.EnsureExternalAnchor(modPath, "bicep:external", "BicepDetector", model.ConfidenceLexical, &nodes, seen) + e := model.NewCodeEdge(srcID+":depends_on:"+tgtID, model.EdgeDependsOn, srcID, tgtID) e.Source = "BicepDetector" e.Properties["module_name"] = name edges = append(edges, e) diff --git a/go/internal/detector/iac/bicep_test.go b/go/internal/detector/iac/bicep_test.go index 855a4750..6ce31972 100644 --- a/go/internal/detector/iac/bicep_test.go +++ b/go/internal/detector/iac/bicep_test.go @@ -71,6 +71,40 @@ func TestBicepNegative(t *testing.T) { } } +// TestBicepImports_EdgeSurvivesSnapshot verifies that the anchor nodes emitted +// alongside module depends_on edges are present in the detector result, so +// GraphBuilder.Snapshot's phantom-drop filter does not discard them. +func TestBicepImports_EdgeSurvivesSnapshot(t *testing.T) { + d := NewBicepDetector() + r := d.Detect(&detector.Context{FilePath: "main.bicep", Language: "bicep", Content: bicepSource}) + + var moduleNodes, externalNodes int + for _, n := range r.Nodes { + switch n.Kind { + case model.NodeModule: + moduleNodes++ + case model.NodeExternal: + externalNodes++ + } + } + if moduleNodes == 0 { + t.Fatal("expected at least one MODULE anchor node for the file endpoint") + } + if externalNodes == 0 { + t.Fatal("expected at least one EXTERNAL anchor node for the module target") + } + + dependsEdges := 0 + for _, e := range r.Edges { + if e.Kind == model.EdgeDependsOn { + dependsEdges++ + } + } + if dependsEdges == 0 { + t.Fatal("expected at least one surviving depends_on edge, got 0") + } +} + func TestBicepDeterminism(t *testing.T) { d := NewBicepDetector() ctx := &detector.Context{FilePath: "main.bicep", Language: "bicep", Content: bicepSource} diff --git a/go/internal/detector/iac/dockerfile.go b/go/internal/detector/iac/dockerfile.go index 7db26591..baa03519 100644 --- a/go/internal/detector/iac/dockerfile.go +++ b/go/internal/detector/iac/dockerfile.go @@ -43,6 +43,7 @@ func (d DockerfileDetector) Detect(ctx *detector.Context) *detector.Result { var nodes []*model.CodeNode var edges []*model.CodeEdge fp := ctx.FilePath + seen := map[string]bool{} // Stage tracking — alias → node id, plus offsets so we can resolve which // FROM is the *current* stage at any byte offset later in the file. @@ -86,7 +87,12 @@ func (d DockerfileDetector) Detect(ctx *detector.Context) *detector.Result { fromOffsets = append(fromOffsets, fromOffset{offset: m[0], nodeIndex: len(nodes)}) nodes = append(nodes, n) - e := model.NewCodeEdge(fp+":depends_on:"+image, model.EdgeDependsOn, fp, image) + // Emit anchor nodes so the depends_on edge survives GraphBuilder's + // phantom-drop filter. Without anchors, fp and image are free-form + // strings that don't match any CodeNode. + srcID := base.EnsureFileAnchor(ctx, "dockerfile", "DockerfileDetector", model.ConfidenceLexical, &nodes, seen) + tgtID := base.EnsureExternalAnchor(image, "docker:image", "DockerfileDetector", model.ConfidenceLexical, &nodes, seen) + e := model.NewCodeEdge(srcID+":depends_on:"+tgtID, model.EdgeDependsOn, srcID, tgtID) e.Source = "DockerfileDetector" edges = append(edges, e) } diff --git a/go/internal/detector/iac/dockerfile_test.go b/go/internal/detector/iac/dockerfile_test.go index cca571db..85bd4c04 100644 --- a/go/internal/detector/iac/dockerfile_test.go +++ b/go/internal/detector/iac/dockerfile_test.go @@ -78,6 +78,40 @@ func TestDockerfileNegative(t *testing.T) { } } +// TestDockerfileImports_EdgeSurvivesSnapshot verifies that the anchor nodes +// emitted alongside FROM depends_on edges are present in the detector result, +// so GraphBuilder.Snapshot's phantom-drop filter does not discard them. +func TestDockerfileImports_EdgeSurvivesSnapshot(t *testing.T) { + d := NewDockerfileDetector() + r := d.Detect(&detector.Context{FilePath: "Dockerfile", Language: "dockerfile", Content: dockerfileSource}) + + var moduleNodes, externalNodes int + for _, n := range r.Nodes { + switch n.Kind { + case model.NodeModule: + moduleNodes++ + case model.NodeExternal: + externalNodes++ + } + } + if moduleNodes == 0 { + t.Fatal("expected at least one MODULE anchor node for the file endpoint") + } + if externalNodes == 0 { + t.Fatal("expected at least one EXTERNAL anchor node for the image target") + } + + dependsEdges := 0 + for _, e := range r.Edges { + if e.Kind == model.EdgeDependsOn { + dependsEdges++ + } + } + if dependsEdges == 0 { + t.Fatal("expected at least one surviving depends_on edge, got 0") + } +} + func TestDockerfileDeterminism(t *testing.T) { d := NewDockerfileDetector() ctx := &detector.Context{FilePath: "Dockerfile", Language: "dockerfile", Content: dockerfileSource} diff --git a/go/internal/detector/proto/structure.go b/go/internal/detector/proto/structure.go index a65c5022..334c00fb 100644 --- a/go/internal/detector/proto/structure.go +++ b/go/internal/detector/proto/structure.go @@ -39,6 +39,7 @@ func (d StructureDetector) Detect(ctx *detector.Context) *detector.Result { var edges []*model.CodeEdge fp := ctx.FilePath lines := strings.Split(text, "\n") + seen := map[string]bool{} // Package (first match only) for i, line := range lines { @@ -55,11 +56,15 @@ func (d StructureDetector) Detect(ctx *detector.Context) *detector.Result { } } - // Imports + // Imports — emit anchor nodes so the imports edge survives GraphBuilder's + // phantom-drop filter. Without anchors, fp and imp are free-form strings + // that don't match any CodeNode. for _, line := range lines { if m := protoImportRE.FindStringSubmatch(line); len(m) >= 2 { imp := m[1] - e := model.NewCodeEdge(fp+":imports:"+imp, model.EdgeImports, fp, imp) + srcID := base.EnsureFileAnchor(ctx, "proto", "ProtoStructureDetector", model.ConfidenceLexical, &nodes, seen) + tgtID := base.EnsureExternalAnchor(imp, "proto:external", "ProtoStructureDetector", model.ConfidenceLexical, &nodes, seen) + e := model.NewCodeEdge(srcID+":imports:"+tgtID, model.EdgeImports, srcID, tgtID) e.Source = "ProtoStructureDetector" edges = append(edges, e) } diff --git a/go/internal/detector/proto/structure_test.go b/go/internal/detector/proto/structure_test.go index 5328b107..bac1193d 100644 --- a/go/internal/detector/proto/structure_test.go +++ b/go/internal/detector/proto/structure_test.go @@ -80,6 +80,40 @@ func TestProtoNegative(t *testing.T) { } } +// TestProtoImports_EdgeSurvivesSnapshot verifies that the anchor nodes emitted +// alongside proto import edges are present in the detector result, so +// GraphBuilder.Snapshot's phantom-drop filter does not discard them. +func TestProtoImports_EdgeSurvivesSnapshot(t *testing.T) { + d := NewStructureDetector() + r := d.Detect(&detector.Context{FilePath: "api.proto", Language: "proto", Content: protoSource}) + + var moduleNodes, externalNodes int + for _, n := range r.Nodes { + switch n.Kind { + case model.NodeModule: + moduleNodes++ + case model.NodeExternal: + externalNodes++ + } + } + if moduleNodes == 0 { + t.Fatal("expected at least one MODULE anchor node for the file endpoint") + } + if externalNodes == 0 { + t.Fatal("expected at least one EXTERNAL anchor node for the import target") + } + + importEdges := 0 + for _, e := range r.Edges { + if e.Kind == model.EdgeImports { + importEdges++ + } + } + if importEdges == 0 { + t.Fatal("expected at least one surviving imports edge, got 0") + } +} + func TestProtoDeterminism(t *testing.T) { d := NewStructureDetector() ctx := &detector.Context{FilePath: "api.proto", Language: "proto", Content: protoSource} diff --git a/go/internal/detector/script/shell/bash.go b/go/internal/detector/script/shell/bash.go index a8076e3f..48d92db3 100644 --- a/go/internal/detector/script/shell/bash.go +++ b/go/internal/detector/script/shell/bash.go @@ -39,6 +39,7 @@ func (d BashDetector) Detect(ctx *detector.Context) *detector.Result { var edges []*model.CodeEdge fp := ctx.FilePath lines := strings.Split(text, "\n") + seen := map[string]bool{} // Shebang → MODULE node for the script if len(lines) > 0 { @@ -69,9 +70,12 @@ func (d BashDetector) Detect(ctx *detector.Context) *detector.Result { } // source ./lib.sh / . helpers.sh + // Emit anchor nodes so the imports edge survives GraphBuilder's phantom-drop. if m := bashSourceRE.FindStringSubmatch(line); len(m) >= 2 { src := m[1] - e := model.NewCodeEdge(fp+":sources:"+src, model.EdgeImports, fp, src) + srcID := base.EnsureFileAnchor(ctx, "bash", "BashDetector", model.ConfidenceLexical, &nodes, seen) + tgtID := base.EnsureExternalAnchor(src, "bash:external", "BashDetector", model.ConfidenceLexical, &nodes, seen) + e := model.NewCodeEdge(srcID+":sources:"+tgtID, model.EdgeImports, srcID, tgtID) e.Source = "BashDetector" edges = append(edges, e) } @@ -89,6 +93,7 @@ func (d BashDetector) Detect(ctx *detector.Context) *detector.Result { } // Tool calls — dedup across the whole file, skip comments + // Emit anchor nodes so the calls edges survive GraphBuilder's phantom-drop. toolsSeen := map[string]bool{} for _, line := range lines { stripped := strings.TrimLeft(line, " \t") @@ -101,7 +106,9 @@ func (d BashDetector) Detect(ctx *detector.Context) *detector.Result { continue } toolsSeen[tool] = true - e := model.NewCodeEdge(fp+":calls:"+tool, model.EdgeCalls, fp, tool) + srcID := base.EnsureFileAnchor(ctx, "bash", "BashDetector", model.ConfidenceLexical, &nodes, seen) + tgtID := base.EnsureExternalAnchor(tool, "bash:tool", "BashDetector", model.ConfidenceLexical, &nodes, seen) + e := model.NewCodeEdge(srcID+":calls:"+tgtID, model.EdgeCalls, srcID, tgtID) e.Source = "BashDetector" e.Properties["tool"] = tool edges = append(edges, e) diff --git a/go/internal/detector/script/shell/bash_test.go b/go/internal/detector/script/shell/bash_test.go index a1a4a466..aeb379cb 100644 --- a/go/internal/detector/script/shell/bash_test.go +++ b/go/internal/detector/script/shell/bash_test.go @@ -34,9 +34,9 @@ func TestBashPositive(t *testing.T) { for _, n := range r.Nodes { kinds[n.Kind]++ } - // 1 shebang module - if kinds[model.NodeModule] != 1 { - t.Errorf("expected 1 MODULE (shebang), got %d", kinds[model.NodeModule]) + // 1 shebang module + 1 file-anchor module (emitted by import/calls anchor helpers) + if kinds[model.NodeModule] != 2 { + t.Errorf("expected 2 MODULE (shebang + file anchor), got %d", kinds[model.NodeModule]) } // 2 functions (deploy, cleanup) if kinds[model.NodeMethod] != 2 { @@ -75,6 +75,46 @@ func TestBashNegative(t *testing.T) { } } +// TestBashImports_EdgeSurvivesSnapshot verifies that the anchor nodes emitted +// alongside source/calls edges are present in the detector result, so +// GraphBuilder.Snapshot's phantom-drop filter does not discard them. +func TestBashImports_EdgeSurvivesSnapshot(t *testing.T) { + d := NewBashDetector() + r := d.Detect(&detector.Context{FilePath: "deploy.sh", Language: "bash", Content: bashSource}) + + var moduleNodes, externalNodes int + for _, n := range r.Nodes { + switch n.Kind { + case model.NodeModule: + moduleNodes++ + case model.NodeExternal: + externalNodes++ + } + } + if moduleNodes == 0 { + t.Fatal("expected at least one MODULE anchor node for the file endpoint") + } + if externalNodes == 0 { + t.Fatal("expected at least one EXTERNAL anchor node for imported/called targets") + } + + var importEdges, callEdges int + for _, e := range r.Edges { + switch e.Kind { + case model.EdgeImports: + importEdges++ + case model.EdgeCalls: + callEdges++ + } + } + if importEdges == 0 { + t.Fatal("expected at least one surviving imports edge, got 0") + } + if callEdges == 0 { + t.Fatal("expected at least one surviving calls edge, got 0") + } +} + func TestBashDeterminism(t *testing.T) { d := NewBashDetector() ctx := &detector.Context{FilePath: "deploy.sh", Language: "bash", Content: bashSource} diff --git a/go/internal/detector/script/shell/powershell.go b/go/internal/detector/script/shell/powershell.go index 05015353..6c256065 100644 --- a/go/internal/detector/script/shell/powershell.go +++ b/go/internal/detector/script/shell/powershell.go @@ -39,6 +39,7 @@ func (d PowerShellDetector) Detect(ctx *detector.Context) *detector.Result { var edges []*model.CodeEdge fp := ctx.FilePath lines := strings.Split(text, "\n") + seen := map[string]bool{} for i, line := range lines { // Functions @@ -66,18 +67,23 @@ func (d PowerShellDetector) Detect(ctx *detector.Context) *detector.Result { nodes = append(nodes, n) } - // Import-Module + // Import-Module — emit anchor nodes so the imports edge survives + // GraphBuilder's phantom-drop filter. if m := psImportRE.FindStringSubmatch(line); len(m) >= 2 { imp := m[1] - e := model.NewCodeEdge(fp+":imports:"+imp, model.EdgeImports, fp, imp) + srcID := base.EnsureFileAnchor(ctx, "powershell", "PowerShellDetector", model.ConfidenceLexical, &nodes, seen) + tgtID := base.EnsureExternalAnchor(imp, "powershell:external", "PowerShellDetector", model.ConfidenceLexical, &nodes, seen) + e := model.NewCodeEdge(srcID+":imports:"+tgtID, model.EdgeImports, srcID, tgtID) e.Source = "PowerShellDetector" edges = append(edges, e) } - // . path\to\file.ps1 + // . path\to\file.ps1 — emit anchor nodes so the imports edge survives. if m := psDotSourceRE.FindStringSubmatch(line); len(m) >= 2 { src := m[1] - e := model.NewCodeEdge(fp+":dotsource:"+src, model.EdgeImports, fp, src) + srcID := base.EnsureFileAnchor(ctx, "powershell", "PowerShellDetector", model.ConfidenceLexical, &nodes, seen) + tgtID := base.EnsureExternalAnchor(src, "powershell:external", "PowerShellDetector", model.ConfidenceLexical, &nodes, seen) + e := model.NewCodeEdge(srcID+":dotsource:"+tgtID, model.EdgeImports, srcID, tgtID) e.Source = "PowerShellDetector" edges = append(edges, e) } diff --git a/go/internal/detector/script/shell/powershell_test.go b/go/internal/detector/script/shell/powershell_test.go index eb64efb1..aba90ab4 100644 --- a/go/internal/detector/script/shell/powershell_test.go +++ b/go/internal/detector/script/shell/powershell_test.go @@ -76,6 +76,41 @@ func TestPowerShellNegative(t *testing.T) { } } +// TestPowerShellImports_EdgeSurvivesSnapshot verifies that the anchor nodes +// emitted alongside Import-Module/dot-source imports edges are present in the +// detector result, so GraphBuilder.Snapshot's phantom-drop filter does not +// discard them. +func TestPowerShellImports_EdgeSurvivesSnapshot(t *testing.T) { + d := NewPowerShellDetector() + r := d.Detect(&detector.Context{FilePath: "Deploy.ps1", Language: "powershell", Content: psSource}) + + var moduleNodes, externalNodes int + for _, n := range r.Nodes { + switch n.Kind { + case model.NodeModule: + moduleNodes++ + case model.NodeExternal: + externalNodes++ + } + } + if moduleNodes == 0 { + t.Fatal("expected at least one MODULE anchor node for the file endpoint") + } + if externalNodes == 0 { + t.Fatal("expected at least one EXTERNAL anchor node for import targets") + } + + importEdges := 0 + for _, e := range r.Edges { + if e.Kind == model.EdgeImports { + importEdges++ + } + } + if importEdges == 0 { + t.Fatal("expected at least one surviving imports edge, got 0") + } +} + func TestPowerShellDeterminism(t *testing.T) { d := NewPowerShellDetector() ctx := &detector.Context{FilePath: "Deploy.ps1", Language: "powershell", Content: psSource}