diff --git a/go/internal/graph/bulk.go b/go/internal/graph/bulk.go index 7de54114..a5ee8427 100644 --- a/go/internal/graph/bulk.go +++ b/go/internal/graph/bulk.go @@ -101,9 +101,17 @@ func (s *Store) copyNodeBatch(batch []*model.CodeNode) error { // Kuzu COPY FROM with explicit column list. ToSlash for Windows path // portability — Kuzu's parser accepts forward slashes on all platforms. - // DELIM='|' matches the pipe-separated staging file written above. + // + // DELIM='|' matches the pipe-separated staging file written above. The + // explicit QUOTE/ESCAPE pair overrides Kuzu's default backslash-escape + // behaviour with RFC-4180 (doubled-quote) escaping so that Go's + // encoding/csv writer (which emits "field""with""quotes" form) round- + // trips correctly. Fields containing the delimiter (e.g. Istio service + // names like "inbound|7070|tcplocal|s1tcp.none") are wrapped by the Go + // writer; Kuzu then dequotes them only when the matching escape rule is + // set. q := fmt.Sprintf( - "COPY CodeNode(%s) FROM '%s' (header=false, DELIM='|')", + `COPY CodeNode(%s) FROM '%s' (header=false, DELIM='|', QUOTE='"', ESCAPE='"')`, strings.Join(nodeColumns, ", "), filepath.ToSlash(tmp.Name()), ) @@ -263,9 +271,10 @@ func (s *Store) copyEdgeBatch(kind model.EdgeKind, batch []*model.CodeEdge) erro return fmt.Errorf("graph: csv close: %w", err) } - // DELIM='|' matches the pipe-separated staging file written above. + // DELIM/QUOTE/ESCAPE — see copyNodeBatch for the rationale (RFC-4180 + // round-trip with Go's encoding/csv). q := fmt.Sprintf( - "COPY %s FROM '%s' (header=false, DELIM='|')", + `COPY %s FROM '%s' (header=false, DELIM='|', QUOTE='"', ESCAPE='"')`, relTableName(kind), filepath.ToSlash(tmp.Name()), ) diff --git a/go/internal/graph/bulk_test.go b/go/internal/graph/bulk_test.go index 2ae4f7cc..47bf6302 100644 --- a/go/internal/graph/bulk_test.go +++ b/go/internal/graph/bulk_test.go @@ -223,6 +223,52 @@ func TestBulkLoadNodesCommaInProperties(t *testing.T) { } } +// TestBulkLoadEdgesPipeInTargetID is a regression test for Istio-style IDs +// that contain the field delimiter '|' literally (e.g. EDS cluster names +// "inbound|7070|tcplocal|s1tcp.none" parsed from JSON config). Go's csv.Writer +// RFC-4180-wraps such fields in '"', but Kuzu's default ESCAPE is backslash +// not doubled-quote — so without explicit QUOTE='"', ESCAPE='"' the COPY +// FROM splits the wrapped field on each interior '|' and aborts with +// "expected N values per row, but got more". Fix: explicit QUOTE/ESCAPE in +// the COPY FROM clause. +func TestBulkLoadEdgesPipeInTargetID(t *testing.T) { + s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu")) + if err != nil { + t.Fatal(err) + } + defer s.Close() + if err := s.ApplySchema(); err != nil { + t.Fatal(err) + } + // Istio-flavoured target ID with literal pipes. + target := "json:istio/none_cds.json:inbound|7070|tcplocal|s1tcp.none" + nodes := []*model.CodeNode{ + {ID: "json:istio/none_cds.json", Kind: model.NodeModule, Label: "none_cds.json"}, + {ID: target, Kind: model.NodeConfigKey, Label: "inbound|7070|tcplocal|s1tcp.none"}, + } + if err := s.BulkLoadNodes(nodes); err != nil { + t.Fatalf("BulkLoadNodes with pipe-bearing ID: %v", err) + } + edges := []*model.CodeEdge{{ + ID: "json:istio/none_cds.json->" + target, + Kind: model.EdgeContains, + SourceID: "json:istio/none_cds.json", + TargetID: target, + Confidence: model.ConfidenceSyntactic, + Source: "JsonStructureDetector", + }} + if err := s.BulkLoadEdges(edges); err != nil { + t.Fatalf("BulkLoadEdges with pipe-bearing target ID: %v", err) + } + rows, err := s.Cypher("MATCH ()-[r:CONTAINS]->() RETURN r.id AS id") + if err != nil { + t.Fatal(err) + } + if len(rows) != 1 { + t.Fatalf("want 1 CONTAINS row, got %d: %v", len(rows), rows) + } +} + // TestBulkLoadEdgesEmpty — zero edges is a no-op like the node path. func TestBulkLoadEdgesEmpty(t *testing.T) { s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))