From d8fd9f9e753c8b57add2de65cc37556fdab01879 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 15:26:51 +0000 Subject: [PATCH] fix(parser): unquote TOML keys and section headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apache Airflow's `.cherry_picker.toml` uses TOML's quoted-key form: "check_sha" = "..." `parseTOML` was reading the LHS as the raw text including the literal quotes. The TomlStructureDetector then emitted node IDs like `toml:.cherry_picker.toml:"check_sha"` while the CONTAINS edges (and any downstream lookup) referenced different shapes — Kuzu's BulkLoad aborted with: Copy exception: Unable to find primary key value "toml:.cherry_picker.toml:""check_sha""" Bug was symmetric for `["quoted-section"]` headers. Fix both: call the existing `unquote` helper on the key/section before storing. Regression tests added in structured_test.go (new file). End-to-end: `codeiq enrich ~/projects/polyglot-bench/airflow` now exits 0 (was exit 2): 95k nodes, 246k edges, 165 services loaded. Co-Authored-By: Claude Opus 4.7 --- go/internal/parser/structured.go | 4 +- go/internal/parser/structured_test.go | 60 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 go/internal/parser/structured_test.go diff --git a/go/internal/parser/structured.go b/go/internal/parser/structured.go index 968d50bf..d76c8aa1 100644 --- a/go/internal/parser/structured.go +++ b/go/internal/parser/structured.go @@ -162,7 +162,7 @@ func parseTOML(source []byte) ParsedEnvelope { continue } if strings.HasPrefix(raw, "[") && strings.HasSuffix(raw, "]") { - section := strings.TrimSpace(raw[1 : len(raw)-1]) + section := unquote(strings.TrimSpace(raw[1 : len(raw)-1])) currentSection = section // Walk into a nested map; only create the top-level section in // data — nested namespacing is preserved by the dotted key. @@ -176,7 +176,7 @@ func parseTOML(source []byte) ParsedEnvelope { if eq <= 0 { continue } - key := strings.TrimSpace(raw[:eq]) + key := unquote(strings.TrimSpace(raw[:eq])) val := strings.TrimSpace(raw[eq+1:]) val = unquote(val) if currentSection == "" { diff --git a/go/internal/parser/structured_test.go b/go/internal/parser/structured_test.go new file mode 100644 index 00000000..ac7975a6 --- /dev/null +++ b/go/internal/parser/structured_test.go @@ -0,0 +1,60 @@ +package parser + +import ( + "testing" +) + +func TestParseTOMLUnquotesKeys(t *testing.T) { + // `.cherry_picker.toml` in apache/airflow has `"check_sha" = "..."` — + // a quoted top-level key. Pre-fix the key string included the literal + // quotes which propagated into node IDs like + // `toml:.cherry_picker.toml:"check_sha"`, and CONTAINS edges then + // referenced PKs that the bulk-load couldn't resolve. + src := []byte(`team = "apache" +repo = "airflow" +"check_sha" = "abc123" +'literal_key' = "single-quoted" +`) + env := parseTOML(src) + data, ok := env["data"].(map[string]any) + if !ok { + t.Fatalf("envelope missing data map: %#v", env) + } + for k, want := range map[string]string{ + "team": "apache", + "repo": "airflow", + "check_sha": "abc123", + "literal_key": "single-quoted", + } { + got, ok := data[k].(string) + if !ok { + t.Errorf("key %q missing or non-string: %#v", k, data[k]) + continue + } + if got != want { + t.Errorf("data[%q] = %q, want %q", k, got, want) + } + } + // Negative: a quoted form must NOT appear as its own key. + for _, badKey := range []string{`"check_sha"`, `'literal_key'`} { + if _, exists := data[badKey]; exists { + t.Errorf("data still has quote-bearing key %q — unquote not applied", badKey) + } + } +} + +func TestParseTOMLUnquotesSectionHeaders(t *testing.T) { + // Less common in practice, but TOML spec allows `["foo.bar"]` quoted + // section headers. Same fix applies — unquote before using as map key. + src := []byte(`["quoted-section"] +inner = "v" +`) + env := parseTOML(src) + data := env["data"].(map[string]any) + if _, ok := data["quoted-section"]; !ok { + t.Errorf("missing top-level section 'quoted-section': %#v", data) + } + if _, ok := data[`"quoted-section"`]; ok { + t.Errorf("section header retained literal quotes — unquote not applied") + } +}