From d3863721b69bec0ed60e72ebc1430142de4a442b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 19 May 2026 05:08:02 +0000
Subject: [PATCH] Revert "Merge pull request #160 from
 AdaWorldAPI/claude/lance-surrealdb-analysis-LXmug"

This reverts commit 697fb96364b62e6ce7e9370b17640c4deb89e60c, reversing
changes made to e63158ed77e44fe33d61dcff7810d999b2fbf01b.
---
 .claude/plans/integration-plan.md | 325 -----------
 docs/hpc-api-inventory.md         | 363 ------------
 docs/hpc-stability.md             | 914 ------------------------------
 src/hpc/heel_f64x8.rs             | 155 -----
 4 files changed, 1757 deletions(-)
 delete mode 100644 .claude/plans/integration-plan.md
 delete mode 100644 docs/hpc-api-inventory.md
 delete mode 100644 docs/hpc-stability.md

diff --git a/.claude/plans/integration-plan.md b/.claude/plans/integration-plan.md
deleted file mode 100644
index b94e65d3..00000000
--- a/.claude/plans/integration-plan.md
+++ /dev/null
@@ -1,325 +0,0 @@
-# Integration Plan: ndarray's role in the four-repo convergence
-
-**This repo**: `AdaWorldAPI/ndarray` — SIMD distance kernels + tensor primitives, shared across the stack.
-
-**Status**: planning document. Companion plans at the same path in the other repos:
-- `AdaWorldAPI/lance-graph:.claude/plans/integration-plan.md`
-- `AdaWorldAPI/surrealdb:.claude/plans/integration-plan.md`
-- `AdaWorldAPI/sea-orm:.claude/plans/integration-plan.md`
-
----
-
-## 1. The convergence target
-
-Across all four repos:
-
-> *Foundry-style ontology + BEAM-style supervision + ClickHouse-style analytic + Postgres-style ACID + cognitive primitives — all on one Arrow substrate, surfaced to consumers as a typed sea-orm API.*
-
-Four glue crates close the gap:
-
-| # | Glue crate | Owner repo | Bridges |
-|---|---|---|---|
-| 1 | `surrealdb-ractor` | surrealdb | `cf` / live queries → ractor mailboxes |
-| 2 | `lance-graph-tikv-provider` | lance-graph | TiKV ranges → Arrow `TableProvider` |
-| 3 | `sea-orm-ractor` | sea-orm | `Entity::PK` → ractor process registry |
-| 4 | `cognitive-shader-actor` | lance-graph | cognitive shaders → `ractor::Actor` adapter |
-
-**This repo owns no glue crate.** It owns the **shared low-level numeric substrate** that the other three depend on — SIMD distance kernels (cosine, L1, L2, Linf), `F64x8` polyfills, `heel_f64x8` helpers, `hpc-extras` feature.
-
-### Integration principle: additive contract shape (this repo IS the canonical case)
-
-**This repo is the load-bearing example of the contract-shape discipline.** Every symbol this repo exposes is consumed by surrealdb-core (`idx/trees/vector.rs`) and lance-graph cognitive crates (`bgz-tensor`, `holograph`, `deepnsm`, `causal-edge`). One signature change breaks the entire stack. The discipline:
-
-1. **Existing stable APIs never change signature.** Period. If a hypothetical improvement requires a different signature, the new signature ships as a new function next to the old one. The old function stays forever or for a 5+-version deprecation runway, whichever is longer.
-2. **New kernels are added as new functions in new or existing modules.** Adding `F32x16` doesn't touch `F64x8`. Adding `hamming_u8_simd` doesn't touch `cosine_f64_simd`.
-3. **Internal SIMD backends (AVX2/AVX-512/NEON paths) are not public surface.** They can change without notice. Only the public entry points are load-bearing.
-4. **The `[patch.crates-io]` block in surrealdb's root Cargo.toml is the diamond-dep guard.** This repo's existence + that patch line is what makes downstream `ort` (ONNX runtime) link the same `ndarray` as surrealdb-core. Breaking the patch contract breaks ONNX interop.
-
-**Per-repo enforcement**: every Sprint item below is read as "add this; don't change what's there."
-
-### Contracts (existing + new)
-
-| Contract | Owner repo | Status today | This plan adds |
-|---|---|---|---|
-| `ndarray::hpc::F64x8` + `heel_f64x8::*` | **this repo** | 0.17 fork, stable per §5 below | **unchanged — only new kernels (e.g. `F32x16`, int8, Hamming) added in new symbols** |
-| `[patch.crates-io] ndarray = ...` in surrealdb root Cargo.toml | surrealdb | active (diamond-dep guard) | not touched |
-| `lance-graph-contract` (for cognitive shader / IR vocabulary) | lance-graph | 0.1.x → 0.2.0 additive | not touched by us |
-| surrealdb `MvccSource` / `CfStream` | surrealdb | new additive traits | not touched by us |
-| sea-orm `EntityActor` / `SelectArrowExt` | sea-orm | new additive trait/derive | not touched by us |
-
----
-
-## 2. Architecture diagram
-
-```
-                ┌──────────────────────────────────────────┐
-                │              consumer crate              │
-                └──────────────────┬───────────────────────┘
-                                   │ typed entities
-                                   ▼
-                ┌──────────────────────────────────────────┐
-                │            sea-orm-arrow 2.0             │
-                └────┬─────────────────┬───────────────┬───┘
-                     │                 │               │
-                     ▼                 ▼               ▼
-              ┌───────────┐     ┌───────────┐    ┌───────────┐
-              │  ractor   │◄────│ surrealdb │    │lance-graph│
-              │ (actors,  │ #1  │  (cf +    │    │ (Cypher,  │
-              │ mailboxes,│     │   live    │    │ ontology, │
-              │ supervis.)│     │  queries) │    │cognitive) │
-              └─────┬─────┘     └─────┬─────┘    └─────┬─────┘
-                    │ #3              │                │ #2,#4
-                    ▼                 ▼                ▼
-              ┌─────────────────────────────────────────────┐
-              │       TiKV substrate (Raft + Percolator)    │
-              └─────────────────────────────────────────────┘
-                                  │
-                                  ▼
-                    ┌────────────────────────────┐
-                    │      THIS REPO (ndarray)   │
-                    │  - hpc-extras feature      │
-                    │  - F64x8 polyfill          │
-                    │  - heel_f64x8 distances    │
-                    │  - diamond-dep guard       │
-                    └────────────────────────────┘
-```
-
----
-
-## 3. Role of ndarray in the integration
-
-This is the **shared low-level numeric substrate**. The AdaWorldAPI fork of ndarray 0.17 with `hpc-extras` lives at the bottom of the stack. Two direct consumers:
-
-1. **surrealdb-core**
-   - `core/Cargo.toml:71-77` — `vector-hpc` feature flips on cfg-gated dispatch in `idx/trees/vector.rs`
-   - `core/src/idx/trees/vector.rs` — distance helpers (l1/l2/linf) inlined here, using this repo's SIMD kernels
-   - Comment from surrealdb's root `Cargo.toml:88-93`:
-     > *Always the AdaWorldAPI fork — never crates.io. Direct git dep at the workspace level. Distance helpers (l1/l2/linf) are inlined in surrealdb/core/src/idx/trees/vector.rs.*
-
-2. **lance-graph cognitive crates**
-   - `crates/bgz-tensor/` — element-wise ops use ndarray's `Zip` + `F64x8` chunks
-   - `crates/holograph/` — holographic distance metrics
-   - `crates/deepnsm/` — neural state machine distance kernels
-   - `crates/causal-edge/` — causality scoring uses cosine over embedding vectors
-
-Indirectly via sea-orm and the planner, every vector / distance / similarity operation in the stack lands here.
-
----
-
-## 4. Current state — what makes this fork special
-
-### `F64x8` polyfill
-
-`hpc-extras` feature exposes an 8-wide `f64` SIMD vector type that works on:
-- **x86_64 AVX-512** — native 8-wide
-- **x86_64 AVX2** — two 4-wide ops, software-packed
-- **aarch64 NEON** — two 4-wide via NEON 128-bit, software-packed
-- **other archs** — scalar fallback
-
-This is the kernel both surrealdb's `idx/trees/vector.rs` and lance-graph's cognitive shaders rely on.
-
-### `heel_f64x8` distance kernels
-
-Functions composing `F64x8` chunks into a distance:
-
-```
-heel_f64x8::cosine_f64_simd(a: &[f64], b: &[f64]) -> f64
-heel_f64x8::l1_f64_simd    (a: &[f64], b: &[f64]) -> f64
-heel_f64x8::l2_f64_simd    (a: &[f64], b: &[f64]) -> f64
-heel_f64x8::linf_f64_simd  (a: &[f64], b: &[f64]) -> f64
-```
-
-### Diamond-dep guard
-
-The `[patch.crates-io]` block at the bottom of surrealdb's root `Cargo.toml`:
-
-```toml
-[patch.crates-io]
-ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git" }
-```
-
-ensures any transitive consumer of `ndarray = "0.17.x"` from crates.io lands on this fork. Without the patch, `ort` (ONNX runtime, optional `ml` feature in surrealdb) would link a separate `ndarray` and surrealdb-core would link this one — two distinct `TypeId`s, no interop.
-
-**This repo's existence is what makes the patch work.** Without it, the diamond-dep workaround has no target to redirect to.
-
-### The `lance-index` 0.16 gap (known)
-
-From surrealdb root `Cargo.toml:100-101`:
-
-> *Scope: 0.17 line only. `lance-index 4.0` depends on `ndarray = "0.16"`, a separate major version that this patch does not affect; eliminating that crates.io 0.16 entry requires upstream `lance-index` to bump.*
-
-**Plan**: watch upstream `lance-index` for the 0.17 bump (see §6 Sprint 2). When it lands, the diamond-dep guard becomes single-version-clean.
-
----
-
-## 5. API stability commitment (this repo's contract)
-
-This repo doesn't own a glue *crate* — it owns the **API contract that the SIMD layer of three downstream repos depends on**. The commitment is absolute:
-
-### Stable public surface (no break without major bump, none planned)
-
-| Symbol | Kind |
-|---|---|
-| `ndarray::hpc::F64x8` | type — layout, lane count (8) frozen |
-| `ndarray::hpc::heel_f64x8::cosine_f64_simd(a, b) -> f64` | signature frozen |
-| `ndarray::hpc::heel_f64x8::l1_f64_simd(a, b) -> f64` | signature frozen |
-| `ndarray::hpc::heel_f64x8::l2_f64_simd(a, b) -> f64` | signature frozen |
-| `ndarray::hpc::heel_f64x8::linf_f64_simd(a, b) -> f64` | signature frozen |
-| feature `hpc-extras` | name + what it enables frozen |
-
-**"Frozen" means**: no signature change, no rename, no semantic drift. If we want to refine — e.g., a fused multiply-add variant of cosine — we add `cosine_f64_simd_fma(a, b) -> f64` as a NEW function. Both coexist forever (or 5+ versions, whichever is longer).
-
-### Internal / unstable
-
-- Polyfill backends (AVX2/AVX-512/NEON paths) — implementation detail
-- Auto-dispatch heuristics — can change without notice
-- Numeric tolerance in non-cancellation-prone paths — within `f64::EPSILON * len` of scalar reference
-
-### Doc commitment
-
-- Each stable function gets a doc-test
-- Cross-arch behaviour documented in `docs/hpc-stability.md` (Sprint 0)
-- A CI matrix runs the doc-tests on x86_64-AVX2, x86_64-AVX-512, aarch64-NEON, and scalar-fallback
-
----
-
-## 6. Sprint sequence (this repo)
-
-All work is **additive** — new symbols in new or existing modules; no existing symbol changes signature.
-
-### Sprint 0 — API freeze + doc (1 week)
-- Mark stable APIs with `#[stable]`-style doc tag (custom attribute or doc-comment convention)
-- Write `docs/hpc-stability.md` listing the commitment from §5
-- Add CI cross-arch doc-test matrix
-- Cross-link from this plan
-
-### Sprint 1 — `bgz-tensor` direct coupling (1 week)
-- `bgz-tensor` (lance-graph crate) takes a direct dep on this fork (additive: new dep line, no existing dep changes)
-- Ensures `bgz-tensor` users always get the SIMD kernels regardless of feature-flag composition
-- Coordinate with lance-graph plan §4
-
-### Sprint 2 — `lance-index` 0.17 readiness (timing depends on upstream)
-- Watch upstream `lance-index` for the 0.17 bump
-- Have a forked `lance-index` 0.17 ready to slot in if upstream delays
-- Once available, extend the surrealdb `[patch.crates-io]` block to cover both 0.16 (if still needed) and 0.17
-- This is purely additive on this repo's side (we add no symbols; we are the target of the patch)
-
-### Sprint 3 — additional kernels as needed (ad-hoc; all additive)
-- Add `F32x16` polyfill if cognitive shaders migrate to f32 (NEW type, F64x8 unchanged)
-- Add quantised int8 distance kernels for embedding compression (NEW module `heel_i8x32::*`)
-- Add Hamming distance kernel for binary embeddings (NEW function `heel_u8x32::hamming_u8_simd`)
-
----
-
-## 7. Examples
-
-### Example 1 — surrealdb using the fork's SIMD
-
-```rust
-// surrealdb/core/src/idx/trees/vector.rs — sketch of what's already wired
-use ndarray::hpc::heel_f64x8;
-
-pub fn cosine_distance(a: &[f64], b: &[f64]) -> f64 {
-    debug_assert_eq!(a.len(), b.len());
-    #[cfg(feature = "vector-hpc")]
-    { 1.0 - heel_f64x8::cosine_f64_simd(a, b) }
-    #[cfg(not(feature = "vector-hpc"))]
-    { scalar_cosine(a, b) }
-}
-```
-
-### Example 2 — lance-graph cognitive shader using the fork
-
-```rust
-// lance-graph/crates/holograph/src/distance.rs
-use ndarray::hpc::heel_f64x8;
-use crate::HolographEmbedding;
-
-impl HolographEmbedding {
-    pub fn similarity(&self, other: &Self) -> f64 {
-        heel_f64x8::cosine_f64_simd(self.as_slice(), other.as_slice())
-    }
-}
-```
-
-### Example 3 — `bgz-tensor` element-wise ops via the fork
-
-```rust
-// lance-graph/crates/bgz-tensor/src/ops.rs
-use ndarray::hpc::F64x8;
-use ndarray::Zip;
-
-impl BgzTensor<f64> {
-    pub fn elementwise_mul(&self, other: &Self) -> Self {
-        let mut out = self.clone();
-        Zip::from(&mut out.data)
-            .and(&other.data)
-            .for_each(|a, &b| *a *= b);
-        // F64x8-chunked path handled by ndarray's Zip internals for large tensors.
-        out
-    }
-}
-```
-
-### Example 4 — The diamond-dep guard (replicated for cross-reference)
-
-```toml
-# surrealdb root Cargo.toml (already in place; documented here so the
-# fork knows what surfaces are load-bearing).
-[patch.crates-io]
-ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git" }
-```
-
-Without this patch:
-- `ort` pulls `ndarray = "0.17.2"` from crates.io
-- `surrealdb-core` pulls this fork
-- They have distinct `TypeId`s → no interop between ONNX outputs and surrealdb's index code
-
-With this patch, both link the same crate. **This fork's stability is the diamond-dep fix.**
-
-### Example 5 — New kernel landing as a new symbol (additive)
-
-Hypothetical: a fused multiply-add cosine variant lands. Old + new coexist:
-
-```rust
-// crates/ndarray/src/hpc/heel_f64x8.rs — new function, existing unchanged
-pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 { /* existing */ }
-
-/// FMA variant. Lower latency on AVX-512 + AVX2-FMA hosts.
-/// Numerically identical within f64::EPSILON * len.
-pub fn cosine_f64_simd_fma(a: &[f64], b: &[f64]) -> f64 { /* new */ }
-```
-
-Consumers pick. Nothing breaks.
-
----
-
-## 8. What this plan asks of the other repos
-
-Nothing structural — only that consumers stay on the stable surface (§5) and report breakage promptly. Specifically:
-
-- **surrealdb**: `idx/trees/vector.rs` should only use `ndarray::hpc::*` items listed in §5. Anything else is a non-stable detail and may break without notice.
-- **lance-graph**: cognitive crates should use `heel_f64x8` distance kernels; if a kernel is missing (e.g. Hamming), file an issue here rather than implementing locally.
-- **sea-orm**: no direct dep on this fork; touches it only transitively if a consumer uses sea-orm-arrow with `f64` Arrow columns.
-
----
-
-## 9. Open questions
-
-1. **`F32x16` priority** — is a cognitive shader consumer planning to move to f32? If yes, Sprint 3 fast-track. If no, defer.
-2. **Quantised int8 distance kernels** — trigger Sprint 3 item when a concrete consumer surfaces.
-3. **WASM target** — surrealdb has a WASM build path. Does it need `vector-hpc`? Today the scalar fallback covers it. Confirm with surrealdb plan.
-4. **Numeric tolerance documentation** — currently "within `f64::EPSILON * len`"; doc-test it in Sprint 0.
-5. **`#[stable]` attribute convention** — use Rust nightly `#[stable]` (not available on stable) or a doc-comment convention? Probably the latter for portability; revisit when nightly `#[stable]` stabilises.
-
----
-
-## 10. Cross-references
-
-- **Glue #1** (surrealdb-ractor): `AdaWorldAPI/surrealdb:.claude/plans/integration-plan.md` §5
-- **Glue #2** (TiKV TableProvider): `AdaWorldAPI/lance-graph:.claude/plans/integration-plan.md` §5
-- **Glue #3** (sea-orm-ractor): `AdaWorldAPI/sea-orm:.claude/plans/integration-plan.md` §5
-- **Glue #4** (cognitive-shader-actor): `AdaWorldAPI/lance-graph:.claude/plans/integration-plan.md` §6
-- **Cognitive crate consumers** (the load-bearing reason this fork exists): `AdaWorldAPI/lance-graph:.claude/plans/integration-plan.md` §3 + §4
-- **surrealdb's `vector-hpc` feature**: `AdaWorldAPI/surrealdb:.claude/plans/integration-plan.md` §4 (`core/Cargo.toml:71-77`)
-- **`lance-projection` sibling** (analytic view of cognitive crate outputs): `AdaWorldAPI/surrealdb:.claude/plans/integration-plan.md` §6
diff --git a/docs/hpc-api-inventory.md b/docs/hpc-api-inventory.md
deleted file mode 100644
index 2bb10e38..00000000
--- a/docs/hpc-api-inventory.md
+++ /dev/null
@@ -1,363 +0,0 @@
-# HPC API Inventory — AdaWorldAPI/ndarray fork
-
-**Generated**: 2026-05-18
-**Branch**: `claude/lance-surrealdb-analysis-LXmug`
-**Purpose**: Catalogue of the existing public HPC surface relevant to the
-lance-graph ↔ surrealdb ↔ ndarray integration plan §1 stable-surface commitment.
-
----
-
-## 1. Discovered HPC Modules
-
-The `src/hpc/` directory contains **~100 Rust source files** across flat and
-nested layouts. Below are the modules relevant to distance computation and the
-stable surface claimed by the integration plan.
-
-| Module | File | Notes |
-|---|---|---|
-| `hpc::heel_f64x8` | `src/hpc/heel_f64x8.rs` | **Primary distance surface** — SIMD cosine + HEEL plane Hamming |
-| `hpc::distance` | `src/hpc/distance.rs` | Spatial k-NN, squared L2, radius filter (f32 AVX2 + f64 scalar) |
-| `hpc::bitwise` | `src/hpc/bitwise.rs` | `hamming_distance_raw`, `popcount_raw`, batch Hamming + top-k |
-| `hpc::palette_distance` | `src/hpc/palette_distance.rs` | Palette/SPO distance matrices (`SpoDistanceMatrices`) |
-| `hpc::layered_distance` | `src/hpc/layered_distance.rs` | Lance-graph container layout (`[u64; 256]`), `palette_distance()` |
-| `hpc::parallel_search` | `src/hpc/parallel_search.rs` | `parallel_search`, `lfd_from_palette`, `PaletteScope` |
-| `hpc::cam_pq` | `src/hpc/cam_pq.rs` | `squared_l2`, `kmeans`, `CamCodebook`, `DistanceTables` |
-| `hpc::blas_level1` | `src/hpc/blas_level1.rs` | `dot_f32/f64`, `nrm2_f32/f64`, `axpy_f32/f64`, `blas_rotg` |
-| `hpc::vml` | `src/hpc/vml.rs` | `vsexp`, `vdexp`, `vsln`, `vdln`, `vssqrt`, `vdsqrt`, `vsabs`, `vdabs`, etc. |
-| `hpc::reductions` | `src/hpc/reductions.rs` | `sum_f32/f64`, `mean_f32/f64`, `max/min_f32`, `argmax/argmin_f32`, `nrm2_f32` |
-| `hpc::simd_caps` | `src/hpc/simd_caps.rs` | Runtime SIMD capability singleton |
-| `hpc::simd_dispatch` | `src/hpc/simd_dispatch.rs` | `LazyLock`-frozen SIMD dispatch function pointers |
-| `hpc::fingerprint` | `src/hpc/fingerprint.rs` | `Fingerprint<N>`, `Fingerprint1K/2K/64K`, `VectorConfig` |
-| `hpc::clam` | `src/hpc/clam.rs` | `knn_brute`, `ClamTree::build` |
-| `hpc::prefilter` | `src/hpc/prefilter.rs` | `approx_hamming_candidates` |
-| `hpc::cyclic_bundle` | `src/hpc/cyclic_bundle.rs` | `hamming_128`, `cyclic_shift`, `bundle_spo` |
-| `hpc::zeck` | `src/hpc/zeck.rs` | ZeckF64 progressive edge encoding, `hamming_distance_raw` consumer |
-| `hpc::holo` | `src/hpc/holo.rs` | Phase-space holographic ops: `focus_hamming`, `focus_l1`, `wasserstein_sorted_i8` |
-
-Additionally gated behind `feature = "hpc-extras"`:
-
-| Module | File |
-|---|---|
-| `hpc::spo_bundle` | `src/hpc/spo_bundle.rs` |
-| `hpc::deepnsm` | `src/hpc/deepnsm.rs` |
-| `hpc::compression_curves` | `src/hpc/compression_curves.rs` |
-| `hpc::crystal_encoder` | `src/hpc/crystal_encoder.rs` |
-| `hpc::p64_bridge` | `src/hpc/p64_bridge.rs` |
-
----
-
-## 2. F64x8 Type — Actual Definition
-
-### AVX-512 path (canonical production backend)
-
-**File**: `src/simd_avx512.rs`
-**Line**: 304 (struct definition) / 314 (LANES constant)
-
-```rust
-// src/simd_avx512.rs:302–304
-#[derive(Copy, Clone)]
-#[repr(transparent)]
-pub struct F64x8(pub __m512d);
-
-// src/simd_avx512.rs:314
-pub const LANES: usize = 8;
-```
-
-Repr: `__m512d` — a native 512-bit AVX-512 register holding 8 × `f64`.
-Lane count: **8**.
-Backing: `_mm512_loadu_pd` (unaligned load), `_mm512_storeu_pd` (unaligned store).
-
-Key methods available on `F64x8` (`src/simd_avx512.rs`):
-`splat(v: f64)`, `from_slice(&[f64])`, `from_array([f64; 8])`, `to_array()`,
-`copy_to_slice(&mut [f64])`, `reduce_sum()`, `reduce_min()`, `reduce_max()`,
-`abs()`, `sqrt()`, `round()`, `floor()`, `mul_add(b, c)`, `simd_min/max/clamp`,
-`simd_lt/le/gt/ge/eq/ne`, `to_bits()`, `from_bits()`.
-
-### AVX2 fallback (non-AVX-512 x86_64)
-
-**File**: `src/simd_avx2.rs`
-The AVX2 path supplies `F64x8` as a polyfill backed by two `__m256d` (2 × 4
-lanes). Same public API surface as the AVX-512 variant; `impl_float_type!` macro
-used at line ~820 of `simd_avx2.rs`.
-
-### Scalar fallback (non-x86 targets)
-
-**File**: `src/simd.rs`, scalar module (not-x86 cfg block, line ~789)
-```rust
-impl_float_type!(F64x8, f64, 8, F64Mask8, u8);
-```
-Backed by `[f64; 8]`. Same API.
-
-### Re-export path
-
-`src/simd.rs:244` (AVX-512 path) / `src/simd.rs:280` (AVX2 fallback) / `src/simd.rs:1573` (NEON)
-→ `pub use crate::simd::F64x8;` is the canonical consumer entry point.
-
----
-
-## 3. `heel_f64x8` Functions — Signatures and File:Line
-
-**File**: `src/hpc/heel_f64x8.rs`
-
-| Function | Signature | Line | Description |
-|---|---|---|---|
-| `heel_weighted_distance` | `(distances: &[f64; 8], weights: &[f64; 8]) -> f64` | 23 | Weighted dot via F64x8 FMA; single vmulpd+vreducepd on AVX-512 |
-| `heel_plane_distances` | `(a: &[u64; 8], b: &[u64; 8]) -> [f64; 8]` | 34 | Hamming (popcount of XOR) per plane → 8 f64 distances |
-| `heel_weighted_hamming` | `(a_planes: &[u64; 8], b_planes: &[u64; 8], weights: &[f64; 8]) -> f64` | 44 | Full pipeline: planes → per-plane Hamming → weighted dot |
-| `dot_f64_simd` | `(a: &[f64], b: &[f64]) -> f64` | 64 | SIMD dot product; 8 f64 per iteration with FMA accumulation |
-| `sum_sq_f64_simd` | `(a: &[f64]) -> f64` | 86 | Sum of squares via F64x8 FMA |
-| `cosine_f64_simd` | `(a: &[f64], b: &[f64]) -> f64` | 109 | SIMD cosine similarity, single-pass dot+norms |
-| `cosine_f32_to_f64_simd` | `(a: &[f32], b: &[f32]) -> f64` | 149 | f32 inputs, f64 precision cosine via scalar widening + F64x8 FMA |
-
-**Constants also defined**:
-- `UNIFORM_WEIGHTS: [f64; 8] = [1.0; 8]` — line 50
-- `HEEL_7PLUS1_WEIGHTS: [f64; 8] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5]` — line 54
-
-### Integration plan claim vs reality
-
-The integration plan (`lance-graph/.claude/plans/integration-plan.md:32`) states:
-> `hpc-extras` feature, `heel_f64x8::cosine_f64_simd` etc.
-
-And the contracts table (line 58) states:
-> `ndarray::hpc::F64x8` + `heel_f64x8::*` — ndarray 0.17 fork, stable
-
-**Verdict: PRESENT and matches.** `cosine_f64_simd` exists at
-`src/hpc/heel_f64x8.rs:109` with signature `(a: &[f64], b: &[f64]) -> f64`.
-
-**Additional functions the plan's "etc." implies but does not name explicitly
-are also present**: `heel_weighted_hamming`, `heel_plane_distances`,
-`heel_weighted_distance`, `dot_f64_simd`, `sum_sq_f64_simd`, `cosine_f32_to_f64_simd`.
-
----
-
-## 4. Other Distance Kernels Found
-
-### 4a. Hamming (binary Hamming distance)
-
-| Function | File | Line | Signature |
-|---|---|---|---|
-| `hamming_distance_raw` | `src/hpc/bitwise.rs` | 180 | `(a: &[u8], b: &[u8]) -> u64` |
-| `popcount_raw` | `src/hpc/bitwise.rs` | 185 | `(a: &[u8]) -> u64` |
-| `hamming_batch_raw` | `src/hpc/bitwise.rs` | 193 | `(query, database, num_rows, row_bytes) -> Vec<u64>` |
-| `hamming_top_k_raw` | `src/hpc/bitwise.rs` | 201 | `(query, database, num_rows, row_bytes, k) -> Vec<(usize,u64)>` |
-| `hamming_distance` | `src/simd_avx2.rs` | 276 | `(a: &[u8], b: &[u8]) -> u64` (AVX2-specific) |
-| `hamming_batch` | `src/simd_avx2.rs` | 316 | `(query, database, num_rows, row_bytes) -> Vec<u64>` |
-| `hamming_top_k` | `src/simd_avx2.rs` | 338 | `(query, database, num_rows, row_bytes, k) -> Vec<(usize,u64)>` |
-| `hamming_128` | `src/hpc/cyclic_bundle.rs` | 153 | `(a: &[u64; N], b: &[u64; N]) -> u32` (128×64-bit) |
-| `hamming_u8x16` | `src/simd_neon.rs` | 74 | `unsafe (a: &[u8; 16], b: &[u8; 16]) -> u32` (NEON) |
-| `focus_hamming` | `src/hpc/holo.rs` | ~1951 | `(a: &[u8], b: &[u8], mask_x, mask_y, mask_z) -> (u64, u32)` |
-| `approx_hamming_candidates` | `src/hpc/prefilter.rs` | 252 | `(query, db, bytes_per_vec, n_vectors, k) -> Vec<(usize,u32)>` |
-
-Re-exported to `ndarray::simd` namespace:
-- `src/simd.rs:1714`: `pub use crate::hpc::bitwise::{hamming_distance_raw, popcount_raw};`
-
-### 4b. Cosine Similarity
-
-| Function | File | Line | Signature |
-|---|---|---|---|
-| `cosine_f64_simd` | `src/hpc/heel_f64x8.rs` | 109 | `(a: &[f64], b: &[f64]) -> f64` |
-| `cosine_f32_to_f64_simd` | `src/hpc/heel_f64x8.rs` | 149 | `(a: &[f32], b: &[f32]) -> f64` |
-
-Re-exported to `ndarray::simd` namespace:
-- `src/simd.rs:1751`: `pub use crate::hpc::heel_f64x8::cosine_f32_to_f64_simd;`
-- `cosine_f64_simd` is **NOT** re-exported at the `ndarray::simd` level (only `cosine_f32_to_f64_simd` is). Consumers must import directly from `ndarray::hpc::heel_f64x8::cosine_f64_simd`.
-
-### 4c. L2 / Squared L2
-
-| Function | File | Line | Signature |
-|---|---|---|---|
-| `squared_l2` | `src/hpc/cam_pq.rs` | 473 | `(a: &[f32], b: &[f32]) -> f32` |
-| `squared_distances_f32` | `src/hpc/distance.rs` | 98 | `(query: [f32;3], points: &[[f32;3]]) -> Vec<f32>` |
-| `squared_distances_f64` | `src/hpc/distance.rs` | 142 | `(query: [f64;3], points: &[[f64;3]]) -> Vec<f64>` |
-| `knn_f32` | `src/hpc/distance.rs` | 124 | `(query: [f32;3], points: &[[f32;3]], k) -> (Vec<usize>, Vec<f32>)` |
-| `knn_f64` | `src/hpc/distance.rs` | 158 | `(query: [f64;3], points: &[[f64;3]], k) -> (Vec<usize>, Vec<f64>)` |
-| `filter_by_radius_sq` | `src/hpc/distance.rs` | 113 | `(query: [f32;3], points, radius_sq) -> Vec<usize>` |
-| `filter_by_radius_sq_f64` | `src/hpc/distance.rs` | 147 | `(query: [f64;3], points, radius_sq) -> Vec<usize>` |
-
-Re-exported to `ndarray::simd` namespace:
-- `src/simd.rs:1747`: `pub use crate::hpc::cam_pq::{kmeans, squared_l2};`
-
-### 4d. L1 (Holographic / phase-space variants)
-
-**No generic `l1_f64_simd` or `l1_f64` free function exists at the top-level HPC surface.**
-
-L1-style distance found only in specialized contexts:
-
-| Function | File | Signature | Context |
-|---|---|---|---|
-| `focus_l1` | `src/hpc/holo.rs` | `(a: &[u8], b: &[u8], mask_x, mask_y, mask_z) -> (u64, u32)` | Holographic phase-masked L1 |
-| `wasserstein_sorted_i8` | `src/hpc/holo.rs` | `(a: &[u8], b: &[u8]) -> u64` | Wasserstein-style L1 distance |
-| `carrier_distance_l1` | `src/hpc/holo.rs` | `(a: &[i8], b: &[i8]) -> u64` | Carrier-wave L1 distance |
-| `histogram_l1_distance` | `src/hpc/holo.rs` | `(a: &[u16;16], b: &[u16;16]) -> u32` | Histogram L1 |
-| `asum_f32` / `asum_f64` | `src/simd_avx2.rs` | `(x: &[f32]) -> f32` | L1 norm (sum of absolutes), not pairwise distance |
-
-### 4e. Linf (Chebyshev) Distance
-
-**No `linf_f64_simd` or generic Linf pairwise distance function exists in the HPC surface.** Not found anywhere in `src/`. `reduce_max()` on `F64x8` provides max-element reduction as a building block, but no composed Linf kernel is exported.
-
-### 4f. Dot Products (BLAS L1 level)
-
-| Function | File | Line | Signature |
-|---|---|---|---|
-| `dot_f32` | `src/simd_avx2.rs` | 56 | `(a: &[f32], b: &[f32]) -> f32` |
-| `dot_f64` | `src/simd_avx2.rs` | 88 | `(a: &[f64], b: &[f64]) -> f64` |
-| `dot_f64_simd` | `src/hpc/heel_f64x8.rs` | 64 | `(a: &[f64], b: &[f64]) -> f64` (F64x8 FMA path) |
-| `dot_i8` | `src/simd_avx2.rs` | 406 | `(a: &[u8], b: &[u8]) -> i64` |
-
-### 4g. Palette / SPO Distance
-
-| Function | File | Signature |
-|---|---|---|
-| `palette_distance` | `src/hpc/layered_distance.rs:62` | `(dm: &SpoDistanceMatrices, a: &[u64;256], b: &[u64;256]) -> u32` |
-| `SpoDistanceMatrices::spo_distance` | `src/hpc/palette_distance.rs:345` | `(&self, a_s, a_p, a_o, b_s, b_p, b_o) -> u32` |
-| `DistanceTables::distance` | `src/hpc/cam_pq.rs:189` | `(&self, cam: &CamFingerprint) -> f32` |
-| `parallel_search` | `src/hpc/parallel_search.rs:229` | `(scope: &PaletteScope, query: &PaletteEdge, k, gate: &TruthGate) -> Vec<SearchResult>` |
-
----
-
-## 5. Feature Flags
-
-### `hpc-extras` (defined `Cargo.toml:207`)
-
-```toml
-hpc-extras = ["std", "dep:p64", "dep:fractal", "fractal/std"]
-```
-
-Pulls in: `p64` (Palette64/3D attention NARS bridge) and `fractal` (manifold math).
-
-Modules **gated** behind `hpc-extras` (from `src/hpc/mod.rs`):
-- `hpc::spo_bundle` (line 121)
-- `hpc::deepnsm` (line 124)
-- `hpc::compression_curves` (line 131)
-- `hpc::crystal_encoder` (line 134)
-- `hpc::p64_bridge` (line 141)
-- `jitson_cranelift` sub-module (gated separately on `jit-native`)
-- `splat3d` (gated separately on `splat3d`)
-- The `e2e_tests` integration test block (line 252)
-
-**Default**: `hpc-extras` IS included in the crate default features (`Cargo.toml:174`):
-```toml
-default = ["std", "hpc-extras"]
-```
-
-Modules **not** gated behind `hpc-extras` (unconditionally compiled when `std` is on):
-All of `heel_f64x8`, `distance`, `bitwise`, `blas_level1/2/3`, `cam_pq`, `fingerprint`,
-`clam`, `prefilter`, `palette_distance`, `layered_distance`, `parallel_search`,
-`holo`, `cyclic_bundle`, `vml`, `reductions`, etc.
-
-### Other relevant feature flags
-
-| Flag | Defined at | Purpose |
-|---|---|---|
-| `std` | `Cargo.toml:182` | Enables `hpc` module + blake3 for cognitive substrate |
-| `native` | `Cargo.toml:219` | HPC backend: pure Rust + SIMD |
-| `intel-mkl` | `Cargo.toml:220` | HPC backend: Intel MKL FFI (mutually exclusive with openblas) |
-| `openblas` | `Cargo.toml:221` | HPC backend: OpenBLAS FFI (mutually exclusive with intel-mkl) |
-| `jit-native` | `Cargo.toml:215` | Cranelift JIT backend |
-| `splat3d` | `Cargo.toml:231` | CPU-SIMD 3D Gaussian Splatting |
-| `nightly-simd` | `Cargo.toml:197` | Portable-SIMD miri-compatible backend (nightly only) |
-
----
-
-## 6. Cross-References to Consumers
-
-### lance-graph (`/home/user/lance-graph`)
-
-The integration plan (`integration-plan.md:58`) explicitly contracts:
-> `ndarray::hpc::F64x8` + `heel_f64x8::*` — stable, unchanged
-
-Consuming crates in lance-graph that reference the ndarray HPC surface:
-
-| Consumer file | ndarray function used | Reference |
-|---|---|---|
-| `crates/lance-graph/src/graph/blasgraph/ndarray_bridge.rs` | `hamming_distance_raw`, `U8x64::nibble_popcount_lut` | knowledge doc W1b row |
-| `crates/lance-graph/src/graph/neighborhood/zeckf64.rs` | ZeckF64 (ndarray canonical copy) | `hpc::zeck` |
-| `crates/lance-graph-contract/src/mul.rs` | `I8x16::from_i4_packed_u64`, `batch_packed_i4_16` | W1a W1b plan |
-| `crates/holograph/hamming.rs` | `hamming_distance_raw`, `U64x8::popcnt` (W1a planned) | knowledge doc W1b row |
-| `crates/bgz17/src/simd.rs` | `U16x8::gather_u16` (W1a planned), `hamming_distance_raw` | knowledge doc W1b row |
-| `crates/thinking-engine/src/engine.rs` | `BF16x16`, `simd_amx::*`, `Fingerprint<256>` | W1b VNNI route |
-
-Note: the **lance-graph knowledge doc** (`lance-graph/.claude/knowledge/ndarray-vertical-simd-alien-magic.md`)
-specifies that `cosine_f64_simd` is part of the stable surface ("etc." in the plan) and that
-no raw intrinsics should be used in consumer crates — all SIMD must flow through `ndarray::simd::*`.
-
-### surrealdb (`AdaWorldAPI/surrealdb`)
-
-Referenced via the integration plan; consuming `lance-graph-contract` which depends on ndarray.
-The path is indirect: surrealdb → `lance-graph-contract` → ndarray.
-
-The surrealdb vector distance machinery lives at (plan reference, not audited locally):
-`surrealdb/core/src/idx/trees/vector.rs`
-
-Plan claims this will consume `ndarray::hpc::heel_f64x8::cosine_f64_simd` via the
-lance-graph-contract trait when wired. Not yet wired (integration §5 is a new crate,
-`lance-graph-tikv-provider`, not vector indexing).
-
----
-
-## 7. Gap Analysis — Plan §1 Stable Surface vs Current Reality
-
-The integration plan's stable-surface table (`integration-plan.md:53–58`) claims:
-> `ndarray::hpc::F64x8` + `heel_f64x8::*` — ndarray 0.17 fork, stable — unchanged: only new kernels added
-
-### Present and confirmed
-
-| Claimed API | Actual location | Status |
-|---|---|---|
-| `ndarray::hpc::F64x8` | `src/simd_avx512.rs:304` (AVX-512), polyfill at `simd_avx2.rs`, scalar in `simd.rs` | PRESENT |
-| `F64x8::LANES = 8` | `src/simd_avx512.rs:314` | PRESENT |
-| `F64x8::splat`, `from_slice`, `from_array`, `to_array`, `reduce_sum`, `mul_add`, `sqrt`, `abs` | `src/simd_avx512.rs:316–434` | PRESENT |
-| `heel_f64x8::cosine_f64_simd` | `src/hpc/heel_f64x8.rs:109` | PRESENT — signature `(a: &[f64], b: &[f64]) -> f64` |
-| `heel_f64x8::heel_weighted_hamming` | `src/hpc/heel_f64x8.rs:44` | PRESENT |
-| `heel_f64x8::heel_plane_distances` | `src/hpc/heel_f64x8.rs:34` | PRESENT |
-| `heel_f64x8::heel_weighted_distance` | `src/hpc/heel_f64x8.rs:23` | PRESENT |
-| `heel_f64x8::dot_f64_simd` | `src/hpc/heel_f64x8.rs:64` | PRESENT |
-| `heel_f64x8::cosine_f32_to_f64_simd` | `src/hpc/heel_f64x8.rs:149` | PRESENT; also re-exported at `ndarray::simd` level |
-| `hamming_distance_raw` | `src/hpc/bitwise.rs:180`; re-exported `simd.rs:1714` | PRESENT |
-| `squared_l2` | `src/hpc/cam_pq.rs:473`; re-exported `simd.rs:1747` | PRESENT |
-
-### Missing from the plan's implied surface
-
-| Claimed / implied API | Status | Notes |
-|---|---|---|
-| `l1_f64_simd` or generic pairwise L1 | **ABSENT** | Only L1-norm variants (`asum_f32/f64`) and specialized `focus_l1` / `carrier_distance_l1` / `wasserstein_sorted_i8` exist. No generic `l1_f64_simd(a: &[f64], b: &[f64]) -> f64`. |
-| `l2_f64_simd` as free function | **ABSENT** | `squared_l2` exists for f32; no `l2_f64_simd(a: &[f64], b: &[f64]) -> f64` free function. The L2 distance on 3D points exists in `distance.rs` but is not a general-purpose slice kernel. |
-| `linf_f64_simd` | **ABSENT** | No Linf / Chebyshev distance function at any level. |
-| `cosine_f64_simd` re-export in `ndarray::simd` | **ABSENT** | `cosine_f32_to_f64_simd` IS re-exported at `simd.rs:1751`. `cosine_f64_simd` is **not** — consumers must import from `ndarray::hpc::heel_f64x8`. |
-| `hamming_distance_raw` gating on `hpc-extras` | **NOT REQUIRED** — present unconditionally | `hamming_distance_raw` lives in `hpc::bitwise` which is not behind `hpc-extras`; always available with `std`. |
-
-### W1a primitives claimed by the knowledge doc — current status
-
-From `lance-graph/.claude/knowledge/ndarray-vertical-simd-alien-magic.md` §W1a table:
-
-| W1a primitive | Status in ndarray today |
-|---|---|
-| `I8x16::from_i4_packed_u64` | **ABSENT** — not in `simd_avx512.rs` or `simd_avx2.rs`; W1a PR pending |
-| `I8x16::lane_i8::<N>` | **ABSENT** — generic lane extractor not present |
-| `I8x16::saturating_abs` | **ABSENT** — neither a free function nor method |
-| `batch_packed_i4_16<E, F>` | **ABSENT** — closure-batch entry point not present |
-| `U64x8::xor_popcount` / `U64x8::popcnt` | **ABSENT** — `U64x8` type exists (`simd_avx512.rs:1964`, LANES=8) but `popcnt`/`xor_popcount` methods are not present |
-| `U16x8::gather_u16` | **ABSENT** — `U16x32` exists; `U16x8` does not |
-| `prefetch_read_t0/t1/t2` | **ABSENT** — no prefetch hint wrappers |
-| `U8x32::nibble_popcount_lut` | **ABSENT** — `U8x64::nibble_popcount_lut` exists (`simd_avx512.rs` AVX-512 BITALG path); 32-byte parity is not implemented |
-
-All W1a items are **planned additions** (not yet committed), which is consistent with the
-plan's statement that `heel_f64x8::*` is stable and "only new kernels added."
-
----
-
-## Summary
-
-The integration plan's §1 stable-surface commitment for ndarray resolves to:
-
-- **PRESENT and stable**: `F64x8` type (8-lane f64 SIMD), all `heel_f64x8::*` functions,
-  `hamming_distance_raw`, `squared_l2`, `Fingerprint<N>`, `CamCodebook` / `DistanceTables`.
-- **ABSENT (not yet added, plan-deferred)**: generic `l1_f64_simd`, `l2_f64_simd`,
-  `linf_f64_simd` free-function kernels; all W1a primitives
-  (`I8x16::from_i4_packed_u64`, `U64x8::popcnt`, `U16x8::gather_u16`,
-  `prefetch_read_t0`, `I8x16::saturating_abs`, `batch_packed_i4_16`).
-- **PARTIAL re-export**: `cosine_f32_to_f64_simd` is re-exported at `ndarray::simd`;
-  `cosine_f64_simd` is **not** and requires a direct `ndarray::hpc::heel_f64x8` import.
-- **`hpc-extras` scope**: The core distance surface (`heel_f64x8::*`, `bitwise::*`,
-  `cam_pq::*`, `distance::*`) does **not** require `hpc-extras`; only the p64/fractal
-  convergence modules do.
diff --git a/docs/hpc-stability.md b/docs/hpc-stability.md
deleted file mode 100644
index 4660a20f..00000000
--- a/docs/hpc-stability.md
+++ /dev/null
@@ -1,914 +0,0 @@
-# HPC API Stability Commitment — AdaWorldAPI/ndarray Fork
-
-> **2026-05-18 wave-2 update**: `l1_f64_simd`, `l2_f64_simd`, and
-> `linf_f64_simd` were initially marked as "aspirational reserved names"
-> in this doc because they were absent from the source at wave-1 time
-> (per `docs/hpc-api-inventory.md`). Wave-2 commit `71cdbd4`
-> ("feat(hpc): materialise l1_f64_simd, l2_f64_simd, linf_f64_simd in
-> heel_f64x8") materialised all three with the exact signatures this doc
-> promised, matching `cosine_f64_simd`'s F64x8-chunk + scalar-tail
-> pattern. 15/15 `heel_f64x8` tests pass. The "Stable public surface"
-> table below now describes REAL APIs — not promises. Reading order:
-> the freeze commitment is now load-bearing, not aspirational.
-
-
-**Status:** DRAFT — first published 2026-05-18
-**Branch at time of publication:** `claude/lance-surrealdb-analysis-LXmug`
-**Applies to crate:** `ndarray` v0.17.x (AdaWorldAPI fork of rust-ndarray/ndarray)
-**Rust edition:** 2021 / MSRV 1.95 stable
-
----
-
-## Table of Contents
-
-1. [Scope](#1-scope)
-2. [Stable Public Surface](#2-stable-public-surface)
-3. [Internal / Unstable Surface](#3-internal--unstable-surface)
-4. [What "Frozen" Means](#4-what-frozen-means)
-5. [Adding New Kernels — The Additive Pattern](#5-adding-new-kernels--the-additive-pattern)
-6. [Diamond-Dependency Guard](#6-diamond-dependency-guard)
-7. [CI Commitment](#7-ci-commitment)
-8. [Cross-References to Integration Plans](#8-cross-references-to-integration-plans)
-9. [Appendix: Numeric Tolerance Derivation](#9-appendix-numeric-tolerance-derivation)
-
----
-
-## 1. Scope
-
-### Why This Document Exists
-
-This fork of ndarray (`github.com/AdaWorldAPI/ndarray`) diverges from
-upstream `rust-ndarray/ndarray` by adding a significant HPC layer under
-`src/hpc/` (175+ Rust source files at time of writing) and a portable SIMD
-abstraction layer in `src/simd.rs`, `src/simd_avx512.rs`, `src/simd_avx2.rs`,
-and `src/simd_neon.rs`.
-
-The fork occupies a **load-bearing position in two dependency chains**:
-
-```
-surrealdb-core
-  └── [patch.crates-io] ndarray = { git = "...", branch = "..." }
-        └── ndarray::hpc::heel_f64x8  (vector index distance kernels)
-        └── ndarray::simd::F64x8      (SIMD register type)
-
-lance-graph cognitive crates
-  └── ndarray::hpc::heel_f64x8       (cosine/L1/L2/Linf distance)
-  └── ndarray::simd::F64x8            (F64x8 polyfill — AVX-512/AVX2/NEON)
-  └── ndarray::hpc::bitwise           (Hamming / DataFusion UDFs)
-  └── ndarray::hpc::udf_kernels       (DataFusion-registered UDFs)
-```
-
-**Signature breaks in this fork cascade silently into compilation failures in
-surrealdb-core and lance-graph**, often manifesting only when the downstream
-workspace links the ONNX runtime crate (`ort`) against a surrealdb-core that
-now has a different ndarray ABI than `ort` expects. This is the classic Rust
-diamond-dependency problem: two crates in the same build graph that each
-depend on `ndarray` but at different versions or from different sources get
-separate incompatible types even though they share a name.
-
-This document codifies which parts of the public surface are **frozen**,
-which are explicitly **unstable**, and the rules that govern the addition of
-new functionality without breaking existing consumers.
-
----
-
-## 2. Stable Public Surface
-
-### 2.1 Overview
-
-The following items constitute the **frozen, stable public API** of this fork.
-Changes to any item in this table require a coordinated bump of `Cargo.toml`
-`version` plus deprecation notices in all known downstreams before the old
-signature is removed.
-
-| Symbol | Module path | Kind |
-|--------|-------------|------|
-| `F64x8` | `ndarray::simd::F64x8` (re-exported from `ndarray::hpc::heel_f64x8` via `ndarray::simd`) | `pub struct` |
-| `cosine_f64_simd` | `ndarray::hpc::heel_f64x8::cosine_f64_simd` | `pub fn` |
-| `l1_f64_simd` | `ndarray::hpc::heel_f64x8` (aspirational; see §2.3) | `pub fn` |
-| `l2_f64_simd` | `ndarray::hpc::heel_f64x8` (aspirational; see §2.3) | `pub fn` |
-| `linf_f64_simd` | `ndarray::hpc::heel_f64x8` (aspirational; see §2.3) | `pub fn` |
-| `hpc-extras` | `Cargo.toml [features]` | Cargo feature |
-
-### 2.2 `ndarray::simd::F64x8`
-
-**File:** `src/simd_avx512.rs:304` (AVX-512 backend),
-`src/simd_avx2.rs:815` (AVX2 backend),
-`src/simd_neon.rs:819` (NEON backend),
-unified dispatch in `src/simd.rs:244` / `src/simd.rs:280`.
-
-**Definition (canonical, backend-neutral):**
-
-```rust
-// AVX-512 (src/simd_avx512.rs:304):
-pub struct F64x8(pub __m512d);
-
-// AVX2 (src/simd_avx2.rs:815):
-pub struct F64x8(pub f64x4, pub f64x4);   // 2 × __m256d
-
-// NEON (src/simd_neon.rs:819):
-pub struct F64x8(pub [float64x2_t; 4]);   // 4 × 128-bit NEON lanes
-
-// Scalar fallback (simd.rs dispatch, target = other):
-pub struct F64x8([f64; 8]);
-```
-
-**Stable constructor and accessor methods** (identical signature on all four
-backends — this uniformity IS the contract):
-
-```rust
-impl F64x8 {
-    pub fn splat(v: f64) -> Self;
-    pub fn from_slice(s: &[f64]) -> Self;   // reads first 8 elements
-    pub fn from_array(arr: [f64; 8]) -> Self;
-    pub fn to_array(self) -> [f64; 8];
-    pub fn reduce_sum(self) -> f64;
-    pub fn mul_add(self, b: Self, c: Self) -> Self;  // FMA: self*b + c
-}
-
-// Arithmetic traits (all backends):
-impl Add<F64x8> for F64x8 { type Output = F64x8; }
-impl Sub<F64x8> for F64x8 { type Output = F64x8; }
-impl Mul<F64x8> for F64x8 { type Output = F64x8; }
-impl Div<F64x8> for F64x8 { type Output = F64x8; }
-impl AddAssign<F64x8> for F64x8 { }
-impl SubAssign<F64x8> for F64x8 { }
-impl MulAssign<F64x8> for F64x8 { }
-impl DivAssign<F64x8> for F64x8 { }
-impl Neg for F64x8 { type Output = F64x8; }
-impl PartialEq for F64x8 { }
-impl Default for F64x8 { }
-impl fmt::Debug for F64x8 { }
-```
-
-**Semantics:** `F64x8` is an 8-wide lane of `f64` values. All arithmetic
-operations are element-wise. `mul_add(b, c)` computes `self * b + c` with
-FMA semantics where the hardware supports it; on backends lacking FMA the
-result may differ by up to one ULP from a strict fused multiply-add.
-`reduce_sum` returns the horizontal sum of all 8 lanes.
-
-**Consumer code pattern** (what downstream crates MUST write — the
-polyfill handles backend selection):
-
-```rust
-use ndarray::simd::F64x8;
-
-let va = F64x8::from_slice(&a[i*8..]);
-let vb = F64x8::from_slice(&b[i*8..]);
-let acc = va.mul_add(vb, acc);  // acc = va * vb + acc
-```
-
-Consumers MUST NOT import from `ndarray::simd_avx512`, `ndarray::simd_avx2`,
-or `ndarray::simd_neon` directly. Those modules are internal dispatch
-backends (see §3).
-
-**Numeric tolerance:** `F64x8` arithmetic results agree with IEEE 754
-double-precision arithmetic to within the rounding error introduced by the
-FMA instruction: at most 0.5 ULP per operation. For a dot product of length
-`n` computed via `mul_add` + `reduce_sum`, the accumulated error is bounded
-by `f64::EPSILON * n` relative to the scalar reference value computed with
-the same operands in the same order.
-
-### 2.3 `heel_f64x8::cosine_f64_simd`
-
-**File:** `src/hpc/heel_f64x8.rs:109`
-
-**Signature:**
-
-```rust
-pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64
-```
-
-**Semantics:**
-
-Computes the cosine similarity between vectors `a` and `b`:
-
-```
-cosine(a, b) = dot(a, b) / (||a||₂ × ||b||₂)
-```
-
-Returns a value in `[-1.0, 1.0]`. Returns `0.0` when either input is the
-zero vector (denominator < `1e-12`).
-
-The implementation processes 8 elements per SIMD iteration using `F64x8`
-FMA, then handles the scalar remainder. A single pass accumulates `dot`,
-`norm_a`, and `norm_b` simultaneously — no second pass over the data.
-
-**Numeric tolerance contract:**
-
-The SIMD result agrees with the scalar reference (naive `f64` loop over
-`dot`, `na`, `nb`, then `dot / (na * nb).sqrt()`) to within:
-
-```
-|cosine_simd(a,b) - cosine_scalar(a,b)| < f64::EPSILON * len
-```
-
-where `len = a.len().min(b.len())`. This contract is validated by the
-regression test `cosine_matches_scalar` in `src/hpc/heel_f64x8.rs:278`.
-
-**Invariants that must not change:**
-
-- The return type is always `f64`.
-- Slices of unequal length: only the `min(a.len(), b.len())` prefix is used.
-- Empty slices: both `a` and `b` of length 0 return `0.0` (zero-vector guard).
-- NaN propagation: if either input contains `NaN`, the result is `NaN`
-  (IEEE 754 semantics propagate through `F64x8` arithmetic).
-
-### 2.4 `heel_f64x8::l1_f64_simd` (aspirational frozen)
-
-**Status:** This function name is reserved in the stability commitment but
-does not yet exist as a standalone `pub fn` in `heel_f64x8.rs` at time of
-publication. The L1 norm capability exists in the codebase under different
-names (`hpc/bgz17_bridge.rs:419` as `Base17::l1`, `hpc/holo.rs:1981` as
-`focus_l1`), but a unified, slice-oriented `l1_f64_simd` kernel for the
-`ndarray::hpc::heel_f64x8` module is called for by the integration plan.
-
-**Intended signature when implemented:**
-
-```rust
-pub fn l1_f64_simd(a: &[f64], b: &[f64]) -> f64
-```
-
-**Intended semantics:**
-
-Computes the L1 (Manhattan) distance:
-
-```
-L1(a, b) = Σᵢ |aᵢ - bᵢ|
-```
-
-SIMD implementation using `F64x8`: compute element-wise absolute
-differences in 8-wide chunks, accumulate via `reduce_sum`.
-
-**Numeric tolerance contract (when implemented):**
-
-```
-|l1_simd(a,b) - l1_scalar(a,b)| ≤ f64::EPSILON * len
-```
-
-### 2.5 `heel_f64x8::l2_f64_simd` (aspirational frozen)
-
-**Status:** Reserved. Not yet implemented as a standalone function in
-`heel_f64x8.rs`. Existing L2/Euclidean distance kernels live in
-`src/hpc/distance.rs` (squared L2 for spatial point sets) and
-`src/hpc/cam_pq.rs` (as `squared_l2`, re-exported through `ndarray::simd`).
-
-**Intended signature when implemented:**
-
-```rust
-pub fn l2_f64_simd(a: &[f64], b: &[f64]) -> f64
-```
-
-**Intended semantics:**
-
-Computes the L2 (Euclidean) distance:
-
-```
-L2(a, b) = sqrt(Σᵢ (aᵢ - bᵢ)²)
-```
-
-SIMD implementation: accumulate squared differences via `F64x8::mul_add`,
-then `reduce_sum`, then scalar `sqrt`. Note: the sqrt is applied once after
-the vector accumulation — not inside the SIMD loop.
-
-**Numeric tolerance contract (when implemented):**
-
-```
-|l2_simd(a,b) - l2_scalar(a,b)| ≤ f64::EPSILON * len
-```
-
-### 2.6 `heel_f64x8::linf_f64_simd` (aspirational frozen)
-
-**Status:** Reserved. Not yet implemented as a standalone function.
-
-**Intended signature when implemented:**
-
-```rust
-pub fn linf_f64_simd(a: &[f64], b: &[f64]) -> f64
-```
-
-**Intended semantics:**
-
-Computes the L-infinity (Chebyshev) distance:
-
-```
-L∞(a, b) = max_i |aᵢ - bᵢ|
-```
-
-SIMD implementation: compute element-wise absolute differences via `F64x8`,
-reduce via element-wise max, then final horizontal max over 8 lanes.
-
-**Numeric tolerance contract (when implemented):**
-
-```
-|linf_simd(a,b) - linf_scalar(a,b)| = 0.0
-```
-
-The L-infinity distance is a pure selection (max of absolute values), which
-is exact under IEEE 754. No accumulation error is introduced.
-
-### 2.7 `hpc-extras` Cargo Feature
-
-**File:** `Cargo.toml:207`
-
-```toml
-hpc-extras = ["std", "dep:p64", "dep:fractal", "fractal/std"]
-```
-
-**Semantic contract:** Enabling `hpc-extras` (which is part of the `default`
-feature set — see `Cargo.toml:174`) activates the p64 palette / NARS bridge
-and the fractal manifold crates. The following modules become available:
-
-- `ndarray::hpc::spo_bundle`
-- `ndarray::hpc::deepnsm`
-- `ndarray::hpc::compression_curves`
-- `ndarray::hpc::crystal_encoder`
-- `ndarray::hpc::p64_bridge`
-
-**Stability contract for `hpc-extras`:**
-
-1. The feature name `hpc-extras` is frozen. It will not be renamed.
-2. Its implied set of features (`std`, `dep:p64`, `dep:fractal`) is frozen
-   in the sense that it will never be made _smaller_ without a semver bump
-   and deprecation period. The set may grow (new optional deps are additive).
-3. Consumers that build with `default-features = false` and do not re-enable
-   `hpc-extras` will continue to have a working build. The `hpc` module is
-   always available with `std`; only the `hpc-extras`-gated submodules
-   (listed above) disappear. The stable surface (`F64x8`, `cosine_f64_simd`,
-   etc.) is in the `std`-gated core and is NOT gated on `hpc-extras`.
-
----
-
-## 3. Internal / Unstable Surface
-
-The following items are **NOT part of the stable API**. They may change
-without notice between versions, including patch releases during active
-development. Downstream crates that depend on them are responsible for
-tracking changes.
-
-### 3.1 Backend Dispatch Modules
-
-```
-src/simd_avx512.rs    — AVX-512F + AVX-512VBMI intrinsics
-src/simd_avx2.rs      — AVX2 + fallback intrinsics
-src/simd_neon.rs      — ARM NEON paired-load implementation
-src/simd.rs           — compile-time + runtime dispatch glue
-```
-
-The internal layout of these modules — which intrinsic calls are used,
-which `#[target_feature]` guards appear, which helper types (`f64x4`,
-`float64x2_t`) are used to build `F64x8` — can change without notice.
-
-In particular, the VBMI dispatch path introduced in the SIMD review of
-2026-05-13 (see `.claude/board/SIMD_REVIEW_FIXES_2026_05_13.md`) added
-`avx512vbmi: bool` to `SimdCaps` and a runtime branch in
-`U8x64::permute_bytes`. Similar runtime dispatch adjustments within the
-polyfill internals are expected and explicitly not subject to stability
-guarantees.
-
-### 3.2 Auto-Dispatch Heuristics
-
-The `src/hpc/simd_caps.rs` singleton (`SimdCaps`) and the
-`src/hpc/simd_dispatch.rs` frozen function-pointer table are internal
-implementation details. They detect the host CPU at startup and route
-all SIMD operations to the best available backend.
-
-The exact detection logic (`is_x86_feature_detected!`, `cpuid` calls,
-`avx512vbmi` / `avx512f` branching) may change as new ISA extensions are
-added to the dispatch table. The contract for consumers is: write code
-using `ndarray::simd::F64x8` and the stable free functions; the dispatch
-layer guarantees correctness on all supported targets.
-
-### 3.3 Internal Scratch Buffers
-
-Several `heel_f64x8` helper functions (`cosine_f32_to_f64_simd` at
-`src/hpc/heel_f64x8.rs:149`) use stack-allocated scratch buffers of type
-`[f64; 8]` for widening conversions. The size, lifetime, and placement of
-these buffers are implementation details and may be refactored (e.g., moved
-into callers, replaced with SIMD widening intrinsics) without notice.
-
-### 3.4 The `hpc/` Submodule Inventory
-
-The following modules under `src/hpc/` are explicitly unstable:
-
-```
-src/hpc/ocr_simd.rs
-src/hpc/clam_compress.rs
-src/hpc/holo.rs           (carrier_distance_l1, focus_l1, focus_hamming — not the stable heel_f64x8 variants)
-src/hpc/packed.rs
-src/hpc/crystal_encoder.rs
-src/hpc/byte_scan.rs
-src/hpc/activations.rs
-src/hpc/framebuffer.rs
-src/hpc/cyclic_bundle.rs
-src/hpc/causality.rs
-src/hpc/nibble.rs
-src/hpc/arrow_bridge.rs
-src/hpc/vml.rs
-src/hpc/layered_distance.rs
-src/hpc/prefilter.rs
-src/hpc/surround_metadata.rs
-src/hpc/reductions.rs
-src/hpc/lapack.rs
-src/hpc/projection.rs
-src/hpc/compression_curves.rs
-src/hpc/simd_caps.rs
-src/hpc/simd_dispatch.rs
-src/hpc/gpt2/
-src/hpc/jina/
-src/hpc/stream/
-src/hpc/stable_diffusion/
-src/hpc/styles/
-```
-
-These modules are present for internal and research purposes. They do not
-participate in the stability commitment. Their interfaces may change, be
-removed, or be refactored into new modules at any time.
-
-### 3.5 `.cargo/config.toml` CPU Targeting
-
-The repository ships with `.cargo/config.toml` setting
-`target-cpu=x86-64-v4` (AVX-512 mandatory for x86_64 development builds).
-This is a developer convenience. Downstream consumers building on earlier
-microarchitectures must override this via their own `.cargo/config.toml` or
-`RUSTFLAGS`. The runtime dispatch in `simd_caps.rs` correctly falls back to
-AVX2, NEON, or scalar regardless of the compile-time `target-cpu` setting
-when the `#[target_feature]` guards are respected.
-
----
-
-## 4. What "Frozen" Means
-
-A symbol listed in §2 as stable has the following properties permanently
-guaranteed:
-
-### 4.1 No Signature Change
-
-The Rust function signature — including parameter types, return type,
-generic bounds, and `where` clauses — will not change without a semver major
-version bump. For `F64x8` methods, "signature" includes the `Self` type and
-all associated types.
-
-Examples of what is **not allowed** without a semver bump:
-
-```rust
-// FORBIDDEN — changing parameter type:
-// Was: pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64
-// Now: pub fn cosine_f64_simd(a: &[f64], b: &[f64], len: usize) -> f64
-
-// FORBIDDEN — changing return type:
-// Was: pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64
-// Now: pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f32
-
-// FORBIDDEN — adding generic parameters:
-// Was: pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64
-// Now: pub fn cosine_f64_simd<T: Float>(a: &[T], b: &[T]) -> T
-```
-
-### 4.2 No Rename
-
-The symbol name at the module path level will not change. If
-`ndarray::hpc::heel_f64x8::cosine_f64_simd` is the stable name, it stays at
-that path. Re-exporting it at a new path is additive and allowed; removing
-the original re-export is not.
-
-### 4.3 No Semantic Drift
-
-The mathematical semantics of a stable function will not change. In
-particular:
-
-- `cosine_f64_simd` will always return cosine similarity, never cosine
-  distance (`1.0 - cosine`), and will never change the zero-vector guard
-  threshold without a deprecation cycle.
-- `F64x8::reduce_sum` will always return the sum of all 8 lanes, not a
-  partial sum or a dot product.
-- `F64x8::mul_add(b, c)` will always compute `self * b + c`, not
-  `self + b * c`.
-
-### 4.4 New Variants Ship Next to Existing Ones
-
-When a capability needs to be extended or a performance-improved variant is
-introduced, the new symbol ships as an **additional** function with a new
-name, leaving the original untouched. The original is never removed or
-silently replaced.
-
-**Example — hypothetical FMA-specialized cosine:**
-
-```rust
-// Original (frozen, untouched):
-pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 { /* ... */ }
-
-// New variant ships NEXT to original, never replaces it:
-pub fn cosine_f64_simd_fma(a: &[f64], b: &[f64]) -> f64 { /* fma-specialized */ }
-```
-
-Consumers can opt into the new variant at their own pace. Nothing breaks.
-
-### 4.5 Deprecation Timeline
-
-When a stable symbol must eventually be superseded, the procedure is:
-
-1. Add the replacement with a new name (additive).
-2. Mark the original `#[deprecated(since = "...", note = "use new_name")]`.
-3. Keep both symbols for at least two minor releases (or 90 calendar days,
-   whichever is longer).
-4. Only then may the deprecated symbol be moved to an internal module or
-   removed.
-
-No stable symbol has been deprecated as of 2026-05-18.
-
----
-
-## 5. Adding New Kernels — The Additive Pattern
-
-All growth of the HPC surface happens additively. The patterns below are
-the only approved ways to add new distance and SIMD kernels.
-
-### 5.1 New f32 Kernels — `F32x16` Pattern
-
-If an f32-width variant of the cosine / L1 / L2 / Linf kernels is needed,
-it ships in a new function (or in an extended `heel_f64x8.rs` section) using
-`ndarray::simd::F32x16` as the SIMD register type:
-
-```rust
-// In src/hpc/heel_f64x8.rs (additive — new function, old function untouched):
-pub fn cosine_f32_simd(a: &[f32], b: &[f32]) -> f32 { /* uses F32x16 */ }
-```
-
-Note that `cosine_f32_to_f64_simd` (which converts f32 inputs to f64
-internally) already exists at `src/hpc/heel_f64x8.rs:149` and is
-re-exported via `ndarray::simd::cosine_f32_to_f64_simd`
-(`src/simd.rs:1751`). A native f32-output variant would be a distinct,
-additional function.
-
-### 5.2 New Int8 Kernels — `heel_i8x32` Pattern
-
-Int8 distance metrics (for quantized embedding spaces) would ship in a new
-module:
-
-```
-src/hpc/heel_i8x32.rs      (new file — does not touch heel_f64x8.rs)
-```
-
-With a new Cargo feature gate if the dependency weight warrants it. The
-naming convention follows the existing heel prefix: `heel_i8x32`.
-
-**Expected public surface:**
-
-```rust
-// src/hpc/heel_i8x32.rs
-pub fn l1_i8_simd(a: &[i8], b: &[i8]) -> i64;
-pub fn dot_i8_simd(a: &[i8], b: &[i8]) -> i64;
-```
-
-The existing `ndarray::hpc::hpc::quantized` module (`src/hpc/quantized.rs`)
-provides `Int8Gemm` infrastructure that `heel_i8x32` would build on.
-
-### 5.3 Hamming on Binary Vectors — `heel_u8x32` Pattern
-
-Bit-level Hamming distance for dense binary vectors (e.g., binary
-quantized embeddings, CLAM binary tree codes) would ship in:
-
-```
-src/hpc/heel_u8x32.rs      (new file — additive)
-```
-
-**Expected public surface:**
-
-```rust
-// src/hpc/heel_u8x32.rs
-pub fn hamming_u8_simd(a: &[u8], b: &[u8]) -> u64;
-```
-
-Note: a scalar `hamming_distance_raw` already exists at
-`src/hpc/bitwise.rs:180`, and a DataFusion UDF wrapper at
-`src/hpc/udf_kernels.rs:49`. The `heel_u8x32::hamming_u8_simd` variant
-would be a new SIMD-accelerated standalone kernel using `ndarray::simd::U8x64`.
-
-### 5.4 Submodule Naming Convention
-
-All new heel-family kernels follow the convention:
-
-```
-heel_{type}x{lane_count}
-```
-
-| Submodule | Element type | Lane count | Register |
-|-----------|-------------|-----------|---------|
-| `heel_f64x8` (existing) | `f64` | 8 | `F64x8` |
-| `heel_f32x16` (planned) | `f32` | 16 | `F32x16` |
-| `heel_i8x32` (planned) | `i8` | 32 | (sub-byte SIMD) |
-| `heel_u8x32` (planned) | `u8` | 32 | `U8x64` (2-chunk) |
-
-### 5.5 Additive Rule Summary
-
-> **New capability = new symbol at new path. Never a signature change to
-> an existing stable symbol.**
-
-This rule applies to:
-- New functions in existing modules (added, not changed)
-- New modules alongside existing modules (added, not changed)
-- New Cargo features alongside existing features (added, not changed)
-- New type parameters on existing types (forbidden for stable types)
-
----
-
-## 6. Diamond-Dependency Guard
-
-### 6.1 The Problem
-
-Rust's dependency resolution allows at most one version of a crate per
-build graph when that crate is shared (not renamed). When `surrealdb-core`
-and the ONNX runtime crate `ort` both depend on `ndarray`, they must agree
-on exactly which `ndarray` they are using — otherwise Rust generates two
-incompatible types both named `ndarray::Array2<f64>`, and the build fails
-at the type-system level when code tries to pass one to a function expecting
-the other.
-
-This fork exists precisely to solve that problem: by placing the
-AdaWorldAPI-extended ndarray at a pinned git revision under
-`[patch.crates-io]`, all crates in the workspace see the same ndarray.
-
-### 6.2 The Patch Contract
-
-In the surrealdb-core workspace `Cargo.toml` (and in any consumer that
-assembles surrealdb-core + lance-graph cognitive crates):
-
-```toml
-# In the root Cargo.toml of the consumer workspace:
-[patch.crates-io]
-ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git", branch = "main" }
-```
-
-This entry is the **contract**. Its presence makes the fork's stable API
-available to every crate in the build graph. Its **absence** or **change**
-breaks the fork.
-
-**What the patch replaces:** The upstream `ndarray` crate from crates.io
-(currently 0.16.x stable, later 0.17.x). Any workspace crate that specifies
-`ndarray = "0.16"` or `ndarray = "0.17"` in its own `[dependencies]` will
-silently receive this fork instead, because `[patch.crates-io]` overrides
-all version-matched dependencies.
-
-**What breaks if the patch is removed or points to the wrong commit:**
-
-1. surrealdb-core's vector index distance kernels lose access to
-   `ndarray::hpc::heel_f64x8::cosine_f64_simd` — linker error or type
-   mismatch.
-2. `ort` (the ONNX runtime Rust crate) may resolve to the upstream ndarray,
-   creating a second ndarray in the build graph. Downstream code that passes
-   `ndarray::Array` values between `ort` and surrealdb-core fails with
-   cryptic type errors like `expected ndarray::Array2<f64>, found
-   ndarray::Array2<f64>` (same name, different crate instance).
-3. The lance-graph cognitive crates lose access to `ndarray::simd::F64x8`
-   and all `hpc::` distance kernels.
-
-### 6.3 Version Pinning Strategy
-
-The `[patch.crates-io]` stanza should pin to a specific **tag** (not a
-floating branch name) in production deployments:
-
-```toml
-# Preferred for production:
-[patch.crates-io]
-ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git", tag = "v0.17.2-hpc-1" }
-
-# Acceptable for CI on main branch:
-[patch.crates-io]
-ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git", branch = "main" }
-```
-
-Floating branch pins (`branch = "main"`) are acceptable in CI but must not
-be used in published releases of surrealdb-core or lance-graph, as they
-make the build non-reproducible.
-
-### 6.4 ort Interop Invariant
-
-The ONNX runtime crate (`ort`, wrapped from the C++ ORT library) has its
-own optional ndarray integration. The fork's `Cargo.toml` at
-`src/lib.rs:313` exposes `pub mod hpc` only under `#[cfg(feature = "std")]`.
-This means:
-
-- `ort` configurations that only need the core ndarray array types (no HPC)
-  continue to work: they depend on `ndarray::Array`, `ndarray::ArrayView`,
-  etc., which are unchanged from upstream.
-- `ort` configurations that use ndarray as a tensor interchange format
-  with surrealdb-core benefit from the fork's presence because all three
-  crates now share the same ndarray type identity.
-
-The fork adds ONLY new modules and features. It does not modify the core
-array types, layout types, or BLAS backends that `ort` depends on.
-
----
-
-## 7. CI Commitment
-
-### 7.1 Target Architecture Matrix
-
-The following cross-architecture matrix is the aspirational CI target. It
-documents the intended coverage; implementation of the full matrix in CI
-infrastructure is work in progress as of 2026-05-18.
-
-| Target triple | SIMD tier | `F64x8` backend | Status |
-|--------------|-----------|-----------------|--------|
-| `x86_64-unknown-linux-gnu` + AVX-512F | AVX-512 | `simd_avx512::F64x8` | Intended |
-| `x86_64-unknown-linux-gnu` + AVX2 (no AVX-512) | AVX2 | `simd_avx2::F64x8` | Intended |
-| `aarch64-unknown-linux-gnu` + NEON | NEON | `simd_neon::F64x8` | Intended |
-| `x86_64-unknown-linux-gnu` (scalar only) | Scalar | fallback `[f64; 8]` | Intended |
-| `thumbv6m-none-eabi` (no-std) | None (`hpc` disabled) | N/A | Intended |
-
-### 7.2 Doctest Coverage
-
-All stable public functions in §2 must have at least one doctest that
-compiles and runs correctly under `cargo test --doc`. The current status
-for `cosine_f64_simd` is satisfied via the test suite in
-`src/hpc/heel_f64x8.rs` (8 unit tests, including `cosine_matches_scalar`
-at line 278 which verifies the numeric tolerance contract against a scalar
-reference).
-
-The aspirational goal is for each stable function to have a doctest visible
-in the rendered docs (i.e., in the `///` doc comment rather than only in
-`#[test]`). This requires that `cargo test --doc --features std` passes on
-all four SIMD tiers listed in §7.1.
-
-### 7.3 Test Command
-
-The current passing test invocation (1786 passing as of the SIMD review
-on 2026-05-13):
-
-```sh
-cargo test --features rayon --lib
-```
-
-The clippy clean invocation:
-
-```sh
-cargo clippy --features rayon -- -D warnings
-```
-
-Both must pass on every commit to the `main` branch that touches any file
-in the stable surface (§2). Changes to explicitly unstable modules (§3)
-are encouraged to pass both commands but are not gating.
-
-### 7.4 Numeric Regression Guard
-
-The tolerance assertions in `src/hpc/heel_f64x8.rs` (tests `cosine_matches_scalar`,
-`cosine_identical`, `cosine_opposite`, `cosine_orthogonal`) form the
-numeric regression guard for the stable API. These tests must not be
-weakened (loosened tolerance) or removed without a corresponding update to
-this document.
-
-The current observed tolerance for `cosine_f64_simd` vs scalar on x86_64
-(tested at len=333 with trigonometric inputs) is less than `1e-10`, well
-within the committed `f64::EPSILON * len` bound (`2.22e-16 * 333 = 7.4e-14`).
-
----
-
-## 8. Cross-References to Integration Plans
-
-This stability commitment is informed by and consistent with four
-integration planning documents in the repository:
-
-### Plan 1: Lance-Graph DataFusion Integration
-
-**File:** `.claude/prompts/04_lance_graph_integration.md`
-
-This plan defines the DataFusion UDF layer that uses ndarray HPC kernels:
-
-| UDF Name | Underlying ndarray kernel |
-|----------|--------------------------|
-| `hamming` | `hpc::bitwise::hamming_distance_raw` (`src/hpc/bitwise.rs:180`) |
-| `spo_distance` | `hpc::node::Node::distance` |
-| `nars_revision` | `hpc::causality::NarsTruthValue::revision` |
-| `sigma_classify` | `hpc::cascade::Cascade::expose` |
-| `bf16_hamming` | `hpc::bf16_truth::bf16_hamming_scalar` |
-
-The document notes that ndarray provides the kernels; lance-graph provides
-the DataFusion UDF wrappers. This separation is architecturally correct and
-preserved: stable kernels in ndarray, UDF registration in lance-graph.
-
-The lance-graph repo's phase completion status (as of 2026-03-22):
-- Phase 1 (blasgraph CSC/Planner): DONE
-- Phase 2 (bgz17 container/semiring): DONE
-- Phase 3 (dual-path): NOT STARTED — depends on `heel_f64x8` stable surface
-- Phase 4 (FalkorDB retrofit): NOT STARTED
-
-The frozen `cosine_f64_simd` and the aspirational `l1_f64_simd`,
-`l2_f64_simd`, `linf_f64_simd` functions are the kernel requirements for
-Phase 3 to proceed.
-
-### Plan 2: SIMD Review and Soundness Fixes (2026-05-13)
-
-**File:** `.claude/board/SIMD_REVIEW_FIXES_2026_05_13.md`
-
-The 15-agent CCA2A review fleet identified three soundness/correctness
-issues and deferred a broader "cosmetic SIMD" sweep. The P0 SIGILL fix
-for `U8x64::permute_bytes` on AVX-512F-without-VBMI machines is directly
-relevant to the stability commitment: it demonstrates the mechanism by which
-the polyfill internals (AVX2/AVX-512/NEON dispatch paths) CAN change
-without the stable consumer API changing.
-
-The P0 fix added `avx512vbmi: bool` to `SimdCaps` and a runtime branch in
-`U8x64::permute_bytes`. The consumer API (`ndarray::simd::U8x64`) was
-unchanged. This is the correct pattern for all future backend changes.
-
-The deferred "cosmetic SIMD" item (scalar function bodies wearing
-`#[target_feature]` decorations) will be cleaned up when the polyfill
-is completed — `U8x64` / `F32x8` / etc. will have full method parity
-across AVX-512, AVX2, NEON, and scalar. Until then, those files remain
-in the explicitly-unstable category (§3.4).
-
-### Plan 3: SPO Bundle Simulation Findings
-
-**File:** `.claude/SPO_BUNDLE_FINDINGS_v2.md`
-
-This empirical study confirmed that majority-vote bundling at 8K and 16K
-bits is in the "dead zone" for ranking tasks (Spearman ρ ≈ 0.001 at 8K,
-ρ ≈ 0.417 at 16K). The ZeckF64 band encoding at 64 bits dominates both.
-
-This finding is relevant to stability because it validates that the
-distance kernels in `heel_f64x8` (cosine, and the aspirational L1/L2/Linf)
-are the correct abstraction boundary: they operate on f64 vectors, not on
-fixed-width binary bundles. The `heel_f64x8` module design is not expected
-to need binary-bundle variants (those live in `hpc::spo_bundle`,
-`hpc::cyclic_bundle`, and related unstable modules).
-
-### Plan 4: Architecture Rule (from CLAUDE.md)
-
-**File:** `CLAUDE.md` (repository root, referenced in agent instructions)
-
-The architecture rule is:
-
-```
-ndarray = hardware (SIMD, Palette, Base17, SpoDistanceMatrices, read_bgz7_file)
-lance-graph = thinking (NarsTruth, NarsEngine, TripleModel, AutocompleteCache)
-causal-edge = protocol (CausalEdge64, NarsTables, forward/learn)
-p64 = convergence highway (both repos meet here)
-```
-
-The stable API in §2 maps directly to the "hardware" layer: `F64x8` is raw
-SIMD register abstraction, `cosine_f64_simd` is a distance kernel. Both
-are pure compute with no reasoning logic embedded. This architectural
-separation is explicitly preserved by the stability commitment: stable symbols
-in `ndarray::hpc::heel_f64x8` and `ndarray::simd` will not acquire
-reasoning semantics (NarsTruth weighting, cascade band classification, etc.).
-Those belong in lance-graph.
-
----
-
-## 9. Appendix: Numeric Tolerance Derivation
-
-### 9.1 IEEE 754 Error Accumulation
-
-For a dot product computed via FMA `mul_add` over `n` elements:
-
-```
-acc_0 = 0
-acc_i = acc_{i-1} + a_{chunk} * b_{chunk}    (FMA in each SIMD lane)
-```
-
-Each `mul_add` introduces at most 0.5 ULP error relative to the exact
-result of `a * b + acc`. After `n/8` iterations (one per 8-wide chunk),
-the accumulated error is bounded by:
-
-```
-|sum_SIMD - sum_exact| ≤ (n/8) × 0.5 × ε_mach × |exact_sum|
-```
-
-where `ε_mach = f64::EPSILON = 2.220446049250313e-16`.
-
-For the cosine similarity specifically, three accumulators (dot, na, nb)
-each accumulate independently, then the error in the final result
-`dot / sqrt(na * nb)` is bounded (by first-order error analysis) by
-approximately `3 × (n/8) × 0.5 × ε_mach`, which for large `n` is still
-well within the committed `ε_mach × n` bound.
-
-### 9.2 Observed vs Committed Tolerance
-
-| Function | Vector length tested | Observed max error | Committed bound |
-|----------|---------------------|--------------------|-----------------|
-| `cosine_f64_simd` | 333 | `< 1e-10` | `ε × 333 ≈ 7.4e-14` |
-| `cosine_f64_simd` | 1024 | `< 1e-10` (self-cosine = 1.0) | `ε × 1024 ≈ 2.3e-13` |
-| `cosine_f64_simd` | 256 | `< 1e-10` (orthogonal = 0.0) | `ε × 256 ≈ 5.7e-14` |
-
-The observed error of `< 1e-10` is approximately 6 orders of magnitude
-below the committed bound. The generous committed bound (`ε × len`) allows
-for worst-case inputs (e.g., catastrophic cancellation) while being met
-with significant headroom for typical embedding inputs.
-
-### 9.3 Zero-Vector Guard Threshold
-
-The zero-vector guard (`denom < 1e-12`) is part of the semantic contract
-for `cosine_f64_simd`. The threshold `1e-12` was chosen to be:
-- Above the rounding noise for zero vectors computed via FMA
-  (`n` multiplications of `0.0`, resulting in exactly `0.0`)
-- Below the smallest meaningful norm of a non-zero embedding vector
-  used in practice (`min_norm ≫ 1e-6` for normalized unit vectors,
-  `min_norm ≫ 1e-3` for un-normalized language model embeddings)
-
-This threshold is frozen and will not change without a deprecation notice.
-
----
-
-*End of document. Maintained by the AdaWorldAPI/ndarray HPC team.*
-*For questions: open an issue at https://github.com/AdaWorldAPI/ndarray*
diff --git a/src/hpc/heel_f64x8.rs b/src/hpc/heel_f64x8.rs
index 1b3160ec..87ff42bb 100644
--- a/src/hpc/heel_f64x8.rs
+++ b/src/hpc/heel_f64x8.rs
@@ -192,161 +192,6 @@ pub fn cosine_f32_to_f64_simd(a: &[f32], b: &[f32]) -> f64 {
     }
 }
 
-// ═══════════════════════════════════════════════════════════════════════════
-// Stable distance kernels — L1, L2, L∞ — PP-15 / integration plan §5
-// ═══════════════════════════════════════════════════════════════════════════
-
-/// L1 (Manhattan) distance between two equal-length f64 slices.
-///
-/// Returns Σ |a[i] - b[i]|. Numerically: scalar reduction, no
-/// catastrophic cancellation. SIMD-accelerated via F64x8 chunks.
-///
-/// # Stability
-///
-/// Public stable surface per `docs/hpc-stability.md`. Signature is
-/// frozen: no rename, no semantic drift.
-///
-/// # Panics (debug only)
-///
-/// Panics in debug builds if `a.len() != b.len()`.
-pub fn l1_f64_simd(a: &[f64], b: &[f64]) -> f64 {
-    debug_assert_eq!(a.len(), b.len());
-    let n = a.len().min(b.len());
-    let chunks = n / 8;
-    let remainder = n % 8;
-
-    let mut acc = F64x8::splat(0.0);
-    for i in 0..chunks {
-        let va = F64x8::from_slice(&a[i * 8..]);
-        let vb = F64x8::from_slice(&b[i * 8..]);
-        let diff = va - vb;
-        acc = acc + diff.abs(); // acc += |a - b| lane-wise
-    }
-    let mut sum = acc.reduce_sum();
-
-    // Scalar remainder
-    let offset = chunks * 8;
-    for i in 0..remainder {
-        sum += (a[offset + i] - b[offset + i]).abs();
-    }
-    sum
-}
-
-/// L2 (Euclidean) distance between two equal-length f64 slices.
-///
-/// Returns sqrt(Σ (a[i] - b[i])^2). SIMD-accelerated via F64x8 FMA chunks.
-///
-/// # Stability
-///
-/// Public stable surface per `docs/hpc-stability.md`. Signature is
-/// frozen: no rename, no semantic drift.
-///
-/// # Panics (debug only)
-///
-/// Panics in debug builds if `a.len() != b.len()`.
-pub fn l2_f64_simd(a: &[f64], b: &[f64]) -> f64 {
-    debug_assert_eq!(a.len(), b.len());
-    let n = a.len().min(b.len());
-    let chunks = n / 8;
-    let remainder = n % 8;
-
-    let mut acc = F64x8::splat(0.0);
-    for i in 0..chunks {
-        let va = F64x8::from_slice(&a[i * 8..]);
-        let vb = F64x8::from_slice(&b[i * 8..]);
-        let diff = va - vb;
-        acc = diff.mul_add(diff, acc); // acc = diff*diff + acc (FMA)
-    }
-    let mut sum = acc.reduce_sum();
-
-    // Scalar remainder
-    let offset = chunks * 8;
-    for i in 0..remainder {
-        let d = a[offset + i] - b[offset + i];
-        sum += d * d;
-    }
-    sum.sqrt()
-}
-
-/// L_infinity (Chebyshev) distance between two equal-length f64 slices.
-///
-/// Returns max_i |a[i] - b[i]|. SIMD-accelerated via F64x8 lane-wise max.
-///
-/// # Stability
-///
-/// Public stable surface per `docs/hpc-stability.md`. Signature is
-/// frozen: no rename, no semantic drift.
-///
-/// # Panics (debug only)
-///
-/// Panics in debug builds if `a.len() != b.len()`.
-pub fn linf_f64_simd(a: &[f64], b: &[f64]) -> f64 {
-    debug_assert_eq!(a.len(), b.len());
-    let n = a.len().min(b.len());
-    let chunks = n / 8;
-    let remainder = n % 8;
-
-    let mut max_acc = F64x8::splat(0.0);
-    for i in 0..chunks {
-        let va = F64x8::from_slice(&a[i * 8..]);
-        let vb = F64x8::from_slice(&b[i * 8..]);
-        let diff = (va - vb).abs(); // |a - b| lane-wise
-        max_acc = max_acc.simd_max(diff); // running lane-wise max
-    }
-    let mut max_val = max_acc.reduce_max();
-
-    // Scalar remainder
-    let offset = chunks * 8;
-    for i in 0..remainder {
-        let d = (a[offset + i] - b[offset + i]).abs();
-        if d > max_val {
-            max_val = d;
-        }
-    }
-    max_val
-}
-
-#[cfg(test)]
-mod l1_l2_linf_tests {
-    use super::*;
-
-    #[test]
-    fn l1_zero_for_equal_inputs() {
-        let a = vec![1.0f64; 8];
-        let b = vec![1.0f64; 8];
-        let result = l1_f64_simd(&a, &b);
-        assert_eq!(result, 0.0, "L1 of identical vectors must be 0.0, got {}", result);
-    }
-
-    #[test]
-    fn l2_matches_scalar_reference() {
-        let a: Vec<f64> = (0..100).map(|i| (i as f64 * 0.1).sin()).collect();
-        let b: Vec<f64> = (0..100).map(|i| (i as f64 * 0.1).cos()).collect();
-
-        let simd_l2 = l2_f64_simd(&a, &b);
-
-        // Scalar reference
-        let scalar_sum: f64 = a.iter().zip(&b).map(|(x, y)| (x - y) * (x - y)).sum();
-        let scalar_l2 = scalar_sum.sqrt();
-
-        assert!(
-            (simd_l2 - scalar_l2).abs() < 100.0 * f64::EPSILON,
-            "SIMD L2 {:.15} vs scalar L2 {:.15}, diff = {:.3e}",
-            simd_l2,
-            scalar_l2,
-            (simd_l2 - scalar_l2).abs()
-        );
-    }
-
-    #[test]
-    fn linf_picks_the_largest_gap() {
-        let a = vec![0.0f64, 0.0, 5.0, 0.0];
-        let b = vec![0.0f64, 0.0, 0.0, 0.0];
-        let result = linf_f64_simd(&a, &b);
-        assert!((result - 5.0).abs() < f64::EPSILON, "L∞ should be 5.0, got {}", result);
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;