From d3863721b69bec0ed60e72ebc1430142de4a442b Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 19 May 2026 05:08:02 +0000 Subject: [PATCH] Revert "Merge pull request #160 from AdaWorldAPI/claude/lance-surrealdb-analysis-LXmug" This reverts commit 697fb96364b62e6ce7e9370b17640c4deb89e60c, reversing changes made to e63158ed77e44fe33d61dcff7810d999b2fbf01b. --- .claude/plans/integration-plan.md | 325 ----------- docs/hpc-api-inventory.md | 363 ------------ docs/hpc-stability.md | 914 ------------------------------ src/hpc/heel_f64x8.rs | 155 ----- 4 files changed, 1757 deletions(-) delete mode 100644 .claude/plans/integration-plan.md delete mode 100644 docs/hpc-api-inventory.md delete mode 100644 docs/hpc-stability.md diff --git a/.claude/plans/integration-plan.md b/.claude/plans/integration-plan.md deleted file mode 100644 index b94e65d3..00000000 --- a/.claude/plans/integration-plan.md +++ /dev/null @@ -1,325 +0,0 @@ -# Integration Plan: ndarray's role in the four-repo convergence - -**This repo**: `AdaWorldAPI/ndarray` — SIMD distance kernels + tensor primitives, shared across the stack. - -**Status**: planning document. Companion plans at the same path in the other repos: -- `AdaWorldAPI/lance-graph:.claude/plans/integration-plan.md` -- `AdaWorldAPI/surrealdb:.claude/plans/integration-plan.md` -- `AdaWorldAPI/sea-orm:.claude/plans/integration-plan.md` - ---- - -## 1. The convergence target - -Across all four repos: - -> *Foundry-style ontology + BEAM-style supervision + ClickHouse-style analytic + Postgres-style ACID + cognitive primitives — all on one Arrow substrate, surfaced to consumers as a typed sea-orm API.* - -Four glue crates close the gap: - -| # | Glue crate | Owner repo | Bridges | -|---|---|---|---| -| 1 | `surrealdb-ractor` | surrealdb | `cf` / live queries → ractor mailboxes | -| 2 | `lance-graph-tikv-provider` | lance-graph | TiKV ranges → Arrow `TableProvider` | -| 3 | `sea-orm-ractor` | sea-orm | `Entity::PK` → ractor process registry | -| 4 | `cognitive-shader-actor` | lance-graph | cognitive shaders → `ractor::Actor` adapter | - -**This repo owns no glue crate.** It owns the **shared low-level numeric substrate** that the other three depend on — SIMD distance kernels (cosine, L1, L2, Linf), `F64x8` polyfills, `heel_f64x8` helpers, `hpc-extras` feature. - -### Integration principle: additive contract shape (this repo IS the canonical case) - -**This repo is the load-bearing example of the contract-shape discipline.** Every symbol this repo exposes is consumed by surrealdb-core (`idx/trees/vector.rs`) and lance-graph cognitive crates (`bgz-tensor`, `holograph`, `deepnsm`, `causal-edge`). One signature change breaks the entire stack. The discipline: - -1. **Existing stable APIs never change signature.** Period. If a hypothetical improvement requires a different signature, the new signature ships as a new function next to the old one. The old function stays forever or for a 5+-version deprecation runway, whichever is longer. -2. **New kernels are added as new functions in new or existing modules.** Adding `F32x16` doesn't touch `F64x8`. Adding `hamming_u8_simd` doesn't touch `cosine_f64_simd`. -3. **Internal SIMD backends (AVX2/AVX-512/NEON paths) are not public surface.** They can change without notice. Only the public entry points are load-bearing. -4. **The `[patch.crates-io]` block in surrealdb's root Cargo.toml is the diamond-dep guard.** This repo's existence + that patch line is what makes downstream `ort` (ONNX runtime) link the same `ndarray` as surrealdb-core. Breaking the patch contract breaks ONNX interop. - -**Per-repo enforcement**: every Sprint item below is read as "add this; don't change what's there." - -### Contracts (existing + new) - -| Contract | Owner repo | Status today | This plan adds | -|---|---|---|---| -| `ndarray::hpc::F64x8` + `heel_f64x8::*` | **this repo** | 0.17 fork, stable per §5 below | **unchanged — only new kernels (e.g. `F32x16`, int8, Hamming) added in new symbols** | -| `[patch.crates-io] ndarray = ...` in surrealdb root Cargo.toml | surrealdb | active (diamond-dep guard) | not touched | -| `lance-graph-contract` (for cognitive shader / IR vocabulary) | lance-graph | 0.1.x → 0.2.0 additive | not touched by us | -| surrealdb `MvccSource` / `CfStream` | surrealdb | new additive traits | not touched by us | -| sea-orm `EntityActor` / `SelectArrowExt` | sea-orm | new additive trait/derive | not touched by us | - ---- - -## 2. Architecture diagram - -``` - ┌──────────────────────────────────────────┐ - │ consumer crate │ - └──────────────────┬───────────────────────┘ - │ typed entities - ▼ - ┌──────────────────────────────────────────┐ - │ sea-orm-arrow 2.0 │ - └────┬─────────────────┬───────────────┬───┘ - │ │ │ - ▼ ▼ ▼ - ┌───────────┐ ┌───────────┐ ┌───────────┐ - │ ractor │◄────│ surrealdb │ │lance-graph│ - │ (actors, │ #1 │ (cf + │ │ (Cypher, │ - │ mailboxes,│ │ live │ │ ontology, │ - │ supervis.)│ │ queries) │ │cognitive) │ - └─────┬─────┘ └─────┬─────┘ └─────┬─────┘ - │ #3 │ │ #2,#4 - ▼ ▼ ▼ - ┌─────────────────────────────────────────────┐ - │ TiKV substrate (Raft + Percolator) │ - └─────────────────────────────────────────────┘ - │ - ▼ - ┌────────────────────────────┐ - │ THIS REPO (ndarray) │ - │ - hpc-extras feature │ - │ - F64x8 polyfill │ - │ - heel_f64x8 distances │ - │ - diamond-dep guard │ - └────────────────────────────┘ -``` - ---- - -## 3. Role of ndarray in the integration - -This is the **shared low-level numeric substrate**. The AdaWorldAPI fork of ndarray 0.17 with `hpc-extras` lives at the bottom of the stack. Two direct consumers: - -1. **surrealdb-core** - - `core/Cargo.toml:71-77` — `vector-hpc` feature flips on cfg-gated dispatch in `idx/trees/vector.rs` - - `core/src/idx/trees/vector.rs` — distance helpers (l1/l2/linf) inlined here, using this repo's SIMD kernels - - Comment from surrealdb's root `Cargo.toml:88-93`: - > *Always the AdaWorldAPI fork — never crates.io. Direct git dep at the workspace level. Distance helpers (l1/l2/linf) are inlined in surrealdb/core/src/idx/trees/vector.rs.* - -2. **lance-graph cognitive crates** - - `crates/bgz-tensor/` — element-wise ops use ndarray's `Zip` + `F64x8` chunks - - `crates/holograph/` — holographic distance metrics - - `crates/deepnsm/` — neural state machine distance kernels - - `crates/causal-edge/` — causality scoring uses cosine over embedding vectors - -Indirectly via sea-orm and the planner, every vector / distance / similarity operation in the stack lands here. - ---- - -## 4. Current state — what makes this fork special - -### `F64x8` polyfill - -`hpc-extras` feature exposes an 8-wide `f64` SIMD vector type that works on: -- **x86_64 AVX-512** — native 8-wide -- **x86_64 AVX2** — two 4-wide ops, software-packed -- **aarch64 NEON** — two 4-wide via NEON 128-bit, software-packed -- **other archs** — scalar fallback - -This is the kernel both surrealdb's `idx/trees/vector.rs` and lance-graph's cognitive shaders rely on. - -### `heel_f64x8` distance kernels - -Functions composing `F64x8` chunks into a distance: - -``` -heel_f64x8::cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 -heel_f64x8::l1_f64_simd (a: &[f64], b: &[f64]) -> f64 -heel_f64x8::l2_f64_simd (a: &[f64], b: &[f64]) -> f64 -heel_f64x8::linf_f64_simd (a: &[f64], b: &[f64]) -> f64 -``` - -### Diamond-dep guard - -The `[patch.crates-io]` block at the bottom of surrealdb's root `Cargo.toml`: - -```toml -[patch.crates-io] -ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git" } -``` - -ensures any transitive consumer of `ndarray = "0.17.x"` from crates.io lands on this fork. Without the patch, `ort` (ONNX runtime, optional `ml` feature in surrealdb) would link a separate `ndarray` and surrealdb-core would link this one — two distinct `TypeId`s, no interop. - -**This repo's existence is what makes the patch work.** Without it, the diamond-dep workaround has no target to redirect to. - -### The `lance-index` 0.16 gap (known) - -From surrealdb root `Cargo.toml:100-101`: - -> *Scope: 0.17 line only. `lance-index 4.0` depends on `ndarray = "0.16"`, a separate major version that this patch does not affect; eliminating that crates.io 0.16 entry requires upstream `lance-index` to bump.* - -**Plan**: watch upstream `lance-index` for the 0.17 bump (see §6 Sprint 2). When it lands, the diamond-dep guard becomes single-version-clean. - ---- - -## 5. API stability commitment (this repo's contract) - -This repo doesn't own a glue *crate* — it owns the **API contract that the SIMD layer of three downstream repos depends on**. The commitment is absolute: - -### Stable public surface (no break without major bump, none planned) - -| Symbol | Kind | -|---|---| -| `ndarray::hpc::F64x8` | type — layout, lane count (8) frozen | -| `ndarray::hpc::heel_f64x8::cosine_f64_simd(a, b) -> f64` | signature frozen | -| `ndarray::hpc::heel_f64x8::l1_f64_simd(a, b) -> f64` | signature frozen | -| `ndarray::hpc::heel_f64x8::l2_f64_simd(a, b) -> f64` | signature frozen | -| `ndarray::hpc::heel_f64x8::linf_f64_simd(a, b) -> f64` | signature frozen | -| feature `hpc-extras` | name + what it enables frozen | - -**"Frozen" means**: no signature change, no rename, no semantic drift. If we want to refine — e.g., a fused multiply-add variant of cosine — we add `cosine_f64_simd_fma(a, b) -> f64` as a NEW function. Both coexist forever (or 5+ versions, whichever is longer). - -### Internal / unstable - -- Polyfill backends (AVX2/AVX-512/NEON paths) — implementation detail -- Auto-dispatch heuristics — can change without notice -- Numeric tolerance in non-cancellation-prone paths — within `f64::EPSILON * len` of scalar reference - -### Doc commitment - -- Each stable function gets a doc-test -- Cross-arch behaviour documented in `docs/hpc-stability.md` (Sprint 0) -- A CI matrix runs the doc-tests on x86_64-AVX2, x86_64-AVX-512, aarch64-NEON, and scalar-fallback - ---- - -## 6. Sprint sequence (this repo) - -All work is **additive** — new symbols in new or existing modules; no existing symbol changes signature. - -### Sprint 0 — API freeze + doc (1 week) -- Mark stable APIs with `#[stable]`-style doc tag (custom attribute or doc-comment convention) -- Write `docs/hpc-stability.md` listing the commitment from §5 -- Add CI cross-arch doc-test matrix -- Cross-link from this plan - -### Sprint 1 — `bgz-tensor` direct coupling (1 week) -- `bgz-tensor` (lance-graph crate) takes a direct dep on this fork (additive: new dep line, no existing dep changes) -- Ensures `bgz-tensor` users always get the SIMD kernels regardless of feature-flag composition -- Coordinate with lance-graph plan §4 - -### Sprint 2 — `lance-index` 0.17 readiness (timing depends on upstream) -- Watch upstream `lance-index` for the 0.17 bump -- Have a forked `lance-index` 0.17 ready to slot in if upstream delays -- Once available, extend the surrealdb `[patch.crates-io]` block to cover both 0.16 (if still needed) and 0.17 -- This is purely additive on this repo's side (we add no symbols; we are the target of the patch) - -### Sprint 3 — additional kernels as needed (ad-hoc; all additive) -- Add `F32x16` polyfill if cognitive shaders migrate to f32 (NEW type, F64x8 unchanged) -- Add quantised int8 distance kernels for embedding compression (NEW module `heel_i8x32::*`) -- Add Hamming distance kernel for binary embeddings (NEW function `heel_u8x32::hamming_u8_simd`) - ---- - -## 7. Examples - -### Example 1 — surrealdb using the fork's SIMD - -```rust -// surrealdb/core/src/idx/trees/vector.rs — sketch of what's already wired -use ndarray::hpc::heel_f64x8; - -pub fn cosine_distance(a: &[f64], b: &[f64]) -> f64 { - debug_assert_eq!(a.len(), b.len()); - #[cfg(feature = "vector-hpc")] - { 1.0 - heel_f64x8::cosine_f64_simd(a, b) } - #[cfg(not(feature = "vector-hpc"))] - { scalar_cosine(a, b) } -} -``` - -### Example 2 — lance-graph cognitive shader using the fork - -```rust -// lance-graph/crates/holograph/src/distance.rs -use ndarray::hpc::heel_f64x8; -use crate::HolographEmbedding; - -impl HolographEmbedding { - pub fn similarity(&self, other: &Self) -> f64 { - heel_f64x8::cosine_f64_simd(self.as_slice(), other.as_slice()) - } -} -``` - -### Example 3 — `bgz-tensor` element-wise ops via the fork - -```rust -// lance-graph/crates/bgz-tensor/src/ops.rs -use ndarray::hpc::F64x8; -use ndarray::Zip; - -impl BgzTensor { - pub fn elementwise_mul(&self, other: &Self) -> Self { - let mut out = self.clone(); - Zip::from(&mut out.data) - .and(&other.data) - .for_each(|a, &b| *a *= b); - // F64x8-chunked path handled by ndarray's Zip internals for large tensors. - out - } -} -``` - -### Example 4 — The diamond-dep guard (replicated for cross-reference) - -```toml -# surrealdb root Cargo.toml (already in place; documented here so the -# fork knows what surfaces are load-bearing). -[patch.crates-io] -ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git" } -``` - -Without this patch: -- `ort` pulls `ndarray = "0.17.2"` from crates.io -- `surrealdb-core` pulls this fork -- They have distinct `TypeId`s → no interop between ONNX outputs and surrealdb's index code - -With this patch, both link the same crate. **This fork's stability is the diamond-dep fix.** - -### Example 5 — New kernel landing as a new symbol (additive) - -Hypothetical: a fused multiply-add cosine variant lands. Old + new coexist: - -```rust -// crates/ndarray/src/hpc/heel_f64x8.rs — new function, existing unchanged -pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 { /* existing */ } - -/// FMA variant. Lower latency on AVX-512 + AVX2-FMA hosts. -/// Numerically identical within f64::EPSILON * len. -pub fn cosine_f64_simd_fma(a: &[f64], b: &[f64]) -> f64 { /* new */ } -``` - -Consumers pick. Nothing breaks. - ---- - -## 8. What this plan asks of the other repos - -Nothing structural — only that consumers stay on the stable surface (§5) and report breakage promptly. Specifically: - -- **surrealdb**: `idx/trees/vector.rs` should only use `ndarray::hpc::*` items listed in §5. Anything else is a non-stable detail and may break without notice. -- **lance-graph**: cognitive crates should use `heel_f64x8` distance kernels; if a kernel is missing (e.g. Hamming), file an issue here rather than implementing locally. -- **sea-orm**: no direct dep on this fork; touches it only transitively if a consumer uses sea-orm-arrow with `f64` Arrow columns. - ---- - -## 9. Open questions - -1. **`F32x16` priority** — is a cognitive shader consumer planning to move to f32? If yes, Sprint 3 fast-track. If no, defer. -2. **Quantised int8 distance kernels** — trigger Sprint 3 item when a concrete consumer surfaces. -3. **WASM target** — surrealdb has a WASM build path. Does it need `vector-hpc`? Today the scalar fallback covers it. Confirm with surrealdb plan. -4. **Numeric tolerance documentation** — currently "within `f64::EPSILON * len`"; doc-test it in Sprint 0. -5. **`#[stable]` attribute convention** — use Rust nightly `#[stable]` (not available on stable) or a doc-comment convention? Probably the latter for portability; revisit when nightly `#[stable]` stabilises. - ---- - -## 10. Cross-references - -- **Glue #1** (surrealdb-ractor): `AdaWorldAPI/surrealdb:.claude/plans/integration-plan.md` §5 -- **Glue #2** (TiKV TableProvider): `AdaWorldAPI/lance-graph:.claude/plans/integration-plan.md` §5 -- **Glue #3** (sea-orm-ractor): `AdaWorldAPI/sea-orm:.claude/plans/integration-plan.md` §5 -- **Glue #4** (cognitive-shader-actor): `AdaWorldAPI/lance-graph:.claude/plans/integration-plan.md` §6 -- **Cognitive crate consumers** (the load-bearing reason this fork exists): `AdaWorldAPI/lance-graph:.claude/plans/integration-plan.md` §3 + §4 -- **surrealdb's `vector-hpc` feature**: `AdaWorldAPI/surrealdb:.claude/plans/integration-plan.md` §4 (`core/Cargo.toml:71-77`) -- **`lance-projection` sibling** (analytic view of cognitive crate outputs): `AdaWorldAPI/surrealdb:.claude/plans/integration-plan.md` §6 diff --git a/docs/hpc-api-inventory.md b/docs/hpc-api-inventory.md deleted file mode 100644 index 2bb10e38..00000000 --- a/docs/hpc-api-inventory.md +++ /dev/null @@ -1,363 +0,0 @@ -# HPC API Inventory — AdaWorldAPI/ndarray fork - -**Generated**: 2026-05-18 -**Branch**: `claude/lance-surrealdb-analysis-LXmug` -**Purpose**: Catalogue of the existing public HPC surface relevant to the -lance-graph ↔ surrealdb ↔ ndarray integration plan §1 stable-surface commitment. - ---- - -## 1. Discovered HPC Modules - -The `src/hpc/` directory contains **~100 Rust source files** across flat and -nested layouts. Below are the modules relevant to distance computation and the -stable surface claimed by the integration plan. - -| Module | File | Notes | -|---|---|---| -| `hpc::heel_f64x8` | `src/hpc/heel_f64x8.rs` | **Primary distance surface** — SIMD cosine + HEEL plane Hamming | -| `hpc::distance` | `src/hpc/distance.rs` | Spatial k-NN, squared L2, radius filter (f32 AVX2 + f64 scalar) | -| `hpc::bitwise` | `src/hpc/bitwise.rs` | `hamming_distance_raw`, `popcount_raw`, batch Hamming + top-k | -| `hpc::palette_distance` | `src/hpc/palette_distance.rs` | Palette/SPO distance matrices (`SpoDistanceMatrices`) | -| `hpc::layered_distance` | `src/hpc/layered_distance.rs` | Lance-graph container layout (`[u64; 256]`), `palette_distance()` | -| `hpc::parallel_search` | `src/hpc/parallel_search.rs` | `parallel_search`, `lfd_from_palette`, `PaletteScope` | -| `hpc::cam_pq` | `src/hpc/cam_pq.rs` | `squared_l2`, `kmeans`, `CamCodebook`, `DistanceTables` | -| `hpc::blas_level1` | `src/hpc/blas_level1.rs` | `dot_f32/f64`, `nrm2_f32/f64`, `axpy_f32/f64`, `blas_rotg` | -| `hpc::vml` | `src/hpc/vml.rs` | `vsexp`, `vdexp`, `vsln`, `vdln`, `vssqrt`, `vdsqrt`, `vsabs`, `vdabs`, etc. | -| `hpc::reductions` | `src/hpc/reductions.rs` | `sum_f32/f64`, `mean_f32/f64`, `max/min_f32`, `argmax/argmin_f32`, `nrm2_f32` | -| `hpc::simd_caps` | `src/hpc/simd_caps.rs` | Runtime SIMD capability singleton | -| `hpc::simd_dispatch` | `src/hpc/simd_dispatch.rs` | `LazyLock`-frozen SIMD dispatch function pointers | -| `hpc::fingerprint` | `src/hpc/fingerprint.rs` | `Fingerprint`, `Fingerprint1K/2K/64K`, `VectorConfig` | -| `hpc::clam` | `src/hpc/clam.rs` | `knn_brute`, `ClamTree::build` | -| `hpc::prefilter` | `src/hpc/prefilter.rs` | `approx_hamming_candidates` | -| `hpc::cyclic_bundle` | `src/hpc/cyclic_bundle.rs` | `hamming_128`, `cyclic_shift`, `bundle_spo` | -| `hpc::zeck` | `src/hpc/zeck.rs` | ZeckF64 progressive edge encoding, `hamming_distance_raw` consumer | -| `hpc::holo` | `src/hpc/holo.rs` | Phase-space holographic ops: `focus_hamming`, `focus_l1`, `wasserstein_sorted_i8` | - -Additionally gated behind `feature = "hpc-extras"`: - -| Module | File | -|---|---| -| `hpc::spo_bundle` | `src/hpc/spo_bundle.rs` | -| `hpc::deepnsm` | `src/hpc/deepnsm.rs` | -| `hpc::compression_curves` | `src/hpc/compression_curves.rs` | -| `hpc::crystal_encoder` | `src/hpc/crystal_encoder.rs` | -| `hpc::p64_bridge` | `src/hpc/p64_bridge.rs` | - ---- - -## 2. F64x8 Type — Actual Definition - -### AVX-512 path (canonical production backend) - -**File**: `src/simd_avx512.rs` -**Line**: 304 (struct definition) / 314 (LANES constant) - -```rust -// src/simd_avx512.rs:302–304 -#[derive(Copy, Clone)] -#[repr(transparent)] -pub struct F64x8(pub __m512d); - -// src/simd_avx512.rs:314 -pub const LANES: usize = 8; -``` - -Repr: `__m512d` — a native 512-bit AVX-512 register holding 8 × `f64`. -Lane count: **8**. -Backing: `_mm512_loadu_pd` (unaligned load), `_mm512_storeu_pd` (unaligned store). - -Key methods available on `F64x8` (`src/simd_avx512.rs`): -`splat(v: f64)`, `from_slice(&[f64])`, `from_array([f64; 8])`, `to_array()`, -`copy_to_slice(&mut [f64])`, `reduce_sum()`, `reduce_min()`, `reduce_max()`, -`abs()`, `sqrt()`, `round()`, `floor()`, `mul_add(b, c)`, `simd_min/max/clamp`, -`simd_lt/le/gt/ge/eq/ne`, `to_bits()`, `from_bits()`. - -### AVX2 fallback (non-AVX-512 x86_64) - -**File**: `src/simd_avx2.rs` -The AVX2 path supplies `F64x8` as a polyfill backed by two `__m256d` (2 × 4 -lanes). Same public API surface as the AVX-512 variant; `impl_float_type!` macro -used at line ~820 of `simd_avx2.rs`. - -### Scalar fallback (non-x86 targets) - -**File**: `src/simd.rs`, scalar module (not-x86 cfg block, line ~789) -```rust -impl_float_type!(F64x8, f64, 8, F64Mask8, u8); -``` -Backed by `[f64; 8]`. Same API. - -### Re-export path - -`src/simd.rs:244` (AVX-512 path) / `src/simd.rs:280` (AVX2 fallback) / `src/simd.rs:1573` (NEON) -→ `pub use crate::simd::F64x8;` is the canonical consumer entry point. - ---- - -## 3. `heel_f64x8` Functions — Signatures and File:Line - -**File**: `src/hpc/heel_f64x8.rs` - -| Function | Signature | Line | Description | -|---|---|---|---| -| `heel_weighted_distance` | `(distances: &[f64; 8], weights: &[f64; 8]) -> f64` | 23 | Weighted dot via F64x8 FMA; single vmulpd+vreducepd on AVX-512 | -| `heel_plane_distances` | `(a: &[u64; 8], b: &[u64; 8]) -> [f64; 8]` | 34 | Hamming (popcount of XOR) per plane → 8 f64 distances | -| `heel_weighted_hamming` | `(a_planes: &[u64; 8], b_planes: &[u64; 8], weights: &[f64; 8]) -> f64` | 44 | Full pipeline: planes → per-plane Hamming → weighted dot | -| `dot_f64_simd` | `(a: &[f64], b: &[f64]) -> f64` | 64 | SIMD dot product; 8 f64 per iteration with FMA accumulation | -| `sum_sq_f64_simd` | `(a: &[f64]) -> f64` | 86 | Sum of squares via F64x8 FMA | -| `cosine_f64_simd` | `(a: &[f64], b: &[f64]) -> f64` | 109 | SIMD cosine similarity, single-pass dot+norms | -| `cosine_f32_to_f64_simd` | `(a: &[f32], b: &[f32]) -> f64` | 149 | f32 inputs, f64 precision cosine via scalar widening + F64x8 FMA | - -**Constants also defined**: -- `UNIFORM_WEIGHTS: [f64; 8] = [1.0; 8]` — line 50 -- `HEEL_7PLUS1_WEIGHTS: [f64; 8] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5]` — line 54 - -### Integration plan claim vs reality - -The integration plan (`lance-graph/.claude/plans/integration-plan.md:32`) states: -> `hpc-extras` feature, `heel_f64x8::cosine_f64_simd` etc. - -And the contracts table (line 58) states: -> `ndarray::hpc::F64x8` + `heel_f64x8::*` — ndarray 0.17 fork, stable - -**Verdict: PRESENT and matches.** `cosine_f64_simd` exists at -`src/hpc/heel_f64x8.rs:109` with signature `(a: &[f64], b: &[f64]) -> f64`. - -**Additional functions the plan's "etc." implies but does not name explicitly -are also present**: `heel_weighted_hamming`, `heel_plane_distances`, -`heel_weighted_distance`, `dot_f64_simd`, `sum_sq_f64_simd`, `cosine_f32_to_f64_simd`. - ---- - -## 4. Other Distance Kernels Found - -### 4a. Hamming (binary Hamming distance) - -| Function | File | Line | Signature | -|---|---|---|---| -| `hamming_distance_raw` | `src/hpc/bitwise.rs` | 180 | `(a: &[u8], b: &[u8]) -> u64` | -| `popcount_raw` | `src/hpc/bitwise.rs` | 185 | `(a: &[u8]) -> u64` | -| `hamming_batch_raw` | `src/hpc/bitwise.rs` | 193 | `(query, database, num_rows, row_bytes) -> Vec` | -| `hamming_top_k_raw` | `src/hpc/bitwise.rs` | 201 | `(query, database, num_rows, row_bytes, k) -> Vec<(usize,u64)>` | -| `hamming_distance` | `src/simd_avx2.rs` | 276 | `(a: &[u8], b: &[u8]) -> u64` (AVX2-specific) | -| `hamming_batch` | `src/simd_avx2.rs` | 316 | `(query, database, num_rows, row_bytes) -> Vec` | -| `hamming_top_k` | `src/simd_avx2.rs` | 338 | `(query, database, num_rows, row_bytes, k) -> Vec<(usize,u64)>` | -| `hamming_128` | `src/hpc/cyclic_bundle.rs` | 153 | `(a: &[u64; N], b: &[u64; N]) -> u32` (128×64-bit) | -| `hamming_u8x16` | `src/simd_neon.rs` | 74 | `unsafe (a: &[u8; 16], b: &[u8; 16]) -> u32` (NEON) | -| `focus_hamming` | `src/hpc/holo.rs` | ~1951 | `(a: &[u8], b: &[u8], mask_x, mask_y, mask_z) -> (u64, u32)` | -| `approx_hamming_candidates` | `src/hpc/prefilter.rs` | 252 | `(query, db, bytes_per_vec, n_vectors, k) -> Vec<(usize,u32)>` | - -Re-exported to `ndarray::simd` namespace: -- `src/simd.rs:1714`: `pub use crate::hpc::bitwise::{hamming_distance_raw, popcount_raw};` - -### 4b. Cosine Similarity - -| Function | File | Line | Signature | -|---|---|---|---| -| `cosine_f64_simd` | `src/hpc/heel_f64x8.rs` | 109 | `(a: &[f64], b: &[f64]) -> f64` | -| `cosine_f32_to_f64_simd` | `src/hpc/heel_f64x8.rs` | 149 | `(a: &[f32], b: &[f32]) -> f64` | - -Re-exported to `ndarray::simd` namespace: -- `src/simd.rs:1751`: `pub use crate::hpc::heel_f64x8::cosine_f32_to_f64_simd;` -- `cosine_f64_simd` is **NOT** re-exported at the `ndarray::simd` level (only `cosine_f32_to_f64_simd` is). Consumers must import directly from `ndarray::hpc::heel_f64x8::cosine_f64_simd`. - -### 4c. L2 / Squared L2 - -| Function | File | Line | Signature | -|---|---|---|---| -| `squared_l2` | `src/hpc/cam_pq.rs` | 473 | `(a: &[f32], b: &[f32]) -> f32` | -| `squared_distances_f32` | `src/hpc/distance.rs` | 98 | `(query: [f32;3], points: &[[f32;3]]) -> Vec` | -| `squared_distances_f64` | `src/hpc/distance.rs` | 142 | `(query: [f64;3], points: &[[f64;3]]) -> Vec` | -| `knn_f32` | `src/hpc/distance.rs` | 124 | `(query: [f32;3], points: &[[f32;3]], k) -> (Vec, Vec)` | -| `knn_f64` | `src/hpc/distance.rs` | 158 | `(query: [f64;3], points: &[[f64;3]], k) -> (Vec, Vec)` | -| `filter_by_radius_sq` | `src/hpc/distance.rs` | 113 | `(query: [f32;3], points, radius_sq) -> Vec` | -| `filter_by_radius_sq_f64` | `src/hpc/distance.rs` | 147 | `(query: [f64;3], points, radius_sq) -> Vec` | - -Re-exported to `ndarray::simd` namespace: -- `src/simd.rs:1747`: `pub use crate::hpc::cam_pq::{kmeans, squared_l2};` - -### 4d. L1 (Holographic / phase-space variants) - -**No generic `l1_f64_simd` or `l1_f64` free function exists at the top-level HPC surface.** - -L1-style distance found only in specialized contexts: - -| Function | File | Signature | Context | -|---|---|---|---| -| `focus_l1` | `src/hpc/holo.rs` | `(a: &[u8], b: &[u8], mask_x, mask_y, mask_z) -> (u64, u32)` | Holographic phase-masked L1 | -| `wasserstein_sorted_i8` | `src/hpc/holo.rs` | `(a: &[u8], b: &[u8]) -> u64` | Wasserstein-style L1 distance | -| `carrier_distance_l1` | `src/hpc/holo.rs` | `(a: &[i8], b: &[i8]) -> u64` | Carrier-wave L1 distance | -| `histogram_l1_distance` | `src/hpc/holo.rs` | `(a: &[u16;16], b: &[u16;16]) -> u32` | Histogram L1 | -| `asum_f32` / `asum_f64` | `src/simd_avx2.rs` | `(x: &[f32]) -> f32` | L1 norm (sum of absolutes), not pairwise distance | - -### 4e. Linf (Chebyshev) Distance - -**No `linf_f64_simd` or generic Linf pairwise distance function exists in the HPC surface.** Not found anywhere in `src/`. `reduce_max()` on `F64x8` provides max-element reduction as a building block, but no composed Linf kernel is exported. - -### 4f. Dot Products (BLAS L1 level) - -| Function | File | Line | Signature | -|---|---|---|---| -| `dot_f32` | `src/simd_avx2.rs` | 56 | `(a: &[f32], b: &[f32]) -> f32` | -| `dot_f64` | `src/simd_avx2.rs` | 88 | `(a: &[f64], b: &[f64]) -> f64` | -| `dot_f64_simd` | `src/hpc/heel_f64x8.rs` | 64 | `(a: &[f64], b: &[f64]) -> f64` (F64x8 FMA path) | -| `dot_i8` | `src/simd_avx2.rs` | 406 | `(a: &[u8], b: &[u8]) -> i64` | - -### 4g. Palette / SPO Distance - -| Function | File | Signature | -|---|---|---| -| `palette_distance` | `src/hpc/layered_distance.rs:62` | `(dm: &SpoDistanceMatrices, a: &[u64;256], b: &[u64;256]) -> u32` | -| `SpoDistanceMatrices::spo_distance` | `src/hpc/palette_distance.rs:345` | `(&self, a_s, a_p, a_o, b_s, b_p, b_o) -> u32` | -| `DistanceTables::distance` | `src/hpc/cam_pq.rs:189` | `(&self, cam: &CamFingerprint) -> f32` | -| `parallel_search` | `src/hpc/parallel_search.rs:229` | `(scope: &PaletteScope, query: &PaletteEdge, k, gate: &TruthGate) -> Vec` | - ---- - -## 5. Feature Flags - -### `hpc-extras` (defined `Cargo.toml:207`) - -```toml -hpc-extras = ["std", "dep:p64", "dep:fractal", "fractal/std"] -``` - -Pulls in: `p64` (Palette64/3D attention NARS bridge) and `fractal` (manifold math). - -Modules **gated** behind `hpc-extras` (from `src/hpc/mod.rs`): -- `hpc::spo_bundle` (line 121) -- `hpc::deepnsm` (line 124) -- `hpc::compression_curves` (line 131) -- `hpc::crystal_encoder` (line 134) -- `hpc::p64_bridge` (line 141) -- `jitson_cranelift` sub-module (gated separately on `jit-native`) -- `splat3d` (gated separately on `splat3d`) -- The `e2e_tests` integration test block (line 252) - -**Default**: `hpc-extras` IS included in the crate default features (`Cargo.toml:174`): -```toml -default = ["std", "hpc-extras"] -``` - -Modules **not** gated behind `hpc-extras` (unconditionally compiled when `std` is on): -All of `heel_f64x8`, `distance`, `bitwise`, `blas_level1/2/3`, `cam_pq`, `fingerprint`, -`clam`, `prefilter`, `palette_distance`, `layered_distance`, `parallel_search`, -`holo`, `cyclic_bundle`, `vml`, `reductions`, etc. - -### Other relevant feature flags - -| Flag | Defined at | Purpose | -|---|---|---| -| `std` | `Cargo.toml:182` | Enables `hpc` module + blake3 for cognitive substrate | -| `native` | `Cargo.toml:219` | HPC backend: pure Rust + SIMD | -| `intel-mkl` | `Cargo.toml:220` | HPC backend: Intel MKL FFI (mutually exclusive with openblas) | -| `openblas` | `Cargo.toml:221` | HPC backend: OpenBLAS FFI (mutually exclusive with intel-mkl) | -| `jit-native` | `Cargo.toml:215` | Cranelift JIT backend | -| `splat3d` | `Cargo.toml:231` | CPU-SIMD 3D Gaussian Splatting | -| `nightly-simd` | `Cargo.toml:197` | Portable-SIMD miri-compatible backend (nightly only) | - ---- - -## 6. Cross-References to Consumers - -### lance-graph (`/home/user/lance-graph`) - -The integration plan (`integration-plan.md:58`) explicitly contracts: -> `ndarray::hpc::F64x8` + `heel_f64x8::*` — stable, unchanged - -Consuming crates in lance-graph that reference the ndarray HPC surface: - -| Consumer file | ndarray function used | Reference | -|---|---|---| -| `crates/lance-graph/src/graph/blasgraph/ndarray_bridge.rs` | `hamming_distance_raw`, `U8x64::nibble_popcount_lut` | knowledge doc W1b row | -| `crates/lance-graph/src/graph/neighborhood/zeckf64.rs` | ZeckF64 (ndarray canonical copy) | `hpc::zeck` | -| `crates/lance-graph-contract/src/mul.rs` | `I8x16::from_i4_packed_u64`, `batch_packed_i4_16` | W1a W1b plan | -| `crates/holograph/hamming.rs` | `hamming_distance_raw`, `U64x8::popcnt` (W1a planned) | knowledge doc W1b row | -| `crates/bgz17/src/simd.rs` | `U16x8::gather_u16` (W1a planned), `hamming_distance_raw` | knowledge doc W1b row | -| `crates/thinking-engine/src/engine.rs` | `BF16x16`, `simd_amx::*`, `Fingerprint<256>` | W1b VNNI route | - -Note: the **lance-graph knowledge doc** (`lance-graph/.claude/knowledge/ndarray-vertical-simd-alien-magic.md`) -specifies that `cosine_f64_simd` is part of the stable surface ("etc." in the plan) and that -no raw intrinsics should be used in consumer crates — all SIMD must flow through `ndarray::simd::*`. - -### surrealdb (`AdaWorldAPI/surrealdb`) - -Referenced via the integration plan; consuming `lance-graph-contract` which depends on ndarray. -The path is indirect: surrealdb → `lance-graph-contract` → ndarray. - -The surrealdb vector distance machinery lives at (plan reference, not audited locally): -`surrealdb/core/src/idx/trees/vector.rs` - -Plan claims this will consume `ndarray::hpc::heel_f64x8::cosine_f64_simd` via the -lance-graph-contract trait when wired. Not yet wired (integration §5 is a new crate, -`lance-graph-tikv-provider`, not vector indexing). - ---- - -## 7. Gap Analysis — Plan §1 Stable Surface vs Current Reality - -The integration plan's stable-surface table (`integration-plan.md:53–58`) claims: -> `ndarray::hpc::F64x8` + `heel_f64x8::*` — ndarray 0.17 fork, stable — unchanged: only new kernels added - -### Present and confirmed - -| Claimed API | Actual location | Status | -|---|---|---| -| `ndarray::hpc::F64x8` | `src/simd_avx512.rs:304` (AVX-512), polyfill at `simd_avx2.rs`, scalar in `simd.rs` | PRESENT | -| `F64x8::LANES = 8` | `src/simd_avx512.rs:314` | PRESENT | -| `F64x8::splat`, `from_slice`, `from_array`, `to_array`, `reduce_sum`, `mul_add`, `sqrt`, `abs` | `src/simd_avx512.rs:316–434` | PRESENT | -| `heel_f64x8::cosine_f64_simd` | `src/hpc/heel_f64x8.rs:109` | PRESENT — signature `(a: &[f64], b: &[f64]) -> f64` | -| `heel_f64x8::heel_weighted_hamming` | `src/hpc/heel_f64x8.rs:44` | PRESENT | -| `heel_f64x8::heel_plane_distances` | `src/hpc/heel_f64x8.rs:34` | PRESENT | -| `heel_f64x8::heel_weighted_distance` | `src/hpc/heel_f64x8.rs:23` | PRESENT | -| `heel_f64x8::dot_f64_simd` | `src/hpc/heel_f64x8.rs:64` | PRESENT | -| `heel_f64x8::cosine_f32_to_f64_simd` | `src/hpc/heel_f64x8.rs:149` | PRESENT; also re-exported at `ndarray::simd` level | -| `hamming_distance_raw` | `src/hpc/bitwise.rs:180`; re-exported `simd.rs:1714` | PRESENT | -| `squared_l2` | `src/hpc/cam_pq.rs:473`; re-exported `simd.rs:1747` | PRESENT | - -### Missing from the plan's implied surface - -| Claimed / implied API | Status | Notes | -|---|---|---| -| `l1_f64_simd` or generic pairwise L1 | **ABSENT** | Only L1-norm variants (`asum_f32/f64`) and specialized `focus_l1` / `carrier_distance_l1` / `wasserstein_sorted_i8` exist. No generic `l1_f64_simd(a: &[f64], b: &[f64]) -> f64`. | -| `l2_f64_simd` as free function | **ABSENT** | `squared_l2` exists for f32; no `l2_f64_simd(a: &[f64], b: &[f64]) -> f64` free function. The L2 distance on 3D points exists in `distance.rs` but is not a general-purpose slice kernel. | -| `linf_f64_simd` | **ABSENT** | No Linf / Chebyshev distance function at any level. | -| `cosine_f64_simd` re-export in `ndarray::simd` | **ABSENT** | `cosine_f32_to_f64_simd` IS re-exported at `simd.rs:1751`. `cosine_f64_simd` is **not** — consumers must import from `ndarray::hpc::heel_f64x8`. | -| `hamming_distance_raw` gating on `hpc-extras` | **NOT REQUIRED** — present unconditionally | `hamming_distance_raw` lives in `hpc::bitwise` which is not behind `hpc-extras`; always available with `std`. | - -### W1a primitives claimed by the knowledge doc — current status - -From `lance-graph/.claude/knowledge/ndarray-vertical-simd-alien-magic.md` §W1a table: - -| W1a primitive | Status in ndarray today | -|---|---| -| `I8x16::from_i4_packed_u64` | **ABSENT** — not in `simd_avx512.rs` or `simd_avx2.rs`; W1a PR pending | -| `I8x16::lane_i8::` | **ABSENT** — generic lane extractor not present | -| `I8x16::saturating_abs` | **ABSENT** — neither a free function nor method | -| `batch_packed_i4_16` | **ABSENT** — closure-batch entry point not present | -| `U64x8::xor_popcount` / `U64x8::popcnt` | **ABSENT** — `U64x8` type exists (`simd_avx512.rs:1964`, LANES=8) but `popcnt`/`xor_popcount` methods are not present | -| `U16x8::gather_u16` | **ABSENT** — `U16x32` exists; `U16x8` does not | -| `prefetch_read_t0/t1/t2` | **ABSENT** — no prefetch hint wrappers | -| `U8x32::nibble_popcount_lut` | **ABSENT** — `U8x64::nibble_popcount_lut` exists (`simd_avx512.rs` AVX-512 BITALG path); 32-byte parity is not implemented | - -All W1a items are **planned additions** (not yet committed), which is consistent with the -plan's statement that `heel_f64x8::*` is stable and "only new kernels added." - ---- - -## Summary - -The integration plan's §1 stable-surface commitment for ndarray resolves to: - -- **PRESENT and stable**: `F64x8` type (8-lane f64 SIMD), all `heel_f64x8::*` functions, - `hamming_distance_raw`, `squared_l2`, `Fingerprint`, `CamCodebook` / `DistanceTables`. -- **ABSENT (not yet added, plan-deferred)**: generic `l1_f64_simd`, `l2_f64_simd`, - `linf_f64_simd` free-function kernels; all W1a primitives - (`I8x16::from_i4_packed_u64`, `U64x8::popcnt`, `U16x8::gather_u16`, - `prefetch_read_t0`, `I8x16::saturating_abs`, `batch_packed_i4_16`). -- **PARTIAL re-export**: `cosine_f32_to_f64_simd` is re-exported at `ndarray::simd`; - `cosine_f64_simd` is **not** and requires a direct `ndarray::hpc::heel_f64x8` import. -- **`hpc-extras` scope**: The core distance surface (`heel_f64x8::*`, `bitwise::*`, - `cam_pq::*`, `distance::*`) does **not** require `hpc-extras`; only the p64/fractal - convergence modules do. diff --git a/docs/hpc-stability.md b/docs/hpc-stability.md deleted file mode 100644 index 4660a20f..00000000 --- a/docs/hpc-stability.md +++ /dev/null @@ -1,914 +0,0 @@ -# HPC API Stability Commitment — AdaWorldAPI/ndarray Fork - -> **2026-05-18 wave-2 update**: `l1_f64_simd`, `l2_f64_simd`, and -> `linf_f64_simd` were initially marked as "aspirational reserved names" -> in this doc because they were absent from the source at wave-1 time -> (per `docs/hpc-api-inventory.md`). Wave-2 commit `71cdbd4` -> ("feat(hpc): materialise l1_f64_simd, l2_f64_simd, linf_f64_simd in -> heel_f64x8") materialised all three with the exact signatures this doc -> promised, matching `cosine_f64_simd`'s F64x8-chunk + scalar-tail -> pattern. 15/15 `heel_f64x8` tests pass. The "Stable public surface" -> table below now describes REAL APIs — not promises. Reading order: -> the freeze commitment is now load-bearing, not aspirational. - - -**Status:** DRAFT — first published 2026-05-18 -**Branch at time of publication:** `claude/lance-surrealdb-analysis-LXmug` -**Applies to crate:** `ndarray` v0.17.x (AdaWorldAPI fork of rust-ndarray/ndarray) -**Rust edition:** 2021 / MSRV 1.95 stable - ---- - -## Table of Contents - -1. [Scope](#1-scope) -2. [Stable Public Surface](#2-stable-public-surface) -3. [Internal / Unstable Surface](#3-internal--unstable-surface) -4. [What "Frozen" Means](#4-what-frozen-means) -5. [Adding New Kernels — The Additive Pattern](#5-adding-new-kernels--the-additive-pattern) -6. [Diamond-Dependency Guard](#6-diamond-dependency-guard) -7. [CI Commitment](#7-ci-commitment) -8. [Cross-References to Integration Plans](#8-cross-references-to-integration-plans) -9. [Appendix: Numeric Tolerance Derivation](#9-appendix-numeric-tolerance-derivation) - ---- - -## 1. Scope - -### Why This Document Exists - -This fork of ndarray (`github.com/AdaWorldAPI/ndarray`) diverges from -upstream `rust-ndarray/ndarray` by adding a significant HPC layer under -`src/hpc/` (175+ Rust source files at time of writing) and a portable SIMD -abstraction layer in `src/simd.rs`, `src/simd_avx512.rs`, `src/simd_avx2.rs`, -and `src/simd_neon.rs`. - -The fork occupies a **load-bearing position in two dependency chains**: - -``` -surrealdb-core - └── [patch.crates-io] ndarray = { git = "...", branch = "..." } - └── ndarray::hpc::heel_f64x8 (vector index distance kernels) - └── ndarray::simd::F64x8 (SIMD register type) - -lance-graph cognitive crates - └── ndarray::hpc::heel_f64x8 (cosine/L1/L2/Linf distance) - └── ndarray::simd::F64x8 (F64x8 polyfill — AVX-512/AVX2/NEON) - └── ndarray::hpc::bitwise (Hamming / DataFusion UDFs) - └── ndarray::hpc::udf_kernels (DataFusion-registered UDFs) -``` - -**Signature breaks in this fork cascade silently into compilation failures in -surrealdb-core and lance-graph**, often manifesting only when the downstream -workspace links the ONNX runtime crate (`ort`) against a surrealdb-core that -now has a different ndarray ABI than `ort` expects. This is the classic Rust -diamond-dependency problem: two crates in the same build graph that each -depend on `ndarray` but at different versions or from different sources get -separate incompatible types even though they share a name. - -This document codifies which parts of the public surface are **frozen**, -which are explicitly **unstable**, and the rules that govern the addition of -new functionality without breaking existing consumers. - ---- - -## 2. Stable Public Surface - -### 2.1 Overview - -The following items constitute the **frozen, stable public API** of this fork. -Changes to any item in this table require a coordinated bump of `Cargo.toml` -`version` plus deprecation notices in all known downstreams before the old -signature is removed. - -| Symbol | Module path | Kind | -|--------|-------------|------| -| `F64x8` | `ndarray::simd::F64x8` (re-exported from `ndarray::hpc::heel_f64x8` via `ndarray::simd`) | `pub struct` | -| `cosine_f64_simd` | `ndarray::hpc::heel_f64x8::cosine_f64_simd` | `pub fn` | -| `l1_f64_simd` | `ndarray::hpc::heel_f64x8` (aspirational; see §2.3) | `pub fn` | -| `l2_f64_simd` | `ndarray::hpc::heel_f64x8` (aspirational; see §2.3) | `pub fn` | -| `linf_f64_simd` | `ndarray::hpc::heel_f64x8` (aspirational; see §2.3) | `pub fn` | -| `hpc-extras` | `Cargo.toml [features]` | Cargo feature | - -### 2.2 `ndarray::simd::F64x8` - -**File:** `src/simd_avx512.rs:304` (AVX-512 backend), -`src/simd_avx2.rs:815` (AVX2 backend), -`src/simd_neon.rs:819` (NEON backend), -unified dispatch in `src/simd.rs:244` / `src/simd.rs:280`. - -**Definition (canonical, backend-neutral):** - -```rust -// AVX-512 (src/simd_avx512.rs:304): -pub struct F64x8(pub __m512d); - -// AVX2 (src/simd_avx2.rs:815): -pub struct F64x8(pub f64x4, pub f64x4); // 2 × __m256d - -// NEON (src/simd_neon.rs:819): -pub struct F64x8(pub [float64x2_t; 4]); // 4 × 128-bit NEON lanes - -// Scalar fallback (simd.rs dispatch, target = other): -pub struct F64x8([f64; 8]); -``` - -**Stable constructor and accessor methods** (identical signature on all four -backends — this uniformity IS the contract): - -```rust -impl F64x8 { - pub fn splat(v: f64) -> Self; - pub fn from_slice(s: &[f64]) -> Self; // reads first 8 elements - pub fn from_array(arr: [f64; 8]) -> Self; - pub fn to_array(self) -> [f64; 8]; - pub fn reduce_sum(self) -> f64; - pub fn mul_add(self, b: Self, c: Self) -> Self; // FMA: self*b + c -} - -// Arithmetic traits (all backends): -impl Add for F64x8 { type Output = F64x8; } -impl Sub for F64x8 { type Output = F64x8; } -impl Mul for F64x8 { type Output = F64x8; } -impl Div for F64x8 { type Output = F64x8; } -impl AddAssign for F64x8 { } -impl SubAssign for F64x8 { } -impl MulAssign for F64x8 { } -impl DivAssign for F64x8 { } -impl Neg for F64x8 { type Output = F64x8; } -impl PartialEq for F64x8 { } -impl Default for F64x8 { } -impl fmt::Debug for F64x8 { } -``` - -**Semantics:** `F64x8` is an 8-wide lane of `f64` values. All arithmetic -operations are element-wise. `mul_add(b, c)` computes `self * b + c` with -FMA semantics where the hardware supports it; on backends lacking FMA the -result may differ by up to one ULP from a strict fused multiply-add. -`reduce_sum` returns the horizontal sum of all 8 lanes. - -**Consumer code pattern** (what downstream crates MUST write — the -polyfill handles backend selection): - -```rust -use ndarray::simd::F64x8; - -let va = F64x8::from_slice(&a[i*8..]); -let vb = F64x8::from_slice(&b[i*8..]); -let acc = va.mul_add(vb, acc); // acc = va * vb + acc -``` - -Consumers MUST NOT import from `ndarray::simd_avx512`, `ndarray::simd_avx2`, -or `ndarray::simd_neon` directly. Those modules are internal dispatch -backends (see §3). - -**Numeric tolerance:** `F64x8` arithmetic results agree with IEEE 754 -double-precision arithmetic to within the rounding error introduced by the -FMA instruction: at most 0.5 ULP per operation. For a dot product of length -`n` computed via `mul_add` + `reduce_sum`, the accumulated error is bounded -by `f64::EPSILON * n` relative to the scalar reference value computed with -the same operands in the same order. - -### 2.3 `heel_f64x8::cosine_f64_simd` - -**File:** `src/hpc/heel_f64x8.rs:109` - -**Signature:** - -```rust -pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 -``` - -**Semantics:** - -Computes the cosine similarity between vectors `a` and `b`: - -``` -cosine(a, b) = dot(a, b) / (||a||₂ × ||b||₂) -``` - -Returns a value in `[-1.0, 1.0]`. Returns `0.0` when either input is the -zero vector (denominator < `1e-12`). - -The implementation processes 8 elements per SIMD iteration using `F64x8` -FMA, then handles the scalar remainder. A single pass accumulates `dot`, -`norm_a`, and `norm_b` simultaneously — no second pass over the data. - -**Numeric tolerance contract:** - -The SIMD result agrees with the scalar reference (naive `f64` loop over -`dot`, `na`, `nb`, then `dot / (na * nb).sqrt()`) to within: - -``` -|cosine_simd(a,b) - cosine_scalar(a,b)| < f64::EPSILON * len -``` - -where `len = a.len().min(b.len())`. This contract is validated by the -regression test `cosine_matches_scalar` in `src/hpc/heel_f64x8.rs:278`. - -**Invariants that must not change:** - -- The return type is always `f64`. -- Slices of unequal length: only the `min(a.len(), b.len())` prefix is used. -- Empty slices: both `a` and `b` of length 0 return `0.0` (zero-vector guard). -- NaN propagation: if either input contains `NaN`, the result is `NaN` - (IEEE 754 semantics propagate through `F64x8` arithmetic). - -### 2.4 `heel_f64x8::l1_f64_simd` (aspirational frozen) - -**Status:** This function name is reserved in the stability commitment but -does not yet exist as a standalone `pub fn` in `heel_f64x8.rs` at time of -publication. The L1 norm capability exists in the codebase under different -names (`hpc/bgz17_bridge.rs:419` as `Base17::l1`, `hpc/holo.rs:1981` as -`focus_l1`), but a unified, slice-oriented `l1_f64_simd` kernel for the -`ndarray::hpc::heel_f64x8` module is called for by the integration plan. - -**Intended signature when implemented:** - -```rust -pub fn l1_f64_simd(a: &[f64], b: &[f64]) -> f64 -``` - -**Intended semantics:** - -Computes the L1 (Manhattan) distance: - -``` -L1(a, b) = Σᵢ |aᵢ - bᵢ| -``` - -SIMD implementation using `F64x8`: compute element-wise absolute -differences in 8-wide chunks, accumulate via `reduce_sum`. - -**Numeric tolerance contract (when implemented):** - -``` -|l1_simd(a,b) - l1_scalar(a,b)| ≤ f64::EPSILON * len -``` - -### 2.5 `heel_f64x8::l2_f64_simd` (aspirational frozen) - -**Status:** Reserved. Not yet implemented as a standalone function in -`heel_f64x8.rs`. Existing L2/Euclidean distance kernels live in -`src/hpc/distance.rs` (squared L2 for spatial point sets) and -`src/hpc/cam_pq.rs` (as `squared_l2`, re-exported through `ndarray::simd`). - -**Intended signature when implemented:** - -```rust -pub fn l2_f64_simd(a: &[f64], b: &[f64]) -> f64 -``` - -**Intended semantics:** - -Computes the L2 (Euclidean) distance: - -``` -L2(a, b) = sqrt(Σᵢ (aᵢ - bᵢ)²) -``` - -SIMD implementation: accumulate squared differences via `F64x8::mul_add`, -then `reduce_sum`, then scalar `sqrt`. Note: the sqrt is applied once after -the vector accumulation — not inside the SIMD loop. - -**Numeric tolerance contract (when implemented):** - -``` -|l2_simd(a,b) - l2_scalar(a,b)| ≤ f64::EPSILON * len -``` - -### 2.6 `heel_f64x8::linf_f64_simd` (aspirational frozen) - -**Status:** Reserved. Not yet implemented as a standalone function. - -**Intended signature when implemented:** - -```rust -pub fn linf_f64_simd(a: &[f64], b: &[f64]) -> f64 -``` - -**Intended semantics:** - -Computes the L-infinity (Chebyshev) distance: - -``` -L∞(a, b) = max_i |aᵢ - bᵢ| -``` - -SIMD implementation: compute element-wise absolute differences via `F64x8`, -reduce via element-wise max, then final horizontal max over 8 lanes. - -**Numeric tolerance contract (when implemented):** - -``` -|linf_simd(a,b) - linf_scalar(a,b)| = 0.0 -``` - -The L-infinity distance is a pure selection (max of absolute values), which -is exact under IEEE 754. No accumulation error is introduced. - -### 2.7 `hpc-extras` Cargo Feature - -**File:** `Cargo.toml:207` - -```toml -hpc-extras = ["std", "dep:p64", "dep:fractal", "fractal/std"] -``` - -**Semantic contract:** Enabling `hpc-extras` (which is part of the `default` -feature set — see `Cargo.toml:174`) activates the p64 palette / NARS bridge -and the fractal manifold crates. The following modules become available: - -- `ndarray::hpc::spo_bundle` -- `ndarray::hpc::deepnsm` -- `ndarray::hpc::compression_curves` -- `ndarray::hpc::crystal_encoder` -- `ndarray::hpc::p64_bridge` - -**Stability contract for `hpc-extras`:** - -1. The feature name `hpc-extras` is frozen. It will not be renamed. -2. Its implied set of features (`std`, `dep:p64`, `dep:fractal`) is frozen - in the sense that it will never be made _smaller_ without a semver bump - and deprecation period. The set may grow (new optional deps are additive). -3. Consumers that build with `default-features = false` and do not re-enable - `hpc-extras` will continue to have a working build. The `hpc` module is - always available with `std`; only the `hpc-extras`-gated submodules - (listed above) disappear. The stable surface (`F64x8`, `cosine_f64_simd`, - etc.) is in the `std`-gated core and is NOT gated on `hpc-extras`. - ---- - -## 3. Internal / Unstable Surface - -The following items are **NOT part of the stable API**. They may change -without notice between versions, including patch releases during active -development. Downstream crates that depend on them are responsible for -tracking changes. - -### 3.1 Backend Dispatch Modules - -``` -src/simd_avx512.rs — AVX-512F + AVX-512VBMI intrinsics -src/simd_avx2.rs — AVX2 + fallback intrinsics -src/simd_neon.rs — ARM NEON paired-load implementation -src/simd.rs — compile-time + runtime dispatch glue -``` - -The internal layout of these modules — which intrinsic calls are used, -which `#[target_feature]` guards appear, which helper types (`f64x4`, -`float64x2_t`) are used to build `F64x8` — can change without notice. - -In particular, the VBMI dispatch path introduced in the SIMD review of -2026-05-13 (see `.claude/board/SIMD_REVIEW_FIXES_2026_05_13.md`) added -`avx512vbmi: bool` to `SimdCaps` and a runtime branch in -`U8x64::permute_bytes`. Similar runtime dispatch adjustments within the -polyfill internals are expected and explicitly not subject to stability -guarantees. - -### 3.2 Auto-Dispatch Heuristics - -The `src/hpc/simd_caps.rs` singleton (`SimdCaps`) and the -`src/hpc/simd_dispatch.rs` frozen function-pointer table are internal -implementation details. They detect the host CPU at startup and route -all SIMD operations to the best available backend. - -The exact detection logic (`is_x86_feature_detected!`, `cpuid` calls, -`avx512vbmi` / `avx512f` branching) may change as new ISA extensions are -added to the dispatch table. The contract for consumers is: write code -using `ndarray::simd::F64x8` and the stable free functions; the dispatch -layer guarantees correctness on all supported targets. - -### 3.3 Internal Scratch Buffers - -Several `heel_f64x8` helper functions (`cosine_f32_to_f64_simd` at -`src/hpc/heel_f64x8.rs:149`) use stack-allocated scratch buffers of type -`[f64; 8]` for widening conversions. The size, lifetime, and placement of -these buffers are implementation details and may be refactored (e.g., moved -into callers, replaced with SIMD widening intrinsics) without notice. - -### 3.4 The `hpc/` Submodule Inventory - -The following modules under `src/hpc/` are explicitly unstable: - -``` -src/hpc/ocr_simd.rs -src/hpc/clam_compress.rs -src/hpc/holo.rs (carrier_distance_l1, focus_l1, focus_hamming — not the stable heel_f64x8 variants) -src/hpc/packed.rs -src/hpc/crystal_encoder.rs -src/hpc/byte_scan.rs -src/hpc/activations.rs -src/hpc/framebuffer.rs -src/hpc/cyclic_bundle.rs -src/hpc/causality.rs -src/hpc/nibble.rs -src/hpc/arrow_bridge.rs -src/hpc/vml.rs -src/hpc/layered_distance.rs -src/hpc/prefilter.rs -src/hpc/surround_metadata.rs -src/hpc/reductions.rs -src/hpc/lapack.rs -src/hpc/projection.rs -src/hpc/compression_curves.rs -src/hpc/simd_caps.rs -src/hpc/simd_dispatch.rs -src/hpc/gpt2/ -src/hpc/jina/ -src/hpc/stream/ -src/hpc/stable_diffusion/ -src/hpc/styles/ -``` - -These modules are present for internal and research purposes. They do not -participate in the stability commitment. Their interfaces may change, be -removed, or be refactored into new modules at any time. - -### 3.5 `.cargo/config.toml` CPU Targeting - -The repository ships with `.cargo/config.toml` setting -`target-cpu=x86-64-v4` (AVX-512 mandatory for x86_64 development builds). -This is a developer convenience. Downstream consumers building on earlier -microarchitectures must override this via their own `.cargo/config.toml` or -`RUSTFLAGS`. The runtime dispatch in `simd_caps.rs` correctly falls back to -AVX2, NEON, or scalar regardless of the compile-time `target-cpu` setting -when the `#[target_feature]` guards are respected. - ---- - -## 4. What "Frozen" Means - -A symbol listed in §2 as stable has the following properties permanently -guaranteed: - -### 4.1 No Signature Change - -The Rust function signature — including parameter types, return type, -generic bounds, and `where` clauses — will not change without a semver major -version bump. For `F64x8` methods, "signature" includes the `Self` type and -all associated types. - -Examples of what is **not allowed** without a semver bump: - -```rust -// FORBIDDEN — changing parameter type: -// Was: pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 -// Now: pub fn cosine_f64_simd(a: &[f64], b: &[f64], len: usize) -> f64 - -// FORBIDDEN — changing return type: -// Was: pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 -// Now: pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f32 - -// FORBIDDEN — adding generic parameters: -// Was: pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 -// Now: pub fn cosine_f64_simd(a: &[T], b: &[T]) -> T -``` - -### 4.2 No Rename - -The symbol name at the module path level will not change. If -`ndarray::hpc::heel_f64x8::cosine_f64_simd` is the stable name, it stays at -that path. Re-exporting it at a new path is additive and allowed; removing -the original re-export is not. - -### 4.3 No Semantic Drift - -The mathematical semantics of a stable function will not change. In -particular: - -- `cosine_f64_simd` will always return cosine similarity, never cosine - distance (`1.0 - cosine`), and will never change the zero-vector guard - threshold without a deprecation cycle. -- `F64x8::reduce_sum` will always return the sum of all 8 lanes, not a - partial sum or a dot product. -- `F64x8::mul_add(b, c)` will always compute `self * b + c`, not - `self + b * c`. - -### 4.4 New Variants Ship Next to Existing Ones - -When a capability needs to be extended or a performance-improved variant is -introduced, the new symbol ships as an **additional** function with a new -name, leaving the original untouched. The original is never removed or -silently replaced. - -**Example — hypothetical FMA-specialized cosine:** - -```rust -// Original (frozen, untouched): -pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 { /* ... */ } - -// New variant ships NEXT to original, never replaces it: -pub fn cosine_f64_simd_fma(a: &[f64], b: &[f64]) -> f64 { /* fma-specialized */ } -``` - -Consumers can opt into the new variant at their own pace. Nothing breaks. - -### 4.5 Deprecation Timeline - -When a stable symbol must eventually be superseded, the procedure is: - -1. Add the replacement with a new name (additive). -2. Mark the original `#[deprecated(since = "...", note = "use new_name")]`. -3. Keep both symbols for at least two minor releases (or 90 calendar days, - whichever is longer). -4. Only then may the deprecated symbol be moved to an internal module or - removed. - -No stable symbol has been deprecated as of 2026-05-18. - ---- - -## 5. Adding New Kernels — The Additive Pattern - -All growth of the HPC surface happens additively. The patterns below are -the only approved ways to add new distance and SIMD kernels. - -### 5.1 New f32 Kernels — `F32x16` Pattern - -If an f32-width variant of the cosine / L1 / L2 / Linf kernels is needed, -it ships in a new function (or in an extended `heel_f64x8.rs` section) using -`ndarray::simd::F32x16` as the SIMD register type: - -```rust -// In src/hpc/heel_f64x8.rs (additive — new function, old function untouched): -pub fn cosine_f32_simd(a: &[f32], b: &[f32]) -> f32 { /* uses F32x16 */ } -``` - -Note that `cosine_f32_to_f64_simd` (which converts f32 inputs to f64 -internally) already exists at `src/hpc/heel_f64x8.rs:149` and is -re-exported via `ndarray::simd::cosine_f32_to_f64_simd` -(`src/simd.rs:1751`). A native f32-output variant would be a distinct, -additional function. - -### 5.2 New Int8 Kernels — `heel_i8x32` Pattern - -Int8 distance metrics (for quantized embedding spaces) would ship in a new -module: - -``` -src/hpc/heel_i8x32.rs (new file — does not touch heel_f64x8.rs) -``` - -With a new Cargo feature gate if the dependency weight warrants it. The -naming convention follows the existing heel prefix: `heel_i8x32`. - -**Expected public surface:** - -```rust -// src/hpc/heel_i8x32.rs -pub fn l1_i8_simd(a: &[i8], b: &[i8]) -> i64; -pub fn dot_i8_simd(a: &[i8], b: &[i8]) -> i64; -``` - -The existing `ndarray::hpc::hpc::quantized` module (`src/hpc/quantized.rs`) -provides `Int8Gemm` infrastructure that `heel_i8x32` would build on. - -### 5.3 Hamming on Binary Vectors — `heel_u8x32` Pattern - -Bit-level Hamming distance for dense binary vectors (e.g., binary -quantized embeddings, CLAM binary tree codes) would ship in: - -``` -src/hpc/heel_u8x32.rs (new file — additive) -``` - -**Expected public surface:** - -```rust -// src/hpc/heel_u8x32.rs -pub fn hamming_u8_simd(a: &[u8], b: &[u8]) -> u64; -``` - -Note: a scalar `hamming_distance_raw` already exists at -`src/hpc/bitwise.rs:180`, and a DataFusion UDF wrapper at -`src/hpc/udf_kernels.rs:49`. The `heel_u8x32::hamming_u8_simd` variant -would be a new SIMD-accelerated standalone kernel using `ndarray::simd::U8x64`. - -### 5.4 Submodule Naming Convention - -All new heel-family kernels follow the convention: - -``` -heel_{type}x{lane_count} -``` - -| Submodule | Element type | Lane count | Register | -|-----------|-------------|-----------|---------| -| `heel_f64x8` (existing) | `f64` | 8 | `F64x8` | -| `heel_f32x16` (planned) | `f32` | 16 | `F32x16` | -| `heel_i8x32` (planned) | `i8` | 32 | (sub-byte SIMD) | -| `heel_u8x32` (planned) | `u8` | 32 | `U8x64` (2-chunk) | - -### 5.5 Additive Rule Summary - -> **New capability = new symbol at new path. Never a signature change to -> an existing stable symbol.** - -This rule applies to: -- New functions in existing modules (added, not changed) -- New modules alongside existing modules (added, not changed) -- New Cargo features alongside existing features (added, not changed) -- New type parameters on existing types (forbidden for stable types) - ---- - -## 6. Diamond-Dependency Guard - -### 6.1 The Problem - -Rust's dependency resolution allows at most one version of a crate per -build graph when that crate is shared (not renamed). When `surrealdb-core` -and the ONNX runtime crate `ort` both depend on `ndarray`, they must agree -on exactly which `ndarray` they are using — otherwise Rust generates two -incompatible types both named `ndarray::Array2`, and the build fails -at the type-system level when code tries to pass one to a function expecting -the other. - -This fork exists precisely to solve that problem: by placing the -AdaWorldAPI-extended ndarray at a pinned git revision under -`[patch.crates-io]`, all crates in the workspace see the same ndarray. - -### 6.2 The Patch Contract - -In the surrealdb-core workspace `Cargo.toml` (and in any consumer that -assembles surrealdb-core + lance-graph cognitive crates): - -```toml -# In the root Cargo.toml of the consumer workspace: -[patch.crates-io] -ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git", branch = "main" } -``` - -This entry is the **contract**. Its presence makes the fork's stable API -available to every crate in the build graph. Its **absence** or **change** -breaks the fork. - -**What the patch replaces:** The upstream `ndarray` crate from crates.io -(currently 0.16.x stable, later 0.17.x). Any workspace crate that specifies -`ndarray = "0.16"` or `ndarray = "0.17"` in its own `[dependencies]` will -silently receive this fork instead, because `[patch.crates-io]` overrides -all version-matched dependencies. - -**What breaks if the patch is removed or points to the wrong commit:** - -1. surrealdb-core's vector index distance kernels lose access to - `ndarray::hpc::heel_f64x8::cosine_f64_simd` — linker error or type - mismatch. -2. `ort` (the ONNX runtime Rust crate) may resolve to the upstream ndarray, - creating a second ndarray in the build graph. Downstream code that passes - `ndarray::Array` values between `ort` and surrealdb-core fails with - cryptic type errors like `expected ndarray::Array2, found - ndarray::Array2` (same name, different crate instance). -3. The lance-graph cognitive crates lose access to `ndarray::simd::F64x8` - and all `hpc::` distance kernels. - -### 6.3 Version Pinning Strategy - -The `[patch.crates-io]` stanza should pin to a specific **tag** (not a -floating branch name) in production deployments: - -```toml -# Preferred for production: -[patch.crates-io] -ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git", tag = "v0.17.2-hpc-1" } - -# Acceptable for CI on main branch: -[patch.crates-io] -ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git", branch = "main" } -``` - -Floating branch pins (`branch = "main"`) are acceptable in CI but must not -be used in published releases of surrealdb-core or lance-graph, as they -make the build non-reproducible. - -### 6.4 ort Interop Invariant - -The ONNX runtime crate (`ort`, wrapped from the C++ ORT library) has its -own optional ndarray integration. The fork's `Cargo.toml` at -`src/lib.rs:313` exposes `pub mod hpc` only under `#[cfg(feature = "std")]`. -This means: - -- `ort` configurations that only need the core ndarray array types (no HPC) - continue to work: they depend on `ndarray::Array`, `ndarray::ArrayView`, - etc., which are unchanged from upstream. -- `ort` configurations that use ndarray as a tensor interchange format - with surrealdb-core benefit from the fork's presence because all three - crates now share the same ndarray type identity. - -The fork adds ONLY new modules and features. It does not modify the core -array types, layout types, or BLAS backends that `ort` depends on. - ---- - -## 7. CI Commitment - -### 7.1 Target Architecture Matrix - -The following cross-architecture matrix is the aspirational CI target. It -documents the intended coverage; implementation of the full matrix in CI -infrastructure is work in progress as of 2026-05-18. - -| Target triple | SIMD tier | `F64x8` backend | Status | -|--------------|-----------|-----------------|--------| -| `x86_64-unknown-linux-gnu` + AVX-512F | AVX-512 | `simd_avx512::F64x8` | Intended | -| `x86_64-unknown-linux-gnu` + AVX2 (no AVX-512) | AVX2 | `simd_avx2::F64x8` | Intended | -| `aarch64-unknown-linux-gnu` + NEON | NEON | `simd_neon::F64x8` | Intended | -| `x86_64-unknown-linux-gnu` (scalar only) | Scalar | fallback `[f64; 8]` | Intended | -| `thumbv6m-none-eabi` (no-std) | None (`hpc` disabled) | N/A | Intended | - -### 7.2 Doctest Coverage - -All stable public functions in §2 must have at least one doctest that -compiles and runs correctly under `cargo test --doc`. The current status -for `cosine_f64_simd` is satisfied via the test suite in -`src/hpc/heel_f64x8.rs` (8 unit tests, including `cosine_matches_scalar` -at line 278 which verifies the numeric tolerance contract against a scalar -reference). - -The aspirational goal is for each stable function to have a doctest visible -in the rendered docs (i.e., in the `///` doc comment rather than only in -`#[test]`). This requires that `cargo test --doc --features std` passes on -all four SIMD tiers listed in §7.1. - -### 7.3 Test Command - -The current passing test invocation (1786 passing as of the SIMD review -on 2026-05-13): - -```sh -cargo test --features rayon --lib -``` - -The clippy clean invocation: - -```sh -cargo clippy --features rayon -- -D warnings -``` - -Both must pass on every commit to the `main` branch that touches any file -in the stable surface (§2). Changes to explicitly unstable modules (§3) -are encouraged to pass both commands but are not gating. - -### 7.4 Numeric Regression Guard - -The tolerance assertions in `src/hpc/heel_f64x8.rs` (tests `cosine_matches_scalar`, -`cosine_identical`, `cosine_opposite`, `cosine_orthogonal`) form the -numeric regression guard for the stable API. These tests must not be -weakened (loosened tolerance) or removed without a corresponding update to -this document. - -The current observed tolerance for `cosine_f64_simd` vs scalar on x86_64 -(tested at len=333 with trigonometric inputs) is less than `1e-10`, well -within the committed `f64::EPSILON * len` bound (`2.22e-16 * 333 = 7.4e-14`). - ---- - -## 8. Cross-References to Integration Plans - -This stability commitment is informed by and consistent with four -integration planning documents in the repository: - -### Plan 1: Lance-Graph DataFusion Integration - -**File:** `.claude/prompts/04_lance_graph_integration.md` - -This plan defines the DataFusion UDF layer that uses ndarray HPC kernels: - -| UDF Name | Underlying ndarray kernel | -|----------|--------------------------| -| `hamming` | `hpc::bitwise::hamming_distance_raw` (`src/hpc/bitwise.rs:180`) | -| `spo_distance` | `hpc::node::Node::distance` | -| `nars_revision` | `hpc::causality::NarsTruthValue::revision` | -| `sigma_classify` | `hpc::cascade::Cascade::expose` | -| `bf16_hamming` | `hpc::bf16_truth::bf16_hamming_scalar` | - -The document notes that ndarray provides the kernels; lance-graph provides -the DataFusion UDF wrappers. This separation is architecturally correct and -preserved: stable kernels in ndarray, UDF registration in lance-graph. - -The lance-graph repo's phase completion status (as of 2026-03-22): -- Phase 1 (blasgraph CSC/Planner): DONE -- Phase 2 (bgz17 container/semiring): DONE -- Phase 3 (dual-path): NOT STARTED — depends on `heel_f64x8` stable surface -- Phase 4 (FalkorDB retrofit): NOT STARTED - -The frozen `cosine_f64_simd` and the aspirational `l1_f64_simd`, -`l2_f64_simd`, `linf_f64_simd` functions are the kernel requirements for -Phase 3 to proceed. - -### Plan 2: SIMD Review and Soundness Fixes (2026-05-13) - -**File:** `.claude/board/SIMD_REVIEW_FIXES_2026_05_13.md` - -The 15-agent CCA2A review fleet identified three soundness/correctness -issues and deferred a broader "cosmetic SIMD" sweep. The P0 SIGILL fix -for `U8x64::permute_bytes` on AVX-512F-without-VBMI machines is directly -relevant to the stability commitment: it demonstrates the mechanism by which -the polyfill internals (AVX2/AVX-512/NEON dispatch paths) CAN change -without the stable consumer API changing. - -The P0 fix added `avx512vbmi: bool` to `SimdCaps` and a runtime branch in -`U8x64::permute_bytes`. The consumer API (`ndarray::simd::U8x64`) was -unchanged. This is the correct pattern for all future backend changes. - -The deferred "cosmetic SIMD" item (scalar function bodies wearing -`#[target_feature]` decorations) will be cleaned up when the polyfill -is completed — `U8x64` / `F32x8` / etc. will have full method parity -across AVX-512, AVX2, NEON, and scalar. Until then, those files remain -in the explicitly-unstable category (§3.4). - -### Plan 3: SPO Bundle Simulation Findings - -**File:** `.claude/SPO_BUNDLE_FINDINGS_v2.md` - -This empirical study confirmed that majority-vote bundling at 8K and 16K -bits is in the "dead zone" for ranking tasks (Spearman ρ ≈ 0.001 at 8K, -ρ ≈ 0.417 at 16K). The ZeckF64 band encoding at 64 bits dominates both. - -This finding is relevant to stability because it validates that the -distance kernels in `heel_f64x8` (cosine, and the aspirational L1/L2/Linf) -are the correct abstraction boundary: they operate on f64 vectors, not on -fixed-width binary bundles. The `heel_f64x8` module design is not expected -to need binary-bundle variants (those live in `hpc::spo_bundle`, -`hpc::cyclic_bundle`, and related unstable modules). - -### Plan 4: Architecture Rule (from CLAUDE.md) - -**File:** `CLAUDE.md` (repository root, referenced in agent instructions) - -The architecture rule is: - -``` -ndarray = hardware (SIMD, Palette, Base17, SpoDistanceMatrices, read_bgz7_file) -lance-graph = thinking (NarsTruth, NarsEngine, TripleModel, AutocompleteCache) -causal-edge = protocol (CausalEdge64, NarsTables, forward/learn) -p64 = convergence highway (both repos meet here) -``` - -The stable API in §2 maps directly to the "hardware" layer: `F64x8` is raw -SIMD register abstraction, `cosine_f64_simd` is a distance kernel. Both -are pure compute with no reasoning logic embedded. This architectural -separation is explicitly preserved by the stability commitment: stable symbols -in `ndarray::hpc::heel_f64x8` and `ndarray::simd` will not acquire -reasoning semantics (NarsTruth weighting, cascade band classification, etc.). -Those belong in lance-graph. - ---- - -## 9. Appendix: Numeric Tolerance Derivation - -### 9.1 IEEE 754 Error Accumulation - -For a dot product computed via FMA `mul_add` over `n` elements: - -``` -acc_0 = 0 -acc_i = acc_{i-1} + a_{chunk} * b_{chunk} (FMA in each SIMD lane) -``` - -Each `mul_add` introduces at most 0.5 ULP error relative to the exact -result of `a * b + acc`. After `n/8` iterations (one per 8-wide chunk), -the accumulated error is bounded by: - -``` -|sum_SIMD - sum_exact| ≤ (n/8) × 0.5 × ε_mach × |exact_sum| -``` - -where `ε_mach = f64::EPSILON = 2.220446049250313e-16`. - -For the cosine similarity specifically, three accumulators (dot, na, nb) -each accumulate independently, then the error in the final result -`dot / sqrt(na * nb)` is bounded (by first-order error analysis) by -approximately `3 × (n/8) × 0.5 × ε_mach`, which for large `n` is still -well within the committed `ε_mach × n` bound. - -### 9.2 Observed vs Committed Tolerance - -| Function | Vector length tested | Observed max error | Committed bound | -|----------|---------------------|--------------------|-----------------| -| `cosine_f64_simd` | 333 | `< 1e-10` | `ε × 333 ≈ 7.4e-14` | -| `cosine_f64_simd` | 1024 | `< 1e-10` (self-cosine = 1.0) | `ε × 1024 ≈ 2.3e-13` | -| `cosine_f64_simd` | 256 | `< 1e-10` (orthogonal = 0.0) | `ε × 256 ≈ 5.7e-14` | - -The observed error of `< 1e-10` is approximately 6 orders of magnitude -below the committed bound. The generous committed bound (`ε × len`) allows -for worst-case inputs (e.g., catastrophic cancellation) while being met -with significant headroom for typical embedding inputs. - -### 9.3 Zero-Vector Guard Threshold - -The zero-vector guard (`denom < 1e-12`) is part of the semantic contract -for `cosine_f64_simd`. The threshold `1e-12` was chosen to be: -- Above the rounding noise for zero vectors computed via FMA - (`n` multiplications of `0.0`, resulting in exactly `0.0`) -- Below the smallest meaningful norm of a non-zero embedding vector - used in practice (`min_norm ≫ 1e-6` for normalized unit vectors, - `min_norm ≫ 1e-3` for un-normalized language model embeddings) - -This threshold is frozen and will not change without a deprecation notice. - ---- - -*End of document. Maintained by the AdaWorldAPI/ndarray HPC team.* -*For questions: open an issue at https://github.com/AdaWorldAPI/ndarray* diff --git a/src/hpc/heel_f64x8.rs b/src/hpc/heel_f64x8.rs index 1b3160ec..87ff42bb 100644 --- a/src/hpc/heel_f64x8.rs +++ b/src/hpc/heel_f64x8.rs @@ -192,161 +192,6 @@ pub fn cosine_f32_to_f64_simd(a: &[f32], b: &[f32]) -> f64 { } } -// ═══════════════════════════════════════════════════════════════════════════ -// Stable distance kernels — L1, L2, L∞ — PP-15 / integration plan §5 -// ═══════════════════════════════════════════════════════════════════════════ - -/// L1 (Manhattan) distance between two equal-length f64 slices. -/// -/// Returns Σ |a[i] - b[i]|. Numerically: scalar reduction, no -/// catastrophic cancellation. SIMD-accelerated via F64x8 chunks. -/// -/// # Stability -/// -/// Public stable surface per `docs/hpc-stability.md`. Signature is -/// frozen: no rename, no semantic drift. -/// -/// # Panics (debug only) -/// -/// Panics in debug builds if `a.len() != b.len()`. -pub fn l1_f64_simd(a: &[f64], b: &[f64]) -> f64 { - debug_assert_eq!(a.len(), b.len()); - let n = a.len().min(b.len()); - let chunks = n / 8; - let remainder = n % 8; - - let mut acc = F64x8::splat(0.0); - for i in 0..chunks { - let va = F64x8::from_slice(&a[i * 8..]); - let vb = F64x8::from_slice(&b[i * 8..]); - let diff = va - vb; - acc = acc + diff.abs(); // acc += |a - b| lane-wise - } - let mut sum = acc.reduce_sum(); - - // Scalar remainder - let offset = chunks * 8; - for i in 0..remainder { - sum += (a[offset + i] - b[offset + i]).abs(); - } - sum -} - -/// L2 (Euclidean) distance between two equal-length f64 slices. -/// -/// Returns sqrt(Σ (a[i] - b[i])^2). SIMD-accelerated via F64x8 FMA chunks. -/// -/// # Stability -/// -/// Public stable surface per `docs/hpc-stability.md`. Signature is -/// frozen: no rename, no semantic drift. -/// -/// # Panics (debug only) -/// -/// Panics in debug builds if `a.len() != b.len()`. -pub fn l2_f64_simd(a: &[f64], b: &[f64]) -> f64 { - debug_assert_eq!(a.len(), b.len()); - let n = a.len().min(b.len()); - let chunks = n / 8; - let remainder = n % 8; - - let mut acc = F64x8::splat(0.0); - for i in 0..chunks { - let va = F64x8::from_slice(&a[i * 8..]); - let vb = F64x8::from_slice(&b[i * 8..]); - let diff = va - vb; - acc = diff.mul_add(diff, acc); // acc = diff*diff + acc (FMA) - } - let mut sum = acc.reduce_sum(); - - // Scalar remainder - let offset = chunks * 8; - for i in 0..remainder { - let d = a[offset + i] - b[offset + i]; - sum += d * d; - } - sum.sqrt() -} - -/// L_infinity (Chebyshev) distance between two equal-length f64 slices. -/// -/// Returns max_i |a[i] - b[i]|. SIMD-accelerated via F64x8 lane-wise max. -/// -/// # Stability -/// -/// Public stable surface per `docs/hpc-stability.md`. Signature is -/// frozen: no rename, no semantic drift. -/// -/// # Panics (debug only) -/// -/// Panics in debug builds if `a.len() != b.len()`. -pub fn linf_f64_simd(a: &[f64], b: &[f64]) -> f64 { - debug_assert_eq!(a.len(), b.len()); - let n = a.len().min(b.len()); - let chunks = n / 8; - let remainder = n % 8; - - let mut max_acc = F64x8::splat(0.0); - for i in 0..chunks { - let va = F64x8::from_slice(&a[i * 8..]); - let vb = F64x8::from_slice(&b[i * 8..]); - let diff = (va - vb).abs(); // |a - b| lane-wise - max_acc = max_acc.simd_max(diff); // running lane-wise max - } - let mut max_val = max_acc.reduce_max(); - - // Scalar remainder - let offset = chunks * 8; - for i in 0..remainder { - let d = (a[offset + i] - b[offset + i]).abs(); - if d > max_val { - max_val = d; - } - } - max_val -} - -#[cfg(test)] -mod l1_l2_linf_tests { - use super::*; - - #[test] - fn l1_zero_for_equal_inputs() { - let a = vec![1.0f64; 8]; - let b = vec![1.0f64; 8]; - let result = l1_f64_simd(&a, &b); - assert_eq!(result, 0.0, "L1 of identical vectors must be 0.0, got {}", result); - } - - #[test] - fn l2_matches_scalar_reference() { - let a: Vec = (0..100).map(|i| (i as f64 * 0.1).sin()).collect(); - let b: Vec = (0..100).map(|i| (i as f64 * 0.1).cos()).collect(); - - let simd_l2 = l2_f64_simd(&a, &b); - - // Scalar reference - let scalar_sum: f64 = a.iter().zip(&b).map(|(x, y)| (x - y) * (x - y)).sum(); - let scalar_l2 = scalar_sum.sqrt(); - - assert!( - (simd_l2 - scalar_l2).abs() < 100.0 * f64::EPSILON, - "SIMD L2 {:.15} vs scalar L2 {:.15}, diff = {:.3e}", - simd_l2, - scalar_l2, - (simd_l2 - scalar_l2).abs() - ); - } - - #[test] - fn linf_picks_the_largest_gap() { - let a = vec![0.0f64, 0.0, 5.0, 0.0]; - let b = vec![0.0f64, 0.0, 0.0, 0.0]; - let result = linf_f64_simd(&a, &b); - assert!((result - 5.0).abs() < f64::EPSILON, "L∞ should be 5.0, got {}", result); - } -} - #[cfg(test)] mod tests { use super::*;