From 8afa8acd43bead4e27000f1eaebb29b8dd386761 Mon Sep 17 00:00:00 2001 From: Jascha Date: Thu, 14 May 2026 22:30:00 -0700 Subject: [PATCH] Add Zensical documentation site Mirrors the structure of the AgentPin docs: index, getting-started, pin-protocol, CLI guide, adapters, detectors, deployment, security, troubleshooting. Wired up via zensical.toml at the repo root. The existing docs/spec.md remains the normative protocol reference (the new pin-protocol.md is the readable user-facing walkthrough). Fixes vs. the working draft: - Repository URL casing aligned to github.com/ThirdKeyAI/VectorPin. - pin-protocol.md cross-language anchor link points at testvectors/v2.json rather than v1.json (matches current main). Site rendering is handled by the Zensical static site generator; this patch only adds the source content and config. --- docs/adapters.md | 257 +++++++++++++++++++++++++++++++ docs/cli-guide.md | 286 ++++++++++++++++++++++++++++++++++ docs/deployment.md | 244 +++++++++++++++++++++++++++++ docs/detectors.md | 162 +++++++++++++++++++ docs/getting-started.md | 333 ++++++++++++++++++++++++++++++++++++++++ docs/index.md | 79 ++++++++++ docs/pin-protocol.md | 238 ++++++++++++++++++++++++++++ docs/security.md | 203 ++++++++++++++++++++++++ docs/troubleshooting.md | 244 +++++++++++++++++++++++++++++ zensical.toml | 44 ++++++ 10 files changed, 2090 insertions(+) create mode 100644 docs/adapters.md create mode 100644 docs/cli-guide.md create mode 100644 docs/deployment.md create mode 100644 docs/detectors.md create mode 100644 docs/getting-started.md create mode 100644 docs/index.md create mode 100644 docs/pin-protocol.md create mode 100644 docs/security.md create mode 100644 docs/troubleshooting.md create mode 100644 zensical.toml diff --git a/docs/adapters.md b/docs/adapters.md new file mode 100644 index 0000000..8270e1e --- /dev/null +++ b/docs/adapters.md @@ -0,0 +1,257 @@ +# Vector Store Adapters + +VectorPin ships thin adapters for the major vector databases. Adapters do two things: + +1. **Walk records** — Iterate the collection yielding `(id, vector, metadata, pin)` tuples for verification. +2. **Attach pins** — Write a pin into the record's metadata in whichever shape the backend prefers. + +The adapter protocol lives at [`src/vectorpin/adapters/base.py`](https://github.com/ThirdKeyAI/VectorPin/blob/main/src/vectorpin/adapters/base.py) and is intentionally thin. Community contributions for new backends are welcome. + +--- + +## Status + +| Backend | Status | Install | Notes | +|---|---|---|---| +| LanceDB *(default)* | Alpha | `pip install 'vectorpin[default]'` | Embedded, file-based, no daemon. Recommended. | +| Chroma | Alpha | `pip install 'vectorpin[chroma]'` | Both persistent and HTTP modes. | +| Qdrant | Alpha | `pip install 'vectorpin[qdrant]'` | Server-side payload filtering. | +| Pinecone | Alpha | `pip install 'vectorpin[pinecone]'` | Hosted only. | +| pgvector | Planned | — | | +| FAISS | Planned | Use `LanceDBAdapter` (embedded, has metadata column natively). | | + +All adapters present the same `iter_records()` / `attach_pin()` interface. The backend differences are limited to where the pin physically lives in the underlying record. + +--- + +## Storage Convention + +By convention, pins are stored under the metadata key `vectorpin`. Specifically: + +| Backend | Pin lives at | +|---|---| +| LanceDB | A typed schema column literally named `vectorpin` (string-valued, holding the pin JSON). | +| Chroma | The `metadata` dict, under key `vectorpin`. | +| Qdrant | The `payload` dict, under key `vectorpin`. | +| Pinecone | The `metadata` dict, under key `vectorpin`. | + +Backends without free-form metadata fields are out of scope — provenance must travel with the data, not in a sidecar. + +--- + +## LanceDB (default) + +LanceDB is the recommended default: embedded, file-based, no daemon, with a typed schema column that holds the Pin natively. It matches the [Symbiont runtime's](https://github.com/thirdkeyai/symbiont) default vector backend. + +### Pin a corpus + +```python +from vectorpin import Signer +from vectorpin.adapters import LanceDBAdapter + +adapter = LanceDBAdapter.connect("./data/vector_db", "rag-corpus") +signer = Signer.generate(key_id="prod-2026-05") + +for record in adapter.iter_records(): + pin = signer.pin( + source=record.metadata["text"], + model="text-embedding-3-large", + vector=record.vector, + ) + adapter.attach_pin(record.id, pin) +``` + +### Verify a corpus + +```python +from vectorpin import Verifier +from vectorpin.adapters import LanceDBAdapter + +adapter = LanceDBAdapter.connect("./data/vector_db", "rag-corpus") +verifier = Verifier({"prod-2026-05": public_key_bytes}) + +failed = 0 +for record in adapter.iter_records(): + if record.pin is None: + continue + result = verifier.verify( + record.pin, + source=record.metadata["text"], + vector=record.vector, + ) + if not result.ok: + print(f"FAIL {record.id} [{result.error.value}] {result.detail}") + failed += 1 + +assert failed == 0, f"{failed} records failed verification" +``` + +### Connection options + +`LanceDBAdapter.connect` accepts a URI (directory path, `s3://`, `gs://`, or LanceDB Cloud connection string), a table name, and optional column overrides: + +```python +adapter = LanceDBAdapter.connect( + uri="s3://my-bucket/vector_db", + table_name="rag-corpus", + id_column="id", # default: "id" + vector_column="vector", # default: "vector" +) +``` + +### Symbiont schema + +For Symbiont deployments: Symbiont's source text lives in the `content` column. Symbiont's column literally named `source` is upstream provenance (a URL), not VectorPin's `source` argument. Pass `source=record.metadata["content"]` when calling `signer.pin`. See [`tests/test_adapter_lancedb_symbiont.py`](https://github.com/ThirdKeyAI/VectorPin/blob/main/tests/test_adapter_lancedb_symbiont.py) for an end-to-end example. + +--- + +## Chroma + +Chroma offers both an embedded persistent client and a remote HTTP client. The adapter supports both. + +### Persistent (embedded) + +```python +from vectorpin.adapters import ChromaAdapter + +adapter = ChromaAdapter.connect_persistent("./chroma_db", "my-rag") +``` + +### HTTP + +```python +adapter = ChromaAdapter.connect_http( + host="chroma.internal", + port=8000, + collection_name="my-rag", + ssl=False, +) +``` + +### Pinning + +```python +for record in adapter.iter_records(): + pin = signer.pin( + source=record.metadata["text"], + model="text-embedding-3-large", + vector=record.vector, + ) + adapter.attach_pin(record.id, pin) +``` + +The pin is stored as a JSON string under `metadata["vectorpin"]`. Chroma metadata is `dict[str, str | int | float | bool]`, so the pin survives the JSON-string round trip without loss. + +--- + +## Qdrant + +Qdrant supports both local and Qdrant Cloud deployments. Pins are written into the `payload` dict. + +```python +from vectorpin.adapters import QdrantAdapter + +adapter = QdrantAdapter.connect( + url="http://localhost:6333", + collection_name="my-rag", + api_key=None, # set for Qdrant Cloud +) + +for record in adapter.iter_records(batch_size=256): + pin = signer.pin( + source=record.metadata["text"], + model="text-embedding-3-large", + vector=record.vector, + ) + adapter.attach_pin(record.id, pin) +``` + +Qdrant's payload filtering means you can query for unpinned records server-side: + +```python +# Pseudo — exact API depends on qdrant-client version +unpinned = client.scroll( + collection_name="my-rag", + scroll_filter={"must_not": [{"key": "vectorpin", "match": {"any": ["*"]}}]}, +) +``` + +--- + +## Pinecone + +Pinecone is hosted-only. Pins are stored under `metadata["vectorpin"]` as a JSON string. + +```python +from vectorpin.adapters import PineconeAdapter + +adapter = PineconeAdapter.connect( + api_key="...", + index_name="my-rag", +) + +for record in adapter.iter_records(): + pin = signer.pin( + source=record.metadata["text"], + model="text-embedding-3-large", + vector=record.vector, + ) + adapter.attach_pin(record.id, pin) +``` + +Pinecone metadata values are size-limited (40 KiB per record). VectorPin pins are well under 1 KiB at typical sizes, so you'll never hit the limit — but if you stuff large `extra` payloads in, double-check. + +--- + +## Choosing a Backend + +| If you... | Use | +|---|---| +| Just want pinning without standing up a server | **LanceDB** (default) | +| Already run Chroma | Chroma | +| Need server-side payload filtering | Qdrant | +| Are on Pinecone today | Pinecone | +| Run Symbiont | LanceDB (matches Symbiont's default backend) | + +LanceDB also gives you a typed `vectorpin` column, which is more grep-able than a JSON blob in a metadata dict — useful when reasoning about partial backfills. + +--- + +## Writing a New Adapter + +The adapter protocol is two methods plus a record dataclass. Sketch: + +```python +from dataclasses import dataclass +from typing import Iterator +import numpy as np +from vectorpin import Pin + +@dataclass +class PinnedRecord: + id: str + vector: np.ndarray + metadata: dict + pin: Pin | None + +class MyBackendAdapter: + @classmethod + def connect(cls, ...) -> "MyBackendAdapter": + ... + + def iter_records(self, batch_size: int = 256) -> Iterator[PinnedRecord]: + ... + + def attach_pin(self, record_id: str, pin: Pin) -> None: + ... +``` + +See [`src/vectorpin/adapters/base.py`](https://github.com/ThirdKeyAI/VectorPin/blob/main/src/vectorpin/adapters/base.py) for the canonical protocol and the existing adapters for working examples. + +--- + +## See Also + +- [CLI Guide](cli-guide.md#audit-commands) — Command-line equivalents to programmatic auditing +- [Getting Started](getting-started.md) — End-to-end pinning + verification walkthrough +- [Pin Protocol](pin-protocol.md) — Wire format and verification order diff --git a/docs/cli-guide.md b/docs/cli-guide.md new file mode 100644 index 0000000..1badf9f --- /dev/null +++ b/docs/cli-guide.md @@ -0,0 +1,286 @@ +# VectorPin CLI Guide + +The VectorPin CLI (`vectorpin`) provides commands for key generation, single-pin demos, and bulk auditing of vector-store collections. It ships with the Python package. + +--- + +## Installation + +```bash +pip install vectorpin + +# Or, if you'll audit a specific backend, pull its driver too: +pip install 'vectorpin[default]' # LanceDB +pip install 'vectorpin[chroma]' +pip install 'vectorpin[qdrant]' +pip install 'vectorpin[pinecone]' +``` + +Confirm the install: + +```bash +vectorpin --help +``` + +--- + +## Commands + +| Command | Purpose | +|---|---| +| `vectorpin keygen` | Generate an Ed25519 signing key pair. | +| `vectorpin pin` | Sign a single `(text, vector)` pair (debug / demo). | +| `vectorpin verify-pin` | Verify a single Pin against ground-truth source / vector. | +| `vectorpin audit-lancedb` | Walk a LanceDB table and verify every pinned record. | +| `vectorpin audit-chroma` | Walk a Chroma collection and verify every pinned record. | +| `vectorpin audit-qdrant` | Walk a Qdrant collection and verify every pinned record. | + +--- + +### `vectorpin keygen` — Generate Key Pair + +Generate an Ed25519 key pair for signing pins. + +```bash +vectorpin keygen --key-id prod-2026-05 --output ./keys +``` + +**Options:** + +| Flag | Required | Default | Description | +|------|----------|---------|-------------| +| `--key-id` | Yes | — | Identifier for the new key (travels in the pin's `kid` field). | +| `--output` | No | `keys` | Output directory. Created if missing. | + +**Output files:** + +``` +keys/ +├── prod-2026-05.priv # Ed25519 private key, 32 bytes — KEEP SECRET +└── prod-2026-05.pub # Ed25519 public key, 32 bytes +``` + +Set restrictive permissions on the private key immediately: + +```bash +chmod 600 ./keys/prod-2026-05.priv +``` + +--- + +### `vectorpin pin` — Sign a Single Pair + +Sign one `(source, vector)` pair and print the pin JSON to stdout. Intended for demos and one-off integrations; production ingestion should use a [vector store adapter](adapters.md) so the pin is written next to the embedding atomically. + +```bash +vectorpin pin \ + --private-key ./keys/prod-2026-05.priv \ + --key-id prod-2026-05 \ + --model text-embedding-3-large \ + --source ./doc.txt \ + --vector ./embedding.npy +``` + +**Options:** + +| Flag | Required | Description | +|------|----------|-------------| +| `--private-key` | Yes | Path to `.priv` key file. | +| `--key-id` | Yes | Key identifier (written into the pin's `kid` field). | +| `--model` | Yes | Embedding model identifier (e.g., `text-embedding-3-large`). | +| `--source` | Yes | Path to UTF-8 source-text file. | +| `--vector` | Yes | Path to vector file: `.npy` (NumPy) or `.json` (array of floats). | + +**Output:** Compact pin JSON to stdout; pipe to a file or directly into your DB write path. + +--- + +### `vectorpin verify-pin` — Verify a Single Pin + +Verify one pin against ground-truth source and/or vector. Both ground-truth inputs are optional — omit them to check only the signature (i.e., that the pin was produced by the registered key). + +```bash +vectorpin verify-pin \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 \ + --pin ./pin.json \ + --source ./doc.txt \ + --vector ./embedding.npy +``` + +**Options:** + +| Flag | Required | Description | +|------|----------|-------------| +| `--public-key` | Yes | Path to `.pub` key file. | +| `--key-id` | Yes | Key identifier (must match the pin's `kid`). | +| `--pin` | Yes | Path to pin JSON. | +| `--source` | No | Path to source text. If set, source is verified against `source_hash`. | +| `--vector` | No | Path to vector. If set, vector is verified against `vec_hash`. | + +**Exit codes:** + +| Code | Meaning | +|---|---| +| `0` | `OK` — pin verified. | +| `2` | Failure. Reason printed to stderr in the form `FAIL [] `. | + +--- + +## Audit Commands + +Audit commands walk an entire collection, verify every pinned record, and print a JSON summary to stdout. They exit non-zero if any pinned record fails verification — so they compose cleanly into CI or cron. + +Common audit summary shape: + +```json +{ + "table": "rag-corpus", + "total": 12453, + "pinned": 12450, + "verified_ok": 12450, + "verification_failed": 0, + "unpinned": 3 +} +``` + +`unpinned` records are reported but do **not** by themselves fail the run — operators who want stricter behavior can `grep` `unpinned` from the JSON summary in CI. + +### `vectorpin audit-lancedb` + +Audit a LanceDB table. The recommended default backend. + +```bash +vectorpin audit-lancedb \ + --uri ./data/vector_db \ + --table rag-corpus \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 \ + --source-column text +``` + +**Options:** + +| Flag | Required | Default | Description | +|------|----------|---------|-------------| +| `--uri` | Yes | — | LanceDB URI: a directory, `s3://`, `gs://`, or LanceDB Cloud connection string. | +| `--table` | Yes | — | Table name. | +| `--public-key` | Yes | — | Path to `.pub` key. | +| `--key-id` | Yes | — | Key identifier. | +| `--id-column` | No | `id` | Column holding the record id. | +| `--vector-column` | No | `vector` | Column holding the embedding. | +| `--source-column` | No | — | Optional column holding the source text; if set, source is verified too. For Symbiont's default schema, use `--source-column content`. | +| `--batch-size` | No | `256` | Records per batch. | + +### `vectorpin audit-chroma` + +Audit a Chroma collection (persistent or HTTP). + +```bash +# Persistent (file-based) +vectorpin audit-chroma \ + --path ./chroma_db \ + --collection my-rag \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 \ + --source-metadata-key text + +# HTTP server +vectorpin audit-chroma \ + --host chroma.internal \ + --port 8000 \ + --collection my-rag \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 +``` + +**Options:** + +| Flag | Required | Default | Description | +|------|----------|---------|-------------| +| `--collection` | Yes | — | Collection name. | +| `--public-key` | Yes | — | Path to `.pub` key. | +| `--key-id` | Yes | — | Key identifier. | +| `--path` | One of | — | Path for a `PersistentClient`. | +| `--host` | One of | — | Host for an `HttpClient`. (Either `--path` or `--host` is required.) | +| `--port` | No | `8000` | HTTP port. | +| `--ssl` | No | off | Enable TLS for HTTP client. | +| `--source-metadata-key` | No | — | Optional metadata key holding the source text. | +| `--batch-size` | No | `256` | Records per batch. | + +### `vectorpin audit-qdrant` + +Audit a Qdrant collection. + +```bash +vectorpin audit-qdrant \ + --url http://localhost:6333 \ + --collection my-rag \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 +``` + +**Options:** + +| Flag | Required | Default | Description | +|------|----------|---------|-------------| +| `--url` | Yes | — | Qdrant URL, e.g. `http://localhost:6333`. | +| `--collection` | Yes | — | Collection name. | +| `--public-key` | Yes | — | Path to `.pub` key. | +| `--key-id` | Yes | — | Key identifier. | +| `--api-key` | No | — | API key for Qdrant Cloud. | +| `--source-payload-key` | No | — | Optional payload key holding source text; if set, source is verified too. | +| `--batch-size` | No | `256` | Records per batch. | + +--- + +## CI Recipes + +### Drop-in cron audit + +```bash +#!/usr/bin/env bash +set -euo pipefail + +vectorpin audit-lancedb \ + --uri "$VECTOR_DB_URI" \ + --table "$VECTOR_TABLE" \ + --public-key "$VECTORPIN_PUB" \ + --key-id "$VECTORPIN_KID" \ + --source-column text \ + > /var/log/vectorpin/$(date +%Y-%m-%dT%H:%M).json +``` + +Non-zero exit triggers your alerting; the JSON summary is the audit log. + +### GitHub Actions + +```yaml +- name: VectorPin audit + run: | + pip install 'vectorpin[default]' + vectorpin audit-lancedb \ + --uri "${{ secrets.VECTOR_DB_URI }}" \ + --table rag-corpus \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 \ + --source-column text +``` + +### Treat unpinned records as failures + +The audit commands tolerate unpinned records by design (migrations, partial backfills). To fail closed: + +```bash +summary=$(vectorpin audit-lancedb --uri ./db --table t --public-key ./k.pub --key-id k) +echo "$summary" +echo "$summary" | python -c "import json,sys; s=json.load(sys.stdin); sys.exit(1 if s['unpinned'] else 0)" +``` + +--- + +## See Also + +- [Getting Started](getting-started.md) — Library usage in Python, Rust, and TypeScript +- [Vector Store Adapters](adapters.md) — Programmatic equivalents to the `audit-*` commands +- [Deployment](deployment.md) — Where to keep keys, how to rotate them diff --git a/docs/deployment.md b/docs/deployment.md new file mode 100644 index 0000000..142adc6 --- /dev/null +++ b/docs/deployment.md @@ -0,0 +1,244 @@ +# Deployment Guide + +This guide covers how to deploy VectorPin in production: key custody, rotation, CI integration, and the operational patterns that distinguish a real deployment from a demo. + +--- + +## Architecture Overview + +``` +Ingestion pipeline Vector store Audit / verify +────────────────── ──────────── ────────────── + +text + model ─┐ ┌─ verifier + │ │ (with key + signer ───┼──> (vector, pin) ──> LanceDB / Chroma / ... ─┤ registry) + (private) │ │ + │ └─ result + KMS / file ┘ (per record) +``` + +The pin is produced once at ingestion (where the private key lives) and verified continuously at read or audit time (where only the public key lives). This asymmetry is the whole point — verifiers need no secret material. + +--- + +## Key Custody + +VectorPin private keys are the only things an attacker needs in order to forge valid pins. Treat them accordingly. + +### Filesystem keys (development / small deployments) + +The CLI writes raw Ed25519 bytes: + +```bash +vectorpin keygen --key-id prod-2026-05 --output ./keys +chmod 600 ./keys/prod-2026-05.priv +``` + +| Property | Recommendation | +|---|---| +| File permissions | `0600`, owner = signing process user | +| Storage | Encrypted disk; never plaintext in version control | +| Backup | Encrypted, offline, in escrow at your org's secrets-of-record location | +| Distribution | Out of band — never email / chat / paste | + +### KMS / HSM (recommended for production) + +Production deployments SHOULD use a KMS or hardware-backed signer rather than file-system keys. The signing API can be wrapped — see [`Signer.from_pem`](https://github.com/ThirdKeyAI/VectorPin/blob/main/src/vectorpin/signer.py) for the surface to integrate against. + +Typical wrappers: + +- AWS KMS asymmetric keys (Ed25519 supported as of 2024). +- GCP Cloud KMS asymmetric keys. +- Azure Key Vault managed keys. +- YubiHSM / Nitrokey HSM for on-prem. + +Whatever you use, the only material that should ever leave the boundary is the public key bytes and the 64-byte signature on a specific `signed_bytes`. + +### Per-environment separation + +Use separate signing keys for separate environments. A staging key compromise must not invalidate production pins. + +``` +prod-2026-05 # production +staging-2026-05 # staging +dev-2026-05 # local dev +``` + +Verifier registries SHOULD include only the keys appropriate to that environment — a production verifier should not honor staging-signed pins. + +### Per-tenant separation + +Multi-tenant deployments SHOULD issue separate `kid`s per tenant rather than share a single producer key. This way, compromise of one tenant's environment cannot forge pins for another tenant. + +--- + +## Verifier Key Registries + +A verifier holds a mapping from `kid` to `(public_key, valid_from, valid_until)`. How that registry is populated is out of scope of the protocol, but the following SHOULD hold: + +- **Fingerprint format**: Operators identifying a key out of band SHOULD use `SHA-256(pubkey_bytes)` truncated to the first 16 hex digits, formatted as four colon-separated quads — e.g. `1f3a:7b22:9e0d:c4f1`. +- **Transparency log**: Production registries SHOULD reference a transparency-log entry (e.g., sigstore Rekor) for each `kid` registration and revocation. This lets downstream verifiers detect a malicious registry rollback. +- **TOFU is not recommended**: A verifier that auto-registers any `kid` it encounters provides no integrity guarantee — it's a checksum, not a signature. + +### Python registry shape + +```python +from vectorpin import Verifier, KeyEntry +from datetime import datetime, timezone + +verifier = Verifier({ + "prod-2026-05": KeyEntry( + public_key=load_public_bytes("./keys/prod-2026-05.pub"), + valid_from=datetime(2026, 5, 1, tzinfo=timezone.utc), + valid_until=None, # still current + ), + "prod-2025-11": KeyEntry( + public_key=load_public_bytes("./keys/prod-2025-11.pub"), + valid_from=datetime(2025, 11, 1, tzinfo=timezone.utc), + valid_until=datetime(2026, 5, 1, tzinfo=timezone.utc), # rotated out + ), +}) +``` + +Pins signed by `prod-2025-11` continue to verify against the registry as long as their `ts` falls in `[2025-11-01, 2026-05-01)`. + +--- + +## Key Rotation + +Rotate regularly, even without a known compromise. Suggested cadence: + +| Key | Rotation period | +|---|---| +| Production signing keys | Every 6–12 months | +| Staging / dev keys | Every 3 months | +| Keys after suspected compromise | Immediately (see [Revocation](#revocation)) | + +### Rotation procedure + +1. Generate a new keypair with a fresh `kid` (e.g., `prod-2026-11`). +2. Add the new public key to all verifier registries with `valid_from` no earlier than when the new private key becomes operational. Leave the old key in place. +3. Switch production signing to the new private key. +4. Set `valid_until` on the old key entry to the cutover instant. **Do not delete the old entry** — historical pins must continue to verify against it. +5. Optionally re-pin the corpus over time with the new key. This is not required — old pins remain valid forever within their key's window. + +### Revocation + +If a private key is compromised, set `valid_until` on the `kid` entry to the latest moment the key is believed to have been uncompromised. Pins with `ts` after that instant return `KEY_EXPIRED`; pins with `ts` before it continue to verify. + +The protocol does not specify a revocation file format in v2 — this is intentional, so deployments can integrate with existing PKI / sigstore infrastructure. The minimum requirement on a verifier is to honor the `(valid_from, valid_until)` window, however it is delivered. + +Operators SHOULD pair revocation with a transparency-log entry for the revocation event itself, so that downstream verifiers can detect a malicious registry rollback. + +--- + +## CI Integration + +The audit commands are designed to drop into CI unchanged. They print a JSON summary on stdout and exit non-zero on any verification failure. + +### Cron audit + +```bash +#!/usr/bin/env bash +set -euo pipefail + +vectorpin audit-lancedb \ + --uri "$VECTOR_DB_URI" \ + --table "$VECTOR_TABLE" \ + --public-key /etc/vectorpin/prod-2026-05.pub \ + --key-id prod-2026-05 \ + --source-column text \ + > /var/log/vectorpin/$(date -u +%Y-%m-%dT%H:%M:%SZ).json +``` + +A `cron` entry every 15 minutes plus alerting on non-zero exit gives you per-quarter-hour coverage of the corpus with no further integration work. + +### GitHub Actions + +```yaml +name: VectorPin audit +on: + schedule: + - cron: "0 * * * *" + workflow_dispatch: + +jobs: + audit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - run: pip install 'vectorpin[default]' + - run: | + vectorpin audit-lancedb \ + --uri "${{ secrets.VECTOR_DB_URI }}" \ + --table rag-corpus \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 \ + --source-column text +``` + +### Treating unpinned records as failures + +By default, `unpinned` records do not fail the run (they're useful during partial backfills). To fail closed: + +```bash +summary=$(vectorpin audit-lancedb --uri ... --table ... --public-key ... --key-id ...) +echo "$summary" +echo "$summary" | python -c "import json,sys; sys.exit(1 if json.load(sys.stdin)['unpinned'] else 0)" +``` + +--- + +## Inline vs Batch Verification + +| Pattern | Pin at | Verify at | Pro | Con | +|---|---|---|---|---| +| **Inline verify** | Ingestion | Every read | Strongest guarantee — no tampered vector ever reaches a model. | Adds ~50–100 µs of CPU per retrieval. | +| **Batch audit** | Ingestion | CI / cron | Zero read-path overhead. | Tampering only detected at next audit. | +| **Hybrid** | Ingestion | Sample reads + CI | Cheap probabilistic coverage. | Tuning required. | + +Inline verify on every retrieval is the right default for security-critical workloads (medical RAG, agent runtimes like Symbiont). Batch audit alone is acceptable for lower-stakes RAG when audit cadence is tight (every few minutes). Hybrid suits cost-sensitive deployments that want some inline protection without paying for it on every read. + +Sub-millisecond verification latency means inline is rarely the bottleneck. + +--- + +## Performance Budget + +Indicative numbers on a modern x86_64 laptop, 3072-dim vectors (`text-embedding-3-large`): + +| Operation | Rust (µs) | Python (µs) | +|---|---|---| +| `hash_vector` | 6.4 | 5.8 | +| `sign` (pin) | 35 | 35 | +| `verify_full` | 42 | 79 | +| `verify_signature_only` | 22 | 75 | + +Re-run on your own hardware before quoting numbers — see [`scripts/bench_python.py`](https://github.com/ThirdKeyAI/VectorPin/blob/main/scripts/bench_python.py) and [`rust/vectorpin/benches/perf.rs`](https://github.com/ThirdKeyAI/VectorPin/blob/main/rust/vectorpin/benches/perf.rs). + +For a 1 M record corpus, a full Python audit completes in roughly 80 seconds of pure VectorPin work (verification only; iterating the backend is separate). + +--- + +## Symbiont Integration + +For [Symbiont](https://github.com/ThirdKeyAI/Symbiont) deployments: + +- Symbiont's default vector backend is LanceDB; use `LanceDBAdapter` directly. +- Symbiont's source text is in the `content` column. Symbiont's column literally named `source` is upstream provenance like a URL, **not** VectorPin's `source` argument. +- Pass `source=record.metadata["content"]` when calling `signer.pin`. +- See [`tests/test_adapter_lancedb_symbiont.py`](https://github.com/ThirdKeyAI/VectorPin/blob/main/tests/test_adapter_lancedb_symbiont.py) for an end-to-end example against the Symbiont schema. + +The Symbiont runtime consumes VectorPin attestations to enforce policy along the lines of "agents may only retrieve from verified vector stores." + +--- + +## See Also + +- [Security](security.md) — Threat model and broader best practices +- [CLI Guide](cli-guide.md) — `vectorpin audit-*` reference +- [Pin Protocol](pin-protocol.md#key-rotation-and-revocation) — Protocol-level rotation semantics diff --git a/docs/detectors.md b/docs/detectors.md new file mode 100644 index 0000000..acab0ac --- /dev/null +++ b/docs/detectors.md @@ -0,0 +1,162 @@ +# Statistical Detectors + +Cryptographic pinning catches **modifications** to vectors after they are produced. Statistical detectors catch a different class of attack: **ingestion-time poisoning**, where a compromised pipeline writes legitimately-signed-but-malicious vectors into the store. + +The two are complementary. Pinning is the durable layer; statistical detection is defense-in-depth. + +--- + +## What Pinning Doesn't Catch + +A pin attests: + +> *"This vector is what the model produced from this source text."* + +It does **not** attest: + +> *"This vector came from a benign source text."* + +If an attacker compromises the ingestion pipeline (or the upstream document corpus) and feeds in: + +- Vectors derived from sensitive content the attacker controls +- Vectors crafted to skew RAG retrieval toward attacker-chosen passages +- Vectors that encode exfiltration payloads in the embedding's noise floor before pinning + +…then every pin is technically valid. The signature is correct, the hash matches, the vector is intact since pinning. The integrity layer cannot tell that the vector is *suspicious*. + +Statistical detectors look at the distribution of vectors over the store and flag anomalies that have no business being there. + +--- + +## Quick Start + +Requires the `detectors` extra: + +```bash +pip install 'vectorpin[detectors]' +``` + +```python +from vectorpin.detectors.isolation_forest import IsolationForestDetector + +# Fit on a corpus you trust (e.g., before exposing the ingestion pipeline) +detector = IsolationForestDetector().fit(clean_embeddings) + +# Score new arrivals +flagged = detector.decide(suspect_embeddings) +# `flagged` is a boolean array — True means "anomalous, hold for review" +``` + +In the VectorSmuggle empirical study, this single line of defense flagged every operating point of every distribution-shifting steganographic technique that hides a non-trivial amount of data. + +--- + +## What Detectors Catch + +| Attack | Pinning catches | Detectors catch | +|---|---|---| +| Modify a vector in the DB | ✅ (`VECTOR_TAMPERED`) | maybe | +| Re-sign with a stolen key | ❌ | maybe | +| Inject new vectors derived from exfil text | ❌ | ✅ | +| Inject vectors with steganographic noise (high payload) | ❌ | ✅ | +| Inject vectors with steganographic noise (low payload) | ❌ | maybe | +| Orthogonal rotation of all vectors | ✅ (`VECTOR_TAMPERED`) | ❌ | +| Replace source documents before embedding | ❌ | maybe | + +Detectors and pins overlap on the "modify a vector in the DB" attack — pins always catch it cleanly. But for the four attacks where pinning has a blind spot, detectors fill in some of the gap. + +**Detectors are brittle against an adaptive attacker** who knows the detector and can evade it (e.g., projecting their payload onto the in-distribution subspace). Use them as defense-in-depth, not as a primary control. + +--- + +## Available Detectors + +### IsolationForestDetector + +The default. Wraps `sklearn.ensemble.IsolationForest` with a VectorPin-shaped API. + +```python +from vectorpin.detectors.isolation_forest import IsolationForestDetector + +detector = IsolationForestDetector( + n_estimators=100, + contamination=0.05, # expected anomaly rate during fit +) +detector.fit(clean_embeddings) + +# Bool decision (True = anomaly) +flagged = detector.decide(new_embeddings) + +# Continuous anomaly score (higher = more anomalous) +scores = detector.score(new_embeddings) +``` + +**Strengths:** + +- Cheap to fit and score. +- Catches distribution shift in noise injection, scaling, offset perturbation, cross-model fragmentation. + +**Weaknesses:** + +- Cannot catch orthogonal rotation (preserves every density feature the detector fits on). +- Cannot catch payload that is projected onto the in-distribution subspace. +- Performance depends on the fit corpus being representative of legitimate traffic. + +--- + +## Operational Patterns + +### Ingestion-time gate + +The strictest pattern: fit a detector once on a trusted corpus, and reject any new embedding it flags before pinning. This means rejected payloads never get a valid pin. + +```python +from vectorpin import Signer +from vectorpin.detectors.isolation_forest import IsolationForestDetector + +signer = Signer.generate(key_id="prod-2026-05") +detector = IsolationForestDetector().fit(trusted_corpus_embeddings) + +def ingest(source: str, vector): + if detector.decide([vector])[0]: + raise IngestionRejected(f"vector flagged by detector") + return signer.pin(source=source, model="text-embedding-3-large", vector=vector) +``` + +### Periodic re-scoring + +Re-run the detector across the live corpus on a cadence (nightly, weekly) and surface newly-flagged records for review. Use this when ingestion is high-volume and the detector is too expensive to run inline. + +```python +flagged_ids = [] +for record in adapter.iter_records(): + if detector.decide([record.vector])[0]: + flagged_ids.append(record.id) +``` + +### Combined with pin audit + +The natural CI shape is: + +1. Run `vectorpin audit-*` — fails if any pin is broken. +2. Re-score the corpus with a detector fitted on a held-out trusted subset — fails if the anomaly rate jumps. + +Step 1 catches modifications; step 2 catches poisoning campaigns. + +--- + +## What Detectors Don't Promise + +- **No formal guarantee.** A false-negative is a single rounding decision; an adaptive attacker can target it. +- **No model-specific tuning out of the box.** The default contamination parameter is a starting point — tune against your model and corpus. +- **No multi-tenant separation.** If you fit one detector on a mixed-tenant corpus, you'll see tenant-shift false positives. Fit per-tenant. + +For the empirical study underlying the design choices here, see the [VectorSmuggle preprint](https://doi.org/10.5281/zenodo.20058256). + +--- + +## See Also + +- [Pin Protocol](pin-protocol.md) — What pinning does guarantee +- [Security](security.md) — Threat model +- [VectorSmuggle](https://github.com/jaschadub/VectorSmuggle) — Companion threat-research project diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..7400f68 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,333 @@ +# Getting Started with VectorPin + +This guide walks you through installing VectorPin, generating a signing key, pinning your first embedding, and verifying it — in Python, Rust, and TypeScript. + +--- + +## Installation + +### Python + +Requires Python >= 3.11. + +```bash +# Core library only (no vector DB driver) +pip install vectorpin + +# With the default LanceDB adapter (recommended) +pip install 'vectorpin[default]' + +# Other adapters +pip install 'vectorpin[chroma]' +pip install 'vectorpin[qdrant]' +pip install 'vectorpin[pinecone]' + +# Statistical detectors +pip install 'vectorpin[detectors]' + +# Everything +pip install 'vectorpin[all]' +``` + +### Rust + +Add `vectorpin` to your `Cargo.toml`: + +```toml +[dependencies] +vectorpin = "0.1" +``` + +Build and test: + +```bash +cd rust && cargo build && cargo test +``` + +### TypeScript / JavaScript + +Requires Node.js >= 20. Pure JavaScript dependencies (`@noble/ed25519`, `@noble/hashes`) — also runs on Deno, Bun, and edge runtimes. + +```bash +npm install vectorpin +``` + +### CLI + +The CLI ships with the Python package: + +```bash +pip install vectorpin +vectorpin --help +``` + +--- + +## Step 1: Generate a Signing Key + +VectorPin uses **Ed25519** exclusively. Generate a keypair in any language: + +### Python + +```python +from vectorpin import Signer + +signer = Signer.generate(key_id="prod-2026-05") +private_bytes = signer.private_key_bytes() # KEEP SECRET +public_bytes = signer.public_key_bytes() +``` + +### Rust + +```rust +use vectorpin::Signer; + +let signer = Signer::generate("prod-2026-05".to_string()); +let private_bytes = signer.private_key_bytes(); // KEEP SECRET +let public_bytes = signer.public_key_bytes(); +``` + +### TypeScript + +```ts +import { Signer } from 'vectorpin'; + +const signer = Signer.generate('prod-2026-05'); +const privateBytes = signer.privateKeyBytes(); // KEEP SECRET +const publicBytes = signer.publicKeyBytes(); +``` + +### CLI + +```bash +vectorpin keygen --key-id prod-2026-05 --output ./keys +``` + +This writes two files: + +- `keys/prod-2026-05.priv` — Ed25519 private key (32 bytes, `0600` permissions recommended) +- `keys/prod-2026-05.pub` — Ed25519 public key (32 bytes) + +The `kid` is the only label that travels with the pin — pick something stable and rotatable (see the [naming convention](security.md#key-id-naming-convention)). + +--- + +## Step 2: Pin an Embedding + +A Pin is a compact JSON attestation that commits to the source text, the producing model, the vector itself, and the timestamp. It is Ed25519-signed and travels alongside the embedding in your vector DB. + +### Python + +```python +import numpy as np +from vectorpin import Signer + +signer = Signer.generate(key_id="prod-2026-05") +embedding = my_model.embed("The quick brown fox.") # np.ndarray[f32] + +pin = signer.pin( + source="The quick brown fox.", + model="text-embedding-3-large", + vector=embedding, +) + +print(pin.to_json()) +# Store this string in your vector DB metadata, keyed as "vectorpin". +``` + +### Rust + +```rust +use vectorpin::Signer; + +let signer = Signer::generate("prod-2026-05".to_string()); +let embedding: Vec = my_model_embed("The quick brown fox."); + +let pin = signer.pin( + "The quick brown fox.", + "text-embedding-3-large", + embedding.as_slice(), +)?; + +let json = pin.to_json()?; +``` + +### TypeScript + +```ts +import { Signer } from 'vectorpin'; + +const signer = Signer.generate('prod-2026-05'); +const embedding = new Float32Array(/* ... 3072 floats ... */); + +const pin = signer.pin({ + source: 'The quick brown fox.', + model: 'text-embedding-3-large', + vector: embedding, +}); + +const json = pin.toJSON(); +``` + +### CLI + +```bash +vectorpin pin \ + --private-key ./keys/prod-2026-05.priv \ + --key-id prod-2026-05 \ + --model text-embedding-3-large \ + --source ./doc.txt \ + --vector ./embedding.npy +``` + +The pin JSON is printed to stdout. Pipe to a file or directly into your DB write path. + +### Pin Structure + +```json +{ + "v": 2, + "kid": "prod-2026-05", + "model": "text-embedding-3-large", + "source_hash": "sha256:9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08", + "vec_hash": "sha256:0123abcd...", + "vec_dtype": "f32", + "vec_dim": 3072, + "ts": "2026-05-15T12:00:00Z", + "sig": "MEUCIQD..." +} +``` + +See [Pin Protocol](pin-protocol.md) for the full wire-format specification. + +--- + +## Step 3: Verify a Pin + +Verification distinguishes failure modes so callers can route them differently — a `VECTOR_TAMPERED` is a security incident; an `UNKNOWN_KEY` is usually a misconfiguration. + +### Python + +```python +from vectorpin import Pin, Verifier + +verifier = Verifier({"prod-2026-05": signer.public_key_bytes()}) + +result = verifier.verify( + pin, + source="The quick brown fox.", # ground-truth source + vector=embedding, # ground-truth vector +) + +if result.ok: + print("Pin verified.") +else: + print(f"FAIL [{result.error.value}] {result.detail}") +``` + +### Rust + +```rust +use vectorpin::Verifier; + +let mut verifier = Verifier::new(); +verifier.add_key(signer.key_id(), signer.public_key_bytes()); + +let result = verifier.verify_full::<&[f32]>( + &pin, + Some("The quick brown fox."), + Some(embedding.as_slice()), + None, +); +assert!(result.is_ok()); +``` + +### TypeScript + +```ts +import { Verifier } from 'vectorpin'; + +const verifier = new Verifier({ [signer.keyId]: signer.publicKeyBytes() }); + +const result = verifier.verify(pin, { + source: 'The quick brown fox.', + vector: embedding, +}); + +if (!result.ok) throw new Error(`integrity failure: ${result.error}`); +``` + +### CLI + +```bash +vectorpin verify-pin \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 \ + --pin ./pin.json \ + --source ./doc.txt \ + --vector ./embedding.npy +``` + +Exit code 0 on success, 2 on failure. The failure reason is printed to stderr. + +### Verification Outcomes + +| Outcome | Meaning | +|---|---| +| `OK` | Signature valid, vector intact, source matches. | +| `SIGNATURE_INVALID` | Pin was forged or re-signed by an attacker. | +| `VECTOR_TAMPERED` | Embedding modified after pinning. **This is the steganography kill shot.** | +| `SOURCE_MISMATCH` | Source text differs from what was pinned. | +| `MODEL_MISMATCH` | Pin was produced by a different embedding model than expected. | +| `UNKNOWN_KEY` | Pin signed by a key not in the verifier's registry. | +| `KEY_EXPIRED` | `ts` falls outside the registered key's `(valid_from, valid_until)` window. | +| `SHAPE_MISMATCH` | Supplied vector dimensionality does not match `vec_dim`. | +| `PARSE_ERROR` | Pin JSON is malformed, oversized, or contains unknown top-level fields. | +| `RECORD_MISMATCH` / `COLLECTION_MISMATCH` / `TENANT_MISMATCH` | Replay-protection mismatch (see [Pin Protocol §8](pin-protocol.md#replay-protection)). | + +--- + +## Step 4: Pin a Whole Corpus + +Most users don't pin one vector at a time — they pin during ingestion and verify during audit. Use a [vector store adapter](adapters.md) to do this without writing schema code: + +```python +from vectorpin import Signer, Verifier +from vectorpin.adapters import LanceDBAdapter + +adapter = LanceDBAdapter.connect("./data/vector_db", "rag-corpus") +signer = Signer.generate(key_id="prod-2026-05") + +# Pin during ingestion +for record in adapter.iter_records(): + pin = signer.pin( + source=record.metadata["text"], # column holding source text + model="text-embedding-3-large", + vector=record.vector, + ) + adapter.attach_pin(record.id, pin) +``` + +Then audit from the command line as often as you like: + +```bash +vectorpin audit-lancedb \ + --uri ./data/vector_db \ + --table rag-corpus \ + --public-key ./keys/prod-2026-05.pub \ + --key-id prod-2026-05 \ + --source-column text +``` + +JSON summary on stdout, non-zero exit on any failure — drops into CI or cron unchanged. See [CLI Guide](cli-guide.md#audit-commands) for the audit commands across all backends. + +--- + +## Next Steps + +- [Pin Protocol](pin-protocol.md) — Wire format and verification order +- [CLI Guide](cli-guide.md) — Full CLI reference +- [Vector Store Adapters](adapters.md) — LanceDB, Chroma, Qdrant, Pinecone +- [Statistical Detectors](detectors.md) — Defense-in-depth against ingestion-time poisoning +- [Deployment](deployment.md) — Key custody, rotation, CI integration +- [Security](security.md) — Threat model and best practices diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..7850d60 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,79 @@ +# VectorPin + +**Verifiable integrity for AI embedding stores.** + +VectorPin is the provenance layer of the [ThirdKey](https://thirdkey.ai) trust stack: [SchemaPin](https://schemapin.org) (tool integrity) → [AgentPin](https://agentpin.org) (agent identity) → **VectorPin** (vector store integrity) → [Symbiont](https://symbiont.dev) (runtime). + +--- + +## What VectorPin Does + +Vector databases sit underneath every modern RAG system, but most are written and read with zero integrity checking. VectorPin binds each embedding to its source content and the model that produced it, then verifies that nothing has changed — including covert steganographic modifications invisible to traditional DLP. + +- **Pinning** — Sign a compact attestation that commits to the source text (SHA-256 of NFC-normalized UTF-8), the model identifier, the vector itself (SHA-256 of canonical little-endian bytes), the producer (Ed25519 key), and an RFC 3339 timestamp. +- **Verification** — Reject any embedding whose hash, source, model, signature, or `kid` does not match. Distinguish `VECTOR_TAMPERED` from `SOURCE_MISMATCH` from `UNKNOWN_KEY` so callers can route them. +- **Auditing** — Walk a whole LanceDB / Chroma / Qdrant / Pinecone collection and report on every record. JSON summary on stdout, non-zero exit on any failure — drops into CI or cron unchanged. +- **Key rotation** — Verifier registries hold multiple `kid → public_key` mappings, each with a `(valid_from, valid_until)` window. Old pins keep verifying; compromised keys produce `KEY_EXPIRED` for anything signed after the compromise instant. +- **Cross-language** — Python, Rust, and TypeScript implementations are byte-for-byte compatible, locked by shared test vectors in CI. + +## Quick Example + +```python +import numpy as np +from vectorpin import Signer, Verifier + +# At ingestion time +signer = Signer.generate(key_id="prod-2026-05") +embedding = my_model.embed("The quick brown fox.") +pin = signer.pin( + source="The quick brown fox.", + model="text-embedding-3-large", + vector=embedding, +) +# Store pin.to_json() alongside the embedding in your vector DB metadata. + +# At read/audit time +verifier = Verifier({"prod-2026-05": signer.public_key_bytes()}) +result = verifier.verify(pin, source="The quick brown fox.", vector=embedding) +if not result.ok: + print(f"INTEGRITY FAILURE: {result.error.value} — {result.detail}") +``` + +## Implementations + +| Language | Package | Install | +|----------|---------|---------| +| **Python** | `vectorpin` | `pip install vectorpin` | +| **Rust** | `vectorpin` | `cargo add vectorpin` | +| **TypeScript** | `vectorpin` | `npm install vectorpin` | + +All three are byte-for-byte compatible — a pin produced by any implementation verifies on the other two. The TS port is pure JavaScript via `@noble/ed25519` and `@noble/hashes`, so it also runs in Deno, Bun, and edge runtimes. + +## Why this matters + +Modern RAG systems convert sensitive content into high-dimensional vectors and store them in databases that don't inspect what gets written, don't verify integrity on read, and treat embeddings as opaque numerical artifacts. That's a giant attack surface. + +The companion [VectorSmuggle](https://github.com/jaschadub/VectorSmuggle) research project demonstrates that an attacker with write access to a vector pipeline can hide arbitrary data inside embeddings using noise injection, rotation, scaling, offset perturbations, cross-model fragmentation, and steganographic encoding that survives quantization. + +Cryptographic pinning is the kill shot. Every steganographic technique requires modifying the vector after the model produces it. If each vector ships with a signed attestation binding it to source text and producing model, any modification breaks the signature. + +## Documentation + +| Guide | Description | +|-------|-------------| +| [Getting Started](getting-started.md) | Install, generate keys, pin and verify embeddings | +| [Pin Protocol](pin-protocol.md) | Wire format, canonicalization, and verification order | +| [CLI Guide](cli-guide.md) | `vectorpin keygen`, `pin`, `verify-pin`, and `audit-*` commands | +| [Vector Store Adapters](adapters.md) | LanceDB, Chroma, Qdrant, Pinecone integrations | +| [Statistical Detectors](detectors.md) | Defense-in-depth against ingestion-time poisoning | +| [Deployment](deployment.md) | Key custody, rotation, and CI integration | +| [Security](security.md) | Threat model and best practices | +| [Troubleshooting](troubleshooting.md) | Common errors and solutions | +| [Specification](spec.md) | Protocol v2 wire-format specification | + +## Links + +- [GitHub](https://github.com/ThirdKeyAI/VectorPin) +- [VectorSmuggle preprint](https://doi.org/10.5281/zenodo.20058256) +- [Symbiont runtime](https://github.com/ThirdKeyAI/Symbiont) +- [ThirdKey](https://thirdkey.ai) diff --git a/docs/pin-protocol.md b/docs/pin-protocol.md new file mode 100644 index 0000000..bf9f9fa --- /dev/null +++ b/docs/pin-protocol.md @@ -0,0 +1,238 @@ +# Pin Protocol + +This page walks through the VectorPin v2 wire format and verification order at a level useful to library users. For the normative specification, see [spec.md](spec.md). + +--- + +## Overview + +A **Pin** is a compact JSON attestation that commits to five things: + +| Commitment | How | +|---|---| +| Source text | SHA-256 of UTF-8 NFC-normalized bytes | +| Embedding model | Identifier string (and optional `model_hash` over weight shards) | +| Vector | SHA-256 of canonical little-endian dtype bytes | +| Producer | Ed25519 signature over `domain_tag \|\| canonical_json(header)` | +| Time | RFC 3339 timestamp `YYYY-MM-DDTHH:MM:SSZ` | + +The pin travels with the embedding through the vector DB, stored under metadata key `vectorpin`. Verification recomputes the hashes, checks the signature against a registered public key, and reports a distinct outcome for each failure mode. + +--- + +## Wire Format + +A v2 Pin is a JSON object with **exactly** these top-level fields: + +| Field | Type | Required | Description | +|---|---|---|---| +| `v` | integer | yes | Protocol version. Must equal `2`. | +| `kid` | string | yes | Identifier of the signing key. | +| `model` | string | yes | Embedding model identifier. | +| `model_hash` | string | no | Optional `sha256:` hash over concatenated weight shards. | +| `source_hash` | string | yes | `sha256:` hex of NFC-normalized source text. | +| `vec_hash` | string | yes | `sha256:` hex of canonical little-endian vector bytes. | +| `vec_dtype` | string | yes | `"f32"` or `"f64"`. | +| `vec_dim` | integer | yes | Embedding dimensionality, `1 ≤ vec_dim ≤ 2^20`. | +| `ts` | string | yes | UTC timestamp matching `^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$`. | +| `extra` | object | no | `map`. Reserved keys: see [Reserved Keys](#replay-protection). | +| `sig` | string | yes | Ed25519 signature, URL-safe base64, no padding, 64 bytes decoded. | + +**Unknown top-level fields cause `PARSE_ERROR`.** This is a verifier MUST — it defeats downgrade attacks where an attacker strips new fields and presents the remainder to an older verifier. + +### Example + +```json +{ + "v": 2, + "kid": "prod-2026-05", + "model": "text-embedding-3-large", + "source_hash": "sha256:9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08", + "vec_hash": "sha256:0123abcd...", + "vec_dtype": "f32", + "vec_dim": 3072, + "ts": "2026-05-15T12:00:00Z", + "sig": "MEUCIQD..." +} +``` + +### Size Limits + +| Limit | Maximum | +|---|---| +| Total pin JSON, UTF-8 byte length | 64 KiB | +| `extra` entry count | 32 | +| Any `extra` key, UTF-8 byte length | 128 bytes | +| Any `extra` value, UTF-8 byte length | 1 KiB | +| `vec_dim` | 1,048,576 (2^20) | +| `sig`, decoded byte length | exactly 64 | + +Verifiers reject oversized pins **before** parsing the signature, to bound resource use under hostile input. + +--- + +## Canonicalization + +The signed byte sequence is: + +``` +signed_bytes := b"vectorpin/v2\x00" || canonical_json(header) +``` + +The 13-byte domain tag prevents cross-protocol signature reuse — a VectorPin signature cannot validate against a sister Trust-Stack protocol (SchemaPin, AgentPin) even if the same Ed25519 key is reused. + +`canonical_json` has fixed rules: + +- All keys sorted lexicographically by Unicode code point. +- No whitespace between tokens (separators are `,` and `:`, no surrounding spaces). +- UTF-8 encoding, NFC-normalized strings. +- `extra` omitted if empty; otherwise keys sorted by the same rule. +- `model_hash` omitted entirely if not set. +- Integers in minimal JSON form (no leading zeros, no exponent). +- Strings use JSON-standard escapes (`\"`, `\\`, `\b`, `\f`, `\n`, `\r`, `\t`, `\uXXXX` for `U+0000`–`U+001F` and `U+007F`). All other characters are raw UTF-8. + +The `sig` field is excluded from `signed_bytes`. **Every other field — including `v` and `kid` — is included.** This defeats: + +- **Version downgrade** — Cannot flip `v: 2` to `v: 1` to fool a legacy verifier. +- **Key swap** — Cannot re-attribute `(pin, sig)` to a different producer by editing `kid`. + +### String Hygiene + +All string fields (`model`, `kid`, `ts`, `extra` keys and values, and the source text) MUST be NFC-normalized and MUST NOT contain: + +- Control characters in `U+0000`–`U+001F`. +- Bidirectional overrides `U+202A`–`U+202E`, `U+2066`–`U+2069`. + +Implementations reject both at sign time and at parse time. + +### Vector Hygiene + +Vectors MUST be free of `NaN`, `+Inf`, `-Inf` at sign time. `-0.0` and `+0.0` are distinct values and both valid; FTZ/DAZ floating-point modes must be disabled or vectors normalized before hashing. + +--- + +## Verification Order + +A conforming verifier MUST execute these steps in order. Short-circuit on the first failure and return the distinct outcome. + +### 0. Size check + +If the serialized JSON exceeds any size limit above, return `PARSE_ERROR` without parsing. + +### 1. Version check + +If `v != 2` (or `v != 1` in legacy mode if explicitly enabled), return `UNSUPPORTED_VERSION`. + +### 2. Key registry lookup + +If `kid` is not in the verifier's registry, return `UNKNOWN_KEY`. + +If `kid` is registered but `ts` falls outside the entry's `(valid_from, valid_until)` window, return `KEY_EXPIRED`. This is how revocation works in v2 — see [Key Rotation](#key-rotation-and-revocation). + +### 3. Structural validation + +If the pin contains unknown top-level fields, non-string `extra` values, malformed `ts`, or any string field not in NFC form, return `PARSE_ERROR`. + +### 4. Signature + +Reconstruct `signed_bytes = "vectorpin/v2\x00" || canonical_json(header)` and verify `sig` against the registered public key for `kid`. On failure, return `SIGNATURE_INVALID`. + +### 5. Source check + +If the caller supplied a ground-truth source string, recompute `hash_text(source)` and compare to `source_hash`. On mismatch, return `SOURCE_MISMATCH`. + +### 6. Vector check + +If the caller supplied a ground-truth vector: + +1. Compare its shape to `vec_dim`. On mismatch, return `SHAPE_MISMATCH`. +2. Reject if the vector contains `NaN`/`Inf`, returning `PARSE_ERROR`. +3. Recompute `hash_vector(vector, vec_dtype)` and compare to `vec_hash`. On mismatch, return `VECTOR_TAMPERED`. + +### 7. Model check + +If the caller supplied an expected model identifier, compare to `model`. On mismatch, return `MODEL_MISMATCH`. + +### 8. Replay-protection check + +If the caller supplied an expected `vectorpin.record_id`, `vectorpin.collection_id`, or `vectorpin.tenant_id`, the verifier MUST compare against the value in `extra` and return `RECORD_MISMATCH` / `COLLECTION_MISMATCH` / `TENANT_MISMATCH` on mismatch. See [Replay Protection](#replay-protection). + +If every applicable step passes, return `OK`. + +--- + +## Key Rotation and Revocation + +Verifier registries hold multiple `kid → (public_key, valid_from, valid_until)` entries simultaneously. This is what makes both rotation and revocation work cleanly. + +### Rotation (hygiene) + +1. Generate a new keypair with a fresh `kid`. +2. Add the new public key to all verifier registries, with `valid_from` no earlier than when the new private key becomes operational. +3. Switch production signing to the new private key. +4. Optionally re-pin the corpus over time. +5. Set `valid_until` on the old key entry to the rotation cutover instant. **Do not remove the entry** — historical pins must continue to verify against it. + +Old pins continue to verify against the old key as long as their `ts` falls within the old key's `(valid_from, valid_until)` window. + +### Revocation (compromise) + +If a private key is compromised — as opposed to merely rotated — set `valid_until` on the `kid` entry to the latest moment the key is believed to have been uncompromised. Pins with `ts` after that instant return `KEY_EXPIRED`; pins with `ts` before it continue to verify. Historical pins stay valid; anything an attacker could forge post-compromise is rejected. + +Pair this with a transparency-log entry (e.g., sigstore Rekor) for the revocation event itself, so downstream verifiers can detect a malicious registry rollback. + +--- + +## Replay Protection + +Pins are not bound to a specific record id at the wire format level. An attacker who copies a pin from record A to record B can pass verification only if the vector and source they paste alongside also match — but in a corpus full of near-duplicates, this is a real concern. + +The `extra` map carries reserved keys for this: + +| Reserved Key | Type | Meaning | +|---|---|---| +| `vectorpin.collection_id` | string | Identifier of the vector-store collection / index. | +| `vectorpin.record_id` | string | Identifier of the specific record this pin attests. | +| `vectorpin.tenant_id` | string | Identifier of the multi-tenant logical namespace. | + +Every `extra` entry is signed, so the values are tamper-evident. Implementations that need stronger replay protection SHOULD set these at pin time and verifiers MUST enforce them when the caller supplies an expected value. + +The `vectorpin.` prefix is reserved for this specification — implementations MUST NOT define their own keys under it. + +--- + +## Cross-Language Compatibility + +The Python, Rust, and TypeScript implementations produce byte-for-byte identical pins, locked together by [shared test vectors](https://github.com/ThirdKeyAI/VectorPin/blob/main/testvectors/v2.json) consumed in all three test suites. A pin produced in any one of them verifies in the other two. + +Concretely, this means: + +- Pinning can happen in your Rust ingestion pipeline; auditing can happen in a Python CI job. +- A TypeScript edge function can verify pins produced by a Python batch processor. +- Backups can be re-verified years later from any implementation that conforms to the v2 spec. + +--- + +## Failure-Mode Reference + +| Outcome | Cause | Typical action | +|---|---|---| +| `OK` | Everything checks out. | Proceed. | +| `UNSUPPORTED_VERSION` | `v` is not in this verifier's set. | Upgrade verifier, or re-pin under v2. | +| `UNKNOWN_KEY` | `kid` is not in the registry. | Misconfiguration — add the key. | +| `KEY_EXPIRED` | `ts` is outside the registered `(valid_from, valid_until)`. | Rotation / revocation working as intended. | +| `PARSE_ERROR` | Oversized pin, unknown top-level field, non-string `extra` value, malformed timestamp, non-NFC string, `NaN`/`Inf` in vector. | Reject the pin; likely hostile or buggy producer. | +| `SIGNATURE_INVALID` | Pin forged, signed with the wrong key, or canonicalization differs. | **Security incident.** | +| `VECTOR_TAMPERED` | Stored vector differs from the one originally pinned. | **Security incident.** Likely steganography or DB compromise. | +| `SOURCE_MISMATCH` | Source text differs from what was pinned. | Source-side drift — investigate. | +| `MODEL_MISMATCH` | Pin was produced by a different model than expected. | Ingestion pipeline using wrong model — investigate. | +| `SHAPE_MISMATCH` | Caller's vector has the wrong dimensionality. | Misconfiguration. | +| `RECORD_MISMATCH` / `COLLECTION_MISMATCH` / `TENANT_MISMATCH` | Replay-protection mismatch. | Likely pin-shuffle attack — investigate. | + +--- + +## See Also + +- [Specification](spec.md) — Normative v2 wire format +- [Getting Started](getting-started.md) — Pinning and verifying in code +- [Security](security.md) — Threat model and best practices diff --git a/docs/security.md b/docs/security.md new file mode 100644 index 0000000..8cb5214 --- /dev/null +++ b/docs/security.md @@ -0,0 +1,203 @@ +# Security Best Practices + +This guide covers the security model, threat mitigations, and operational best practices for VectorPin deployments. + +--- + +## Cryptographic Foundation + +VectorPin uses **Ed25519** signatures and **SHA-256** hashing exclusively. All other algorithms are rejected. + +| Property | Value | +|----------|-------| +| Signature algorithm | Ed25519 (RFC 8032) | +| Hash algorithm | SHA-256 | +| Domain separator | exact ASCII bytes `vectorpin/v2\x00` (13 bytes) | +| Signature encoding | URL-safe base64, no padding (64 decoded bytes) | +| Pin format | JSON, canonicalized per spec §4.2 | + +### Why a single algorithm + +Single-algorithm enforcement prevents: + +- **Algorithm confusion attacks** — Attacker substitutes `none` or a weak HMAC variant to bypass verification. (VectorPin signatures are raw Ed25519, not JWT-shaped — but the principle still applies: there is no algorithm field to confuse.) +- **Downgrade attacks** — Attacker forces a weaker algorithm. The protocol version `v` is part of the signed payload (§4.2 of the spec), so a v2 verifier cannot be tricked into validating a v1-style signature on a v2 pin. +- **Implementation complexity** — Fewer code paths means fewer bugs. + +All three implementations (Python, Rust, TypeScript) bind the algorithm and the protocol version into the canonical bytes via the `vectorpin/v2\x00` domain tag before any signature operation. + +--- + +## Threat Model + +VectorPin is designed against an attacker who can: + +- ✅ **Modify vectors after they are produced** — via a poisoned ingestion pipeline, a compromised vector DB, or backup-level access. This is the steganography attack class from [VectorSmuggle](https://github.com/jaschadub/VectorSmuggle). VectorPin catches this with `VECTOR_TAMPERED`. +- ✅ **See the public verification key**, but not the private signing key. Ed25519 public-key recovery is computationally infeasible. +- ✅ **Replay or selectively delete pins** within a corpus — partly mitigated via `vectorpin.record_id` / `vectorpin.collection_id` / `vectorpin.tenant_id` in `extra` (see [Replay Protection](#replay-protection)). +- ✅ **Strip new fields and present a downgraded pin** — defeated by `v` and `kid` being in the signed canonical bytes, plus strict rejection of unknown top-level fields. +- ✅ **Reuse a signature against a sister Trust-Stack protocol** — defeated by the 13-byte `vectorpin/v2\x00` domain separator. + +VectorPin does **not** defend against: + +- ❌ **An attacker with the private signing key.** Out of scope; key custody is the user's responsibility. See [Key Custody](deployment.md#key-custody). +- ❌ **An attacker who modifies source documents *before* embedding.** Pair with upstream content integrity controls (signed ingestion logs, document provenance). +- ❌ **An attacker who uses a legitimate signing key to attest a malicious vector at ingestion time.** Pair with upstream input validation; see [Statistical Detectors](detectors.md) for defense-in-depth. +- ❌ **Cross-record replay within a corpus** unless the caller uses `vectorpin.record_id` and verifies it. + +--- + +## Key Management + +### Private Key Security + +- **Never commit private keys** to version control. +- **Never embed private keys** in application code or environment variables in plaintext. +- Store private keys in secure storage: file system with restricted permissions, a KMS, or an HSM. +- Use separate keys for separate environments (dev, staging, production). +- Use separate keys for separate tenants in multi-tenant deployments. + +```bash +# Set restrictive permissions on private key files +chmod 600 ./keys/*.priv + +# Verify +ls -la ./keys/*.priv +# -rw------- 1 vectorpin vectorpin 32 May 15 2026 prod-2026-05.priv +``` + +Production deployments SHOULD use a KMS or hardware-backed signer. See [Deployment > Key Custody](deployment.md#key-custody). + +### Key Rotation + +Rotate keys on a schedule, even without a known compromise: + +| Key Type | Rotation Period | +|----------|----------------| +| Production signing keys | Every 6–12 months | +| Development / testing keys | Every 3 months | +| Keys after suspected compromise | Immediately | + +The rotation procedure leaves historical pins valid forever — see [Deployment > Key Rotation](deployment.md#key-rotation) for the step-by-step. + +### Key ID Naming Convention + +Use descriptive, time-stamped key IDs for traceability: + +``` +{purpose}-{year}-{sequence} + +Examples: + prod-2026-05 # production, May 2026 + prod-2026-11 # production, November 2026 (after rotation) + staging-2026-05 # staging environment + tenant-acme-2026-05 # per-tenant key +``` + +### Key Fingerprints + +When identifying a key out of band (Slack, email, ticket), use a 16-hex-digit fingerprint of the public key: + +``` +fingerprint(pub) := SHA-256(pub)[:8] formatted as four colon-separated quads + +Example: 1f3a:7b22:9e0d:c4f1 +``` + +This is short enough to read over a phone, long enough to make collisions infeasible. + +--- + +## Operational Practices + +### Verifier Hygiene + +- **Always pin verifiers to a fixed key registry.** Do not enable TOFU for new pins unless you've explicitly weighed the tradeoff — auto-registering any `kid` you see makes the system a checksum, not a signature. +- **Honor the `(valid_from, valid_until)` window strictly.** This is how both rotation and revocation work. +- **Reject unknown top-level fields.** This is a verifier MUST in the v2 spec; it defeats downgrade attacks. +- **Enforce size limits before parsing the signature.** A single hostile pin without the §4.3 limits can exhaust verifier resources. + +### Storage Hygiene + +- Store pins under the metadata key `vectorpin` (the protocol convention). +- Back up your pins along with your vectors — a pin without its vector is useless, and vice versa. +- For cold backups, also store a copy of the public-key registry and `(valid_from, valid_until)` windows in effect at backup time, so you can verify the backup years later. + +### Replay Protection + +Pins are not bound to a specific record id at the wire-format level. An attacker who copies a pin from record A to record B can pass verification only if the vector and source they paste alongside also match. In a corpus of near-duplicates, that's a real concern. + +Set the reserved `extra` keys at pin time: + +```python +pin = signer.pin( + source=record_text, + model="text-embedding-3-large", + vector=record_vector, + extra={ + "vectorpin.collection_id": "rag-corpus", + "vectorpin.record_id": record_id, + "vectorpin.tenant_id": tenant_id, + }, +) +``` + +And verify them: + +```python +result = verifier.verify( + pin, + source=record_text, + vector=record_vector, + expected_record_id=record_id, + expected_collection_id="rag-corpus", + expected_tenant_id=tenant_id, +) +``` + +Every `extra` entry is signed, so the values are tamper-evident. Verifiers MUST enforce them when the caller supplies an expected value. + +The `vectorpin.` prefix is reserved for this specification — do not use it for your own keys. + +### Source-Time Integrity + +VectorPin attests to the relationship between source and vector **at pin time**. It does not attest that the source itself was authentic at ingestion. If the upstream document corpus is mutable, an attacker who controls it can have a legitimately-signed pin still mean something different than what you expect. + +Pair VectorPin with source-side controls where this matters: + +- Signed ingestion logs (e.g., sigstore Rekor entry per ingested document). +- Document provenance metadata captured at the moment of ingestion. +- Source hashing on the upstream document, separate from the embedding. + +--- + +## Defense in Depth + +VectorPin alone is the cryptographic backstop for vector integrity. For a hardened deployment, layer it: + +| Layer | What it catches | +|---|---| +| **Source integrity** (upstream) | Tampering with documents *before* embedding. | +| **Input validation** (ingestion) | Out-of-policy content that would otherwise be embedded and pinned. | +| **Statistical detectors** ([details](detectors.md)) | Ingestion-time poisoning that passes input validation. | +| **VectorPin** | Any modification of a vector after it is pinned. | +| **Inline verification at retrieval** | Catches DB-level tampering before any model consumes a vector. | +| **Periodic audit** ([details](deployment.md#ci-integration)) | Backstop for the read-path verifier. | +| **Transparency log** (sigstore Rekor) | Detects registry rollback or undisclosed key revocation. | + +The cryptographic guarantee from VectorPin survives every layer above it being compromised, as long as the private key is safe. + +--- + +## Reporting Vulnerabilities + +For security-sensitive findings, email `security@thirdkey.ai` rather than filing public issues. Do not disclose details in GitHub issues, social media, or talks until a fix has shipped to all three implementations. + +--- + +## See Also + +- [Pin Protocol](pin-protocol.md) — How verification is structured +- [Deployment](deployment.md) — Key custody and rotation in practice +- [Detectors](detectors.md) — Statistical defense-in-depth +- [Specification §9](spec.md#9-security-considerations) — Normative security considerations diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 0000000..81d316f --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,244 @@ +# Troubleshooting + +Common issues and solutions when working with VectorPin. + +--- + +## Verification Errors + +### `VECTOR_TAMPERED` + +**Problem:** The vector hash does not match `vec_hash`. + +``` +FAIL [vector_tampered] vec_hash mismatch +``` + +**Cause:** The stored vector differs from the one originally pinned. This is the steganography kill shot — it is what VectorPin exists to catch. Treat as a security incident until proven otherwise. + +**Triage steps:** + +1. Confirm you're passing the **stored** vector (not a re-computed one) to `verify()`. +2. Check for `f32` vs `f64` dtype mismatch between pin time and verify time. The pin commits to a specific `vec_dtype`; converting before hashing breaks the hash. +3. Check for FTZ/DAZ floating-point modes. Disable them, or normalize subnormals to zero before hashing. +4. If 1–3 are clean, treat as a real integrity failure. Pull backups, investigate write paths, and rotate any keys whose `valid_until` window covers the affected pins. + +--- + +### `SOURCE_MISMATCH` + +**Problem:** The source-text hash does not match `source_hash`. + +``` +FAIL [source_mismatch] source_hash mismatch +``` + +**Common causes:** + +1. **Encoding drift** — Source text was UTF-16 / Latin-1 at one site, UTF-8 at the other. +2. **Whitespace drift** — Trailing newline added or stripped; line endings normalized. +3. **Unicode normalization** — Text was not NFC-normalized before pinning. (VectorPin always NFC-normalizes; the failure means the input was different.) +4. **Wrong source column** — On Symbiont, `content` holds the text the embedding was produced from; `source` holds a URL. Using the wrong column breaks the hash. See [Adapters > Symbiont schema](adapters.md#symbiont-schema). + +**Solution:** Match the exact bytes the embedding was produced from. The pin commits to NFC-normalized UTF-8 of the source string passed to `signer.pin`. + +--- + +### `MODEL_MISMATCH` + +**Problem:** The pin's `model` field differs from the caller's expected model. + +``` +FAIL [model_mismatch] model "text-embedding-3-small" != expected "text-embedding-3-large" +``` + +**Cause:** The pin was produced with a different embedding model than the verifier expects. + +**Solution:** Either the ingestion pipeline used the wrong model, or the verifier was configured with the wrong expectation. Confirm which is correct, then either re-pin or update the verifier config. + +This check only fires when the caller passes an `expected_model` argument. If you don't supply one, model mismatch is silently accepted (the source/vector hashes still have to match, so this is only a confidentiality leak about which model the producer used — not an integrity failure). + +--- + +### `UNKNOWN_KEY` + +**Problem:** The pin's `kid` is not in the verifier's registry. + +``` +FAIL [unknown_key] kid "prod-2026-05" not in registry +``` + +**Common causes:** + +1. **Misconfiguration** — Verifier wasn't given the public key for this `kid`. +2. **Cross-environment leak** — A pin from staging reached production (or vice versa). +3. **Forged `kid`** — Less likely, since `kid` is in the signed canonical bytes (§4.2). If `kid` was tampered with, you'd see `SIGNATURE_INVALID` first. + +**Solution:** Add the missing public key to the registry, **after** confirming the key fingerprint out of band: + +```python +verifier = Verifier({ + "prod-2026-05": load_public_bytes("./keys/prod-2026-05.pub"), + # add the missing one: + "prod-2025-11": load_public_bytes("./keys/prod-2025-11.pub"), +}) +``` + +**Do not** silently auto-register unknown keys (TOFU). That makes the system a checksum, not a signature. + +--- + +### `KEY_EXPIRED` + +**Problem:** The pin's `ts` falls outside the registered key's `(valid_from, valid_until)` window. + +``` +FAIL [key_expired] ts 2026-06-01T00:00:00Z is after valid_until 2026-05-15T12:00:00Z +``` + +**Cause:** Either the key was rotated (and the new key is what should have signed this pin) or the key was revoked due to compromise. + +**Solution:** + +- If the pin's `ts` is recent and the producer should still be signing with this key — the registry's `valid_until` is wrong. Fix the window. +- If `valid_until` is correct and the pin is supposed to be there — the pin is suspect. It was signed after the key was retired, which means either clock skew on the producer or a real compromise. Investigate. +- If the key was deliberately revoked due to compromise, this is the system working as designed. + +See [Pin Protocol > Key Rotation and Revocation](pin-protocol.md#key-rotation-and-revocation). + +--- + +### `SIGNATURE_INVALID` + +**Problem:** The Ed25519 signature does not validate against the registered public key. + +``` +FAIL [signature_invalid] signature verification failed +``` + +**Triage steps:** + +1. Confirm the public key bytes match the private key that produced the pin. (Public-key fingerprint mismatch = wrong key in the registry.) +2. Confirm both implementations are on the same protocol version. A v1 pin presented to a strict v2 verifier produces `UNSUPPORTED_VERSION`, not `SIGNATURE_INVALID` — but a v2 pin whose canonical bytes were reconstructed wrongly will fail signature. +3. Check for double-decoding of the pin JSON (e.g., the JSON was JSON-decoded twice and is now a string-of-a-string). + +If 1–3 are clean and the failure is consistent, treat as a security incident — someone produced a pin with a key that isn't yours. + +--- + +### `SHAPE_MISMATCH` + +**Problem:** The caller's vector has a different dimensionality than `vec_dim`. + +``` +FAIL [shape_mismatch] vector has dim 1536, pin has vec_dim 3072 +``` + +**Cause:** Wrong embedding model on either side, or wrong vector field plucked from the DB record. + +**Solution:** Confirm the vector being passed to `verify()` is the one from the same record as the pin, and that both sides agree on the embedding model. + +--- + +### `UNSUPPORTED_VERSION` + +**Problem:** The pin's `v` is not in the verifier's supported set. + +``` +FAIL [unsupported_version] v=1, only v=2 supported +``` + +**Cause:** A v1 pin from before the May 2026 wire-format break. v1 is not backwards-compatible with v2. + +**Solution:** Re-pin the affected records using a v2 signer. If you need to verify v1 pins for a migration, use `LegacyV1Verifier` (opt-in only). See [Specification §12](spec.md#12-changes-from-v1) for the v1 → v2 change list. + +--- + +### `PARSE_ERROR` + +**Problem:** The pin JSON is malformed, oversized, or contains disallowed content. + +``` +FAIL [parse_error] unknown top-level field: "foo" +FAIL [parse_error] string field "model" is not NFC-normalized +FAIL [parse_error] pin exceeds 64 KiB size limit +``` + +**Common causes:** + +1. **Unknown top-level fields** — v2 verifiers MUST reject them. This is a downgrade-attack defense. +2. **Non-string `extra` values** — `extra` is strictly `map`. +3. **Non-NFC strings** — every string field must be NFC-normalized. +4. **Control characters or bidi overrides** in strings — always rejected. +5. **Oversized pin** — exceeds the 64 KiB limit, or `extra` exceeds 32 entries. +6. **Malformed timestamp** — `ts` must match exactly `^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$`. No fractional seconds, no timezone offsets, no lowercase `t`/`z`. + +**Solution:** Inspect the failing pin. Most `PARSE_ERROR` cases are buggy producers — fix the producer, re-pin. Hostile pins go to your security log. + +--- + +### `RECORD_MISMATCH` / `COLLECTION_MISMATCH` / `TENANT_MISMATCH` + +**Problem:** Replay-protection check failed. + +``` +FAIL [record_mismatch] expected record_id "doc-123", pin attests "doc-456" +``` + +**Cause:** A pin from record A was attached to record B, then audited with the expected record id for B. This is exactly what the reserved `vectorpin.record_id` (etc.) keys are designed to catch — see [Security > Replay Protection](security.md#replay-protection). + +**Solution:** Investigate as a possible pin-shuffle attack. If it's a legitimate copy / re-keying operation, re-pin under the new record id. + +--- + +## Pinning Errors + +### `vector contains NaN` / `vector contains Inf` + +**Problem:** The vector contains `NaN`, `+Inf`, or `-Inf`. + +**Cause:** Numerical issue upstream of pinning — e.g., a model that produced NaN for an empty or degenerate input. + +**Solution:** Reject the embedding upstream, don't try to pin it. Pinning NaN/Inf is forbidden by the protocol because their canonical byte representation isn't unique enough across implementations. + +--- + +### `string field is not NFC-normalized` + +**Problem:** A string field (source, model, kid, an `extra` key/value) is not NFC. + +**Solution:** Normalize before passing to `signer.pin`. The signer also normalizes, but it rejects strings containing control characters or bidi overrides outright. + +--- + +### Vector dtype confusion + +If you sign with `f32` but verify with the same array cast to `f64`, the hashes differ. The pin's `vec_dtype` is authoritative — verifiers cast the supplied vector to that dtype before hashing. Make sure your storage and your signing path agree on the dtype. + +--- + +## Adapter Errors + +### LanceDB: `column 'text' not found` + +**Problem:** `--source-column text` (or the programmatic equivalent) names a column that doesn't exist on the table. + +**Solution:** Check the table schema. On Symbiont, the source text lives in `content`, not `text`. Pass `--source-column content`. + +### Chroma: `audit-chroma requires either --path or --host` + +**Solution:** Pick one. `--path /path/to/db` for `PersistentClient`, `--host chroma.host --port 8000` for HTTP. + +### Qdrant: pin not visible in payload + +**Problem:** Pins were attached, but `audit-qdrant` reports them as `unpinned`. + +**Cause:** The pin is stored under `payload["vectorpin"]`. If your write path uses `set_payload` with a partial dict, ensure the dict actually includes that key. + +--- + +## See Also + +- [Pin Protocol > Failure-Mode Reference](pin-protocol.md#failure-mode-reference) — One-line summary of every error +- [Security](security.md) — When to escalate, when to fix config +- [Getting Started](getting-started.md) — Sanity-check end-to-end walkthrough diff --git a/zensical.toml b/zensical.toml new file mode 100644 index 0000000..786482e --- /dev/null +++ b/zensical.toml @@ -0,0 +1,44 @@ +[project] +site_name = "VectorPin Documentation" +site_url = "https://docs.vectorpin.org/" +site_description = "Verifiable integrity for AI embedding stores" +repo_url = "https://github.com/ThirdKeyAI/VectorPin" +repo_name = "ThirdKeyAI/VectorPin" +nav = [ + {"Home" = "index.md"}, + {"Getting Started" = "getting-started.md"}, + {"Pin Protocol" = "pin-protocol.md"}, + {"CLI Guide" = "cli-guide.md"}, + {"Vector Store Adapters" = "adapters.md"}, + {"Statistical Detectors" = "detectors.md"}, + {"Deployment" = "deployment.md"}, + {"Security" = "security.md"}, + {"Troubleshooting" = "troubleshooting.md"}, + {"Specification" = "spec.md"}, + {"thirdkey.ai" = "https://thirdkey.ai"}, +] + +[[project.theme.palette]] +scheme = "default" +toggle.icon = "lucide/sun" +toggle.name = "Switch to dark mode" + +[[project.theme.palette]] +scheme = "slate" +toggle.icon = "lucide/moon" +toggle.name = "Switch to light mode" + +[[project.extra.social]] +icon = "fontawesome/brands/github" +link = "https://github.com/ThirdKeyAI/VectorPin" +name = "VectorPin on GitHub" + +[[project.extra.social]] +icon = "fontawesome/solid/globe" +link = "https://thirdkey.ai" +name = "ThirdKey" + +[[project.extra.social]] +icon = "fontawesome/solid/shield-halved" +link = "https://doi.org/10.5281/zenodo.20058256" +name = "VectorSmuggle preprint"