From b6b4489a38c40a8d02aec7a7a3e34372984c261a Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Mon, 4 May 2026 08:08:15 -0500
Subject: [PATCH 1/7] initial commit

---
 benchmarks/asv.conf.json                      |  13 ++
 benchmarks/benchmarks/__init__.py             |   4 +
 benchmarks/benchmarks/_patch_setup.py         |  74 ++++++++++
 benchmarks/benchmarks/micro/__init__.py       |   0
 benchmarks/benchmarks/micro/bench_exp_log.py  |  98 +++++++++++++
 .../benchmarks/micro/bench_sqrt_misc.py       |  84 +++++++++++
 benchmarks/benchmarks/micro/bench_trig.py     | 134 ++++++++++++++++++
 benchmarks/benchmarks/npbench/__init__.py     |   0
 .../benchmarks/npbench/bench_arc_distance.py  |  50 +++++++
 .../benchmarks/npbench/bench_cholesky2.py     |  54 +++++++
 .../benchmarks/npbench/bench_correlation.py   |  56 ++++++++
 .../benchmarks/npbench/bench_covariance.py    |  53 +++++++
 .../benchmarks/npbench/bench_deriche.py       | 111 +++++++++++++++
 .../benchmarks/npbench/bench_doitgen.py       |  52 +++++++
 benchmarks/benchmarks/npbench/bench_gemm.py   |  53 +++++++
 benchmarks/benchmarks/npbench/bench_gemver.py |  71 ++++++++++
 .../benchmarks/npbench/bench_gesummv.py       |  45 ++++++
 .../benchmarks/npbench/bench_go_fast.py       |  69 +++++++++
 benchmarks/benchmarks/npbench/bench_k2mm.py   |  55 +++++++
 benchmarks/benchmarks/npbench/bench_k3mm.py   |  44 ++++++
 .../benchmarks/npbench/bench_mandelbrot.py    |  93 ++++++++++++
 .../benchmarks/npbench/bench_softmax.py       |  48 +++++++
 benchmarks/bootstrap-dashboard-branch.sh      |  40 ++++++
 23 files changed, 1301 insertions(+)
 create mode 100644 benchmarks/asv.conf.json
 create mode 100644 benchmarks/benchmarks/__init__.py
 create mode 100644 benchmarks/benchmarks/_patch_setup.py
 create mode 100644 benchmarks/benchmarks/micro/__init__.py
 create mode 100644 benchmarks/benchmarks/micro/bench_exp_log.py
 create mode 100644 benchmarks/benchmarks/micro/bench_sqrt_misc.py
 create mode 100644 benchmarks/benchmarks/micro/bench_trig.py
 create mode 100644 benchmarks/benchmarks/npbench/__init__.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_arc_distance.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_cholesky2.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_correlation.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_covariance.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_deriche.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_doitgen.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_gemm.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_gemver.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_gesummv.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_go_fast.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_k2mm.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_k3mm.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_mandelbrot.py
 create mode 100644 benchmarks/benchmarks/npbench/bench_softmax.py
 create mode 100644 benchmarks/bootstrap-dashboard-branch.sh

diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
new file mode 100644
index 00000000..facb1284
--- /dev/null
+++ b/benchmarks/asv.conf.json
@@ -0,0 +1,13 @@
+{
+    "version": 1,
+    "project": "mkl_umath",
+    "project_url": "https://github.com/IntelPython/mkl_umath",
+    "repo": "..",
+    "branches": ["main"],
+    "environment_type": "existing",
+    "benchmark_dir": "benchmarks",
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html",
+    "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/"
+}
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
new file mode 100644
index 00000000..bcf027e4
--- /dev/null
+++ b/benchmarks/benchmarks/__init__.py
@@ -0,0 +1,4 @@
+# Trigger MKL patching once per ASV worker process.
+# ASV uses --launch-method spawn in CI, so each worker is a fresh process
+# and this runs exactly once before any benchmark is collected or timed.
+from . import _patch_setup  # noqa: F401
diff --git a/benchmarks/benchmarks/_patch_setup.py b/benchmarks/benchmarks/_patch_setup.py
new file mode 100644
index 00000000..f7435c61
--- /dev/null
+++ b/benchmarks/benchmarks/_patch_setup.py
@@ -0,0 +1,74 @@
+"""MKL patch setup — executed once per ASV worker process at import time.
+
+Patches NumPy with Intel MKL implementations for fft, random, and umath.
+Hard-fails with a descriptive RuntimeError if any package is missing or the
+patch does not take effect, so benchmarks never silently run on stock NumPy.
+
+Visible output goes to stderr; pass --show-stderr to ``asv run`` to see it.
+"""
+
+import sys
+
+_PATCH_MAP = [
+    ("mkl_fft",    "patch_numpy_fft"),
+    ("mkl_random", "patch_numpy_random"),
+    ("mkl_umath",  "patch_numpy_umath"),
+]
+
+
+def _apply_patches():
+    patched = {}
+
+    for mod_name, patch_fn_name in _PATCH_MAP:
+        try:
+            mod = __import__(mod_name)
+        except ImportError as exc:
+            raise RuntimeError(
+                f"[mkl-patch] Cannot import {mod_name}: {exc}\n"
+                f"  Ensure the conda env contains {mod_name} from the Intel channel.\n"
+                f"  Required channels: https://software.repos.intel.com/python/conda"
+            ) from exc
+
+        patch_fn = getattr(mod, patch_fn_name, None)
+        if patch_fn is None:
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name} has no {patch_fn_name}(). "
+                f"Upgrade {mod_name} to a version that exposes the stock-numpy patch API."
+            )
+
+        try:
+            patch_fn()
+        except Exception as exc:
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name}.{patch_fn_name}() raised: {exc!r}"
+            ) from exc
+
+        is_patched_fn = getattr(mod, "is_patched", None)
+        if callable(is_patched_fn) and not is_patched_fn():
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name}.is_patched() returned False after patching. "
+                f"NumPy may have been imported before patching in a conflicting state."
+            )
+
+        patched[mod_name] = mod
+
+    # Verbose attribution — verify numpy-level dispatch changed hands
+    import numpy as np
+
+    _attr_checks = {
+        "mkl_fft":    lambda: np.fft.fft.__module__,
+        "mkl_random": lambda: np.random.random.__module__,
+        "mkl_umath":  lambda: np.exp.__module__,
+    }
+    for mod_name in patched:
+        try:
+            attr = _attr_checks[mod_name]()
+        except Exception:
+            attr = "unknown"
+        sys.stderr.write(f"[mkl-patch] {mod_name}: numpy dispatch → {attr}\n")
+
+    sys.stderr.write("[mkl-patch] ALL OK — mkl_fft, mkl_random, mkl_umath active\n")
+    sys.stderr.flush()
+
+
+_apply_patches()
diff --git a/benchmarks/benchmarks/micro/__init__.py b/benchmarks/benchmarks/micro/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/benchmarks/micro/bench_exp_log.py b/benchmarks/benchmarks/micro/bench_exp_log.py
new file mode 100644
index 00000000..216fc740
--- /dev/null
+++ b/benchmarks/benchmarks/micro/bench_exp_log.py
@@ -0,0 +1,98 @@
+"""Micro-benchmarks for mkl_umath exponential and logarithm ufuncs.
+
+Each class times a single ufunc over a Cartesian product of
+  dtype  ∈ [float32, float64]
+  size   ∈ [10_000, 100_000, 1_000_000]
+
+Arrays are pre-allocated in setup() and reused across timing calls.
+Patching is applied once at package import via benchmarks._patch_setup.
+"""
+
+import numpy as np
+
+
+class BenchExp:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        # float32 overflows exp around 88.7; use [-10, 10] safe for both dtypes
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
+
+    def time_exp(self, dtype, size):
+        np.exp(self.x)
+
+
+class BenchExp2:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        # float32 overflows exp2 around 127
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
+
+    def time_exp2(self, dtype, size):
+        np.exp2(self.x)
+
+
+class BenchExpm1:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
+
+    def time_expm1(self, dtype, size):
+        np.expm1(self.x)
+
+
+class BenchLog:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(1e-3, 1e3, size).astype(dtype)
+
+    def time_log(self, dtype, size):
+        np.log(self.x)
+
+
+class BenchLog2:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(1e-3, 1e3, size).astype(dtype)
+
+    def time_log2(self, dtype, size):
+        np.log2(self.x)
+
+
+class BenchLog10:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(1e-3, 1e3, size).astype(dtype)
+
+    def time_log10(self, dtype, size):
+        np.log10(self.x)
+
+
+class BenchLog1p:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        # log1p(x) is defined for x > -1; use [0, 10] which is always safe
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(0.0, 10.0, size).astype(dtype)
+
+    def time_log1p(self, dtype, size):
+        np.log1p(self.x)
diff --git a/benchmarks/benchmarks/micro/bench_sqrt_misc.py b/benchmarks/benchmarks/micro/bench_sqrt_misc.py
new file mode 100644
index 00000000..b1170639
--- /dev/null
+++ b/benchmarks/benchmarks/micro/bench_sqrt_misc.py
@@ -0,0 +1,84 @@
+"""Micro-benchmarks for mkl_umath sqrt, cbrt, and miscellaneous ufuncs.
+
+Each class times a single ufunc over a Cartesian product of
+  dtype  ∈ [float32, float64]
+  size   ∈ [10_000, 100_000, 1_000_000]
+
+Arrays are pre-allocated in setup() and reused across timing calls.
+Patching is applied once at package import via benchmarks._patch_setup.
+"""
+
+import numpy as np
+
+
+class BenchSqrt:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(0.0, 100.0, size).astype(dtype)
+
+    def time_sqrt(self, dtype, size):
+        np.sqrt(self.x)
+
+
+class BenchCbrt:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-100.0, 100.0, size).astype(dtype)
+
+    def time_cbrt(self, dtype, size):
+        np.cbrt(self.x)
+
+
+class BenchSquare:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
+
+    def time_square(self, dtype, size):
+        np.square(self.x)
+
+
+class BenchFabs:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-100.0, 100.0, size).astype(dtype)
+
+    def time_fabs(self, dtype, size):
+        np.fabs(self.x)
+
+
+class BenchAbsolute:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-100.0, 100.0, size).astype(dtype)
+
+    def time_absolute(self, dtype, size):
+        np.absolute(self.x)
+
+
+class BenchReciprocal:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        # Avoid values near zero to prevent inf results dominating timing
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(0.01, 100.0, size).astype(dtype)
+
+    def time_reciprocal(self, dtype, size):
+        np.reciprocal(self.x)
diff --git a/benchmarks/benchmarks/micro/bench_trig.py b/benchmarks/benchmarks/micro/bench_trig.py
new file mode 100644
index 00000000..eb09b9c6
--- /dev/null
+++ b/benchmarks/benchmarks/micro/bench_trig.py
@@ -0,0 +1,134 @@
+"""Micro-benchmarks for mkl_umath trigonometric ufuncs.
+
+Each class times a single ufunc over a Cartesian product of
+  dtype  ∈ [float32, float64]
+  size   ∈ [10_000, 100_000, 1_000_000]
+
+Arrays are pre-allocated in setup() and reused across timing calls.
+Patching is applied once at package import via benchmarks._patch_setup.
+"""
+
+import numpy as np
+
+
+class BenchSin:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-np.pi, np.pi, size).astype(dtype)
+
+    def time_sin(self, dtype, size):
+        np.sin(self.x)
+
+
+class BenchCos:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-np.pi, np.pi, size).astype(dtype)
+
+    def time_cos(self, dtype, size):
+        np.cos(self.x)
+
+
+class BenchTan:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        # Avoid values near ±π/2 where tan diverges
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-1.4, 1.4, size).astype(dtype)
+
+    def time_tan(self, dtype, size):
+        np.tan(self.x)
+
+
+class BenchArcsin:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
+
+    def time_arcsin(self, dtype, size):
+        np.arcsin(self.x)
+
+
+class BenchArccos:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
+
+    def time_arccos(self, dtype, size):
+        np.arccos(self.x)
+
+
+class BenchArctan:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
+
+    def time_arctan(self, dtype, size):
+        np.arctan(self.x)
+
+
+class BenchArctan2:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.y = rng.uniform(-1.0, 1.0, size).astype(dtype)
+        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
+
+    def time_arctan2(self, dtype, size):
+        np.arctan2(self.y, self.x)
+
+
+class BenchSinh:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        # float32 overflows sinh around ±89; keep well inside that
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-5.0, 5.0, size).astype(dtype)
+
+    def time_sinh(self, dtype, size):
+        np.sinh(self.x)
+
+
+class BenchCosh:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-5.0, 5.0, size).astype(dtype)
+
+    def time_cosh(self, dtype, size):
+        np.cosh(self.x)
+
+
+class BenchTanh:
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(-5.0, 5.0, size).astype(dtype)
+
+    def time_tanh(self, dtype, size):
+        np.tanh(self.x)
diff --git a/benchmarks/benchmarks/npbench/__init__.py b/benchmarks/benchmarks/npbench/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/benchmarks/npbench/bench_arc_distance.py b/benchmarks/benchmarks/npbench/bench_arc_distance.py
new file mode 100644
index 00000000..d8039649
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_arc_distance.py
@@ -0,0 +1,50 @@
+"""npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt.
+
+Preset sizes from npbench bench_info/arc_distance.json:
+  S: N=100_000
+  L: N=10_000_000
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py
+def _initialize(N):
+    from numpy.random import default_rng
+    rng = default_rng(42)
+    t0 = rng.random((N,))
+    p0 = rng.random((N,))
+    t1 = rng.random((N,))
+    p1 = rng.random((N,))
+    return t0, p0, t1, p1
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance_numpy.py
+def _arc_distance(theta_1, phi_1, theta_2, phi_2):
+    temp = (
+        np.sin((theta_2 - theta_1) / 2) ** 2
+        + np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2) ** 2
+    )
+    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))
+
+
+_PRESETS = {
+    "S": {"N": 100_000},
+    "L": {"N": 10_000_000},
+}
+
+
+class BenchArcDistance:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        self.theta_1, self.phi_1, self.theta_2, self.phi_2 = cache[preset]
+
+    def time_arc_distance(self, cache, preset):
+        _arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2)
diff --git a/benchmarks/benchmarks/npbench/bench_cholesky2.py b/benchmarks/benchmarks/npbench/bench_cholesky2.py
new file mode 100644
index 00000000..ea095122
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_cholesky2.py
@@ -0,0 +1,54 @@
+"""npbench wrapper: Cholesky decomposition v2 — mkl_umath ops: linalg.cholesky.
+
+Preset sizes from npbench bench_info/cholesky2.json:
+  S: N=1000
+  L: N=8000
+
+The kernel mutates A in-place (A[:] = cholesky(A) + triu(A, k=1)), so
+setup() copies A from cache before each timing round.
+
+The initialization constructs a symmetric positive-definite matrix via A @ A^T,
+which is expensive at N=8000.  setup_cache() runs this once per commit.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/cholesky2/cholesky2.py
+def _initialize(N, datatype=np.float64):
+    A = np.empty((N, N), dtype=datatype)
+    for i in range(N):
+        A[i, :i + 1] = np.fromfunction(
+            lambda j: (-j % N) / N + 1, (i + 1,), dtype=datatype
+        )
+        A[i, i + 1:] = 0.0
+        A[i, i] = 1.0
+    A[:] = A @ np.transpose(A)
+    return A
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/cholesky2/cholesky2_numpy.py
+def _kernel(A):
+    A[:] = np.linalg.cholesky(A) + np.triu(A, k=1)
+
+
+_PRESETS = {
+    "S": {"N": 1000},
+    "L": {"N": 8000},
+}
+
+
+class BenchCholesky2:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        self.A = cache[preset].copy()  # kernel mutates A in-place
+
+    def time_cholesky2(self, cache, preset):
+        _kernel(self.A)
diff --git a/benchmarks/benchmarks/npbench/bench_correlation.py b/benchmarks/benchmarks/npbench/bench_correlation.py
new file mode 100644
index 00000000..c5c7471d
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_correlation.py
@@ -0,0 +1,56 @@
+"""npbench wrapper: Correlation — mkl_umath ops: sqrt, std, mean.
+
+Preset sizes from npbench bench_info/correlation.json:
+  S: M=500,  N=600
+  L: M=3200, N=4000
+
+The kernel mutates ``data`` in-place (data -= mean; data /= ...), so
+setup() copies from the cache before each timing round.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/correlation/correlation.py
+def _initialize(M, N, datatype=np.float64):
+    float_n = datatype(N)
+    data = np.fromfunction(lambda i, j: (i * j) / M + i, (N, M), dtype=datatype)
+    return float_n, data
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/correlation/correlation_numpy.py
+def _kernel(M, float_n, data):
+    mean = np.mean(data, axis=0)
+    stddev = np.std(data, axis=0)
+    stddev[stddev <= 0.1] = 1.0
+    data -= mean
+    data /= np.sqrt(float_n) * stddev
+    corr = np.eye(M, dtype=data.dtype)
+    for i in range(M - 1):
+        corr[i + 1:M, i] = corr[i, i + 1:M] = data[:, i] @ data[:, i + 1:M]
+    return corr
+
+
+_PRESETS = {
+    "S": {"M": 500,  "N": 600},
+    "L": {"M": 3200, "N": 4000},
+}
+
+
+class BenchCorrelation:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        float_n, data = cache[preset]
+        self.M = _PRESETS[preset]["M"]
+        self.float_n = float_n
+        self.data = data.copy()  # kernel mutates data in-place
+
+    def time_correlation(self, cache, preset):
+        _kernel(self.M, self.float_n, self.data)
diff --git a/benchmarks/benchmarks/npbench/bench_covariance.py b/benchmarks/benchmarks/npbench/bench_covariance.py
new file mode 100644
index 00000000..bc541c5b
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_covariance.py
@@ -0,0 +1,53 @@
+"""npbench wrapper: Covariance — mkl_umath ops: mean.
+
+Preset sizes from npbench bench_info/covariance.json:
+  S: M=500,  N=600
+  L: M=3200, N=4000
+
+The kernel mutates ``data`` in-place (data -= mean), so setup() copies
+from the cache before each timing round.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/covariance/covariance.py
+def _initialize(M, N, datatype=np.float64):
+    float_n = datatype(N)
+    data = np.fromfunction(lambda i, j: (i * j) / M, (N, M), dtype=datatype)
+    return float_n, data
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/covariance/covariance_numpy.py
+def _kernel(M, float_n, data):
+    mean = np.mean(data, axis=0)
+    data -= mean
+    cov = np.zeros((M, M), dtype=data.dtype)
+    for i in range(M):
+        cov[i:M, i] = cov[i, i:M] = data[:, i] @ data[:, i:M] / (float_n - 1.0)
+    return cov
+
+
+_PRESETS = {
+    "S": {"M": 500,  "N": 600},
+    "L": {"M": 3200, "N": 4000},
+}
+
+
+class BenchCovariance:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        float_n, data = cache[preset]
+        self.M = _PRESETS[preset]["M"]
+        self.float_n = float_n
+        self.data = data.copy()  # kernel mutates data in-place
+
+    def time_covariance(self, cache, preset):
+        _kernel(self.M, self.float_n, self.data)
diff --git a/benchmarks/benchmarks/npbench/bench_deriche.py b/benchmarks/benchmarks/npbench/bench_deriche.py
new file mode 100644
index 00000000..4cb93f1e
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_deriche.py
@@ -0,0 +1,111 @@
+"""npbench wrapper: Deriche Edge Detector — mkl_umath ops: exp.
+
+Preset sizes from npbench bench_info/deriche.json:
+  S: W=400,  H=200
+  L: W=6000, H=3000
+
+Warning: this kernel contains Python for-loops over rows/columns.
+At the L preset the Python loops dominate runtime; exp() calls on scalar
+floats are measured, not vectorised MKL VM throughput.  The L preset is
+included for historical comparability with npbench runs.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/deriche/deriche.py
+def _initialize(W, H, datatype=np.float64):
+    alpha = datatype(0.25)
+    imgIn = np.fromfunction(
+        lambda i, j: ((313 * i + 991 * j) % 65536) / 65535.0,
+        (W, H),
+        dtype=datatype,
+    )
+    return alpha, imgIn
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/deriche/deriche_numpy.py
+def _kernel(alpha, imgIn):
+    k = (
+        (1.0 - np.exp(-alpha))
+        * (1.0 - np.exp(-alpha))
+        / (1.0 + alpha * np.exp(-alpha) - np.exp(2.0 * alpha))
+    )
+    a1 = a5 = k
+    a2 = a6 = k * np.exp(-alpha) * (alpha - 1.0)
+    a3 = a7 = k * np.exp(-alpha) * (alpha + 1.0)
+    a4 = a8 = -k * np.exp(-2.0 * alpha)
+    b1 = 2.0 ** (-alpha)
+    b2 = -np.exp(-2.0 * alpha)
+    c1 = c2 = 1
+
+    y1 = np.empty_like(imgIn)
+    y1[:, 0] = a1 * imgIn[:, 0]
+    y1[:, 1] = a1 * imgIn[:, 1] + a2 * imgIn[:, 0] + b1 * y1[:, 0]
+    for j in range(2, imgIn.shape[1]):
+        y1[:, j] = (
+            a1 * imgIn[:, j]
+            + a2 * imgIn[:, j - 1]
+            + b1 * y1[:, j - 1]
+            + b2 * y1[:, j - 2]
+        )
+
+    y2 = np.empty_like(imgIn)
+    y2[:, -1] = 0.0
+    y2[:, -2] = a3 * imgIn[:, -1]
+    for j in range(imgIn.shape[1] - 3, -1, -1):
+        y2[:, j] = (
+            a3 * imgIn[:, j + 1]
+            + a4 * imgIn[:, j + 2]
+            + b1 * y2[:, j + 1]
+            + b2 * y2[:, j + 2]
+        )
+
+    imgOut = c1 * (y1 + y2)
+
+    y1[0, :] = a5 * imgOut[0, :]
+    y1[1, :] = a5 * imgOut[1, :] + a6 * imgOut[0, :] + b1 * y1[0, :]
+    for i in range(2, imgIn.shape[0]):
+        y1[i, :] = (
+            a5 * imgOut[i, :]
+            + a6 * imgOut[i - 1, :]
+            + b1 * y1[i - 1, :]
+            + b2 * y1[i - 2, :]
+        )
+
+    y2[-1, :] = 0.0
+    y2[-2, :] = a7 * imgOut[-1, :]
+    for i in range(imgIn.shape[0] - 3, -1, -1):
+        y2[i, :] = (
+            a7 * imgOut[i + 1, :]
+            + a8 * imgOut[i + 2, :]
+            + b1 * y2[i + 1, :]
+            + b2 * y2[i + 2, :]
+        )
+
+    return c2 * (y1 + y2)
+
+
+_PRESETS = {
+    "S": {"W": 400,  "H": 200},
+    "L": {"W": 6000, "H": 3000},
+}
+
+
+class BenchDeriche:
+    # L preset has Python loops over 6000 rows — allow extra time
+    timeout = 600
+
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        self.alpha, self.imgIn = cache[preset]
+
+    def time_deriche(self, cache, preset):
+        _kernel(self.alpha, self.imgIn)
diff --git a/benchmarks/benchmarks/npbench/bench_doitgen.py b/benchmarks/benchmarks/npbench/bench_doitgen.py
new file mode 100644
index 00000000..86467424
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_doitgen.py
@@ -0,0 +1,52 @@
+"""npbench wrapper: Doitgen (multiresolution analysis) — mkl_umath ops: matmul.
+
+Preset sizes from npbench bench_info/doitgen.json:
+  S: NR=60,  NQ=60,  NP=128
+  L: NR=220, NQ=250, NP=512
+
+The kernel mutates ``A`` in-place (A[:] = ...), so setup() copies from cache.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/doitgen/doitgen.py
+def _initialize(NR, NQ, NP, datatype=np.float64):
+    A = np.fromfunction(
+        lambda i, j, k: ((i * j + k) % NP) / NP, (NR, NQ, NP), dtype=datatype
+    )
+    C4 = np.fromfunction(
+        lambda i, j: (i * j % NP) / NP, (NP, NP), dtype=datatype
+    )
+    return A, C4
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/doitgen/doitgen_numpy.py
+def _kernel(NR, NQ, NP, A, C4):
+    A[:] = np.reshape(np.reshape(A, (NR, NQ, 1, NP)) @ C4, (NR, NQ, NP))
+
+
+_PRESETS = {
+    "S": {"NR": 60,  "NQ": 60,  "NP": 128},
+    "L": {"NR": 220, "NQ": 250, "NP": 512},
+}
+
+
+class BenchDoitgen:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        A, C4 = cache[preset]
+        p = _PRESETS[preset]
+        self.NR, self.NQ, self.NP = p["NR"], p["NQ"], p["NP"]
+        self.A = A.copy()  # kernel mutates A in-place
+        self.C4 = C4
+
+    def time_doitgen(self, cache, preset):
+        _kernel(self.NR, self.NQ, self.NP, self.A, self.C4)
diff --git a/benchmarks/benchmarks/npbench/bench_gemm.py b/benchmarks/benchmarks/npbench/bench_gemm.py
new file mode 100644
index 00000000..15b29ed6
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_gemm.py
@@ -0,0 +1,53 @@
+"""npbench wrapper: GEMM (general matrix-matrix multiply) — mkl_umath ops: matmul.
+
+Preset sizes from npbench bench_info/gemm.json:
+  S: NI=1000, NJ=1100, NK=1200
+  L: NI=7000, NJ=7500, NK=8000
+
+The kernel mutates C in-place (C[:] = alpha * A @ B + beta * C), so
+setup() copies C from cache before each timing round.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm.py
+def _initialize(NI, NJ, NK, datatype=np.float64):
+    alpha = datatype(1.5)
+    beta  = datatype(1.2)
+    C = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ), dtype=datatype)
+    A = np.fromfunction(lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK), dtype=datatype)
+    B = np.fromfunction(lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ), dtype=datatype)
+    return alpha, beta, C, A, B
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm_numpy.py
+def _kernel(alpha, beta, C, A, B):
+    C[:] = alpha * A @ B + beta * C
+
+
+_PRESETS = {
+    "S": {"NI": 1000, "NJ": 1100, "NK": 1200},
+    "L": {"NI": 7000, "NJ": 7500, "NK": 8000},
+}
+
+
+class BenchGemm:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        alpha, beta, C, A, B = cache[preset]
+        self.alpha = alpha
+        self.beta  = beta
+        self.C = C.copy()  # mutated in-place
+        self.A = A
+        self.B = B
+
+    def time_gemm(self, cache, preset):
+        _kernel(self.alpha, self.beta, self.C, self.A, self.B)
diff --git a/benchmarks/benchmarks/npbench/bench_gemver.py b/benchmarks/benchmarks/npbench/bench_gemver.py
new file mode 100644
index 00000000..a04726e9
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_gemver.py
@@ -0,0 +1,71 @@
+"""npbench wrapper: GEMVER (vector multiplication and matrix addition) — mkl_umath ops: outer.
+
+Preset sizes from npbench bench_info/gemver.json:
+  S: N=1_000
+  L: N=10_000
+
+The kernel mutates A, x, and w in-place, so setup() copies those from cache.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemver/gemver.py
+def _initialize(N, datatype=np.float64):
+    alpha = datatype(1.5)
+    beta = datatype(1.2)
+    fn = datatype(N)
+    A  = np.fromfunction(lambda i, j: (i * j % N) / N, (N, N), dtype=datatype)
+    u1 = np.fromfunction(lambda i: i, (N,), dtype=datatype)
+    u2 = np.fromfunction(lambda i: ((i + 1) / fn) / 2.0, (N,), dtype=datatype)
+    v1 = np.fromfunction(lambda i: ((i + 1) / fn) / 4.0, (N,), dtype=datatype)
+    v2 = np.fromfunction(lambda i: ((i + 1) / fn) / 6.0, (N,), dtype=datatype)
+    w  = np.zeros((N,), dtype=datatype)
+    x  = np.zeros((N,), dtype=datatype)
+    y  = np.fromfunction(lambda i: ((i + 1) / fn) / 8.0, (N,), dtype=datatype)
+    z  = np.fromfunction(lambda i: ((i + 1) / fn) / 9.0, (N,), dtype=datatype)
+    return alpha, beta, A, u1, v1, u2, v2, w, x, y, z
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemver/gemver_numpy.py
+def _kernel(alpha, beta, A, u1, v1, u2, v2, w, x, y, z):
+    A += np.outer(u1, v1) + np.outer(u2, v2)
+    x += beta * y @ A + z
+    w += alpha * A @ x
+
+
+_PRESETS = {
+    "S": {"N": 1_000},
+    "L": {"N": 10_000},
+}
+
+
+class BenchGemver:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        alpha, beta, A, u1, v1, u2, v2, w, x, y, z = cache[preset]
+        self.alpha = alpha
+        self.beta  = beta
+        self.A  = A.copy()   # mutated: A += outer(u1,v1) + outer(u2,v2)
+        self.u1 = u1
+        self.v1 = v1
+        self.u2 = u2
+        self.v2 = v2
+        self.w  = w.copy()   # mutated: w += alpha * A @ x
+        self.x  = x.copy()   # mutated: x += beta * y @ A + z
+        self.y  = y
+        self.z  = z
+
+    def time_gemver(self, cache, preset):
+        _kernel(
+            self.alpha, self.beta,
+            self.A, self.u1, self.v1, self.u2, self.v2,
+            self.w, self.x, self.y, self.z,
+        )
diff --git a/benchmarks/benchmarks/npbench/bench_gesummv.py b/benchmarks/benchmarks/npbench/bench_gesummv.py
new file mode 100644
index 00000000..b2f54ea8
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_gesummv.py
@@ -0,0 +1,45 @@
+"""npbench wrapper: GESUMMV (scalar, vector and matrix multiplication) — mkl_umath ops: matmul.
+
+Preset sizes from npbench bench_info/gesummv.json:
+  S: N=2_000
+  L: N=14_000
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gesummv/gesummv.py
+def _initialize(N, datatype=np.float64):
+    alpha = datatype(1.5)
+    beta = datatype(1.2)
+    A = np.fromfunction(lambda i, j: ((i * j + 1) % N) / N, (N, N), dtype=datatype)
+    B = np.fromfunction(lambda i, j: ((i * j + 2) % N) / N, (N, N), dtype=datatype)
+    x = np.fromfunction(lambda i: (i % N) / N, (N,), dtype=datatype)
+    return alpha, beta, A, B, x
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gesummv/gesummv_numpy.py
+def _kernel(alpha, beta, A, B, x):
+    return alpha * A @ x + beta * B @ x
+
+
+_PRESETS = {
+    "S": {"N": 2_000},
+    "L": {"N": 14_000},
+}
+
+
+class BenchGesummv:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        self.alpha, self.beta, self.A, self.B, self.x = cache[preset]
+
+    def time_gesummv(self, cache, preset):
+        _kernel(self.alpha, self.beta, self.A, self.B, self.x)
diff --git a/benchmarks/benchmarks/npbench/bench_go_fast.py b/benchmarks/benchmarks/npbench/bench_go_fast.py
new file mode 100644
index 00000000..83636bf9
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_go_fast.py
@@ -0,0 +1,69 @@
+"""npbench wrapper: GoFast — mkl_umath ops: tanh.
+
+Preset sizes from npbench bench_info/go_fast.json:
+  S: N=2_000
+  L: N=20_000
+
+Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop
+(go_fast_loop).  A vectorized variant (go_fast_vec) using np.tanh on the
+full diagonal is included for direct MKL VM throughput measurement.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py
+def _initialize(N):
+    from numpy.random import default_rng
+    rng = default_rng(42)
+    a = rng.random((N, N))
+    return (a,)
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast_numpy.py
+def _go_fast(a):
+    trace = 0.0
+    for i in range(a.shape[0]):
+        trace += np.tanh(a[i, i])
+    return a + trace
+
+
+_PRESETS = {
+    "S": {"N": 2_000},
+    "L": {"N": 20_000},
+}
+
+
+class BenchGoFastLoop:
+    """Original npbench kernel — diagonal Python loop calling np.tanh per element."""
+
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        (self.a,) = cache[preset]
+
+    def time_go_fast_loop(self, cache, preset):
+        _go_fast(self.a)
+
+
+class BenchGoFastVec:
+    """Vectorized variant — np.tanh on the full diagonal array at once."""
+
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        (self.a,) = cache[preset]
+        self.diag = np.copy(np.diag(self.a))
+
+    def time_go_fast_vec(self, cache, preset):
+        np.tanh(self.diag)
diff --git a/benchmarks/benchmarks/npbench/bench_k2mm.py b/benchmarks/benchmarks/npbench/bench_k2mm.py
new file mode 100644
index 00000000..11342d57
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_k2mm.py
@@ -0,0 +1,55 @@
+"""npbench wrapper: 2MM (two matrix multiplications) — mkl_umath ops: matmul.
+
+Preset sizes from npbench bench_info/k2mm.json:
+  S: NI=800,  NJ=850,  NK=900,  NL=950
+  L: NI=6000, NJ=6500, NK=7000, NL=7500
+
+The kernel mutates D in-place (D[:] = alpha * A @ B @ C + beta * D), so
+setup() copies D from cache before each timing round.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm.py
+def _initialize(NI, NJ, NK, NL, datatype=np.float64):
+    alpha = datatype(1.5)
+    beta  = datatype(1.2)
+    A = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NK), dtype=datatype)
+    B = np.fromfunction(lambda i, j: (i * (j + 1) % NJ) / NJ, (NK, NJ), dtype=datatype)
+    C = np.fromfunction(lambda i, j: ((i * (j + 3) + 1) % NL) / NL, (NJ, NL), dtype=datatype)
+    D = np.fromfunction(lambda i, j: (i * (j + 2) % NK) / NK, (NI, NL), dtype=datatype)
+    return alpha, beta, A, B, C, D
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm_numpy.py
+def _kernel(alpha, beta, A, B, C, D):
+    D[:] = alpha * A @ B @ C + beta * D
+
+
+_PRESETS = {
+    "S": {"NI": 800,  "NJ": 850,  "NK": 900,  "NL": 950},
+    "L": {"NI": 6000, "NJ": 6500, "NK": 7000, "NL": 7500},
+}
+
+
+class BenchK2mm:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        alpha, beta, A, B, C, D = cache[preset]
+        self.alpha = alpha
+        self.beta  = beta
+        self.A = A
+        self.B = B
+        self.C = C
+        self.D = D.copy()  # mutated in-place
+
+    def time_k2mm(self, cache, preset):
+        _kernel(self.alpha, self.beta, self.A, self.B, self.C, self.D)
diff --git a/benchmarks/benchmarks/npbench/bench_k3mm.py b/benchmarks/benchmarks/npbench/bench_k3mm.py
new file mode 100644
index 00000000..86f9efe4
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_k3mm.py
@@ -0,0 +1,44 @@
+"""npbench wrapper: 3MM (three matrix multiplications) — mkl_umath ops: matmul.
+
+Preset sizes from npbench bench_info/k3mm.json:
+  S: NI=800,  NJ=850,  NK=900,  NL=950,  NM=1000
+  L: NI=5500, NJ=6000, NK=6500, NL=7000, NM=7500
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm.py
+def _initialize(NI, NJ, NK, NL, NM, datatype=np.float64):
+    A = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / (5 * NI), (NI, NK), dtype=datatype)
+    B = np.fromfunction(lambda i, j: ((i * (j + 1) + 2) % NJ) / (5 * NJ), (NK, NJ), dtype=datatype)
+    C = np.fromfunction(lambda i, j: (i * (j + 3) % NL) / (5 * NL), (NJ, NM), dtype=datatype)
+    D = np.fromfunction(lambda i, j: ((i * (j + 2) + 2) % NK) / (5 * NK), (NM, NL), dtype=datatype)
+    return A, B, C, D
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm_numpy.py
+def _kernel(A, B, C, D):
+    return A @ B @ C @ D
+
+
+_PRESETS = {
+    "S": {"NI": 800,  "NJ": 850,  "NK": 900,  "NL": 950,  "NM": 1000},
+    "L": {"NI": 5500, "NJ": 6000, "NK": 6500, "NL": 7000, "NM": 7500},
+}
+
+
+class BenchK3mm:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        self.A, self.B, self.C, self.D = cache[preset]
+
+    def time_k3mm(self, cache, preset):
+        _kernel(self.A, self.B, self.C, self.D)
diff --git a/benchmarks/benchmarks/npbench/bench_mandelbrot.py b/benchmarks/benchmarks/npbench/bench_mandelbrot.py
new file mode 100644
index 00000000..6284bcf3
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_mandelbrot.py
@@ -0,0 +1,93 @@
+"""npbench wrapper: Mandelbrot set (two variants) — mkl_umath ops: abs, multiply, add.
+
+Preset sizes from npbench bench_info/mandelbrot1.json and mandelbrot2.json:
+  S: XN=YN=125/200, maxiter=60/40
+  L: XN=YN=833/1000, maxiter=200/100
+
+mandelbrot1 (slow): uses np.less mask + index-based update loop.
+mandelbrot2 (fast): uses dynamic array compaction; more cache-friendly.
+
+Both kernels operate on complex128 arrays.  The dominant mkl_umath op is
+np.abs() on complex arrays at each iteration step.
+"""
+
+import numpy as np
+
+
+# --- mandelbrot1 ---
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot1/mandelbrot1_numpy.py
+def _mandelbrot1(xmin, xmax, ymin, ymax, xn, yn, maxiter, horizon=2.0):
+    X = np.linspace(xmin, xmax, xn, dtype=np.float64)
+    Y = np.linspace(ymin, ymax, yn, dtype=np.float64)
+    C = X + Y[:, None] * 1j
+    N = np.zeros(C.shape, dtype=np.int64)
+    Z = np.zeros(C.shape, dtype=np.complex128)
+    for n in range(maxiter):
+        I = np.less(abs(Z), horizon)
+        N[I] = n
+        Z[I] = Z[I] ** 2 + C[I]
+    N[N == maxiter - 1] = 0
+    return Z, N
+
+
+# --- mandelbrot2 ---
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot2/mandelbrot2_numpy.py
+def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0):
+    Xi, Yi = np.mgrid[0:xn, 0:yn]
+    X = np.linspace(xmin, xmax, xn, dtype=np.float64)[Xi]
+    Y = np.linspace(ymin, ymax, yn, dtype=np.float64)[Yi]
+    C = X + Y * 1j
+    N_ = np.zeros(C.shape, dtype=np.int64)
+    Z_ = np.zeros(C.shape, dtype=np.complex128)
+    Xi.shape = Yi.shape = C.shape = xn * yn
+
+    Z = np.zeros(C.shape, np.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+        np.multiply(Z, Z, Z)
+        np.add(Z, C, Z)
+        rem = np.abs(Z) > horizon
+        Z_[Xi[rem], Yi[rem]] = Z[rem]
+        N_[Xi[rem], Yi[rem]] = i + 1
+        ind = ~rem
+        Z = Z[ind]
+        C = C[ind]
+        Xi = Xi[ind]
+        Yi = Yi[ind]
+    return Z_, N_
+
+
+_PRESETS_M1 = {
+    "S": {"xmin": -1.75, "xmax": 0.25, "ymin": -1.00, "ymax": 1.00,
+          "xn": 125, "yn": 125, "maxiter": 60,  "horizon": 2.0},
+    "L": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25,
+          "xn": 833, "yn": 833, "maxiter": 200, "horizon": 2.0},
+}
+
+_PRESETS_M2 = {
+    "S": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25,
+          "xn": 200, "yn": 200, "itermax": 40,  "horizon": 2.0},
+    "L": {"xmin": -2.25, "xmax": 0.75, "ymin": -1.50, "ymax": 1.50,
+          "xn": 1000, "yn": 1000, "itermax": 100, "horizon": 2.0},
+}
+
+
+class BenchMandelbrot1:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def time_mandelbrot1(self, preset):
+        _mandelbrot1(**_PRESETS_M1[preset])
+
+
+class BenchMandelbrot2:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def time_mandelbrot2(self, preset):
+        _mandelbrot2(**_PRESETS_M2[preset])
diff --git a/benchmarks/benchmarks/npbench/bench_softmax.py b/benchmarks/benchmarks/npbench/bench_softmax.py
new file mode 100644
index 00000000..cd976f14
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_softmax.py
@@ -0,0 +1,48 @@
+"""npbench wrapper: Softmax — mkl_umath ops: exp, max, sum.
+
+Preset sizes from npbench bench_info/softmax.json:
+  S: N=16,  H=16,  SM=128   (float32)
+  L: N=64,  H=16,  SM=448   (float32)
+
+npbench initializes this benchmark with float32 explicitly.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax.py
+def _initialize(N, H, SM):
+    from numpy.random import default_rng
+    rng = default_rng(42)
+    x = rng.random((N, H, SM, SM), dtype=np.float32)
+    return (x,)
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax_numpy.py
+def _softmax(x):
+    tmp_max = np.max(x, axis=-1, keepdims=True)
+    tmp_out = np.exp(x - tmp_max)
+    tmp_sum = np.sum(tmp_out, axis=-1, keepdims=True)
+    return tmp_out / tmp_sum
+
+
+_PRESETS = {
+    "S": {"N": 16,  "H": 16, "SM": 128},
+    "L": {"N": 64,  "H": 16, "SM": 448},
+}
+
+
+class BenchSoftmax:
+    params = (["S", "L"],)
+    param_names = ["preset"]
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        (self.x,) = cache[preset]
+
+    def time_softmax(self, cache, preset):
+        _softmax(self.x)
diff --git a/benchmarks/bootstrap-dashboard-branch.sh b/benchmarks/bootstrap-dashboard-branch.sh
new file mode 100644
index 00000000..f8fd7cf4
--- /dev/null
+++ b/benchmarks/bootstrap-dashboard-branch.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# bootstrap-dashboard-branch.sh
+#
+# One-time setup: creates the mkl-umath-results branch that ASV uses to
+# store benchmark results.  Run this once against the first commit you want
+# to anchor results to.
+#
+# Usage:
+#   SEED_SHA=<first-benchmarked-git-sha>  bash bootstrap-dashboard-branch.sh
+#
+# The script must be run from inside benchmarks/ (where asv.conf.json lives).
+# The conda env with asv installed must already be active.
+
+set -euo pipefail
+
+RESULTS_BRANCH="mkl-umath-results"
+SEED_SHA="${SEED_SHA:?ERROR: set SEED_SHA=<commit-sha> before running this script}"
+
+echo "[bootstrap] Seeding results branch: ${RESULTS_BRANCH}"
+echo "[bootstrap] Anchored to commit:     ${SEED_SHA}"
+
+# Run a single quick pass to generate the first results JSON
+asv run \
+    --python=same \
+    --quick \
+    --show-stderr \
+    --set-commit-hash "${SEED_SHA}" \
+    HEAD
+
+# Publish results to HTML (creates .asv/html/)
+asv publish
+
+# Push results to the dedicated branch
+asv gh-pages \
+    --rewrite \
+    --no-push \
+    --html-dir .asv/html
+
+echo "[bootstrap] Done.  Push .asv/results to ${RESULTS_BRANCH} manually or"
+echo "            configure asv gh-pages --push to automate."

From 428e440af44c30134f21d299554c81a4cf1141ae Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Mon, 11 May 2026 10:58:35 -0500
Subject: [PATCH 2/7] update benchmarks with M preset, configurations

---
 benchmarks/asv.conf.json                      | 11 ++++-
 .../benchmarks/npbench/bench_arc_distance.py  |  8 ++--
 .../benchmarks/npbench/bench_cholesky2.py     |  8 ++--
 .../benchmarks/npbench/bench_correlation.py   |  8 ++--
 .../benchmarks/npbench/bench_covariance.py    |  8 ++--
 .../benchmarks/npbench/bench_deriche.py       |  8 ++--
 .../benchmarks/npbench/bench_doitgen.py       |  8 ++--
 benchmarks/benchmarks/npbench/bench_gemm.py   |  8 ++--
 benchmarks/benchmarks/npbench/bench_gemver.py |  8 ++--
 .../benchmarks/npbench/bench_gesummv.py       |  8 ++--
 .../benchmarks/npbench/bench_go_fast.py       | 12 ++++--
 benchmarks/benchmarks/npbench/bench_k2mm.py   |  8 ++--
 benchmarks/benchmarks/npbench/bench_k3mm.py   |  8 ++--
 .../benchmarks/npbench/bench_mandelbrot.py    | 18 +++++----
 .../benchmarks/npbench/bench_softmax.py       | 10 +++--
 benchmarks/bootstrap-dashboard-branch.sh      | 40 -------------------
 16 files changed, 89 insertions(+), 90 deletions(-)
 delete mode 100644 benchmarks/bootstrap-dashboard-branch.sh

diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index facb1284..78482758 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -3,11 +3,18 @@
     "project": "mkl_umath",
     "project_url": "https://github.com/IntelPython/mkl_umath",
     "repo": "..",
-    "branches": ["main"],
+    "branches": [
+        "main"
+    ],
     "environment_type": "existing",
     "benchmark_dir": "benchmarks",
     "env_dir": ".asv/env",
     "results_dir": ".asv/results",
     "html_dir": ".asv/html",
-    "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/"
+    "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/",
+    "build_cache_size": 2,
+    "default_benchmark_timeout": 1500,
+    "regressions_thresholds": {
+        ".*": 0.2
+    }
 }
diff --git a/benchmarks/benchmarks/npbench/bench_arc_distance.py b/benchmarks/benchmarks/npbench/bench_arc_distance.py
index d8039649..c17f5775 100644
--- a/benchmarks/benchmarks/npbench/bench_arc_distance.py
+++ b/benchmarks/benchmarks/npbench/bench_arc_distance.py
@@ -1,7 +1,7 @@
 """npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt.
 
 Preset sizes from npbench bench_info/arc_distance.json:
-  S: N=100_000
+  M: N=1_000_000
   L: N=10_000_000
 """
 
@@ -31,14 +31,16 @@ def _arc_distance(theta_1, phi_1, theta_2, phi_2):
 
 
 _PRESETS = {
-    "S": {"N": 100_000},
+    "M": {"N": 1_000_000},
     "L": {"N": 10_000_000},
 }
 
 
 class BenchArcDistance:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_cholesky2.py b/benchmarks/benchmarks/npbench/bench_cholesky2.py
index ea095122..347aaca8 100644
--- a/benchmarks/benchmarks/npbench/bench_cholesky2.py
+++ b/benchmarks/benchmarks/npbench/bench_cholesky2.py
@@ -1,7 +1,7 @@
 """npbench wrapper: Cholesky decomposition v2 — mkl_umath ops: linalg.cholesky.
 
 Preset sizes from npbench bench_info/cholesky2.json:
-  S: N=1000
+  M: N=2200
   L: N=8000
 
 The kernel mutates A in-place (A[:] = cholesky(A) + triu(A, k=1)), so
@@ -35,14 +35,16 @@ def _kernel(A):
 
 
 _PRESETS = {
-    "S": {"N": 1000},
+    "M": {"N": 2200},
     "L": {"N": 8000},
 }
 
 
 class BenchCholesky2:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_correlation.py b/benchmarks/benchmarks/npbench/bench_correlation.py
index c5c7471d..194fcc04 100644
--- a/benchmarks/benchmarks/npbench/bench_correlation.py
+++ b/benchmarks/benchmarks/npbench/bench_correlation.py
@@ -1,7 +1,7 @@
 """npbench wrapper: Correlation — mkl_umath ops: sqrt, std, mean.
 
 Preset sizes from npbench bench_info/correlation.json:
-  S: M=500,  N=600
+  M: M=1400, N=1800
   L: M=3200, N=4000
 
 The kernel mutates ``data`` in-place (data -= mean; data /= ...), so
@@ -34,14 +34,16 @@ def _kernel(M, float_n, data):
 
 
 _PRESETS = {
-    "S": {"M": 500,  "N": 600},
+    "M": {"M": 1400, "N": 1800},
     "L": {"M": 3200, "N": 4000},
 }
 
 
 class BenchCorrelation:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_covariance.py b/benchmarks/benchmarks/npbench/bench_covariance.py
index bc541c5b..b85b3191 100644
--- a/benchmarks/benchmarks/npbench/bench_covariance.py
+++ b/benchmarks/benchmarks/npbench/bench_covariance.py
@@ -1,7 +1,7 @@
 """npbench wrapper: Covariance — mkl_umath ops: mean.
 
 Preset sizes from npbench bench_info/covariance.json:
-  S: M=500,  N=600
+  M: M=1400, N=1800
   L: M=3200, N=4000
 
 The kernel mutates ``data`` in-place (data -= mean), so setup() copies
@@ -31,14 +31,16 @@ def _kernel(M, float_n, data):
 
 
 _PRESETS = {
-    "S": {"M": 500,  "N": 600},
+    "M": {"M": 1400, "N": 1800},
     "L": {"M": 3200, "N": 4000},
 }
 
 
 class BenchCovariance:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_deriche.py b/benchmarks/benchmarks/npbench/bench_deriche.py
index 4cb93f1e..4539053d 100644
--- a/benchmarks/benchmarks/npbench/bench_deriche.py
+++ b/benchmarks/benchmarks/npbench/bench_deriche.py
@@ -1,7 +1,7 @@
 """npbench wrapper: Deriche Edge Detector — mkl_umath ops: exp.
 
 Preset sizes from npbench bench_info/deriche.json:
-  S: W=400,  H=200
+  M: W=1500, H=1000
   L: W=6000, H=3000
 
 Warning: this kernel contains Python for-loops over rows/columns.
@@ -89,7 +89,7 @@ def _kernel(alpha, imgIn):
 
 
 _PRESETS = {
-    "S": {"W": 400,  "H": 200},
+    "M": {"W": 1500, "H": 1000},
     "L": {"W": 6000, "H": 3000},
 }
 
@@ -98,8 +98,10 @@ class BenchDeriche:
     # L preset has Python loops over 6000 rows — allow extra time
     timeout = 600
 
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_doitgen.py b/benchmarks/benchmarks/npbench/bench_doitgen.py
index 86467424..eb255bae 100644
--- a/benchmarks/benchmarks/npbench/bench_doitgen.py
+++ b/benchmarks/benchmarks/npbench/bench_doitgen.py
@@ -1,7 +1,7 @@
 """npbench wrapper: Doitgen (multiresolution analysis) — mkl_umath ops: matmul.
 
 Preset sizes from npbench bench_info/doitgen.json:
-  S: NR=60,  NQ=60,  NP=128
+  M: NR=110, NQ=125, NP=256
   L: NR=220, NQ=250, NP=512
 
 The kernel mutates ``A`` in-place (A[:] = ...), so setup() copies from cache.
@@ -29,14 +29,16 @@ def _kernel(NR, NQ, NP, A, C4):
 
 
 _PRESETS = {
-    "S": {"NR": 60,  "NQ": 60,  "NP": 128},
+    "M": {"NR": 110, "NQ": 125, "NP": 256},
     "L": {"NR": 220, "NQ": 250, "NP": 512},
 }
 
 
 class BenchDoitgen:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_gemm.py b/benchmarks/benchmarks/npbench/bench_gemm.py
index 15b29ed6..f7d43f7d 100644
--- a/benchmarks/benchmarks/npbench/bench_gemm.py
+++ b/benchmarks/benchmarks/npbench/bench_gemm.py
@@ -1,7 +1,7 @@
 """npbench wrapper: GEMM (general matrix-matrix multiply) — mkl_umath ops: matmul.
 
 Preset sizes from npbench bench_info/gemm.json:
-  S: NI=1000, NJ=1100, NK=1200
+  M: NI=2500, NJ=2750, NK=3000
   L: NI=7000, NJ=7500, NK=8000
 
 The kernel mutates C in-place (C[:] = alpha * A @ B + beta * C), so
@@ -29,14 +29,16 @@ def _kernel(alpha, beta, C, A, B):
 
 
 _PRESETS = {
-    "S": {"NI": 1000, "NJ": 1100, "NK": 1200},
+    "M": {"NI": 2500, "NJ": 2750, "NK": 3000},
     "L": {"NI": 7000, "NJ": 7500, "NK": 8000},
 }
 
 
 class BenchGemm:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_gemver.py b/benchmarks/benchmarks/npbench/bench_gemver.py
index a04726e9..d0ea44a6 100644
--- a/benchmarks/benchmarks/npbench/bench_gemver.py
+++ b/benchmarks/benchmarks/npbench/bench_gemver.py
@@ -1,7 +1,7 @@
 """npbench wrapper: GEMVER (vector multiplication and matrix addition) — mkl_umath ops: outer.
 
 Preset sizes from npbench bench_info/gemver.json:
-  S: N=1_000
+  M: N=3_000
   L: N=10_000
 
 The kernel mutates A, x, and w in-place, so setup() copies those from cache.
@@ -37,14 +37,16 @@ def _kernel(alpha, beta, A, u1, v1, u2, v2, w, x, y, z):
 
 
 _PRESETS = {
-    "S": {"N": 1_000},
+    "M": {"N": 3_000},
     "L": {"N": 10_000},
 }
 
 
 class BenchGemver:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_gesummv.py b/benchmarks/benchmarks/npbench/bench_gesummv.py
index b2f54ea8..13bb773e 100644
--- a/benchmarks/benchmarks/npbench/bench_gesummv.py
+++ b/benchmarks/benchmarks/npbench/bench_gesummv.py
@@ -1,7 +1,7 @@
 """npbench wrapper: GESUMMV (scalar, vector and matrix multiplication) — mkl_umath ops: matmul.
 
 Preset sizes from npbench bench_info/gesummv.json:
-  S: N=2_000
+  M: N=4_000
   L: N=14_000
 """
 
@@ -26,14 +26,16 @@ def _kernel(alpha, beta, A, B, x):
 
 
 _PRESETS = {
-    "S": {"N": 2_000},
+    "M": {"N": 4_000},
     "L": {"N": 14_000},
 }
 
 
 class BenchGesummv:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_go_fast.py b/benchmarks/benchmarks/npbench/bench_go_fast.py
index 83636bf9..d197c540 100644
--- a/benchmarks/benchmarks/npbench/bench_go_fast.py
+++ b/benchmarks/benchmarks/npbench/bench_go_fast.py
@@ -1,7 +1,7 @@
 """npbench wrapper: GoFast — mkl_umath ops: tanh.
 
 Preset sizes from npbench bench_info/go_fast.json:
-  S: N=2_000
+  M: N=6_000
   L: N=20_000
 
 Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop
@@ -31,7 +31,7 @@ def _go_fast(a):
 
 
 _PRESETS = {
-    "S": {"N": 2_000},
+    "M": {"N": 6_000},
     "L": {"N": 20_000},
 }
 
@@ -39,8 +39,10 @@ def _go_fast(a):
 class BenchGoFastLoop:
     """Original npbench kernel — diagonal Python loop calling np.tanh per element."""
 
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
@@ -55,8 +57,10 @@ def time_go_fast_loop(self, cache, preset):
 class BenchGoFastVec:
     """Vectorized variant — np.tanh on the full diagonal array at once."""
 
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_k2mm.py b/benchmarks/benchmarks/npbench/bench_k2mm.py
index 11342d57..6ff8cf6f 100644
--- a/benchmarks/benchmarks/npbench/bench_k2mm.py
+++ b/benchmarks/benchmarks/npbench/bench_k2mm.py
@@ -1,7 +1,7 @@
 """npbench wrapper: 2MM (two matrix multiplications) — mkl_umath ops: matmul.
 
 Preset sizes from npbench bench_info/k2mm.json:
-  S: NI=800,  NJ=850,  NK=900,  NL=950
+  M: NI=2000, NJ=2250, NK=2500, NL=2750
   L: NI=6000, NJ=6500, NK=7000, NL=7500
 
 The kernel mutates D in-place (D[:] = alpha * A @ B @ C + beta * D), so
@@ -30,14 +30,16 @@ def _kernel(alpha, beta, A, B, C, D):
 
 
 _PRESETS = {
-    "S": {"NI": 800,  "NJ": 850,  "NK": 900,  "NL": 950},
+    "M": {"NI": 2000, "NJ": 2250, "NK": 2500, "NL": 2750},
     "L": {"NI": 6000, "NJ": 6500, "NK": 7000, "NL": 7500},
 }
 
 
 class BenchK2mm:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_k3mm.py b/benchmarks/benchmarks/npbench/bench_k3mm.py
index 86f9efe4..15e73d60 100644
--- a/benchmarks/benchmarks/npbench/bench_k3mm.py
+++ b/benchmarks/benchmarks/npbench/bench_k3mm.py
@@ -1,7 +1,7 @@
 """npbench wrapper: 3MM (three matrix multiplications) — mkl_umath ops: matmul.
 
 Preset sizes from npbench bench_info/k3mm.json:
-  S: NI=800,  NJ=850,  NK=900,  NL=950,  NM=1000
+  M: NI=2000, NJ=2200, NK=2400, NL=2600, NM=2800
   L: NI=5500, NJ=6000, NK=6500, NL=7000, NM=7500
 """
 
@@ -25,14 +25,16 @@ def _kernel(A, B, C, D):
 
 
 _PRESETS = {
-    "S": {"NI": 800,  "NJ": 850,  "NK": 900,  "NL": 950,  "NM": 1000},
+    "M": {"NI": 2000, "NJ": 2200, "NK": 2400, "NL": 2600, "NM": 2800},
     "L": {"NI": 5500, "NJ": 6000, "NK": 6500, "NL": 7000, "NM": 7500},
 }
 
 
 class BenchK3mm:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/benchmarks/npbench/bench_mandelbrot.py b/benchmarks/benchmarks/npbench/bench_mandelbrot.py
index 6284bcf3..090fcd0e 100644
--- a/benchmarks/benchmarks/npbench/bench_mandelbrot.py
+++ b/benchmarks/benchmarks/npbench/bench_mandelbrot.py
@@ -1,7 +1,7 @@
 """npbench wrapper: Mandelbrot set (two variants) — mkl_umath ops: abs, multiply, add.
 
 Preset sizes from npbench bench_info/mandelbrot1.json and mandelbrot2.json:
-  S: XN=YN=125/200, maxiter=60/40
+  M: XN=YN=250/500, maxiter=150/80
   L: XN=YN=833/1000, maxiter=200/100
 
 mandelbrot1 (slow): uses np.less mask + index-based update loop.
@@ -63,31 +63,35 @@ def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0):
 
 
 _PRESETS_M1 = {
-    "S": {"xmin": -1.75, "xmax": 0.25, "ymin": -1.00, "ymax": 1.00,
-          "xn": 125, "yn": 125, "maxiter": 60,  "horizon": 2.0},
+    "M": {"xmin": -1.75, "xmax": 0.25, "ymin": -1.00, "ymax": 1.00,
+          "xn": 250, "yn": 250, "maxiter": 150, "horizon": 2.0},
     "L": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25,
           "xn": 833, "yn": 833, "maxiter": 200, "horizon": 2.0},
 }
 
 _PRESETS_M2 = {
-    "S": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25,
-          "xn": 200, "yn": 200, "itermax": 40,  "horizon": 2.0},
+    "M": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25,
+          "xn": 500, "yn": 500, "itermax": 80,  "horizon": 2.0},
     "L": {"xmin": -2.25, "xmax": 0.75, "ymin": -1.50, "ymax": 1.50,
           "xn": 1000, "yn": 1000, "itermax": 100, "horizon": 2.0},
 }
 
 
 class BenchMandelbrot1:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def time_mandelbrot1(self, preset):
         _mandelbrot1(**_PRESETS_M1[preset])
 
 
 class BenchMandelbrot2:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def time_mandelbrot2(self, preset):
         _mandelbrot2(**_PRESETS_M2[preset])
diff --git a/benchmarks/benchmarks/npbench/bench_softmax.py b/benchmarks/benchmarks/npbench/bench_softmax.py
index cd976f14..29a77252 100644
--- a/benchmarks/benchmarks/npbench/bench_softmax.py
+++ b/benchmarks/benchmarks/npbench/bench_softmax.py
@@ -1,7 +1,7 @@
 """npbench wrapper: Softmax — mkl_umath ops: exp, max, sum.
 
 Preset sizes from npbench bench_info/softmax.json:
-  S: N=16,  H=16,  SM=128   (float32)
+  M: N=32,  H=8,   SM=256   (float32)
   L: N=64,  H=16,  SM=448   (float32)
 
 npbench initializes this benchmark with float32 explicitly.
@@ -29,14 +29,16 @@ def _softmax(x):
 
 
 _PRESETS = {
-    "S": {"N": 16,  "H": 16, "SM": 128},
-    "L": {"N": 64,  "H": 16, "SM": 448},
+    "M": {"N": 32, "H": 8,  "SM": 256},
+    "L": {"N": 64, "H": 16, "SM": 448},
 }
 
 
 class BenchSoftmax:
-    params = (["S", "L"],)
+    params = (["M", "L"],)
     param_names = ["preset"]
+    number = 1
+    repeat = 20
 
     def setup_cache(self):
         return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
diff --git a/benchmarks/bootstrap-dashboard-branch.sh b/benchmarks/bootstrap-dashboard-branch.sh
deleted file mode 100644
index f8fd7cf4..00000000
--- a/benchmarks/bootstrap-dashboard-branch.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env bash
-# bootstrap-dashboard-branch.sh
-#
-# One-time setup: creates the mkl-umath-results branch that ASV uses to
-# store benchmark results.  Run this once against the first commit you want
-# to anchor results to.
-#
-# Usage:
-#   SEED_SHA=<first-benchmarked-git-sha>  bash bootstrap-dashboard-branch.sh
-#
-# The script must be run from inside benchmarks/ (where asv.conf.json lives).
-# The conda env with asv installed must already be active.
-
-set -euo pipefail
-
-RESULTS_BRANCH="mkl-umath-results"
-SEED_SHA="${SEED_SHA:?ERROR: set SEED_SHA=<commit-sha> before running this script}"
-
-echo "[bootstrap] Seeding results branch: ${RESULTS_BRANCH}"
-echo "[bootstrap] Anchored to commit:     ${SEED_SHA}"
-
-# Run a single quick pass to generate the first results JSON
-asv run \
-    --python=same \
-    --quick \
-    --show-stderr \
-    --set-commit-hash "${SEED_SHA}" \
-    HEAD
-
-# Publish results to HTML (creates .asv/html/)
-asv publish
-
-# Push results to the dedicated branch
-asv gh-pages \
-    --rewrite \
-    --no-push \
-    --html-dir .asv/html
-
-echo "[bootstrap] Done.  Push .asv/results to ${RESULTS_BRANCH} manually or"
-echo "            configure asv gh-pages --push to automate."

From b27dc03db2dee2ebb5575b7ded4366151dead962 Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Mon, 18 May 2026 08:43:28 -0500
Subject: [PATCH 3/7] pre-commit-fixes

---
 benchmarks/benchmarks/_patch_setup.py         | 26 +++++---
 .../benchmarks/npbench/bench_arc_distance.py  |  1 +
 .../benchmarks/npbench/bench_cholesky2.py     |  4 +-
 .../benchmarks/npbench/bench_correlation.py   |  4 +-
 benchmarks/benchmarks/npbench/bench_gemm.py   | 20 ++++--
 benchmarks/benchmarks/npbench/bench_gemver.py | 40 +++++++-----
 .../benchmarks/npbench/bench_gesummv.py       | 12 +++-
 .../benchmarks/npbench/bench_go_fast.py       |  3 +-
 benchmarks/benchmarks/npbench/bench_k2mm.py   | 20 ++++--
 benchmarks/benchmarks/npbench/bench_k3mm.py   | 20 ++++--
 .../benchmarks/npbench/bench_mandelbrot.py    | 61 +++++++++++++++----
 .../benchmarks/npbench/bench_softmax.py       |  3 +-
 12 files changed, 152 insertions(+), 62 deletions(-)

diff --git a/benchmarks/benchmarks/_patch_setup.py b/benchmarks/benchmarks/_patch_setup.py
index f7435c61..9aea6062 100644
--- a/benchmarks/benchmarks/_patch_setup.py
+++ b/benchmarks/benchmarks/_patch_setup.py
@@ -10,9 +10,9 @@
 import sys
 
 _PATCH_MAP = [
-    ("mkl_fft",    "patch_numpy_fft"),
+    ("mkl_fft", "patch_numpy_fft"),
     ("mkl_random", "patch_numpy_random"),
-    ("mkl_umath",  "patch_numpy_umath"),
+    ("mkl_umath", "patch_numpy_umath"),
 ]
 
 
@@ -25,15 +25,18 @@ def _apply_patches():
         except ImportError as exc:
             raise RuntimeError(
                 f"[mkl-patch] Cannot import {mod_name}: {exc}\n"
-                f"  Ensure the conda env contains {mod_name} from the Intel channel.\n"
-                f"  Required channels: https://software.repos.intel.com/python/conda"
+                f"  Ensure the conda env contains {mod_name} "
+                f"from the Intel channel.\n"
+                "  Required channels: "
+                "https://software.repos.intel.com/python/conda"
             ) from exc
 
         patch_fn = getattr(mod, patch_fn_name, None)
         if patch_fn is None:
             raise RuntimeError(
                 f"[mkl-patch] {mod_name} has no {patch_fn_name}(). "
-                f"Upgrade {mod_name} to a version that exposes the stock-numpy patch API."
+                f"Upgrade {mod_name} to a version that exposes "
+                "the stock-numpy patch API."
             )
 
         try:
@@ -46,8 +49,9 @@ def _apply_patches():
         is_patched_fn = getattr(mod, "is_patched", None)
         if callable(is_patched_fn) and not is_patched_fn():
             raise RuntimeError(
-                f"[mkl-patch] {mod_name}.is_patched() returned False after patching. "
-                f"NumPy may have been imported before patching in a conflicting state."
+                f"[mkl-patch] {mod_name}.is_patched() returned False "
+                "after patching. NumPy may have been imported before "
+                "patching in a conflicting state."
             )
 
         patched[mod_name] = mod
@@ -56,9 +60,9 @@ def _apply_patches():
     import numpy as np
 
     _attr_checks = {
-        "mkl_fft":    lambda: np.fft.fft.__module__,
+        "mkl_fft": lambda: np.fft.fft.__module__,
         "mkl_random": lambda: np.random.random.__module__,
-        "mkl_umath":  lambda: np.exp.__module__,
+        "mkl_umath": lambda: np.exp.__module__,
     }
     for mod_name in patched:
         try:
@@ -67,7 +71,9 @@ def _apply_patches():
             attr = "unknown"
         sys.stderr.write(f"[mkl-patch] {mod_name}: numpy dispatch → {attr}\n")
 
-    sys.stderr.write("[mkl-patch] ALL OK — mkl_fft, mkl_random, mkl_umath active\n")
+    sys.stderr.write(
+        "[mkl-patch] ALL OK — mkl_fft, mkl_random, mkl_umath active\n"
+    )
     sys.stderr.flush()
 
 
diff --git a/benchmarks/benchmarks/npbench/bench_arc_distance.py b/benchmarks/benchmarks/npbench/bench_arc_distance.py
index c17f5775..a57a3fa2 100644
--- a/benchmarks/benchmarks/npbench/bench_arc_distance.py
+++ b/benchmarks/benchmarks/npbench/bench_arc_distance.py
@@ -12,6 +12,7 @@
 # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py
 def _initialize(N):
     from numpy.random import default_rng
+
     rng = default_rng(42)
     t0 = rng.random((N,))
     p0 = rng.random((N,))
diff --git a/benchmarks/benchmarks/npbench/bench_cholesky2.py b/benchmarks/benchmarks/npbench/bench_cholesky2.py
index 347aaca8..ae19443f 100644
--- a/benchmarks/benchmarks/npbench/bench_cholesky2.py
+++ b/benchmarks/benchmarks/npbench/bench_cholesky2.py
@@ -19,10 +19,10 @@
 def _initialize(N, datatype=np.float64):
     A = np.empty((N, N), dtype=datatype)
     for i in range(N):
-        A[i, :i + 1] = np.fromfunction(
+        A[i, : i + 1] = np.fromfunction(
             lambda j: (-j % N) / N + 1, (i + 1,), dtype=datatype
         )
-        A[i, i + 1:] = 0.0
+        A[i, i + 1 :] = 0.0
         A[i, i] = 1.0
     A[:] = A @ np.transpose(A)
     return A
diff --git a/benchmarks/benchmarks/npbench/bench_correlation.py b/benchmarks/benchmarks/npbench/bench_correlation.py
index 194fcc04..ca941443 100644
--- a/benchmarks/benchmarks/npbench/bench_correlation.py
+++ b/benchmarks/benchmarks/npbench/bench_correlation.py
@@ -29,7 +29,9 @@ def _kernel(M, float_n, data):
     data /= np.sqrt(float_n) * stddev
     corr = np.eye(M, dtype=data.dtype)
     for i in range(M - 1):
-        corr[i + 1:M, i] = corr[i, i + 1:M] = data[:, i] @ data[:, i + 1:M]
+        corr[i + 1 : M, i] = corr[i, i + 1 : M] = (
+            data[:, i] @ data[:, i + 1 : M]
+        )
     return corr
 
 
diff --git a/benchmarks/benchmarks/npbench/bench_gemm.py b/benchmarks/benchmarks/npbench/bench_gemm.py
index f7d43f7d..c6a36fd5 100644
--- a/benchmarks/benchmarks/npbench/bench_gemm.py
+++ b/benchmarks/benchmarks/npbench/bench_gemm.py
@@ -1,4 +1,6 @@
-"""npbench wrapper: GEMM (general matrix-matrix multiply) — mkl_umath ops: matmul.
+"""npbench wrapper: GEMM (general matrix-matrix multiply).
+
+mkl_umath ops: matmul.
 
 Preset sizes from npbench bench_info/gemm.json:
   M: NI=2500, NJ=2750, NK=3000
@@ -15,10 +17,16 @@
 # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm.py
 def _initialize(NI, NJ, NK, datatype=np.float64):
     alpha = datatype(1.5)
-    beta  = datatype(1.2)
-    C = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ), dtype=datatype)
-    A = np.fromfunction(lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK), dtype=datatype)
-    B = np.fromfunction(lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ), dtype=datatype)
+    beta = datatype(1.2)
+    C = np.fromfunction(
+        lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ), dtype=datatype
+    )
+    A = np.fromfunction(
+        lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK), dtype=datatype
+    )
+    B = np.fromfunction(
+        lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ), dtype=datatype
+    )
     return alpha, beta, C, A, B
 
 
@@ -46,7 +54,7 @@ def setup_cache(self):
     def setup(self, cache, preset):
         alpha, beta, C, A, B = cache[preset]
         self.alpha = alpha
-        self.beta  = beta
+        self.beta = beta
         self.C = C.copy()  # mutated in-place
         self.A = A
         self.B = B
diff --git a/benchmarks/benchmarks/npbench/bench_gemver.py b/benchmarks/benchmarks/npbench/bench_gemver.py
index d0ea44a6..c85313ed 100644
--- a/benchmarks/benchmarks/npbench/bench_gemver.py
+++ b/benchmarks/benchmarks/npbench/bench_gemver.py
@@ -1,4 +1,6 @@
-"""npbench wrapper: GEMVER (vector multiplication and matrix addition) — mkl_umath ops: outer.
+"""npbench wrapper: GEMVER (vector multiplication and matrix addition).
+
+mkl_umath ops: outer.
 
 Preset sizes from npbench bench_info/gemver.json:
   M: N=3_000
@@ -16,15 +18,15 @@ def _initialize(N, datatype=np.float64):
     alpha = datatype(1.5)
     beta = datatype(1.2)
     fn = datatype(N)
-    A  = np.fromfunction(lambda i, j: (i * j % N) / N, (N, N), dtype=datatype)
+    A = np.fromfunction(lambda i, j: (i * j % N) / N, (N, N), dtype=datatype)
     u1 = np.fromfunction(lambda i: i, (N,), dtype=datatype)
     u2 = np.fromfunction(lambda i: ((i + 1) / fn) / 2.0, (N,), dtype=datatype)
     v1 = np.fromfunction(lambda i: ((i + 1) / fn) / 4.0, (N,), dtype=datatype)
     v2 = np.fromfunction(lambda i: ((i + 1) / fn) / 6.0, (N,), dtype=datatype)
-    w  = np.zeros((N,), dtype=datatype)
-    x  = np.zeros((N,), dtype=datatype)
-    y  = np.fromfunction(lambda i: ((i + 1) / fn) / 8.0, (N,), dtype=datatype)
-    z  = np.fromfunction(lambda i: ((i + 1) / fn) / 9.0, (N,), dtype=datatype)
+    w = np.zeros((N,), dtype=datatype)
+    x = np.zeros((N,), dtype=datatype)
+    y = np.fromfunction(lambda i: ((i + 1) / fn) / 8.0, (N,), dtype=datatype)
+    z = np.fromfunction(lambda i: ((i + 1) / fn) / 9.0, (N,), dtype=datatype)
     return alpha, beta, A, u1, v1, u2, v2, w, x, y, z
 
 
@@ -54,20 +56,28 @@ def setup_cache(self):
     def setup(self, cache, preset):
         alpha, beta, A, u1, v1, u2, v2, w, x, y, z = cache[preset]
         self.alpha = alpha
-        self.beta  = beta
-        self.A  = A.copy()   # mutated: A += outer(u1,v1) + outer(u2,v2)
+        self.beta = beta
+        self.A = A.copy()  # mutated: A += outer(u1,v1) + outer(u2,v2)
         self.u1 = u1
         self.v1 = v1
         self.u2 = u2
         self.v2 = v2
-        self.w  = w.copy()   # mutated: w += alpha * A @ x
-        self.x  = x.copy()   # mutated: x += beta * y @ A + z
-        self.y  = y
-        self.z  = z
+        self.w = w.copy()  # mutated: w += alpha * A @ x
+        self.x = x.copy()  # mutated: x += beta * y @ A + z
+        self.y = y
+        self.z = z
 
     def time_gemver(self, cache, preset):
         _kernel(
-            self.alpha, self.beta,
-            self.A, self.u1, self.v1, self.u2, self.v2,
-            self.w, self.x, self.y, self.z,
+            self.alpha,
+            self.beta,
+            self.A,
+            self.u1,
+            self.v1,
+            self.u2,
+            self.v2,
+            self.w,
+            self.x,
+            self.y,
+            self.z,
         )
diff --git a/benchmarks/benchmarks/npbench/bench_gesummv.py b/benchmarks/benchmarks/npbench/bench_gesummv.py
index 13bb773e..b3c02cd5 100644
--- a/benchmarks/benchmarks/npbench/bench_gesummv.py
+++ b/benchmarks/benchmarks/npbench/bench_gesummv.py
@@ -1,4 +1,6 @@
-"""npbench wrapper: GESUMMV (scalar, vector and matrix multiplication) — mkl_umath ops: matmul.
+"""npbench wrapper: GESUMMV (scalar, vector and matrix multiplication).
+
+mkl_umath ops: matmul.
 
 Preset sizes from npbench bench_info/gesummv.json:
   M: N=4_000
@@ -13,8 +15,12 @@
 def _initialize(N, datatype=np.float64):
     alpha = datatype(1.5)
     beta = datatype(1.2)
-    A = np.fromfunction(lambda i, j: ((i * j + 1) % N) / N, (N, N), dtype=datatype)
-    B = np.fromfunction(lambda i, j: ((i * j + 2) % N) / N, (N, N), dtype=datatype)
+    A = np.fromfunction(
+        lambda i, j: ((i * j + 1) % N) / N, (N, N), dtype=datatype
+    )
+    B = np.fromfunction(
+        lambda i, j: ((i * j + 2) % N) / N, (N, N), dtype=datatype
+    )
     x = np.fromfunction(lambda i: (i % N) / N, (N,), dtype=datatype)
     return alpha, beta, A, B, x
 
diff --git a/benchmarks/benchmarks/npbench/bench_go_fast.py b/benchmarks/benchmarks/npbench/bench_go_fast.py
index d197c540..f4dca7ef 100644
--- a/benchmarks/benchmarks/npbench/bench_go_fast.py
+++ b/benchmarks/benchmarks/npbench/bench_go_fast.py
@@ -16,6 +16,7 @@
 # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py
 def _initialize(N):
     from numpy.random import default_rng
+
     rng = default_rng(42)
     a = rng.random((N, N))
     return (a,)
@@ -37,7 +38,7 @@ def _go_fast(a):
 
 
 class BenchGoFastLoop:
-    """Original npbench kernel — diagonal Python loop calling np.tanh per element."""
+    """Original npbench kernel — Python loop calling np.tanh per element."""
 
     params = (["M", "L"],)
     param_names = ["preset"]
diff --git a/benchmarks/benchmarks/npbench/bench_k2mm.py b/benchmarks/benchmarks/npbench/bench_k2mm.py
index 6ff8cf6f..68143c6a 100644
--- a/benchmarks/benchmarks/npbench/bench_k2mm.py
+++ b/benchmarks/benchmarks/npbench/bench_k2mm.py
@@ -15,11 +15,19 @@
 # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm.py
 def _initialize(NI, NJ, NK, NL, datatype=np.float64):
     alpha = datatype(1.5)
-    beta  = datatype(1.2)
-    A = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NK), dtype=datatype)
-    B = np.fromfunction(lambda i, j: (i * (j + 1) % NJ) / NJ, (NK, NJ), dtype=datatype)
-    C = np.fromfunction(lambda i, j: ((i * (j + 3) + 1) % NL) / NL, (NJ, NL), dtype=datatype)
-    D = np.fromfunction(lambda i, j: (i * (j + 2) % NK) / NK, (NI, NL), dtype=datatype)
+    beta = datatype(1.2)
+    A = np.fromfunction(
+        lambda i, j: ((i * j + 1) % NI) / NI, (NI, NK), dtype=datatype
+    )
+    B = np.fromfunction(
+        lambda i, j: (i * (j + 1) % NJ) / NJ, (NK, NJ), dtype=datatype
+    )
+    C = np.fromfunction(
+        lambda i, j: ((i * (j + 3) + 1) % NL) / NL, (NJ, NL), dtype=datatype
+    )
+    D = np.fromfunction(
+        lambda i, j: (i * (j + 2) % NK) / NK, (NI, NL), dtype=datatype
+    )
     return alpha, beta, A, B, C, D
 
 
@@ -47,7 +55,7 @@ def setup_cache(self):
     def setup(self, cache, preset):
         alpha, beta, A, B, C, D = cache[preset]
         self.alpha = alpha
-        self.beta  = beta
+        self.beta = beta
         self.A = A
         self.B = B
         self.C = C
diff --git a/benchmarks/benchmarks/npbench/bench_k3mm.py b/benchmarks/benchmarks/npbench/bench_k3mm.py
index 15e73d60..5211fdaf 100644
--- a/benchmarks/benchmarks/npbench/bench_k3mm.py
+++ b/benchmarks/benchmarks/npbench/bench_k3mm.py
@@ -11,10 +11,22 @@
 # Inlined from spcl/npbench @ main
 # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm.py
 def _initialize(NI, NJ, NK, NL, NM, datatype=np.float64):
-    A = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / (5 * NI), (NI, NK), dtype=datatype)
-    B = np.fromfunction(lambda i, j: ((i * (j + 1) + 2) % NJ) / (5 * NJ), (NK, NJ), dtype=datatype)
-    C = np.fromfunction(lambda i, j: (i * (j + 3) % NL) / (5 * NL), (NJ, NM), dtype=datatype)
-    D = np.fromfunction(lambda i, j: ((i * (j + 2) + 2) % NK) / (5 * NK), (NM, NL), dtype=datatype)
+    A = np.fromfunction(
+        lambda i, j: ((i * j + 1) % NI) / (5 * NI), (NI, NK), dtype=datatype
+    )
+    B = np.fromfunction(
+        lambda i, j: ((i * (j + 1) + 2) % NJ) / (5 * NJ),
+        (NK, NJ),
+        dtype=datatype,
+    )
+    C = np.fromfunction(
+        lambda i, j: (i * (j + 3) % NL) / (5 * NL), (NJ, NM), dtype=datatype
+    )
+    D = np.fromfunction(
+        lambda i, j: ((i * (j + 2) + 2) % NK) / (5 * NK),
+        (NM, NL),
+        dtype=datatype,
+    )
     return A, B, C, D
 
 
diff --git a/benchmarks/benchmarks/npbench/bench_mandelbrot.py b/benchmarks/benchmarks/npbench/bench_mandelbrot.py
index 090fcd0e..47f3f14f 100644
--- a/benchmarks/benchmarks/npbench/bench_mandelbrot.py
+++ b/benchmarks/benchmarks/npbench/bench_mandelbrot.py
@@ -1,4 +1,6 @@
-"""npbench wrapper: Mandelbrot set (two variants) — mkl_umath ops: abs, multiply, add.
+"""npbench wrapper: Mandelbrot set (two variants).
+
+mkl_umath ops: abs, multiply, add.
 
 Preset sizes from npbench bench_info/mandelbrot1.json and mandelbrot2.json:
   M: XN=YN=250/500, maxiter=150/80
@@ -13,9 +15,9 @@
 
 import numpy as np
 
-
 # --- mandelbrot1 ---
 
+
 # Inlined from spcl/npbench @ main
 # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot1/mandelbrot1_numpy.py
 def _mandelbrot1(xmin, xmax, ymin, ymax, xn, yn, maxiter, horizon=2.0):
@@ -25,15 +27,16 @@ def _mandelbrot1(xmin, xmax, ymin, ymax, xn, yn, maxiter, horizon=2.0):
     N = np.zeros(C.shape, dtype=np.int64)
     Z = np.zeros(C.shape, dtype=np.complex128)
     for n in range(maxiter):
-        I = np.less(abs(Z), horizon)
-        N[I] = n
-        Z[I] = Z[I] ** 2 + C[I]
+        mask = np.less(abs(Z), horizon)
+        N[mask] = n
+        Z[mask] = Z[mask] ** 2 + C[mask]
     N[N == maxiter - 1] = 0
     return Z, N
 
 
 # --- mandelbrot2 ---
 
+
 # Inlined from spcl/npbench @ main
 # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot2/mandelbrot2_numpy.py
 def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0):
@@ -63,17 +66,49 @@ def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0):
 
 
 _PRESETS_M1 = {
-    "M": {"xmin": -1.75, "xmax": 0.25, "ymin": -1.00, "ymax": 1.00,
-          "xn": 250, "yn": 250, "maxiter": 150, "horizon": 2.0},
-    "L": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25,
-          "xn": 833, "yn": 833, "maxiter": 200, "horizon": 2.0},
+    "M": {
+        "xmin": -1.75,
+        "xmax": 0.25,
+        "ymin": -1.00,
+        "ymax": 1.00,
+        "xn": 250,
+        "yn": 250,
+        "maxiter": 150,
+        "horizon": 2.0,
+    },
+    "L": {
+        "xmin": -2.00,
+        "xmax": 0.50,
+        "ymin": -1.25,
+        "ymax": 1.25,
+        "xn": 833,
+        "yn": 833,
+        "maxiter": 200,
+        "horizon": 2.0,
+    },
 }
 
 _PRESETS_M2 = {
-    "M": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25,
-          "xn": 500, "yn": 500, "itermax": 80,  "horizon": 2.0},
-    "L": {"xmin": -2.25, "xmax": 0.75, "ymin": -1.50, "ymax": 1.50,
-          "xn": 1000, "yn": 1000, "itermax": 100, "horizon": 2.0},
+    "M": {
+        "xmin": -2.00,
+        "xmax": 0.50,
+        "ymin": -1.25,
+        "ymax": 1.25,
+        "xn": 500,
+        "yn": 500,
+        "itermax": 80,
+        "horizon": 2.0,
+    },
+    "L": {
+        "xmin": -2.25,
+        "xmax": 0.75,
+        "ymin": -1.50,
+        "ymax": 1.50,
+        "xn": 1000,
+        "yn": 1000,
+        "itermax": 100,
+        "horizon": 2.0,
+    },
 }
 
 
diff --git a/benchmarks/benchmarks/npbench/bench_softmax.py b/benchmarks/benchmarks/npbench/bench_softmax.py
index 29a77252..5fdfe321 100644
--- a/benchmarks/benchmarks/npbench/bench_softmax.py
+++ b/benchmarks/benchmarks/npbench/bench_softmax.py
@@ -14,6 +14,7 @@
 # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax.py
 def _initialize(N, H, SM):
     from numpy.random import default_rng
+
     rng = default_rng(42)
     x = rng.random((N, H, SM, SM), dtype=np.float32)
     return (x,)
@@ -29,7 +30,7 @@ def _softmax(x):
 
 
 _PRESETS = {
-    "M": {"N": 32, "H": 8,  "SM": 256},
+    "M": {"N": 32, "H": 8, "SM": 256},
     "L": {"N": 64, "H": 16, "SM": 448},
 }
 

From 5da98d4d2c2f9c7432b6b5483d5a26e915865afe Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Tue, 19 May 2026 10:10:51 -0500
Subject: [PATCH 4/7] Fix PR comments

---
 benchmarks/benchmarks/__init__.py             |   8 +-
 benchmarks/benchmarks/_patch_setup.py         |  19 +--
 benchmarks/benchmarks/micro/bench_exp_log.py  |  98 -------------
 benchmarks/benchmarks/micro/bench_micro.py    |  88 ++++++++++++
 .../benchmarks/micro/bench_sqrt_misc.py       |  84 -----------
 benchmarks/benchmarks/micro/bench_trig.py     | 134 ------------------
 .../benchmarks/npbench/bench_cholesky2.py     |  56 --------
 .../benchmarks/npbench/bench_correlation.py   |  60 --------
 .../benchmarks/npbench/bench_covariance.py    |  55 -------
 .../benchmarks/npbench/bench_deriche.py       | 113 ---------------
 .../benchmarks/npbench/bench_doitgen.py       |  54 -------
 benchmarks/benchmarks/npbench/bench_gemm.py   |  63 --------
 benchmarks/benchmarks/npbench/bench_gemver.py |  83 -----------
 .../benchmarks/npbench/bench_gesummv.py       |  53 -------
 benchmarks/benchmarks/npbench/bench_k2mm.py   |  65 ---------
 benchmarks/benchmarks/npbench/bench_k3mm.py   |  58 --------
 16 files changed, 96 insertions(+), 995 deletions(-)
 delete mode 100644 benchmarks/benchmarks/micro/bench_exp_log.py
 create mode 100644 benchmarks/benchmarks/micro/bench_micro.py
 delete mode 100644 benchmarks/benchmarks/micro/bench_sqrt_misc.py
 delete mode 100644 benchmarks/benchmarks/micro/bench_trig.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_cholesky2.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_correlation.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_covariance.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_deriche.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_doitgen.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_gemm.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_gemver.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_gesummv.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_k2mm.py
 delete mode 100644 benchmarks/benchmarks/npbench/bench_k3mm.py

diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
index bcf027e4..f7739a93 100644
--- a/benchmarks/benchmarks/__init__.py
+++ b/benchmarks/benchmarks/__init__.py
@@ -1,4 +1,4 @@
-# Trigger MKL patching once per ASV worker process.
-# ASV uses --launch-method spawn in CI, so each worker is a fresh process
-# and this runs exactly once before any benchmark is collected or timed.
-from . import _patch_setup  # noqa: F401
+from ._patch_setup import _apply_patches
+
+_apply_patches()
+del _apply_patches
\ No newline at end of file
diff --git a/benchmarks/benchmarks/_patch_setup.py b/benchmarks/benchmarks/_patch_setup.py
index 9aea6062..9383b1c8 100644
--- a/benchmarks/benchmarks/_patch_setup.py
+++ b/benchmarks/benchmarks/_patch_setup.py
@@ -3,12 +3,8 @@
 Patches NumPy with Intel MKL implementations for fft, random, and umath.
 Hard-fails with a descriptive RuntimeError if any package is missing or the
 patch does not take effect, so benchmarks never silently run on stock NumPy.
-
-Visible output goes to stderr; pass --show-stderr to ``asv run`` to see it.
 """
 
-import sys
-
 _PATCH_MAP = [
     ("mkl_fft", "patch_numpy_fft"),
     ("mkl_random", "patch_numpy_random"),
@@ -17,6 +13,8 @@
 
 
 def _apply_patches():
+    import numpy as np
+
     patched = {}
 
     for mod_name, patch_fn_name in _PATCH_MAP:
@@ -56,9 +54,6 @@ def _apply_patches():
 
         patched[mod_name] = mod
 
-    # Verbose attribution — verify numpy-level dispatch changed hands
-    import numpy as np
-
     _attr_checks = {
         "mkl_fft": lambda: np.fft.fft.__module__,
         "mkl_random": lambda: np.random.random.__module__,
@@ -69,12 +64,6 @@ def _apply_patches():
             attr = _attr_checks[mod_name]()
         except Exception:
             attr = "unknown"
-        sys.stderr.write(f"[mkl-patch] {mod_name}: numpy dispatch → {attr}\n")
-
-    sys.stderr.write(
-        "[mkl-patch] ALL OK — mkl_fft, mkl_random, mkl_umath active\n"
-    )
-    sys.stderr.flush()
-
+        print(f"[mkl-patch] {mod_name}: numpy dispatch -> {attr}")
 
-_apply_patches()
+    print("[mkl-patch] ALL OK -- mkl_fft, mkl_random, mkl_umath active")
diff --git a/benchmarks/benchmarks/micro/bench_exp_log.py b/benchmarks/benchmarks/micro/bench_exp_log.py
deleted file mode 100644
index 216fc740..00000000
--- a/benchmarks/benchmarks/micro/bench_exp_log.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""Micro-benchmarks for mkl_umath exponential and logarithm ufuncs.
-
-Each class times a single ufunc over a Cartesian product of
-  dtype  ∈ [float32, float64]
-  size   ∈ [10_000, 100_000, 1_000_000]
-
-Arrays are pre-allocated in setup() and reused across timing calls.
-Patching is applied once at package import via benchmarks._patch_setup.
-"""
-
-import numpy as np
-
-
-class BenchExp:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        # float32 overflows exp around 88.7; use [-10, 10] safe for both dtypes
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
-
-    def time_exp(self, dtype, size):
-        np.exp(self.x)
-
-
-class BenchExp2:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        # float32 overflows exp2 around 127
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
-
-    def time_exp2(self, dtype, size):
-        np.exp2(self.x)
-
-
-class BenchExpm1:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
-
-    def time_expm1(self, dtype, size):
-        np.expm1(self.x)
-
-
-class BenchLog:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(1e-3, 1e3, size).astype(dtype)
-
-    def time_log(self, dtype, size):
-        np.log(self.x)
-
-
-class BenchLog2:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(1e-3, 1e3, size).astype(dtype)
-
-    def time_log2(self, dtype, size):
-        np.log2(self.x)
-
-
-class BenchLog10:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(1e-3, 1e3, size).astype(dtype)
-
-    def time_log10(self, dtype, size):
-        np.log10(self.x)
-
-
-class BenchLog1p:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        # log1p(x) is defined for x > -1; use [0, 10] which is always safe
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(0.0, 10.0, size).astype(dtype)
-
-    def time_log1p(self, dtype, size):
-        np.log1p(self.x)
diff --git a/benchmarks/benchmarks/micro/bench_micro.py b/benchmarks/benchmarks/micro/bench_micro.py
new file mode 100644
index 00000000..381c823c
--- /dev/null
+++ b/benchmarks/benchmarks/micro/bench_micro.py
@@ -0,0 +1,88 @@
+"""Micro-benchmarks for mkl_umath unary ufuncs.
+
+Times each ufunc over a Cartesian product of
+  dtype  in [float32, float64]
+  size   in [10_000, 100_000, 1_000_000]
+
+Arrays are pre-allocated in setup() and reused across timing calls.
+Patching is applied once at package import via benchmarks._patch_setup.
+"""
+
+import numpy as np
+
+
+_UFUNC_CONFIGS = {
+    "exp": {"func": np.exp, "low": -10.0, "high": 10.0},
+    "exp2": {"func": np.exp2, "low": -10.0, "high": 10.0},
+    "expm1": {"func": np.expm1, "low": -10.0, "high": 10.0},
+    "log": {"func": np.log, "low": 1e-3, "high": 1e3},
+    "log2": {"func": np.log2, "low": 1e-3, "high": 1e3},
+    "log10": {"func": np.log10, "low": 1e-3, "high": 1e3},
+    "log1p": {"func": np.log1p, "low": 0.0, "high": 10.0},
+    "sin": {"func": np.sin, "low": -np.pi, "high": np.pi},
+    "cos": {"func": np.cos, "low": -np.pi, "high": np.pi},
+    "tan": {"func": np.tan, "low": -1.4, "high": 1.4},
+    "arcsin": {"func": np.arcsin, "low": -1.0, "high": 1.0},
+    "arccos": {"func": np.arccos, "low": -1.0, "high": 1.0},
+    "arctan": {"func": np.arctan, "low": -10.0, "high": 10.0},
+    "sinh": {"func": np.sinh, "low": -5.0, "high": 5.0},
+    "cosh": {"func": np.cosh, "low": -5.0, "high": 5.0},
+    "tanh": {"func": np.tanh, "low": -5.0, "high": 5.0},
+    "arcsinh": {"func": np.arcsinh, "low": -10.0, "high": 10.0},
+    "arccosh": {"func": np.arccosh, "low": 1.0, "high": 100.0},
+    "arctanh": {"func": np.arctanh, "low": -0.99, "high": 0.99},
+    "sqrt": {"func": np.sqrt, "low": 0.0, "high": 100.0},
+    "cbrt": {"func": np.cbrt, "low": -100.0, "high": 100.0},
+    "square": {"func": np.square, "low": -10.0, "high": 10.0},
+    "fabs": {"func": np.fabs, "low": -100.0, "high": 100.0},
+    "absolute": {"func": np.absolute, "low": -100.0, "high": 100.0},
+    "reciprocal": {"func": np.reciprocal, "low": 0.01, "high": 100.0},
+}
+
+
+class BenchMicro:
+    params = (
+        sorted(_UFUNC_CONFIGS.keys()),
+        ["float32", "float64"],
+        [10_000, 100_000, 1_000_000],
+    )
+    param_names = ["ufunc", "dtype", "size"]
+
+    def setup(self, ufunc, dtype, size):
+        cfg = _UFUNC_CONFIGS[ufunc]
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(cfg["low"], cfg["high"], size).astype(dtype)
+        self._func = cfg["func"]
+
+    def time_micro(self, ufunc, dtype, size):
+        self._func(self.x)
+
+
+class BenchArctan2:
+    """Binary ufunc arctan2"""
+
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.y = rng.uniform(-1.0, 1.0, size).astype(dtype)
+        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
+
+    def time_arctan2(self, dtype, size):
+        np.arctan2(self.y, self.x)
+
+
+class BenchPower:
+    """Binary ufunc power (arbitrary exponent via MKL vdPow)"""
+
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.base = rng.uniform(0.1, 10.0, size).astype(dtype)
+        self.exp = rng.uniform(0.5, 3.0, size).astype(dtype)
+
+    def time_power(self, dtype, size):
+        np.power(self.base, self.exp)
diff --git a/benchmarks/benchmarks/micro/bench_sqrt_misc.py b/benchmarks/benchmarks/micro/bench_sqrt_misc.py
deleted file mode 100644
index b1170639..00000000
--- a/benchmarks/benchmarks/micro/bench_sqrt_misc.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""Micro-benchmarks for mkl_umath sqrt, cbrt, and miscellaneous ufuncs.
-
-Each class times a single ufunc over a Cartesian product of
-  dtype  ∈ [float32, float64]
-  size   ∈ [10_000, 100_000, 1_000_000]
-
-Arrays are pre-allocated in setup() and reused across timing calls.
-Patching is applied once at package import via benchmarks._patch_setup.
-"""
-
-import numpy as np
-
-
-class BenchSqrt:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(0.0, 100.0, size).astype(dtype)
-
-    def time_sqrt(self, dtype, size):
-        np.sqrt(self.x)
-
-
-class BenchCbrt:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-100.0, 100.0, size).astype(dtype)
-
-    def time_cbrt(self, dtype, size):
-        np.cbrt(self.x)
-
-
-class BenchSquare:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
-
-    def time_square(self, dtype, size):
-        np.square(self.x)
-
-
-class BenchFabs:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-100.0, 100.0, size).astype(dtype)
-
-    def time_fabs(self, dtype, size):
-        np.fabs(self.x)
-
-
-class BenchAbsolute:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-100.0, 100.0, size).astype(dtype)
-
-    def time_absolute(self, dtype, size):
-        np.absolute(self.x)
-
-
-class BenchReciprocal:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        # Avoid values near zero to prevent inf results dominating timing
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(0.01, 100.0, size).astype(dtype)
-
-    def time_reciprocal(self, dtype, size):
-        np.reciprocal(self.x)
diff --git a/benchmarks/benchmarks/micro/bench_trig.py b/benchmarks/benchmarks/micro/bench_trig.py
deleted file mode 100644
index eb09b9c6..00000000
--- a/benchmarks/benchmarks/micro/bench_trig.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Micro-benchmarks for mkl_umath trigonometric ufuncs.
-
-Each class times a single ufunc over a Cartesian product of
-  dtype  ∈ [float32, float64]
-  size   ∈ [10_000, 100_000, 1_000_000]
-
-Arrays are pre-allocated in setup() and reused across timing calls.
-Patching is applied once at package import via benchmarks._patch_setup.
-"""
-
-import numpy as np
-
-
-class BenchSin:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-np.pi, np.pi, size).astype(dtype)
-
-    def time_sin(self, dtype, size):
-        np.sin(self.x)
-
-
-class BenchCos:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-np.pi, np.pi, size).astype(dtype)
-
-    def time_cos(self, dtype, size):
-        np.cos(self.x)
-
-
-class BenchTan:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        # Avoid values near ±π/2 where tan diverges
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-1.4, 1.4, size).astype(dtype)
-
-    def time_tan(self, dtype, size):
-        np.tan(self.x)
-
-
-class BenchArcsin:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
-
-    def time_arcsin(self, dtype, size):
-        np.arcsin(self.x)
-
-
-class BenchArccos:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
-
-    def time_arccos(self, dtype, size):
-        np.arccos(self.x)
-
-
-class BenchArctan:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-10.0, 10.0, size).astype(dtype)
-
-    def time_arctan(self, dtype, size):
-        np.arctan(self.x)
-
-
-class BenchArctan2:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.y = rng.uniform(-1.0, 1.0, size).astype(dtype)
-        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
-
-    def time_arctan2(self, dtype, size):
-        np.arctan2(self.y, self.x)
-
-
-class BenchSinh:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        # float32 overflows sinh around ±89; keep well inside that
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-5.0, 5.0, size).astype(dtype)
-
-    def time_sinh(self, dtype, size):
-        np.sinh(self.x)
-
-
-class BenchCosh:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-5.0, 5.0, size).astype(dtype)
-
-    def time_cosh(self, dtype, size):
-        np.cosh(self.x)
-
-
-class BenchTanh:
-    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
-    param_names = ["dtype", "size"]
-
-    def setup(self, dtype, size):
-        rng = np.random.default_rng(42)
-        self.x = rng.uniform(-5.0, 5.0, size).astype(dtype)
-
-    def time_tanh(self, dtype, size):
-        np.tanh(self.x)
diff --git a/benchmarks/benchmarks/npbench/bench_cholesky2.py b/benchmarks/benchmarks/npbench/bench_cholesky2.py
deleted file mode 100644
index ae19443f..00000000
--- a/benchmarks/benchmarks/npbench/bench_cholesky2.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""npbench wrapper: Cholesky decomposition v2 — mkl_umath ops: linalg.cholesky.
-
-Preset sizes from npbench bench_info/cholesky2.json:
-  M: N=2200
-  L: N=8000
-
-The kernel mutates A in-place (A[:] = cholesky(A) + triu(A, k=1)), so
-setup() copies A from cache before each timing round.
-
-The initialization constructs a symmetric positive-definite matrix via A @ A^T,
-which is expensive at N=8000.  setup_cache() runs this once per commit.
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/cholesky2/cholesky2.py
-def _initialize(N, datatype=np.float64):
-    A = np.empty((N, N), dtype=datatype)
-    for i in range(N):
-        A[i, : i + 1] = np.fromfunction(
-            lambda j: (-j % N) / N + 1, (i + 1,), dtype=datatype
-        )
-        A[i, i + 1 :] = 0.0
-        A[i, i] = 1.0
-    A[:] = A @ np.transpose(A)
-    return A
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/cholesky2/cholesky2_numpy.py
-def _kernel(A):
-    A[:] = np.linalg.cholesky(A) + np.triu(A, k=1)
-
-
-_PRESETS = {
-    "M": {"N": 2200},
-    "L": {"N": 8000},
-}
-
-
-class BenchCholesky2:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        self.A = cache[preset].copy()  # kernel mutates A in-place
-
-    def time_cholesky2(self, cache, preset):
-        _kernel(self.A)
diff --git a/benchmarks/benchmarks/npbench/bench_correlation.py b/benchmarks/benchmarks/npbench/bench_correlation.py
deleted file mode 100644
index ca941443..00000000
--- a/benchmarks/benchmarks/npbench/bench_correlation.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""npbench wrapper: Correlation — mkl_umath ops: sqrt, std, mean.
-
-Preset sizes from npbench bench_info/correlation.json:
-  M: M=1400, N=1800
-  L: M=3200, N=4000
-
-The kernel mutates ``data`` in-place (data -= mean; data /= ...), so
-setup() copies from the cache before each timing round.
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/correlation/correlation.py
-def _initialize(M, N, datatype=np.float64):
-    float_n = datatype(N)
-    data = np.fromfunction(lambda i, j: (i * j) / M + i, (N, M), dtype=datatype)
-    return float_n, data
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/correlation/correlation_numpy.py
-def _kernel(M, float_n, data):
-    mean = np.mean(data, axis=0)
-    stddev = np.std(data, axis=0)
-    stddev[stddev <= 0.1] = 1.0
-    data -= mean
-    data /= np.sqrt(float_n) * stddev
-    corr = np.eye(M, dtype=data.dtype)
-    for i in range(M - 1):
-        corr[i + 1 : M, i] = corr[i, i + 1 : M] = (
-            data[:, i] @ data[:, i + 1 : M]
-        )
-    return corr
-
-
-_PRESETS = {
-    "M": {"M": 1400, "N": 1800},
-    "L": {"M": 3200, "N": 4000},
-}
-
-
-class BenchCorrelation:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        float_n, data = cache[preset]
-        self.M = _PRESETS[preset]["M"]
-        self.float_n = float_n
-        self.data = data.copy()  # kernel mutates data in-place
-
-    def time_correlation(self, cache, preset):
-        _kernel(self.M, self.float_n, self.data)
diff --git a/benchmarks/benchmarks/npbench/bench_covariance.py b/benchmarks/benchmarks/npbench/bench_covariance.py
deleted file mode 100644
index b85b3191..00000000
--- a/benchmarks/benchmarks/npbench/bench_covariance.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""npbench wrapper: Covariance — mkl_umath ops: mean.
-
-Preset sizes from npbench bench_info/covariance.json:
-  M: M=1400, N=1800
-  L: M=3200, N=4000
-
-The kernel mutates ``data`` in-place (data -= mean), so setup() copies
-from the cache before each timing round.
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/covariance/covariance.py
-def _initialize(M, N, datatype=np.float64):
-    float_n = datatype(N)
-    data = np.fromfunction(lambda i, j: (i * j) / M, (N, M), dtype=datatype)
-    return float_n, data
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/covariance/covariance_numpy.py
-def _kernel(M, float_n, data):
-    mean = np.mean(data, axis=0)
-    data -= mean
-    cov = np.zeros((M, M), dtype=data.dtype)
-    for i in range(M):
-        cov[i:M, i] = cov[i, i:M] = data[:, i] @ data[:, i:M] / (float_n - 1.0)
-    return cov
-
-
-_PRESETS = {
-    "M": {"M": 1400, "N": 1800},
-    "L": {"M": 3200, "N": 4000},
-}
-
-
-class BenchCovariance:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        float_n, data = cache[preset]
-        self.M = _PRESETS[preset]["M"]
-        self.float_n = float_n
-        self.data = data.copy()  # kernel mutates data in-place
-
-    def time_covariance(self, cache, preset):
-        _kernel(self.M, self.float_n, self.data)
diff --git a/benchmarks/benchmarks/npbench/bench_deriche.py b/benchmarks/benchmarks/npbench/bench_deriche.py
deleted file mode 100644
index 4539053d..00000000
--- a/benchmarks/benchmarks/npbench/bench_deriche.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""npbench wrapper: Deriche Edge Detector — mkl_umath ops: exp.
-
-Preset sizes from npbench bench_info/deriche.json:
-  M: W=1500, H=1000
-  L: W=6000, H=3000
-
-Warning: this kernel contains Python for-loops over rows/columns.
-At the L preset the Python loops dominate runtime; exp() calls on scalar
-floats are measured, not vectorised MKL VM throughput.  The L preset is
-included for historical comparability with npbench runs.
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/deriche/deriche.py
-def _initialize(W, H, datatype=np.float64):
-    alpha = datatype(0.25)
-    imgIn = np.fromfunction(
-        lambda i, j: ((313 * i + 991 * j) % 65536) / 65535.0,
-        (W, H),
-        dtype=datatype,
-    )
-    return alpha, imgIn
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/deriche/deriche_numpy.py
-def _kernel(alpha, imgIn):
-    k = (
-        (1.0 - np.exp(-alpha))
-        * (1.0 - np.exp(-alpha))
-        / (1.0 + alpha * np.exp(-alpha) - np.exp(2.0 * alpha))
-    )
-    a1 = a5 = k
-    a2 = a6 = k * np.exp(-alpha) * (alpha - 1.0)
-    a3 = a7 = k * np.exp(-alpha) * (alpha + 1.0)
-    a4 = a8 = -k * np.exp(-2.0 * alpha)
-    b1 = 2.0 ** (-alpha)
-    b2 = -np.exp(-2.0 * alpha)
-    c1 = c2 = 1
-
-    y1 = np.empty_like(imgIn)
-    y1[:, 0] = a1 * imgIn[:, 0]
-    y1[:, 1] = a1 * imgIn[:, 1] + a2 * imgIn[:, 0] + b1 * y1[:, 0]
-    for j in range(2, imgIn.shape[1]):
-        y1[:, j] = (
-            a1 * imgIn[:, j]
-            + a2 * imgIn[:, j - 1]
-            + b1 * y1[:, j - 1]
-            + b2 * y1[:, j - 2]
-        )
-
-    y2 = np.empty_like(imgIn)
-    y2[:, -1] = 0.0
-    y2[:, -2] = a3 * imgIn[:, -1]
-    for j in range(imgIn.shape[1] - 3, -1, -1):
-        y2[:, j] = (
-            a3 * imgIn[:, j + 1]
-            + a4 * imgIn[:, j + 2]
-            + b1 * y2[:, j + 1]
-            + b2 * y2[:, j + 2]
-        )
-
-    imgOut = c1 * (y1 + y2)
-
-    y1[0, :] = a5 * imgOut[0, :]
-    y1[1, :] = a5 * imgOut[1, :] + a6 * imgOut[0, :] + b1 * y1[0, :]
-    for i in range(2, imgIn.shape[0]):
-        y1[i, :] = (
-            a5 * imgOut[i, :]
-            + a6 * imgOut[i - 1, :]
-            + b1 * y1[i - 1, :]
-            + b2 * y1[i - 2, :]
-        )
-
-    y2[-1, :] = 0.0
-    y2[-2, :] = a7 * imgOut[-1, :]
-    for i in range(imgIn.shape[0] - 3, -1, -1):
-        y2[i, :] = (
-            a7 * imgOut[i + 1, :]
-            + a8 * imgOut[i + 2, :]
-            + b1 * y2[i + 1, :]
-            + b2 * y2[i + 2, :]
-        )
-
-    return c2 * (y1 + y2)
-
-
-_PRESETS = {
-    "M": {"W": 1500, "H": 1000},
-    "L": {"W": 6000, "H": 3000},
-}
-
-
-class BenchDeriche:
-    # L preset has Python loops over 6000 rows — allow extra time
-    timeout = 600
-
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        self.alpha, self.imgIn = cache[preset]
-
-    def time_deriche(self, cache, preset):
-        _kernel(self.alpha, self.imgIn)
diff --git a/benchmarks/benchmarks/npbench/bench_doitgen.py b/benchmarks/benchmarks/npbench/bench_doitgen.py
deleted file mode 100644
index eb255bae..00000000
--- a/benchmarks/benchmarks/npbench/bench_doitgen.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""npbench wrapper: Doitgen (multiresolution analysis) — mkl_umath ops: matmul.
-
-Preset sizes from npbench bench_info/doitgen.json:
-  M: NR=110, NQ=125, NP=256
-  L: NR=220, NQ=250, NP=512
-
-The kernel mutates ``A`` in-place (A[:] = ...), so setup() copies from cache.
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/doitgen/doitgen.py
-def _initialize(NR, NQ, NP, datatype=np.float64):
-    A = np.fromfunction(
-        lambda i, j, k: ((i * j + k) % NP) / NP, (NR, NQ, NP), dtype=datatype
-    )
-    C4 = np.fromfunction(
-        lambda i, j: (i * j % NP) / NP, (NP, NP), dtype=datatype
-    )
-    return A, C4
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/doitgen/doitgen_numpy.py
-def _kernel(NR, NQ, NP, A, C4):
-    A[:] = np.reshape(np.reshape(A, (NR, NQ, 1, NP)) @ C4, (NR, NQ, NP))
-
-
-_PRESETS = {
-    "M": {"NR": 110, "NQ": 125, "NP": 256},
-    "L": {"NR": 220, "NQ": 250, "NP": 512},
-}
-
-
-class BenchDoitgen:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        A, C4 = cache[preset]
-        p = _PRESETS[preset]
-        self.NR, self.NQ, self.NP = p["NR"], p["NQ"], p["NP"]
-        self.A = A.copy()  # kernel mutates A in-place
-        self.C4 = C4
-
-    def time_doitgen(self, cache, preset):
-        _kernel(self.NR, self.NQ, self.NP, self.A, self.C4)
diff --git a/benchmarks/benchmarks/npbench/bench_gemm.py b/benchmarks/benchmarks/npbench/bench_gemm.py
deleted file mode 100644
index c6a36fd5..00000000
--- a/benchmarks/benchmarks/npbench/bench_gemm.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""npbench wrapper: GEMM (general matrix-matrix multiply).
-
-mkl_umath ops: matmul.
-
-Preset sizes from npbench bench_info/gemm.json:
-  M: NI=2500, NJ=2750, NK=3000
-  L: NI=7000, NJ=7500, NK=8000
-
-The kernel mutates C in-place (C[:] = alpha * A @ B + beta * C), so
-setup() copies C from cache before each timing round.
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm.py
-def _initialize(NI, NJ, NK, datatype=np.float64):
-    alpha = datatype(1.5)
-    beta = datatype(1.2)
-    C = np.fromfunction(
-        lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ), dtype=datatype
-    )
-    A = np.fromfunction(
-        lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK), dtype=datatype
-    )
-    B = np.fromfunction(
-        lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ), dtype=datatype
-    )
-    return alpha, beta, C, A, B
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm_numpy.py
-def _kernel(alpha, beta, C, A, B):
-    C[:] = alpha * A @ B + beta * C
-
-
-_PRESETS = {
-    "M": {"NI": 2500, "NJ": 2750, "NK": 3000},
-    "L": {"NI": 7000, "NJ": 7500, "NK": 8000},
-}
-
-
-class BenchGemm:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        alpha, beta, C, A, B = cache[preset]
-        self.alpha = alpha
-        self.beta = beta
-        self.C = C.copy()  # mutated in-place
-        self.A = A
-        self.B = B
-
-    def time_gemm(self, cache, preset):
-        _kernel(self.alpha, self.beta, self.C, self.A, self.B)
diff --git a/benchmarks/benchmarks/npbench/bench_gemver.py b/benchmarks/benchmarks/npbench/bench_gemver.py
deleted file mode 100644
index c85313ed..00000000
--- a/benchmarks/benchmarks/npbench/bench_gemver.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""npbench wrapper: GEMVER (vector multiplication and matrix addition).
-
-mkl_umath ops: outer.
-
-Preset sizes from npbench bench_info/gemver.json:
-  M: N=3_000
-  L: N=10_000
-
-The kernel mutates A, x, and w in-place, so setup() copies those from cache.
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemver/gemver.py
-def _initialize(N, datatype=np.float64):
-    alpha = datatype(1.5)
-    beta = datatype(1.2)
-    fn = datatype(N)
-    A = np.fromfunction(lambda i, j: (i * j % N) / N, (N, N), dtype=datatype)
-    u1 = np.fromfunction(lambda i: i, (N,), dtype=datatype)
-    u2 = np.fromfunction(lambda i: ((i + 1) / fn) / 2.0, (N,), dtype=datatype)
-    v1 = np.fromfunction(lambda i: ((i + 1) / fn) / 4.0, (N,), dtype=datatype)
-    v2 = np.fromfunction(lambda i: ((i + 1) / fn) / 6.0, (N,), dtype=datatype)
-    w = np.zeros((N,), dtype=datatype)
-    x = np.zeros((N,), dtype=datatype)
-    y = np.fromfunction(lambda i: ((i + 1) / fn) / 8.0, (N,), dtype=datatype)
-    z = np.fromfunction(lambda i: ((i + 1) / fn) / 9.0, (N,), dtype=datatype)
-    return alpha, beta, A, u1, v1, u2, v2, w, x, y, z
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemver/gemver_numpy.py
-def _kernel(alpha, beta, A, u1, v1, u2, v2, w, x, y, z):
-    A += np.outer(u1, v1) + np.outer(u2, v2)
-    x += beta * y @ A + z
-    w += alpha * A @ x
-
-
-_PRESETS = {
-    "M": {"N": 3_000},
-    "L": {"N": 10_000},
-}
-
-
-class BenchGemver:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        alpha, beta, A, u1, v1, u2, v2, w, x, y, z = cache[preset]
-        self.alpha = alpha
-        self.beta = beta
-        self.A = A.copy()  # mutated: A += outer(u1,v1) + outer(u2,v2)
-        self.u1 = u1
-        self.v1 = v1
-        self.u2 = u2
-        self.v2 = v2
-        self.w = w.copy()  # mutated: w += alpha * A @ x
-        self.x = x.copy()  # mutated: x += beta * y @ A + z
-        self.y = y
-        self.z = z
-
-    def time_gemver(self, cache, preset):
-        _kernel(
-            self.alpha,
-            self.beta,
-            self.A,
-            self.u1,
-            self.v1,
-            self.u2,
-            self.v2,
-            self.w,
-            self.x,
-            self.y,
-            self.z,
-        )
diff --git a/benchmarks/benchmarks/npbench/bench_gesummv.py b/benchmarks/benchmarks/npbench/bench_gesummv.py
deleted file mode 100644
index b3c02cd5..00000000
--- a/benchmarks/benchmarks/npbench/bench_gesummv.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""npbench wrapper: GESUMMV (scalar, vector and matrix multiplication).
-
-mkl_umath ops: matmul.
-
-Preset sizes from npbench bench_info/gesummv.json:
-  M: N=4_000
-  L: N=14_000
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gesummv/gesummv.py
-def _initialize(N, datatype=np.float64):
-    alpha = datatype(1.5)
-    beta = datatype(1.2)
-    A = np.fromfunction(
-        lambda i, j: ((i * j + 1) % N) / N, (N, N), dtype=datatype
-    )
-    B = np.fromfunction(
-        lambda i, j: ((i * j + 2) % N) / N, (N, N), dtype=datatype
-    )
-    x = np.fromfunction(lambda i: (i % N) / N, (N,), dtype=datatype)
-    return alpha, beta, A, B, x
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gesummv/gesummv_numpy.py
-def _kernel(alpha, beta, A, B, x):
-    return alpha * A @ x + beta * B @ x
-
-
-_PRESETS = {
-    "M": {"N": 4_000},
-    "L": {"N": 14_000},
-}
-
-
-class BenchGesummv:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        self.alpha, self.beta, self.A, self.B, self.x = cache[preset]
-
-    def time_gesummv(self, cache, preset):
-        _kernel(self.alpha, self.beta, self.A, self.B, self.x)
diff --git a/benchmarks/benchmarks/npbench/bench_k2mm.py b/benchmarks/benchmarks/npbench/bench_k2mm.py
deleted file mode 100644
index 68143c6a..00000000
--- a/benchmarks/benchmarks/npbench/bench_k2mm.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""npbench wrapper: 2MM (two matrix multiplications) — mkl_umath ops: matmul.
-
-Preset sizes from npbench bench_info/k2mm.json:
-  M: NI=2000, NJ=2250, NK=2500, NL=2750
-  L: NI=6000, NJ=6500, NK=7000, NL=7500
-
-The kernel mutates D in-place (D[:] = alpha * A @ B @ C + beta * D), so
-setup() copies D from cache before each timing round.
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm.py
-def _initialize(NI, NJ, NK, NL, datatype=np.float64):
-    alpha = datatype(1.5)
-    beta = datatype(1.2)
-    A = np.fromfunction(
-        lambda i, j: ((i * j + 1) % NI) / NI, (NI, NK), dtype=datatype
-    )
-    B = np.fromfunction(
-        lambda i, j: (i * (j + 1) % NJ) / NJ, (NK, NJ), dtype=datatype
-    )
-    C = np.fromfunction(
-        lambda i, j: ((i * (j + 3) + 1) % NL) / NL, (NJ, NL), dtype=datatype
-    )
-    D = np.fromfunction(
-        lambda i, j: (i * (j + 2) % NK) / NK, (NI, NL), dtype=datatype
-    )
-    return alpha, beta, A, B, C, D
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm_numpy.py
-def _kernel(alpha, beta, A, B, C, D):
-    D[:] = alpha * A @ B @ C + beta * D
-
-
-_PRESETS = {
-    "M": {"NI": 2000, "NJ": 2250, "NK": 2500, "NL": 2750},
-    "L": {"NI": 6000, "NJ": 6500, "NK": 7000, "NL": 7500},
-}
-
-
-class BenchK2mm:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        alpha, beta, A, B, C, D = cache[preset]
-        self.alpha = alpha
-        self.beta = beta
-        self.A = A
-        self.B = B
-        self.C = C
-        self.D = D.copy()  # mutated in-place
-
-    def time_k2mm(self, cache, preset):
-        _kernel(self.alpha, self.beta, self.A, self.B, self.C, self.D)
diff --git a/benchmarks/benchmarks/npbench/bench_k3mm.py b/benchmarks/benchmarks/npbench/bench_k3mm.py
deleted file mode 100644
index 5211fdaf..00000000
--- a/benchmarks/benchmarks/npbench/bench_k3mm.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""npbench wrapper: 3MM (three matrix multiplications) — mkl_umath ops: matmul.
-
-Preset sizes from npbench bench_info/k3mm.json:
-  M: NI=2000, NJ=2200, NK=2400, NL=2600, NM=2800
-  L: NI=5500, NJ=6000, NK=6500, NL=7000, NM=7500
-"""
-
-import numpy as np
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm.py
-def _initialize(NI, NJ, NK, NL, NM, datatype=np.float64):
-    A = np.fromfunction(
-        lambda i, j: ((i * j + 1) % NI) / (5 * NI), (NI, NK), dtype=datatype
-    )
-    B = np.fromfunction(
-        lambda i, j: ((i * (j + 1) + 2) % NJ) / (5 * NJ),
-        (NK, NJ),
-        dtype=datatype,
-    )
-    C = np.fromfunction(
-        lambda i, j: (i * (j + 3) % NL) / (5 * NL), (NJ, NM), dtype=datatype
-    )
-    D = np.fromfunction(
-        lambda i, j: ((i * (j + 2) + 2) % NK) / (5 * NK),
-        (NM, NL),
-        dtype=datatype,
-    )
-    return A, B, C, D
-
-
-# Inlined from spcl/npbench @ main
-# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm_numpy.py
-def _kernel(A, B, C, D):
-    return A @ B @ C @ D
-
-
-_PRESETS = {
-    "M": {"NI": 2000, "NJ": 2200, "NK": 2400, "NL": 2600, "NM": 2800},
-    "L": {"NI": 5500, "NJ": 6000, "NK": 6500, "NL": 7000, "NM": 7500},
-}
-
-
-class BenchK3mm:
-    params = (["M", "L"],)
-    param_names = ["preset"]
-    number = 1
-    repeat = 20
-
-    def setup_cache(self):
-        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
-
-    def setup(self, cache, preset):
-        self.A, self.B, self.C, self.D = cache[preset]
-
-    def time_k3mm(self, cache, preset):
-        _kernel(self.A, self.B, self.C, self.D)

From 8555438c08e0739509ed535490170c2acdf286f0 Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Wed, 20 May 2026 13:40:39 -0500
Subject: [PATCH 5/7] PR fixes

---
 benchmarks/README.md                       | 34 +++++++++++++++++
 benchmarks/benchmarks/__init__.py          | 43 +++++++++++++++++++++-
 benchmarks/benchmarks/micro/bench_micro.py |  1 -
 3 files changed, 76 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/README.md

diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..2016345a
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,34 @@
+# mkl_umath ASV Benchmarks
+
+Performance benchmarks for [mkl_umath](https://github.com/IntelPython/mkl_umath) using [Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/).
+
+The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench) to measure end-to-end impact of MKL ufunc acceleration in realistic workloads.
+
+### Coverage
+
+| File | Ufuncs | Dtypes | Sizes/Presets |
+|------|--------|--------|---------------|
+| `micro/bench_micro.py` | 24 unary (`exp`, `log`, `sin`, `cos`, `sqrt`, `cbrt`, etc.) + `arctan2`, `power` | float32, float64 | 10k, 100k, 1M |
+| `npbench/bench_softmax.py` | `exp`, `max`, `sum` | float32 | M (32x8x256x256), L (64x16x448x448) |
+| `npbench/bench_arc_distance.py` | `sin`, `cos`, `arctan2`, `sqrt` | float64 | M (1M), L (10M) |
+| `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) |
+| `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) |
+
+## Threading
+
+Set `MKL_NUM_THREADS` in the environment before running ASV to control the thread count used by MKL:
+
+```bash
+MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^!
+```
+
+If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation).
+
+## Quick Start
+
+```bash
+cd benchmarks
+asv run --python=same --quick HEAD^!   # time the current commit
+asv compare main HEAD                  # compare against main
+asv publish && asv preview             # view HTML report locally
+```
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
index f7739a93..dd9cbfb1 100644
--- a/benchmarks/benchmarks/__init__.py
+++ b/benchmarks/benchmarks/__init__.py
@@ -1,4 +1,45 @@
+"""ASV benchmarks for mkl_umath.
+
+Thread control — design rationale
+----------------------------------
+Since we do not have a dedicated CI benchmark machine, benchmarks run
+on a shared CI pool whose machines vary in core count over time.
+Using the full physical core count of each machine would make results
+incomparable across runs on different machines.
+
+Strategy:
+  - Physical cores >= 4  →  fix MKL_NUM_THREADS = 4
+      4 is the lowest common denominator that guarantees multi-threaded MKL
+      behavior and is achievable on any modern CI machine.  Results from
+      different machines in the pool are therefore directly comparable.
+  - Physical cores < 4   →  fall back to MKL_NUM_THREADS = 1 (single-threaded)
+      Prevents over-subscription on under-resourced machines and avoids
+      misleading comparisons against 4-thread baselines.
+
+MKL recommendation: use physical cores, not logical (hyperthreaded) CPUs.
+"""
+
+import os
+
+import psutil
+
 from ._patch_setup import _apply_patches
 
+_MIN_THREADS = 4  # minimum physical cores required for multi-threaded mode
+
+
+def _physical_cores():
+    """Return physical core count; fall back to 1 (conservative)."""
+    return psutil.cpu_count(logical=False) or 1
+
+
+def _thread_count():
+    physical = _physical_cores()
+    return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1"
+
+
+_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count())
+os.environ["MKL_NUM_THREADS"] = _THREADS
+
 _apply_patches()
-del _apply_patches
\ No newline at end of file
+del _apply_patches
diff --git a/benchmarks/benchmarks/micro/bench_micro.py b/benchmarks/benchmarks/micro/bench_micro.py
index 381c823c..1d6e4bb8 100644
--- a/benchmarks/benchmarks/micro/bench_micro.py
+++ b/benchmarks/benchmarks/micro/bench_micro.py
@@ -10,7 +10,6 @@
 
 import numpy as np
 
-
 _UFUNC_CONFIGS = {
     "exp": {"func": np.exp, "low": -10.0, "high": 10.0},
     "exp2": {"func": np.exp2, "low": -10.0, "high": 10.0},

From aaf93842f27e07658ed9762acdbd95b978eb5349 Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Wed, 20 May 2026 13:43:21 -0500
Subject: [PATCH 6/7] PR suggestions

---
 benchmarks/benchmarks/__init__.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
index dd9cbfb1..665b2e16 100644
--- a/benchmarks/benchmarks/__init__.py
+++ b/benchmarks/benchmarks/__init__.py
@@ -1,23 +1,4 @@
-"""ASV benchmarks for mkl_umath.
-
-Thread control — design rationale
-----------------------------------
-Since we do not have a dedicated CI benchmark machine, benchmarks run
-on a shared CI pool whose machines vary in core count over time.
-Using the full physical core count of each machine would make results
-incomparable across runs on different machines.
-
-Strategy:
-  - Physical cores >= 4  →  fix MKL_NUM_THREADS = 4
-      4 is the lowest common denominator that guarantees multi-threaded MKL
-      behavior and is achievable on any modern CI machine.  Results from
-      different machines in the pool are therefore directly comparable.
-  - Physical cores < 4   →  fall back to MKL_NUM_THREADS = 1 (single-threaded)
-      Prevents over-subscription on under-resourced machines and avoids
-      misleading comparisons against 4-thread baselines.
-
-MKL recommendation: use physical cores, not logical (hyperthreaded) CPUs.
-"""
+"""ASV benchmarks for mkl_umath"""
 
 import os
 

From f07adb36892eb00a254de3023278e9df8130adc8 Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Wed, 20 May 2026 15:23:56 -0500
Subject: [PATCH 7/7] Improve readme

---
 benchmarks/README.md | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 2016345a..2da9bf01 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -14,21 +14,39 @@ The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench
 | `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) |
 | `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) |
 
-## Threading
+## Running Benchmarks
 
-Set `MKL_NUM_THREADS` in the environment before running ASV to control the thread count used by MKL:
+Prerequisites:
 
 ```bash
-MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^!
+pip install asv psutil
 ```
 
-If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation).
+Run benchmarks against the current commit:
+
+```bash
+asv run --python=same --quick HEAD^!
+```
+
+Compare two commits:
+
+```bash
+asv continuous --python=same HEAD~1 HEAD
+```
 
-## Quick Start
+View results in a browser:
 
 ```bash
-cd benchmarks
-asv run --python=same --quick HEAD^!   # time the current commit
-asv compare main HEAD                  # compare against main
-asv publish && asv preview             # view HTML report locally
+asv publish
+asv preview
 ```
+
+## Threading
+
+Set `MKL_NUM_THREADS` to control the thread count used by MKL:
+
+```bash
+MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^!
+```
+
+If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation).