From b6b4489a38c40a8d02aec7a7a3e34372984c261a Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Mon, 4 May 2026 08:08:15 -0500 Subject: [PATCH 1/7] initial commit --- benchmarks/asv.conf.json | 13 ++ benchmarks/benchmarks/__init__.py | 4 + benchmarks/benchmarks/_patch_setup.py | 74 ++++++++++ benchmarks/benchmarks/micro/__init__.py | 0 benchmarks/benchmarks/micro/bench_exp_log.py | 98 +++++++++++++ .../benchmarks/micro/bench_sqrt_misc.py | 84 +++++++++++ benchmarks/benchmarks/micro/bench_trig.py | 134 ++++++++++++++++++ benchmarks/benchmarks/npbench/__init__.py | 0 .../benchmarks/npbench/bench_arc_distance.py | 50 +++++++ .../benchmarks/npbench/bench_cholesky2.py | 54 +++++++ .../benchmarks/npbench/bench_correlation.py | 56 ++++++++ .../benchmarks/npbench/bench_covariance.py | 53 +++++++ .../benchmarks/npbench/bench_deriche.py | 111 +++++++++++++++ .../benchmarks/npbench/bench_doitgen.py | 52 +++++++ benchmarks/benchmarks/npbench/bench_gemm.py | 53 +++++++ benchmarks/benchmarks/npbench/bench_gemver.py | 71 ++++++++++ .../benchmarks/npbench/bench_gesummv.py | 45 ++++++ .../benchmarks/npbench/bench_go_fast.py | 69 +++++++++ benchmarks/benchmarks/npbench/bench_k2mm.py | 55 +++++++ benchmarks/benchmarks/npbench/bench_k3mm.py | 44 ++++++ .../benchmarks/npbench/bench_mandelbrot.py | 93 ++++++++++++ .../benchmarks/npbench/bench_softmax.py | 48 +++++++ benchmarks/bootstrap-dashboard-branch.sh | 40 ++++++ 23 files changed, 1301 insertions(+) create mode 100644 benchmarks/asv.conf.json create mode 100644 benchmarks/benchmarks/__init__.py create mode 100644 benchmarks/benchmarks/_patch_setup.py create mode 100644 benchmarks/benchmarks/micro/__init__.py create mode 100644 benchmarks/benchmarks/micro/bench_exp_log.py create mode 100644 benchmarks/benchmarks/micro/bench_sqrt_misc.py create mode 100644 benchmarks/benchmarks/micro/bench_trig.py create mode 100644 benchmarks/benchmarks/npbench/__init__.py create mode 100644 benchmarks/benchmarks/npbench/bench_arc_distance.py create mode 100644 benchmarks/benchmarks/npbench/bench_cholesky2.py create mode 100644 benchmarks/benchmarks/npbench/bench_correlation.py create mode 100644 benchmarks/benchmarks/npbench/bench_covariance.py create mode 100644 benchmarks/benchmarks/npbench/bench_deriche.py create mode 100644 benchmarks/benchmarks/npbench/bench_doitgen.py create mode 100644 benchmarks/benchmarks/npbench/bench_gemm.py create mode 100644 benchmarks/benchmarks/npbench/bench_gemver.py create mode 100644 benchmarks/benchmarks/npbench/bench_gesummv.py create mode 100644 benchmarks/benchmarks/npbench/bench_go_fast.py create mode 100644 benchmarks/benchmarks/npbench/bench_k2mm.py create mode 100644 benchmarks/benchmarks/npbench/bench_k3mm.py create mode 100644 benchmarks/benchmarks/npbench/bench_mandelbrot.py create mode 100644 benchmarks/benchmarks/npbench/bench_softmax.py create mode 100644 benchmarks/bootstrap-dashboard-branch.sh diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 00000000..facb1284 --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,13 @@ +{ + "version": 1, + "project": "mkl_umath", + "project_url": "https://github.com/IntelPython/mkl_umath", + "repo": "..", + "branches": ["main"], + "environment_type": "existing", + "benchmark_dir": "benchmarks", + "env_dir": ".asv/env", + "results_dir": ".asv/results", + "html_dir": ".asv/html", + "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/" +} diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py new file mode 100644 index 00000000..bcf027e4 --- /dev/null +++ b/benchmarks/benchmarks/__init__.py @@ -0,0 +1,4 @@ +# Trigger MKL patching once per ASV worker process. +# ASV uses --launch-method spawn in CI, so each worker is a fresh process +# and this runs exactly once before any benchmark is collected or timed. +from . import _patch_setup # noqa: F401 diff --git a/benchmarks/benchmarks/_patch_setup.py b/benchmarks/benchmarks/_patch_setup.py new file mode 100644 index 00000000..f7435c61 --- /dev/null +++ b/benchmarks/benchmarks/_patch_setup.py @@ -0,0 +1,74 @@ +"""MKL patch setup — executed once per ASV worker process at import time. + +Patches NumPy with Intel MKL implementations for fft, random, and umath. +Hard-fails with a descriptive RuntimeError if any package is missing or the +patch does not take effect, so benchmarks never silently run on stock NumPy. + +Visible output goes to stderr; pass --show-stderr to ``asv run`` to see it. +""" + +import sys + +_PATCH_MAP = [ + ("mkl_fft", "patch_numpy_fft"), + ("mkl_random", "patch_numpy_random"), + ("mkl_umath", "patch_numpy_umath"), +] + + +def _apply_patches(): + patched = {} + + for mod_name, patch_fn_name in _PATCH_MAP: + try: + mod = __import__(mod_name) + except ImportError as exc: + raise RuntimeError( + f"[mkl-patch] Cannot import {mod_name}: {exc}\n" + f" Ensure the conda env contains {mod_name} from the Intel channel.\n" + f" Required channels: https://software.repos.intel.com/python/conda" + ) from exc + + patch_fn = getattr(mod, patch_fn_name, None) + if patch_fn is None: + raise RuntimeError( + f"[mkl-patch] {mod_name} has no {patch_fn_name}(). " + f"Upgrade {mod_name} to a version that exposes the stock-numpy patch API." + ) + + try: + patch_fn() + except Exception as exc: + raise RuntimeError( + f"[mkl-patch] {mod_name}.{patch_fn_name}() raised: {exc!r}" + ) from exc + + is_patched_fn = getattr(mod, "is_patched", None) + if callable(is_patched_fn) and not is_patched_fn(): + raise RuntimeError( + f"[mkl-patch] {mod_name}.is_patched() returned False after patching. " + f"NumPy may have been imported before patching in a conflicting state." + ) + + patched[mod_name] = mod + + # Verbose attribution — verify numpy-level dispatch changed hands + import numpy as np + + _attr_checks = { + "mkl_fft": lambda: np.fft.fft.__module__, + "mkl_random": lambda: np.random.random.__module__, + "mkl_umath": lambda: np.exp.__module__, + } + for mod_name in patched: + try: + attr = _attr_checks[mod_name]() + except Exception: + attr = "unknown" + sys.stderr.write(f"[mkl-patch] {mod_name}: numpy dispatch → {attr}\n") + + sys.stderr.write("[mkl-patch] ALL OK — mkl_fft, mkl_random, mkl_umath active\n") + sys.stderr.flush() + + +_apply_patches() diff --git a/benchmarks/benchmarks/micro/__init__.py b/benchmarks/benchmarks/micro/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/benchmarks/micro/bench_exp_log.py b/benchmarks/benchmarks/micro/bench_exp_log.py new file mode 100644 index 00000000..216fc740 --- /dev/null +++ b/benchmarks/benchmarks/micro/bench_exp_log.py @@ -0,0 +1,98 @@ +"""Micro-benchmarks for mkl_umath exponential and logarithm ufuncs. + +Each class times a single ufunc over a Cartesian product of + dtype ∈ [float32, float64] + size ∈ [10_000, 100_000, 1_000_000] + +Arrays are pre-allocated in setup() and reused across timing calls. +Patching is applied once at package import via benchmarks._patch_setup. +""" + +import numpy as np + + +class BenchExp: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + # float32 overflows exp around 88.7; use [-10, 10] safe for both dtypes + rng = np.random.default_rng(42) + self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) + + def time_exp(self, dtype, size): + np.exp(self.x) + + +class BenchExp2: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + # float32 overflows exp2 around 127 + rng = np.random.default_rng(42) + self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) + + def time_exp2(self, dtype, size): + np.exp2(self.x) + + +class BenchExpm1: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) + + def time_expm1(self, dtype, size): + np.expm1(self.x) + + +class BenchLog: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(1e-3, 1e3, size).astype(dtype) + + def time_log(self, dtype, size): + np.log(self.x) + + +class BenchLog2: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(1e-3, 1e3, size).astype(dtype) + + def time_log2(self, dtype, size): + np.log2(self.x) + + +class BenchLog10: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(1e-3, 1e3, size).astype(dtype) + + def time_log10(self, dtype, size): + np.log10(self.x) + + +class BenchLog1p: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + # log1p(x) is defined for x > -1; use [0, 10] which is always safe + rng = np.random.default_rng(42) + self.x = rng.uniform(0.0, 10.0, size).astype(dtype) + + def time_log1p(self, dtype, size): + np.log1p(self.x) diff --git a/benchmarks/benchmarks/micro/bench_sqrt_misc.py b/benchmarks/benchmarks/micro/bench_sqrt_misc.py new file mode 100644 index 00000000..b1170639 --- /dev/null +++ b/benchmarks/benchmarks/micro/bench_sqrt_misc.py @@ -0,0 +1,84 @@ +"""Micro-benchmarks for mkl_umath sqrt, cbrt, and miscellaneous ufuncs. + +Each class times a single ufunc over a Cartesian product of + dtype ∈ [float32, float64] + size ∈ [10_000, 100_000, 1_000_000] + +Arrays are pre-allocated in setup() and reused across timing calls. +Patching is applied once at package import via benchmarks._patch_setup. +""" + +import numpy as np + + +class BenchSqrt: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(0.0, 100.0, size).astype(dtype) + + def time_sqrt(self, dtype, size): + np.sqrt(self.x) + + +class BenchCbrt: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-100.0, 100.0, size).astype(dtype) + + def time_cbrt(self, dtype, size): + np.cbrt(self.x) + + +class BenchSquare: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) + + def time_square(self, dtype, size): + np.square(self.x) + + +class BenchFabs: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-100.0, 100.0, size).astype(dtype) + + def time_fabs(self, dtype, size): + np.fabs(self.x) + + +class BenchAbsolute: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-100.0, 100.0, size).astype(dtype) + + def time_absolute(self, dtype, size): + np.absolute(self.x) + + +class BenchReciprocal: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + # Avoid values near zero to prevent inf results dominating timing + rng = np.random.default_rng(42) + self.x = rng.uniform(0.01, 100.0, size).astype(dtype) + + def time_reciprocal(self, dtype, size): + np.reciprocal(self.x) diff --git a/benchmarks/benchmarks/micro/bench_trig.py b/benchmarks/benchmarks/micro/bench_trig.py new file mode 100644 index 00000000..eb09b9c6 --- /dev/null +++ b/benchmarks/benchmarks/micro/bench_trig.py @@ -0,0 +1,134 @@ +"""Micro-benchmarks for mkl_umath trigonometric ufuncs. + +Each class times a single ufunc over a Cartesian product of + dtype ∈ [float32, float64] + size ∈ [10_000, 100_000, 1_000_000] + +Arrays are pre-allocated in setup() and reused across timing calls. +Patching is applied once at package import via benchmarks._patch_setup. +""" + +import numpy as np + + +class BenchSin: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-np.pi, np.pi, size).astype(dtype) + + def time_sin(self, dtype, size): + np.sin(self.x) + + +class BenchCos: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-np.pi, np.pi, size).astype(dtype) + + def time_cos(self, dtype, size): + np.cos(self.x) + + +class BenchTan: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + # Avoid values near ±π/2 where tan diverges + rng = np.random.default_rng(42) + self.x = rng.uniform(-1.4, 1.4, size).astype(dtype) + + def time_tan(self, dtype, size): + np.tan(self.x) + + +class BenchArcsin: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-1.0, 1.0, size).astype(dtype) + + def time_arcsin(self, dtype, size): + np.arcsin(self.x) + + +class BenchArccos: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-1.0, 1.0, size).astype(dtype) + + def time_arccos(self, dtype, size): + np.arccos(self.x) + + +class BenchArctan: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) + + def time_arctan(self, dtype, size): + np.arctan(self.x) + + +class BenchArctan2: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.y = rng.uniform(-1.0, 1.0, size).astype(dtype) + self.x = rng.uniform(-1.0, 1.0, size).astype(dtype) + + def time_arctan2(self, dtype, size): + np.arctan2(self.y, self.x) + + +class BenchSinh: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + # float32 overflows sinh around ±89; keep well inside that + rng = np.random.default_rng(42) + self.x = rng.uniform(-5.0, 5.0, size).astype(dtype) + + def time_sinh(self, dtype, size): + np.sinh(self.x) + + +class BenchCosh: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-5.0, 5.0, size).astype(dtype) + + def time_cosh(self, dtype, size): + np.cosh(self.x) + + +class BenchTanh: + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.x = rng.uniform(-5.0, 5.0, size).astype(dtype) + + def time_tanh(self, dtype, size): + np.tanh(self.x) diff --git a/benchmarks/benchmarks/npbench/__init__.py b/benchmarks/benchmarks/npbench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/benchmarks/npbench/bench_arc_distance.py b/benchmarks/benchmarks/npbench/bench_arc_distance.py new file mode 100644 index 00000000..d8039649 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_arc_distance.py @@ -0,0 +1,50 @@ +"""npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt. + +Preset sizes from npbench bench_info/arc_distance.json: + S: N=100_000 + L: N=10_000_000 +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py +def _initialize(N): + from numpy.random import default_rng + rng = default_rng(42) + t0 = rng.random((N,)) + p0 = rng.random((N,)) + t1 = rng.random((N,)) + p1 = rng.random((N,)) + return t0, p0, t1, p1 + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance_numpy.py +def _arc_distance(theta_1, phi_1, theta_2, phi_2): + temp = ( + np.sin((theta_2 - theta_1) / 2) ** 2 + + np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2) ** 2 + ) + return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp)) + + +_PRESETS = { + "S": {"N": 100_000}, + "L": {"N": 10_000_000}, +} + + +class BenchArcDistance: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + self.theta_1, self.phi_1, self.theta_2, self.phi_2 = cache[preset] + + def time_arc_distance(self, cache, preset): + _arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2) diff --git a/benchmarks/benchmarks/npbench/bench_cholesky2.py b/benchmarks/benchmarks/npbench/bench_cholesky2.py new file mode 100644 index 00000000..ea095122 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_cholesky2.py @@ -0,0 +1,54 @@ +"""npbench wrapper: Cholesky decomposition v2 — mkl_umath ops: linalg.cholesky. + +Preset sizes from npbench bench_info/cholesky2.json: + S: N=1000 + L: N=8000 + +The kernel mutates A in-place (A[:] = cholesky(A) + triu(A, k=1)), so +setup() copies A from cache before each timing round. + +The initialization constructs a symmetric positive-definite matrix via A @ A^T, +which is expensive at N=8000. setup_cache() runs this once per commit. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/cholesky2/cholesky2.py +def _initialize(N, datatype=np.float64): + A = np.empty((N, N), dtype=datatype) + for i in range(N): + A[i, :i + 1] = np.fromfunction( + lambda j: (-j % N) / N + 1, (i + 1,), dtype=datatype + ) + A[i, i + 1:] = 0.0 + A[i, i] = 1.0 + A[:] = A @ np.transpose(A) + return A + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/cholesky2/cholesky2_numpy.py +def _kernel(A): + A[:] = np.linalg.cholesky(A) + np.triu(A, k=1) + + +_PRESETS = { + "S": {"N": 1000}, + "L": {"N": 8000}, +} + + +class BenchCholesky2: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + self.A = cache[preset].copy() # kernel mutates A in-place + + def time_cholesky2(self, cache, preset): + _kernel(self.A) diff --git a/benchmarks/benchmarks/npbench/bench_correlation.py b/benchmarks/benchmarks/npbench/bench_correlation.py new file mode 100644 index 00000000..c5c7471d --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_correlation.py @@ -0,0 +1,56 @@ +"""npbench wrapper: Correlation — mkl_umath ops: sqrt, std, mean. + +Preset sizes from npbench bench_info/correlation.json: + S: M=500, N=600 + L: M=3200, N=4000 + +The kernel mutates ``data`` in-place (data -= mean; data /= ...), so +setup() copies from the cache before each timing round. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/correlation/correlation.py +def _initialize(M, N, datatype=np.float64): + float_n = datatype(N) + data = np.fromfunction(lambda i, j: (i * j) / M + i, (N, M), dtype=datatype) + return float_n, data + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/correlation/correlation_numpy.py +def _kernel(M, float_n, data): + mean = np.mean(data, axis=0) + stddev = np.std(data, axis=0) + stddev[stddev <= 0.1] = 1.0 + data -= mean + data /= np.sqrt(float_n) * stddev + corr = np.eye(M, dtype=data.dtype) + for i in range(M - 1): + corr[i + 1:M, i] = corr[i, i + 1:M] = data[:, i] @ data[:, i + 1:M] + return corr + + +_PRESETS = { + "S": {"M": 500, "N": 600}, + "L": {"M": 3200, "N": 4000}, +} + + +class BenchCorrelation: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + float_n, data = cache[preset] + self.M = _PRESETS[preset]["M"] + self.float_n = float_n + self.data = data.copy() # kernel mutates data in-place + + def time_correlation(self, cache, preset): + _kernel(self.M, self.float_n, self.data) diff --git a/benchmarks/benchmarks/npbench/bench_covariance.py b/benchmarks/benchmarks/npbench/bench_covariance.py new file mode 100644 index 00000000..bc541c5b --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_covariance.py @@ -0,0 +1,53 @@ +"""npbench wrapper: Covariance — mkl_umath ops: mean. + +Preset sizes from npbench bench_info/covariance.json: + S: M=500, N=600 + L: M=3200, N=4000 + +The kernel mutates ``data`` in-place (data -= mean), so setup() copies +from the cache before each timing round. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/covariance/covariance.py +def _initialize(M, N, datatype=np.float64): + float_n = datatype(N) + data = np.fromfunction(lambda i, j: (i * j) / M, (N, M), dtype=datatype) + return float_n, data + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/covariance/covariance_numpy.py +def _kernel(M, float_n, data): + mean = np.mean(data, axis=0) + data -= mean + cov = np.zeros((M, M), dtype=data.dtype) + for i in range(M): + cov[i:M, i] = cov[i, i:M] = data[:, i] @ data[:, i:M] / (float_n - 1.0) + return cov + + +_PRESETS = { + "S": {"M": 500, "N": 600}, + "L": {"M": 3200, "N": 4000}, +} + + +class BenchCovariance: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + float_n, data = cache[preset] + self.M = _PRESETS[preset]["M"] + self.float_n = float_n + self.data = data.copy() # kernel mutates data in-place + + def time_covariance(self, cache, preset): + _kernel(self.M, self.float_n, self.data) diff --git a/benchmarks/benchmarks/npbench/bench_deriche.py b/benchmarks/benchmarks/npbench/bench_deriche.py new file mode 100644 index 00000000..4cb93f1e --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_deriche.py @@ -0,0 +1,111 @@ +"""npbench wrapper: Deriche Edge Detector — mkl_umath ops: exp. + +Preset sizes from npbench bench_info/deriche.json: + S: W=400, H=200 + L: W=6000, H=3000 + +Warning: this kernel contains Python for-loops over rows/columns. +At the L preset the Python loops dominate runtime; exp() calls on scalar +floats are measured, not vectorised MKL VM throughput. The L preset is +included for historical comparability with npbench runs. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/deriche/deriche.py +def _initialize(W, H, datatype=np.float64): + alpha = datatype(0.25) + imgIn = np.fromfunction( + lambda i, j: ((313 * i + 991 * j) % 65536) / 65535.0, + (W, H), + dtype=datatype, + ) + return alpha, imgIn + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/deriche/deriche_numpy.py +def _kernel(alpha, imgIn): + k = ( + (1.0 - np.exp(-alpha)) + * (1.0 - np.exp(-alpha)) + / (1.0 + alpha * np.exp(-alpha) - np.exp(2.0 * alpha)) + ) + a1 = a5 = k + a2 = a6 = k * np.exp(-alpha) * (alpha - 1.0) + a3 = a7 = k * np.exp(-alpha) * (alpha + 1.0) + a4 = a8 = -k * np.exp(-2.0 * alpha) + b1 = 2.0 ** (-alpha) + b2 = -np.exp(-2.0 * alpha) + c1 = c2 = 1 + + y1 = np.empty_like(imgIn) + y1[:, 0] = a1 * imgIn[:, 0] + y1[:, 1] = a1 * imgIn[:, 1] + a2 * imgIn[:, 0] + b1 * y1[:, 0] + for j in range(2, imgIn.shape[1]): + y1[:, j] = ( + a1 * imgIn[:, j] + + a2 * imgIn[:, j - 1] + + b1 * y1[:, j - 1] + + b2 * y1[:, j - 2] + ) + + y2 = np.empty_like(imgIn) + y2[:, -1] = 0.0 + y2[:, -2] = a3 * imgIn[:, -1] + for j in range(imgIn.shape[1] - 3, -1, -1): + y2[:, j] = ( + a3 * imgIn[:, j + 1] + + a4 * imgIn[:, j + 2] + + b1 * y2[:, j + 1] + + b2 * y2[:, j + 2] + ) + + imgOut = c1 * (y1 + y2) + + y1[0, :] = a5 * imgOut[0, :] + y1[1, :] = a5 * imgOut[1, :] + a6 * imgOut[0, :] + b1 * y1[0, :] + for i in range(2, imgIn.shape[0]): + y1[i, :] = ( + a5 * imgOut[i, :] + + a6 * imgOut[i - 1, :] + + b1 * y1[i - 1, :] + + b2 * y1[i - 2, :] + ) + + y2[-1, :] = 0.0 + y2[-2, :] = a7 * imgOut[-1, :] + for i in range(imgIn.shape[0] - 3, -1, -1): + y2[i, :] = ( + a7 * imgOut[i + 1, :] + + a8 * imgOut[i + 2, :] + + b1 * y2[i + 1, :] + + b2 * y2[i + 2, :] + ) + + return c2 * (y1 + y2) + + +_PRESETS = { + "S": {"W": 400, "H": 200}, + "L": {"W": 6000, "H": 3000}, +} + + +class BenchDeriche: + # L preset has Python loops over 6000 rows — allow extra time + timeout = 600 + + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + self.alpha, self.imgIn = cache[preset] + + def time_deriche(self, cache, preset): + _kernel(self.alpha, self.imgIn) diff --git a/benchmarks/benchmarks/npbench/bench_doitgen.py b/benchmarks/benchmarks/npbench/bench_doitgen.py new file mode 100644 index 00000000..86467424 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_doitgen.py @@ -0,0 +1,52 @@ +"""npbench wrapper: Doitgen (multiresolution analysis) — mkl_umath ops: matmul. + +Preset sizes from npbench bench_info/doitgen.json: + S: NR=60, NQ=60, NP=128 + L: NR=220, NQ=250, NP=512 + +The kernel mutates ``A`` in-place (A[:] = ...), so setup() copies from cache. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/doitgen/doitgen.py +def _initialize(NR, NQ, NP, datatype=np.float64): + A = np.fromfunction( + lambda i, j, k: ((i * j + k) % NP) / NP, (NR, NQ, NP), dtype=datatype + ) + C4 = np.fromfunction( + lambda i, j: (i * j % NP) / NP, (NP, NP), dtype=datatype + ) + return A, C4 + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/doitgen/doitgen_numpy.py +def _kernel(NR, NQ, NP, A, C4): + A[:] = np.reshape(np.reshape(A, (NR, NQ, 1, NP)) @ C4, (NR, NQ, NP)) + + +_PRESETS = { + "S": {"NR": 60, "NQ": 60, "NP": 128}, + "L": {"NR": 220, "NQ": 250, "NP": 512}, +} + + +class BenchDoitgen: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + A, C4 = cache[preset] + p = _PRESETS[preset] + self.NR, self.NQ, self.NP = p["NR"], p["NQ"], p["NP"] + self.A = A.copy() # kernel mutates A in-place + self.C4 = C4 + + def time_doitgen(self, cache, preset): + _kernel(self.NR, self.NQ, self.NP, self.A, self.C4) diff --git a/benchmarks/benchmarks/npbench/bench_gemm.py b/benchmarks/benchmarks/npbench/bench_gemm.py new file mode 100644 index 00000000..15b29ed6 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_gemm.py @@ -0,0 +1,53 @@ +"""npbench wrapper: GEMM (general matrix-matrix multiply) — mkl_umath ops: matmul. + +Preset sizes from npbench bench_info/gemm.json: + S: NI=1000, NJ=1100, NK=1200 + L: NI=7000, NJ=7500, NK=8000 + +The kernel mutates C in-place (C[:] = alpha * A @ B + beta * C), so +setup() copies C from cache before each timing round. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm.py +def _initialize(NI, NJ, NK, datatype=np.float64): + alpha = datatype(1.5) + beta = datatype(1.2) + C = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ), dtype=datatype) + A = np.fromfunction(lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK), dtype=datatype) + B = np.fromfunction(lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ), dtype=datatype) + return alpha, beta, C, A, B + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm_numpy.py +def _kernel(alpha, beta, C, A, B): + C[:] = alpha * A @ B + beta * C + + +_PRESETS = { + "S": {"NI": 1000, "NJ": 1100, "NK": 1200}, + "L": {"NI": 7000, "NJ": 7500, "NK": 8000}, +} + + +class BenchGemm: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + alpha, beta, C, A, B = cache[preset] + self.alpha = alpha + self.beta = beta + self.C = C.copy() # mutated in-place + self.A = A + self.B = B + + def time_gemm(self, cache, preset): + _kernel(self.alpha, self.beta, self.C, self.A, self.B) diff --git a/benchmarks/benchmarks/npbench/bench_gemver.py b/benchmarks/benchmarks/npbench/bench_gemver.py new file mode 100644 index 00000000..a04726e9 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_gemver.py @@ -0,0 +1,71 @@ +"""npbench wrapper: GEMVER (vector multiplication and matrix addition) — mkl_umath ops: outer. + +Preset sizes from npbench bench_info/gemver.json: + S: N=1_000 + L: N=10_000 + +The kernel mutates A, x, and w in-place, so setup() copies those from cache. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemver/gemver.py +def _initialize(N, datatype=np.float64): + alpha = datatype(1.5) + beta = datatype(1.2) + fn = datatype(N) + A = np.fromfunction(lambda i, j: (i * j % N) / N, (N, N), dtype=datatype) + u1 = np.fromfunction(lambda i: i, (N,), dtype=datatype) + u2 = np.fromfunction(lambda i: ((i + 1) / fn) / 2.0, (N,), dtype=datatype) + v1 = np.fromfunction(lambda i: ((i + 1) / fn) / 4.0, (N,), dtype=datatype) + v2 = np.fromfunction(lambda i: ((i + 1) / fn) / 6.0, (N,), dtype=datatype) + w = np.zeros((N,), dtype=datatype) + x = np.zeros((N,), dtype=datatype) + y = np.fromfunction(lambda i: ((i + 1) / fn) / 8.0, (N,), dtype=datatype) + z = np.fromfunction(lambda i: ((i + 1) / fn) / 9.0, (N,), dtype=datatype) + return alpha, beta, A, u1, v1, u2, v2, w, x, y, z + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemver/gemver_numpy.py +def _kernel(alpha, beta, A, u1, v1, u2, v2, w, x, y, z): + A += np.outer(u1, v1) + np.outer(u2, v2) + x += beta * y @ A + z + w += alpha * A @ x + + +_PRESETS = { + "S": {"N": 1_000}, + "L": {"N": 10_000}, +} + + +class BenchGemver: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + alpha, beta, A, u1, v1, u2, v2, w, x, y, z = cache[preset] + self.alpha = alpha + self.beta = beta + self.A = A.copy() # mutated: A += outer(u1,v1) + outer(u2,v2) + self.u1 = u1 + self.v1 = v1 + self.u2 = u2 + self.v2 = v2 + self.w = w.copy() # mutated: w += alpha * A @ x + self.x = x.copy() # mutated: x += beta * y @ A + z + self.y = y + self.z = z + + def time_gemver(self, cache, preset): + _kernel( + self.alpha, self.beta, + self.A, self.u1, self.v1, self.u2, self.v2, + self.w, self.x, self.y, self.z, + ) diff --git a/benchmarks/benchmarks/npbench/bench_gesummv.py b/benchmarks/benchmarks/npbench/bench_gesummv.py new file mode 100644 index 00000000..b2f54ea8 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_gesummv.py @@ -0,0 +1,45 @@ +"""npbench wrapper: GESUMMV (scalar, vector and matrix multiplication) — mkl_umath ops: matmul. + +Preset sizes from npbench bench_info/gesummv.json: + S: N=2_000 + L: N=14_000 +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gesummv/gesummv.py +def _initialize(N, datatype=np.float64): + alpha = datatype(1.5) + beta = datatype(1.2) + A = np.fromfunction(lambda i, j: ((i * j + 1) % N) / N, (N, N), dtype=datatype) + B = np.fromfunction(lambda i, j: ((i * j + 2) % N) / N, (N, N), dtype=datatype) + x = np.fromfunction(lambda i: (i % N) / N, (N,), dtype=datatype) + return alpha, beta, A, B, x + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gesummv/gesummv_numpy.py +def _kernel(alpha, beta, A, B, x): + return alpha * A @ x + beta * B @ x + + +_PRESETS = { + "S": {"N": 2_000}, + "L": {"N": 14_000}, +} + + +class BenchGesummv: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + self.alpha, self.beta, self.A, self.B, self.x = cache[preset] + + def time_gesummv(self, cache, preset): + _kernel(self.alpha, self.beta, self.A, self.B, self.x) diff --git a/benchmarks/benchmarks/npbench/bench_go_fast.py b/benchmarks/benchmarks/npbench/bench_go_fast.py new file mode 100644 index 00000000..83636bf9 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_go_fast.py @@ -0,0 +1,69 @@ +"""npbench wrapper: GoFast — mkl_umath ops: tanh. + +Preset sizes from npbench bench_info/go_fast.json: + S: N=2_000 + L: N=20_000 + +Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop +(go_fast_loop). A vectorized variant (go_fast_vec) using np.tanh on the +full diagonal is included for direct MKL VM throughput measurement. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py +def _initialize(N): + from numpy.random import default_rng + rng = default_rng(42) + a = rng.random((N, N)) + return (a,) + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast_numpy.py +def _go_fast(a): + trace = 0.0 + for i in range(a.shape[0]): + trace += np.tanh(a[i, i]) + return a + trace + + +_PRESETS = { + "S": {"N": 2_000}, + "L": {"N": 20_000}, +} + + +class BenchGoFastLoop: + """Original npbench kernel — diagonal Python loop calling np.tanh per element.""" + + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + (self.a,) = cache[preset] + + def time_go_fast_loop(self, cache, preset): + _go_fast(self.a) + + +class BenchGoFastVec: + """Vectorized variant — np.tanh on the full diagonal array at once.""" + + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + (self.a,) = cache[preset] + self.diag = np.copy(np.diag(self.a)) + + def time_go_fast_vec(self, cache, preset): + np.tanh(self.diag) diff --git a/benchmarks/benchmarks/npbench/bench_k2mm.py b/benchmarks/benchmarks/npbench/bench_k2mm.py new file mode 100644 index 00000000..11342d57 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_k2mm.py @@ -0,0 +1,55 @@ +"""npbench wrapper: 2MM (two matrix multiplications) — mkl_umath ops: matmul. + +Preset sizes from npbench bench_info/k2mm.json: + S: NI=800, NJ=850, NK=900, NL=950 + L: NI=6000, NJ=6500, NK=7000, NL=7500 + +The kernel mutates D in-place (D[:] = alpha * A @ B @ C + beta * D), so +setup() copies D from cache before each timing round. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm.py +def _initialize(NI, NJ, NK, NL, datatype=np.float64): + alpha = datatype(1.5) + beta = datatype(1.2) + A = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NK), dtype=datatype) + B = np.fromfunction(lambda i, j: (i * (j + 1) % NJ) / NJ, (NK, NJ), dtype=datatype) + C = np.fromfunction(lambda i, j: ((i * (j + 3) + 1) % NL) / NL, (NJ, NL), dtype=datatype) + D = np.fromfunction(lambda i, j: (i * (j + 2) % NK) / NK, (NI, NL), dtype=datatype) + return alpha, beta, A, B, C, D + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm_numpy.py +def _kernel(alpha, beta, A, B, C, D): + D[:] = alpha * A @ B @ C + beta * D + + +_PRESETS = { + "S": {"NI": 800, "NJ": 850, "NK": 900, "NL": 950}, + "L": {"NI": 6000, "NJ": 6500, "NK": 7000, "NL": 7500}, +} + + +class BenchK2mm: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + alpha, beta, A, B, C, D = cache[preset] + self.alpha = alpha + self.beta = beta + self.A = A + self.B = B + self.C = C + self.D = D.copy() # mutated in-place + + def time_k2mm(self, cache, preset): + _kernel(self.alpha, self.beta, self.A, self.B, self.C, self.D) diff --git a/benchmarks/benchmarks/npbench/bench_k3mm.py b/benchmarks/benchmarks/npbench/bench_k3mm.py new file mode 100644 index 00000000..86f9efe4 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_k3mm.py @@ -0,0 +1,44 @@ +"""npbench wrapper: 3MM (three matrix multiplications) — mkl_umath ops: matmul. + +Preset sizes from npbench bench_info/k3mm.json: + S: NI=800, NJ=850, NK=900, NL=950, NM=1000 + L: NI=5500, NJ=6000, NK=6500, NL=7000, NM=7500 +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm.py +def _initialize(NI, NJ, NK, NL, NM, datatype=np.float64): + A = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / (5 * NI), (NI, NK), dtype=datatype) + B = np.fromfunction(lambda i, j: ((i * (j + 1) + 2) % NJ) / (5 * NJ), (NK, NJ), dtype=datatype) + C = np.fromfunction(lambda i, j: (i * (j + 3) % NL) / (5 * NL), (NJ, NM), dtype=datatype) + D = np.fromfunction(lambda i, j: ((i * (j + 2) + 2) % NK) / (5 * NK), (NM, NL), dtype=datatype) + return A, B, C, D + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm_numpy.py +def _kernel(A, B, C, D): + return A @ B @ C @ D + + +_PRESETS = { + "S": {"NI": 800, "NJ": 850, "NK": 900, "NL": 950, "NM": 1000}, + "L": {"NI": 5500, "NJ": 6000, "NK": 6500, "NL": 7000, "NM": 7500}, +} + + +class BenchK3mm: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + self.A, self.B, self.C, self.D = cache[preset] + + def time_k3mm(self, cache, preset): + _kernel(self.A, self.B, self.C, self.D) diff --git a/benchmarks/benchmarks/npbench/bench_mandelbrot.py b/benchmarks/benchmarks/npbench/bench_mandelbrot.py new file mode 100644 index 00000000..6284bcf3 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_mandelbrot.py @@ -0,0 +1,93 @@ +"""npbench wrapper: Mandelbrot set (two variants) — mkl_umath ops: abs, multiply, add. + +Preset sizes from npbench bench_info/mandelbrot1.json and mandelbrot2.json: + S: XN=YN=125/200, maxiter=60/40 + L: XN=YN=833/1000, maxiter=200/100 + +mandelbrot1 (slow): uses np.less mask + index-based update loop. +mandelbrot2 (fast): uses dynamic array compaction; more cache-friendly. + +Both kernels operate on complex128 arrays. The dominant mkl_umath op is +np.abs() on complex arrays at each iteration step. +""" + +import numpy as np + + +# --- mandelbrot1 --- + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot1/mandelbrot1_numpy.py +def _mandelbrot1(xmin, xmax, ymin, ymax, xn, yn, maxiter, horizon=2.0): + X = np.linspace(xmin, xmax, xn, dtype=np.float64) + Y = np.linspace(ymin, ymax, yn, dtype=np.float64) + C = X + Y[:, None] * 1j + N = np.zeros(C.shape, dtype=np.int64) + Z = np.zeros(C.shape, dtype=np.complex128) + for n in range(maxiter): + I = np.less(abs(Z), horizon) + N[I] = n + Z[I] = Z[I] ** 2 + C[I] + N[N == maxiter - 1] = 0 + return Z, N + + +# --- mandelbrot2 --- + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot2/mandelbrot2_numpy.py +def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0): + Xi, Yi = np.mgrid[0:xn, 0:yn] + X = np.linspace(xmin, xmax, xn, dtype=np.float64)[Xi] + Y = np.linspace(ymin, ymax, yn, dtype=np.float64)[Yi] + C = X + Y * 1j + N_ = np.zeros(C.shape, dtype=np.int64) + Z_ = np.zeros(C.shape, dtype=np.complex128) + Xi.shape = Yi.shape = C.shape = xn * yn + + Z = np.zeros(C.shape, np.complex128) + for i in range(itermax): + if not len(Z): + break + np.multiply(Z, Z, Z) + np.add(Z, C, Z) + rem = np.abs(Z) > horizon + Z_[Xi[rem], Yi[rem]] = Z[rem] + N_[Xi[rem], Yi[rem]] = i + 1 + ind = ~rem + Z = Z[ind] + C = C[ind] + Xi = Xi[ind] + Yi = Yi[ind] + return Z_, N_ + + +_PRESETS_M1 = { + "S": {"xmin": -1.75, "xmax": 0.25, "ymin": -1.00, "ymax": 1.00, + "xn": 125, "yn": 125, "maxiter": 60, "horizon": 2.0}, + "L": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25, + "xn": 833, "yn": 833, "maxiter": 200, "horizon": 2.0}, +} + +_PRESETS_M2 = { + "S": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25, + "xn": 200, "yn": 200, "itermax": 40, "horizon": 2.0}, + "L": {"xmin": -2.25, "xmax": 0.75, "ymin": -1.50, "ymax": 1.50, + "xn": 1000, "yn": 1000, "itermax": 100, "horizon": 2.0}, +} + + +class BenchMandelbrot1: + params = (["S", "L"],) + param_names = ["preset"] + + def time_mandelbrot1(self, preset): + _mandelbrot1(**_PRESETS_M1[preset]) + + +class BenchMandelbrot2: + params = (["S", "L"],) + param_names = ["preset"] + + def time_mandelbrot2(self, preset): + _mandelbrot2(**_PRESETS_M2[preset]) diff --git a/benchmarks/benchmarks/npbench/bench_softmax.py b/benchmarks/benchmarks/npbench/bench_softmax.py new file mode 100644 index 00000000..cd976f14 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_softmax.py @@ -0,0 +1,48 @@ +"""npbench wrapper: Softmax — mkl_umath ops: exp, max, sum. + +Preset sizes from npbench bench_info/softmax.json: + S: N=16, H=16, SM=128 (float32) + L: N=64, H=16, SM=448 (float32) + +npbench initializes this benchmark with float32 explicitly. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax.py +def _initialize(N, H, SM): + from numpy.random import default_rng + rng = default_rng(42) + x = rng.random((N, H, SM, SM), dtype=np.float32) + return (x,) + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax_numpy.py +def _softmax(x): + tmp_max = np.max(x, axis=-1, keepdims=True) + tmp_out = np.exp(x - tmp_max) + tmp_sum = np.sum(tmp_out, axis=-1, keepdims=True) + return tmp_out / tmp_sum + + +_PRESETS = { + "S": {"N": 16, "H": 16, "SM": 128}, + "L": {"N": 64, "H": 16, "SM": 448}, +} + + +class BenchSoftmax: + params = (["S", "L"],) + param_names = ["preset"] + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + (self.x,) = cache[preset] + + def time_softmax(self, cache, preset): + _softmax(self.x) diff --git a/benchmarks/bootstrap-dashboard-branch.sh b/benchmarks/bootstrap-dashboard-branch.sh new file mode 100644 index 00000000..f8fd7cf4 --- /dev/null +++ b/benchmarks/bootstrap-dashboard-branch.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# bootstrap-dashboard-branch.sh +# +# One-time setup: creates the mkl-umath-results branch that ASV uses to +# store benchmark results. Run this once against the first commit you want +# to anchor results to. +# +# Usage: +# SEED_SHA= bash bootstrap-dashboard-branch.sh +# +# The script must be run from inside benchmarks/ (where asv.conf.json lives). +# The conda env with asv installed must already be active. + +set -euo pipefail + +RESULTS_BRANCH="mkl-umath-results" +SEED_SHA="${SEED_SHA:?ERROR: set SEED_SHA= before running this script}" + +echo "[bootstrap] Seeding results branch: ${RESULTS_BRANCH}" +echo "[bootstrap] Anchored to commit: ${SEED_SHA}" + +# Run a single quick pass to generate the first results JSON +asv run \ + --python=same \ + --quick \ + --show-stderr \ + --set-commit-hash "${SEED_SHA}" \ + HEAD + +# Publish results to HTML (creates .asv/html/) +asv publish + +# Push results to the dedicated branch +asv gh-pages \ + --rewrite \ + --no-push \ + --html-dir .asv/html + +echo "[bootstrap] Done. Push .asv/results to ${RESULTS_BRANCH} manually or" +echo " configure asv gh-pages --push to automate." From 428e440af44c30134f21d299554c81a4cf1141ae Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Mon, 11 May 2026 10:58:35 -0500 Subject: [PATCH 2/7] update benchmarks with M preset, configurations --- benchmarks/asv.conf.json | 11 ++++- .../benchmarks/npbench/bench_arc_distance.py | 8 ++-- .../benchmarks/npbench/bench_cholesky2.py | 8 ++-- .../benchmarks/npbench/bench_correlation.py | 8 ++-- .../benchmarks/npbench/bench_covariance.py | 8 ++-- .../benchmarks/npbench/bench_deriche.py | 8 ++-- .../benchmarks/npbench/bench_doitgen.py | 8 ++-- benchmarks/benchmarks/npbench/bench_gemm.py | 8 ++-- benchmarks/benchmarks/npbench/bench_gemver.py | 8 ++-- .../benchmarks/npbench/bench_gesummv.py | 8 ++-- .../benchmarks/npbench/bench_go_fast.py | 12 ++++-- benchmarks/benchmarks/npbench/bench_k2mm.py | 8 ++-- benchmarks/benchmarks/npbench/bench_k3mm.py | 8 ++-- .../benchmarks/npbench/bench_mandelbrot.py | 18 +++++---- .../benchmarks/npbench/bench_softmax.py | 10 +++-- benchmarks/bootstrap-dashboard-branch.sh | 40 ------------------- 16 files changed, 89 insertions(+), 90 deletions(-) delete mode 100644 benchmarks/bootstrap-dashboard-branch.sh diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json index facb1284..78482758 100644 --- a/benchmarks/asv.conf.json +++ b/benchmarks/asv.conf.json @@ -3,11 +3,18 @@ "project": "mkl_umath", "project_url": "https://github.com/IntelPython/mkl_umath", "repo": "..", - "branches": ["main"], + "branches": [ + "main" + ], "environment_type": "existing", "benchmark_dir": "benchmarks", "env_dir": ".asv/env", "results_dir": ".asv/results", "html_dir": ".asv/html", - "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/" + "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/", + "build_cache_size": 2, + "default_benchmark_timeout": 1500, + "regressions_thresholds": { + ".*": 0.2 + } } diff --git a/benchmarks/benchmarks/npbench/bench_arc_distance.py b/benchmarks/benchmarks/npbench/bench_arc_distance.py index d8039649..c17f5775 100644 --- a/benchmarks/benchmarks/npbench/bench_arc_distance.py +++ b/benchmarks/benchmarks/npbench/bench_arc_distance.py @@ -1,7 +1,7 @@ """npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt. Preset sizes from npbench bench_info/arc_distance.json: - S: N=100_000 + M: N=1_000_000 L: N=10_000_000 """ @@ -31,14 +31,16 @@ def _arc_distance(theta_1, phi_1, theta_2, phi_2): _PRESETS = { - "S": {"N": 100_000}, + "M": {"N": 1_000_000}, "L": {"N": 10_000_000}, } class BenchArcDistance: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_cholesky2.py b/benchmarks/benchmarks/npbench/bench_cholesky2.py index ea095122..347aaca8 100644 --- a/benchmarks/benchmarks/npbench/bench_cholesky2.py +++ b/benchmarks/benchmarks/npbench/bench_cholesky2.py @@ -1,7 +1,7 @@ """npbench wrapper: Cholesky decomposition v2 — mkl_umath ops: linalg.cholesky. Preset sizes from npbench bench_info/cholesky2.json: - S: N=1000 + M: N=2200 L: N=8000 The kernel mutates A in-place (A[:] = cholesky(A) + triu(A, k=1)), so @@ -35,14 +35,16 @@ def _kernel(A): _PRESETS = { - "S": {"N": 1000}, + "M": {"N": 2200}, "L": {"N": 8000}, } class BenchCholesky2: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_correlation.py b/benchmarks/benchmarks/npbench/bench_correlation.py index c5c7471d..194fcc04 100644 --- a/benchmarks/benchmarks/npbench/bench_correlation.py +++ b/benchmarks/benchmarks/npbench/bench_correlation.py @@ -1,7 +1,7 @@ """npbench wrapper: Correlation — mkl_umath ops: sqrt, std, mean. Preset sizes from npbench bench_info/correlation.json: - S: M=500, N=600 + M: M=1400, N=1800 L: M=3200, N=4000 The kernel mutates ``data`` in-place (data -= mean; data /= ...), so @@ -34,14 +34,16 @@ def _kernel(M, float_n, data): _PRESETS = { - "S": {"M": 500, "N": 600}, + "M": {"M": 1400, "N": 1800}, "L": {"M": 3200, "N": 4000}, } class BenchCorrelation: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_covariance.py b/benchmarks/benchmarks/npbench/bench_covariance.py index bc541c5b..b85b3191 100644 --- a/benchmarks/benchmarks/npbench/bench_covariance.py +++ b/benchmarks/benchmarks/npbench/bench_covariance.py @@ -1,7 +1,7 @@ """npbench wrapper: Covariance — mkl_umath ops: mean. Preset sizes from npbench bench_info/covariance.json: - S: M=500, N=600 + M: M=1400, N=1800 L: M=3200, N=4000 The kernel mutates ``data`` in-place (data -= mean), so setup() copies @@ -31,14 +31,16 @@ def _kernel(M, float_n, data): _PRESETS = { - "S": {"M": 500, "N": 600}, + "M": {"M": 1400, "N": 1800}, "L": {"M": 3200, "N": 4000}, } class BenchCovariance: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_deriche.py b/benchmarks/benchmarks/npbench/bench_deriche.py index 4cb93f1e..4539053d 100644 --- a/benchmarks/benchmarks/npbench/bench_deriche.py +++ b/benchmarks/benchmarks/npbench/bench_deriche.py @@ -1,7 +1,7 @@ """npbench wrapper: Deriche Edge Detector — mkl_umath ops: exp. Preset sizes from npbench bench_info/deriche.json: - S: W=400, H=200 + M: W=1500, H=1000 L: W=6000, H=3000 Warning: this kernel contains Python for-loops over rows/columns. @@ -89,7 +89,7 @@ def _kernel(alpha, imgIn): _PRESETS = { - "S": {"W": 400, "H": 200}, + "M": {"W": 1500, "H": 1000}, "L": {"W": 6000, "H": 3000}, } @@ -98,8 +98,10 @@ class BenchDeriche: # L preset has Python loops over 6000 rows — allow extra time timeout = 600 - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_doitgen.py b/benchmarks/benchmarks/npbench/bench_doitgen.py index 86467424..eb255bae 100644 --- a/benchmarks/benchmarks/npbench/bench_doitgen.py +++ b/benchmarks/benchmarks/npbench/bench_doitgen.py @@ -1,7 +1,7 @@ """npbench wrapper: Doitgen (multiresolution analysis) — mkl_umath ops: matmul. Preset sizes from npbench bench_info/doitgen.json: - S: NR=60, NQ=60, NP=128 + M: NR=110, NQ=125, NP=256 L: NR=220, NQ=250, NP=512 The kernel mutates ``A`` in-place (A[:] = ...), so setup() copies from cache. @@ -29,14 +29,16 @@ def _kernel(NR, NQ, NP, A, C4): _PRESETS = { - "S": {"NR": 60, "NQ": 60, "NP": 128}, + "M": {"NR": 110, "NQ": 125, "NP": 256}, "L": {"NR": 220, "NQ": 250, "NP": 512}, } class BenchDoitgen: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_gemm.py b/benchmarks/benchmarks/npbench/bench_gemm.py index 15b29ed6..f7d43f7d 100644 --- a/benchmarks/benchmarks/npbench/bench_gemm.py +++ b/benchmarks/benchmarks/npbench/bench_gemm.py @@ -1,7 +1,7 @@ """npbench wrapper: GEMM (general matrix-matrix multiply) — mkl_umath ops: matmul. Preset sizes from npbench bench_info/gemm.json: - S: NI=1000, NJ=1100, NK=1200 + M: NI=2500, NJ=2750, NK=3000 L: NI=7000, NJ=7500, NK=8000 The kernel mutates C in-place (C[:] = alpha * A @ B + beta * C), so @@ -29,14 +29,16 @@ def _kernel(alpha, beta, C, A, B): _PRESETS = { - "S": {"NI": 1000, "NJ": 1100, "NK": 1200}, + "M": {"NI": 2500, "NJ": 2750, "NK": 3000}, "L": {"NI": 7000, "NJ": 7500, "NK": 8000}, } class BenchGemm: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_gemver.py b/benchmarks/benchmarks/npbench/bench_gemver.py index a04726e9..d0ea44a6 100644 --- a/benchmarks/benchmarks/npbench/bench_gemver.py +++ b/benchmarks/benchmarks/npbench/bench_gemver.py @@ -1,7 +1,7 @@ """npbench wrapper: GEMVER (vector multiplication and matrix addition) — mkl_umath ops: outer. Preset sizes from npbench bench_info/gemver.json: - S: N=1_000 + M: N=3_000 L: N=10_000 The kernel mutates A, x, and w in-place, so setup() copies those from cache. @@ -37,14 +37,16 @@ def _kernel(alpha, beta, A, u1, v1, u2, v2, w, x, y, z): _PRESETS = { - "S": {"N": 1_000}, + "M": {"N": 3_000}, "L": {"N": 10_000}, } class BenchGemver: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_gesummv.py b/benchmarks/benchmarks/npbench/bench_gesummv.py index b2f54ea8..13bb773e 100644 --- a/benchmarks/benchmarks/npbench/bench_gesummv.py +++ b/benchmarks/benchmarks/npbench/bench_gesummv.py @@ -1,7 +1,7 @@ """npbench wrapper: GESUMMV (scalar, vector and matrix multiplication) — mkl_umath ops: matmul. Preset sizes from npbench bench_info/gesummv.json: - S: N=2_000 + M: N=4_000 L: N=14_000 """ @@ -26,14 +26,16 @@ def _kernel(alpha, beta, A, B, x): _PRESETS = { - "S": {"N": 2_000}, + "M": {"N": 4_000}, "L": {"N": 14_000}, } class BenchGesummv: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_go_fast.py b/benchmarks/benchmarks/npbench/bench_go_fast.py index 83636bf9..d197c540 100644 --- a/benchmarks/benchmarks/npbench/bench_go_fast.py +++ b/benchmarks/benchmarks/npbench/bench_go_fast.py @@ -1,7 +1,7 @@ """npbench wrapper: GoFast — mkl_umath ops: tanh. Preset sizes from npbench bench_info/go_fast.json: - S: N=2_000 + M: N=6_000 L: N=20_000 Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop @@ -31,7 +31,7 @@ def _go_fast(a): _PRESETS = { - "S": {"N": 2_000}, + "M": {"N": 6_000}, "L": {"N": 20_000}, } @@ -39,8 +39,10 @@ def _go_fast(a): class BenchGoFastLoop: """Original npbench kernel — diagonal Python loop calling np.tanh per element.""" - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} @@ -55,8 +57,10 @@ def time_go_fast_loop(self, cache, preset): class BenchGoFastVec: """Vectorized variant — np.tanh on the full diagonal array at once.""" - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_k2mm.py b/benchmarks/benchmarks/npbench/bench_k2mm.py index 11342d57..6ff8cf6f 100644 --- a/benchmarks/benchmarks/npbench/bench_k2mm.py +++ b/benchmarks/benchmarks/npbench/bench_k2mm.py @@ -1,7 +1,7 @@ """npbench wrapper: 2MM (two matrix multiplications) — mkl_umath ops: matmul. Preset sizes from npbench bench_info/k2mm.json: - S: NI=800, NJ=850, NK=900, NL=950 + M: NI=2000, NJ=2250, NK=2500, NL=2750 L: NI=6000, NJ=6500, NK=7000, NL=7500 The kernel mutates D in-place (D[:] = alpha * A @ B @ C + beta * D), so @@ -30,14 +30,16 @@ def _kernel(alpha, beta, A, B, C, D): _PRESETS = { - "S": {"NI": 800, "NJ": 850, "NK": 900, "NL": 950}, + "M": {"NI": 2000, "NJ": 2250, "NK": 2500, "NL": 2750}, "L": {"NI": 6000, "NJ": 6500, "NK": 7000, "NL": 7500}, } class BenchK2mm: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_k3mm.py b/benchmarks/benchmarks/npbench/bench_k3mm.py index 86f9efe4..15e73d60 100644 --- a/benchmarks/benchmarks/npbench/bench_k3mm.py +++ b/benchmarks/benchmarks/npbench/bench_k3mm.py @@ -1,7 +1,7 @@ """npbench wrapper: 3MM (three matrix multiplications) — mkl_umath ops: matmul. Preset sizes from npbench bench_info/k3mm.json: - S: NI=800, NJ=850, NK=900, NL=950, NM=1000 + M: NI=2000, NJ=2200, NK=2400, NL=2600, NM=2800 L: NI=5500, NJ=6000, NK=6500, NL=7000, NM=7500 """ @@ -25,14 +25,16 @@ def _kernel(A, B, C, D): _PRESETS = { - "S": {"NI": 800, "NJ": 850, "NK": 900, "NL": 950, "NM": 1000}, + "M": {"NI": 2000, "NJ": 2200, "NK": 2400, "NL": 2600, "NM": 2800}, "L": {"NI": 5500, "NJ": 6000, "NK": 6500, "NL": 7000, "NM": 7500}, } class BenchK3mm: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/benchmarks/npbench/bench_mandelbrot.py b/benchmarks/benchmarks/npbench/bench_mandelbrot.py index 6284bcf3..090fcd0e 100644 --- a/benchmarks/benchmarks/npbench/bench_mandelbrot.py +++ b/benchmarks/benchmarks/npbench/bench_mandelbrot.py @@ -1,7 +1,7 @@ """npbench wrapper: Mandelbrot set (two variants) — mkl_umath ops: abs, multiply, add. Preset sizes from npbench bench_info/mandelbrot1.json and mandelbrot2.json: - S: XN=YN=125/200, maxiter=60/40 + M: XN=YN=250/500, maxiter=150/80 L: XN=YN=833/1000, maxiter=200/100 mandelbrot1 (slow): uses np.less mask + index-based update loop. @@ -63,31 +63,35 @@ def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0): _PRESETS_M1 = { - "S": {"xmin": -1.75, "xmax": 0.25, "ymin": -1.00, "ymax": 1.00, - "xn": 125, "yn": 125, "maxiter": 60, "horizon": 2.0}, + "M": {"xmin": -1.75, "xmax": 0.25, "ymin": -1.00, "ymax": 1.00, + "xn": 250, "yn": 250, "maxiter": 150, "horizon": 2.0}, "L": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25, "xn": 833, "yn": 833, "maxiter": 200, "horizon": 2.0}, } _PRESETS_M2 = { - "S": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25, - "xn": 200, "yn": 200, "itermax": 40, "horizon": 2.0}, + "M": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25, + "xn": 500, "yn": 500, "itermax": 80, "horizon": 2.0}, "L": {"xmin": -2.25, "xmax": 0.75, "ymin": -1.50, "ymax": 1.50, "xn": 1000, "yn": 1000, "itermax": 100, "horizon": 2.0}, } class BenchMandelbrot1: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def time_mandelbrot1(self, preset): _mandelbrot1(**_PRESETS_M1[preset]) class BenchMandelbrot2: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def time_mandelbrot2(self, preset): _mandelbrot2(**_PRESETS_M2[preset]) diff --git a/benchmarks/benchmarks/npbench/bench_softmax.py b/benchmarks/benchmarks/npbench/bench_softmax.py index cd976f14..29a77252 100644 --- a/benchmarks/benchmarks/npbench/bench_softmax.py +++ b/benchmarks/benchmarks/npbench/bench_softmax.py @@ -1,7 +1,7 @@ """npbench wrapper: Softmax — mkl_umath ops: exp, max, sum. Preset sizes from npbench bench_info/softmax.json: - S: N=16, H=16, SM=128 (float32) + M: N=32, H=8, SM=256 (float32) L: N=64, H=16, SM=448 (float32) npbench initializes this benchmark with float32 explicitly. @@ -29,14 +29,16 @@ def _softmax(x): _PRESETS = { - "S": {"N": 16, "H": 16, "SM": 128}, - "L": {"N": 64, "H": 16, "SM": 448}, + "M": {"N": 32, "H": 8, "SM": 256}, + "L": {"N": 64, "H": 16, "SM": 448}, } class BenchSoftmax: - params = (["S", "L"],) + params = (["M", "L"],) param_names = ["preset"] + number = 1 + repeat = 20 def setup_cache(self): return {p: _initialize(**kw) for p, kw in _PRESETS.items()} diff --git a/benchmarks/bootstrap-dashboard-branch.sh b/benchmarks/bootstrap-dashboard-branch.sh deleted file mode 100644 index f8fd7cf4..00000000 --- a/benchmarks/bootstrap-dashboard-branch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash -# bootstrap-dashboard-branch.sh -# -# One-time setup: creates the mkl-umath-results branch that ASV uses to -# store benchmark results. Run this once against the first commit you want -# to anchor results to. -# -# Usage: -# SEED_SHA= bash bootstrap-dashboard-branch.sh -# -# The script must be run from inside benchmarks/ (where asv.conf.json lives). -# The conda env with asv installed must already be active. - -set -euo pipefail - -RESULTS_BRANCH="mkl-umath-results" -SEED_SHA="${SEED_SHA:?ERROR: set SEED_SHA= before running this script}" - -echo "[bootstrap] Seeding results branch: ${RESULTS_BRANCH}" -echo "[bootstrap] Anchored to commit: ${SEED_SHA}" - -# Run a single quick pass to generate the first results JSON -asv run \ - --python=same \ - --quick \ - --show-stderr \ - --set-commit-hash "${SEED_SHA}" \ - HEAD - -# Publish results to HTML (creates .asv/html/) -asv publish - -# Push results to the dedicated branch -asv gh-pages \ - --rewrite \ - --no-push \ - --html-dir .asv/html - -echo "[bootstrap] Done. Push .asv/results to ${RESULTS_BRANCH} manually or" -echo " configure asv gh-pages --push to automate." From b27dc03db2dee2ebb5575b7ded4366151dead962 Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Mon, 18 May 2026 08:43:28 -0500 Subject: [PATCH 3/7] pre-commit-fixes --- benchmarks/benchmarks/_patch_setup.py | 26 +++++--- .../benchmarks/npbench/bench_arc_distance.py | 1 + .../benchmarks/npbench/bench_cholesky2.py | 4 +- .../benchmarks/npbench/bench_correlation.py | 4 +- benchmarks/benchmarks/npbench/bench_gemm.py | 20 ++++-- benchmarks/benchmarks/npbench/bench_gemver.py | 40 +++++++----- .../benchmarks/npbench/bench_gesummv.py | 12 +++- .../benchmarks/npbench/bench_go_fast.py | 3 +- benchmarks/benchmarks/npbench/bench_k2mm.py | 20 ++++-- benchmarks/benchmarks/npbench/bench_k3mm.py | 20 ++++-- .../benchmarks/npbench/bench_mandelbrot.py | 61 +++++++++++++++---- .../benchmarks/npbench/bench_softmax.py | 3 +- 12 files changed, 152 insertions(+), 62 deletions(-) diff --git a/benchmarks/benchmarks/_patch_setup.py b/benchmarks/benchmarks/_patch_setup.py index f7435c61..9aea6062 100644 --- a/benchmarks/benchmarks/_patch_setup.py +++ b/benchmarks/benchmarks/_patch_setup.py @@ -10,9 +10,9 @@ import sys _PATCH_MAP = [ - ("mkl_fft", "patch_numpy_fft"), + ("mkl_fft", "patch_numpy_fft"), ("mkl_random", "patch_numpy_random"), - ("mkl_umath", "patch_numpy_umath"), + ("mkl_umath", "patch_numpy_umath"), ] @@ -25,15 +25,18 @@ def _apply_patches(): except ImportError as exc: raise RuntimeError( f"[mkl-patch] Cannot import {mod_name}: {exc}\n" - f" Ensure the conda env contains {mod_name} from the Intel channel.\n" - f" Required channels: https://software.repos.intel.com/python/conda" + f" Ensure the conda env contains {mod_name} " + f"from the Intel channel.\n" + " Required channels: " + "https://software.repos.intel.com/python/conda" ) from exc patch_fn = getattr(mod, patch_fn_name, None) if patch_fn is None: raise RuntimeError( f"[mkl-patch] {mod_name} has no {patch_fn_name}(). " - f"Upgrade {mod_name} to a version that exposes the stock-numpy patch API." + f"Upgrade {mod_name} to a version that exposes " + "the stock-numpy patch API." ) try: @@ -46,8 +49,9 @@ def _apply_patches(): is_patched_fn = getattr(mod, "is_patched", None) if callable(is_patched_fn) and not is_patched_fn(): raise RuntimeError( - f"[mkl-patch] {mod_name}.is_patched() returned False after patching. " - f"NumPy may have been imported before patching in a conflicting state." + f"[mkl-patch] {mod_name}.is_patched() returned False " + "after patching. NumPy may have been imported before " + "patching in a conflicting state." ) patched[mod_name] = mod @@ -56,9 +60,9 @@ def _apply_patches(): import numpy as np _attr_checks = { - "mkl_fft": lambda: np.fft.fft.__module__, + "mkl_fft": lambda: np.fft.fft.__module__, "mkl_random": lambda: np.random.random.__module__, - "mkl_umath": lambda: np.exp.__module__, + "mkl_umath": lambda: np.exp.__module__, } for mod_name in patched: try: @@ -67,7 +71,9 @@ def _apply_patches(): attr = "unknown" sys.stderr.write(f"[mkl-patch] {mod_name}: numpy dispatch → {attr}\n") - sys.stderr.write("[mkl-patch] ALL OK — mkl_fft, mkl_random, mkl_umath active\n") + sys.stderr.write( + "[mkl-patch] ALL OK — mkl_fft, mkl_random, mkl_umath active\n" + ) sys.stderr.flush() diff --git a/benchmarks/benchmarks/npbench/bench_arc_distance.py b/benchmarks/benchmarks/npbench/bench_arc_distance.py index c17f5775..a57a3fa2 100644 --- a/benchmarks/benchmarks/npbench/bench_arc_distance.py +++ b/benchmarks/benchmarks/npbench/bench_arc_distance.py @@ -12,6 +12,7 @@ # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py def _initialize(N): from numpy.random import default_rng + rng = default_rng(42) t0 = rng.random((N,)) p0 = rng.random((N,)) diff --git a/benchmarks/benchmarks/npbench/bench_cholesky2.py b/benchmarks/benchmarks/npbench/bench_cholesky2.py index 347aaca8..ae19443f 100644 --- a/benchmarks/benchmarks/npbench/bench_cholesky2.py +++ b/benchmarks/benchmarks/npbench/bench_cholesky2.py @@ -19,10 +19,10 @@ def _initialize(N, datatype=np.float64): A = np.empty((N, N), dtype=datatype) for i in range(N): - A[i, :i + 1] = np.fromfunction( + A[i, : i + 1] = np.fromfunction( lambda j: (-j % N) / N + 1, (i + 1,), dtype=datatype ) - A[i, i + 1:] = 0.0 + A[i, i + 1 :] = 0.0 A[i, i] = 1.0 A[:] = A @ np.transpose(A) return A diff --git a/benchmarks/benchmarks/npbench/bench_correlation.py b/benchmarks/benchmarks/npbench/bench_correlation.py index 194fcc04..ca941443 100644 --- a/benchmarks/benchmarks/npbench/bench_correlation.py +++ b/benchmarks/benchmarks/npbench/bench_correlation.py @@ -29,7 +29,9 @@ def _kernel(M, float_n, data): data /= np.sqrt(float_n) * stddev corr = np.eye(M, dtype=data.dtype) for i in range(M - 1): - corr[i + 1:M, i] = corr[i, i + 1:M] = data[:, i] @ data[:, i + 1:M] + corr[i + 1 : M, i] = corr[i, i + 1 : M] = ( + data[:, i] @ data[:, i + 1 : M] + ) return corr diff --git a/benchmarks/benchmarks/npbench/bench_gemm.py b/benchmarks/benchmarks/npbench/bench_gemm.py index f7d43f7d..c6a36fd5 100644 --- a/benchmarks/benchmarks/npbench/bench_gemm.py +++ b/benchmarks/benchmarks/npbench/bench_gemm.py @@ -1,4 +1,6 @@ -"""npbench wrapper: GEMM (general matrix-matrix multiply) — mkl_umath ops: matmul. +"""npbench wrapper: GEMM (general matrix-matrix multiply). + +mkl_umath ops: matmul. Preset sizes from npbench bench_info/gemm.json: M: NI=2500, NJ=2750, NK=3000 @@ -15,10 +17,16 @@ # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm.py def _initialize(NI, NJ, NK, datatype=np.float64): alpha = datatype(1.5) - beta = datatype(1.2) - C = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ), dtype=datatype) - A = np.fromfunction(lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK), dtype=datatype) - B = np.fromfunction(lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ), dtype=datatype) + beta = datatype(1.2) + C = np.fromfunction( + lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ), dtype=datatype + ) + A = np.fromfunction( + lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK), dtype=datatype + ) + B = np.fromfunction( + lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ), dtype=datatype + ) return alpha, beta, C, A, B @@ -46,7 +54,7 @@ def setup_cache(self): def setup(self, cache, preset): alpha, beta, C, A, B = cache[preset] self.alpha = alpha - self.beta = beta + self.beta = beta self.C = C.copy() # mutated in-place self.A = A self.B = B diff --git a/benchmarks/benchmarks/npbench/bench_gemver.py b/benchmarks/benchmarks/npbench/bench_gemver.py index d0ea44a6..c85313ed 100644 --- a/benchmarks/benchmarks/npbench/bench_gemver.py +++ b/benchmarks/benchmarks/npbench/bench_gemver.py @@ -1,4 +1,6 @@ -"""npbench wrapper: GEMVER (vector multiplication and matrix addition) — mkl_umath ops: outer. +"""npbench wrapper: GEMVER (vector multiplication and matrix addition). + +mkl_umath ops: outer. Preset sizes from npbench bench_info/gemver.json: M: N=3_000 @@ -16,15 +18,15 @@ def _initialize(N, datatype=np.float64): alpha = datatype(1.5) beta = datatype(1.2) fn = datatype(N) - A = np.fromfunction(lambda i, j: (i * j % N) / N, (N, N), dtype=datatype) + A = np.fromfunction(lambda i, j: (i * j % N) / N, (N, N), dtype=datatype) u1 = np.fromfunction(lambda i: i, (N,), dtype=datatype) u2 = np.fromfunction(lambda i: ((i + 1) / fn) / 2.0, (N,), dtype=datatype) v1 = np.fromfunction(lambda i: ((i + 1) / fn) / 4.0, (N,), dtype=datatype) v2 = np.fromfunction(lambda i: ((i + 1) / fn) / 6.0, (N,), dtype=datatype) - w = np.zeros((N,), dtype=datatype) - x = np.zeros((N,), dtype=datatype) - y = np.fromfunction(lambda i: ((i + 1) / fn) / 8.0, (N,), dtype=datatype) - z = np.fromfunction(lambda i: ((i + 1) / fn) / 9.0, (N,), dtype=datatype) + w = np.zeros((N,), dtype=datatype) + x = np.zeros((N,), dtype=datatype) + y = np.fromfunction(lambda i: ((i + 1) / fn) / 8.0, (N,), dtype=datatype) + z = np.fromfunction(lambda i: ((i + 1) / fn) / 9.0, (N,), dtype=datatype) return alpha, beta, A, u1, v1, u2, v2, w, x, y, z @@ -54,20 +56,28 @@ def setup_cache(self): def setup(self, cache, preset): alpha, beta, A, u1, v1, u2, v2, w, x, y, z = cache[preset] self.alpha = alpha - self.beta = beta - self.A = A.copy() # mutated: A += outer(u1,v1) + outer(u2,v2) + self.beta = beta + self.A = A.copy() # mutated: A += outer(u1,v1) + outer(u2,v2) self.u1 = u1 self.v1 = v1 self.u2 = u2 self.v2 = v2 - self.w = w.copy() # mutated: w += alpha * A @ x - self.x = x.copy() # mutated: x += beta * y @ A + z - self.y = y - self.z = z + self.w = w.copy() # mutated: w += alpha * A @ x + self.x = x.copy() # mutated: x += beta * y @ A + z + self.y = y + self.z = z def time_gemver(self, cache, preset): _kernel( - self.alpha, self.beta, - self.A, self.u1, self.v1, self.u2, self.v2, - self.w, self.x, self.y, self.z, + self.alpha, + self.beta, + self.A, + self.u1, + self.v1, + self.u2, + self.v2, + self.w, + self.x, + self.y, + self.z, ) diff --git a/benchmarks/benchmarks/npbench/bench_gesummv.py b/benchmarks/benchmarks/npbench/bench_gesummv.py index 13bb773e..b3c02cd5 100644 --- a/benchmarks/benchmarks/npbench/bench_gesummv.py +++ b/benchmarks/benchmarks/npbench/bench_gesummv.py @@ -1,4 +1,6 @@ -"""npbench wrapper: GESUMMV (scalar, vector and matrix multiplication) — mkl_umath ops: matmul. +"""npbench wrapper: GESUMMV (scalar, vector and matrix multiplication). + +mkl_umath ops: matmul. Preset sizes from npbench bench_info/gesummv.json: M: N=4_000 @@ -13,8 +15,12 @@ def _initialize(N, datatype=np.float64): alpha = datatype(1.5) beta = datatype(1.2) - A = np.fromfunction(lambda i, j: ((i * j + 1) % N) / N, (N, N), dtype=datatype) - B = np.fromfunction(lambda i, j: ((i * j + 2) % N) / N, (N, N), dtype=datatype) + A = np.fromfunction( + lambda i, j: ((i * j + 1) % N) / N, (N, N), dtype=datatype + ) + B = np.fromfunction( + lambda i, j: ((i * j + 2) % N) / N, (N, N), dtype=datatype + ) x = np.fromfunction(lambda i: (i % N) / N, (N,), dtype=datatype) return alpha, beta, A, B, x diff --git a/benchmarks/benchmarks/npbench/bench_go_fast.py b/benchmarks/benchmarks/npbench/bench_go_fast.py index d197c540..f4dca7ef 100644 --- a/benchmarks/benchmarks/npbench/bench_go_fast.py +++ b/benchmarks/benchmarks/npbench/bench_go_fast.py @@ -16,6 +16,7 @@ # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py def _initialize(N): from numpy.random import default_rng + rng = default_rng(42) a = rng.random((N, N)) return (a,) @@ -37,7 +38,7 @@ def _go_fast(a): class BenchGoFastLoop: - """Original npbench kernel — diagonal Python loop calling np.tanh per element.""" + """Original npbench kernel — Python loop calling np.tanh per element.""" params = (["M", "L"],) param_names = ["preset"] diff --git a/benchmarks/benchmarks/npbench/bench_k2mm.py b/benchmarks/benchmarks/npbench/bench_k2mm.py index 6ff8cf6f..68143c6a 100644 --- a/benchmarks/benchmarks/npbench/bench_k2mm.py +++ b/benchmarks/benchmarks/npbench/bench_k2mm.py @@ -15,11 +15,19 @@ # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm.py def _initialize(NI, NJ, NK, NL, datatype=np.float64): alpha = datatype(1.5) - beta = datatype(1.2) - A = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NK), dtype=datatype) - B = np.fromfunction(lambda i, j: (i * (j + 1) % NJ) / NJ, (NK, NJ), dtype=datatype) - C = np.fromfunction(lambda i, j: ((i * (j + 3) + 1) % NL) / NL, (NJ, NL), dtype=datatype) - D = np.fromfunction(lambda i, j: (i * (j + 2) % NK) / NK, (NI, NL), dtype=datatype) + beta = datatype(1.2) + A = np.fromfunction( + lambda i, j: ((i * j + 1) % NI) / NI, (NI, NK), dtype=datatype + ) + B = np.fromfunction( + lambda i, j: (i * (j + 1) % NJ) / NJ, (NK, NJ), dtype=datatype + ) + C = np.fromfunction( + lambda i, j: ((i * (j + 3) + 1) % NL) / NL, (NJ, NL), dtype=datatype + ) + D = np.fromfunction( + lambda i, j: (i * (j + 2) % NK) / NK, (NI, NL), dtype=datatype + ) return alpha, beta, A, B, C, D @@ -47,7 +55,7 @@ def setup_cache(self): def setup(self, cache, preset): alpha, beta, A, B, C, D = cache[preset] self.alpha = alpha - self.beta = beta + self.beta = beta self.A = A self.B = B self.C = C diff --git a/benchmarks/benchmarks/npbench/bench_k3mm.py b/benchmarks/benchmarks/npbench/bench_k3mm.py index 15e73d60..5211fdaf 100644 --- a/benchmarks/benchmarks/npbench/bench_k3mm.py +++ b/benchmarks/benchmarks/npbench/bench_k3mm.py @@ -11,10 +11,22 @@ # Inlined from spcl/npbench @ main # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm.py def _initialize(NI, NJ, NK, NL, NM, datatype=np.float64): - A = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / (5 * NI), (NI, NK), dtype=datatype) - B = np.fromfunction(lambda i, j: ((i * (j + 1) + 2) % NJ) / (5 * NJ), (NK, NJ), dtype=datatype) - C = np.fromfunction(lambda i, j: (i * (j + 3) % NL) / (5 * NL), (NJ, NM), dtype=datatype) - D = np.fromfunction(lambda i, j: ((i * (j + 2) + 2) % NK) / (5 * NK), (NM, NL), dtype=datatype) + A = np.fromfunction( + lambda i, j: ((i * j + 1) % NI) / (5 * NI), (NI, NK), dtype=datatype + ) + B = np.fromfunction( + lambda i, j: ((i * (j + 1) + 2) % NJ) / (5 * NJ), + (NK, NJ), + dtype=datatype, + ) + C = np.fromfunction( + lambda i, j: (i * (j + 3) % NL) / (5 * NL), (NJ, NM), dtype=datatype + ) + D = np.fromfunction( + lambda i, j: ((i * (j + 2) + 2) % NK) / (5 * NK), + (NM, NL), + dtype=datatype, + ) return A, B, C, D diff --git a/benchmarks/benchmarks/npbench/bench_mandelbrot.py b/benchmarks/benchmarks/npbench/bench_mandelbrot.py index 090fcd0e..47f3f14f 100644 --- a/benchmarks/benchmarks/npbench/bench_mandelbrot.py +++ b/benchmarks/benchmarks/npbench/bench_mandelbrot.py @@ -1,4 +1,6 @@ -"""npbench wrapper: Mandelbrot set (two variants) — mkl_umath ops: abs, multiply, add. +"""npbench wrapper: Mandelbrot set (two variants). + +mkl_umath ops: abs, multiply, add. Preset sizes from npbench bench_info/mandelbrot1.json and mandelbrot2.json: M: XN=YN=250/500, maxiter=150/80 @@ -13,9 +15,9 @@ import numpy as np - # --- mandelbrot1 --- + # Inlined from spcl/npbench @ main # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot1/mandelbrot1_numpy.py def _mandelbrot1(xmin, xmax, ymin, ymax, xn, yn, maxiter, horizon=2.0): @@ -25,15 +27,16 @@ def _mandelbrot1(xmin, xmax, ymin, ymax, xn, yn, maxiter, horizon=2.0): N = np.zeros(C.shape, dtype=np.int64) Z = np.zeros(C.shape, dtype=np.complex128) for n in range(maxiter): - I = np.less(abs(Z), horizon) - N[I] = n - Z[I] = Z[I] ** 2 + C[I] + mask = np.less(abs(Z), horizon) + N[mask] = n + Z[mask] = Z[mask] ** 2 + C[mask] N[N == maxiter - 1] = 0 return Z, N # --- mandelbrot2 --- + # Inlined from spcl/npbench @ main # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot2/mandelbrot2_numpy.py def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0): @@ -63,17 +66,49 @@ def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0): _PRESETS_M1 = { - "M": {"xmin": -1.75, "xmax": 0.25, "ymin": -1.00, "ymax": 1.00, - "xn": 250, "yn": 250, "maxiter": 150, "horizon": 2.0}, - "L": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25, - "xn": 833, "yn": 833, "maxiter": 200, "horizon": 2.0}, + "M": { + "xmin": -1.75, + "xmax": 0.25, + "ymin": -1.00, + "ymax": 1.00, + "xn": 250, + "yn": 250, + "maxiter": 150, + "horizon": 2.0, + }, + "L": { + "xmin": -2.00, + "xmax": 0.50, + "ymin": -1.25, + "ymax": 1.25, + "xn": 833, + "yn": 833, + "maxiter": 200, + "horizon": 2.0, + }, } _PRESETS_M2 = { - "M": {"xmin": -2.00, "xmax": 0.50, "ymin": -1.25, "ymax": 1.25, - "xn": 500, "yn": 500, "itermax": 80, "horizon": 2.0}, - "L": {"xmin": -2.25, "xmax": 0.75, "ymin": -1.50, "ymax": 1.50, - "xn": 1000, "yn": 1000, "itermax": 100, "horizon": 2.0}, + "M": { + "xmin": -2.00, + "xmax": 0.50, + "ymin": -1.25, + "ymax": 1.25, + "xn": 500, + "yn": 500, + "itermax": 80, + "horizon": 2.0, + }, + "L": { + "xmin": -2.25, + "xmax": 0.75, + "ymin": -1.50, + "ymax": 1.50, + "xn": 1000, + "yn": 1000, + "itermax": 100, + "horizon": 2.0, + }, } diff --git a/benchmarks/benchmarks/npbench/bench_softmax.py b/benchmarks/benchmarks/npbench/bench_softmax.py index 29a77252..5fdfe321 100644 --- a/benchmarks/benchmarks/npbench/bench_softmax.py +++ b/benchmarks/benchmarks/npbench/bench_softmax.py @@ -14,6 +14,7 @@ # https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax.py def _initialize(N, H, SM): from numpy.random import default_rng + rng = default_rng(42) x = rng.random((N, H, SM, SM), dtype=np.float32) return (x,) @@ -29,7 +30,7 @@ def _softmax(x): _PRESETS = { - "M": {"N": 32, "H": 8, "SM": 256}, + "M": {"N": 32, "H": 8, "SM": 256}, "L": {"N": 64, "H": 16, "SM": 448}, } From 5da98d4d2c2f9c7432b6b5483d5a26e915865afe Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Tue, 19 May 2026 10:10:51 -0500 Subject: [PATCH 4/7] Fix PR comments --- benchmarks/benchmarks/__init__.py | 8 +- benchmarks/benchmarks/_patch_setup.py | 19 +-- benchmarks/benchmarks/micro/bench_exp_log.py | 98 ------------- benchmarks/benchmarks/micro/bench_micro.py | 88 ++++++++++++ .../benchmarks/micro/bench_sqrt_misc.py | 84 ----------- benchmarks/benchmarks/micro/bench_trig.py | 134 ------------------ .../benchmarks/npbench/bench_cholesky2.py | 56 -------- .../benchmarks/npbench/bench_correlation.py | 60 -------- .../benchmarks/npbench/bench_covariance.py | 55 ------- .../benchmarks/npbench/bench_deriche.py | 113 --------------- .../benchmarks/npbench/bench_doitgen.py | 54 ------- benchmarks/benchmarks/npbench/bench_gemm.py | 63 -------- benchmarks/benchmarks/npbench/bench_gemver.py | 83 ----------- .../benchmarks/npbench/bench_gesummv.py | 53 ------- benchmarks/benchmarks/npbench/bench_k2mm.py | 65 --------- benchmarks/benchmarks/npbench/bench_k3mm.py | 58 -------- 16 files changed, 96 insertions(+), 995 deletions(-) delete mode 100644 benchmarks/benchmarks/micro/bench_exp_log.py create mode 100644 benchmarks/benchmarks/micro/bench_micro.py delete mode 100644 benchmarks/benchmarks/micro/bench_sqrt_misc.py delete mode 100644 benchmarks/benchmarks/micro/bench_trig.py delete mode 100644 benchmarks/benchmarks/npbench/bench_cholesky2.py delete mode 100644 benchmarks/benchmarks/npbench/bench_correlation.py delete mode 100644 benchmarks/benchmarks/npbench/bench_covariance.py delete mode 100644 benchmarks/benchmarks/npbench/bench_deriche.py delete mode 100644 benchmarks/benchmarks/npbench/bench_doitgen.py delete mode 100644 benchmarks/benchmarks/npbench/bench_gemm.py delete mode 100644 benchmarks/benchmarks/npbench/bench_gemver.py delete mode 100644 benchmarks/benchmarks/npbench/bench_gesummv.py delete mode 100644 benchmarks/benchmarks/npbench/bench_k2mm.py delete mode 100644 benchmarks/benchmarks/npbench/bench_k3mm.py diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py index bcf027e4..f7739a93 100644 --- a/benchmarks/benchmarks/__init__.py +++ b/benchmarks/benchmarks/__init__.py @@ -1,4 +1,4 @@ -# Trigger MKL patching once per ASV worker process. -# ASV uses --launch-method spawn in CI, so each worker is a fresh process -# and this runs exactly once before any benchmark is collected or timed. -from . import _patch_setup # noqa: F401 +from ._patch_setup import _apply_patches + +_apply_patches() +del _apply_patches \ No newline at end of file diff --git a/benchmarks/benchmarks/_patch_setup.py b/benchmarks/benchmarks/_patch_setup.py index 9aea6062..9383b1c8 100644 --- a/benchmarks/benchmarks/_patch_setup.py +++ b/benchmarks/benchmarks/_patch_setup.py @@ -3,12 +3,8 @@ Patches NumPy with Intel MKL implementations for fft, random, and umath. Hard-fails with a descriptive RuntimeError if any package is missing or the patch does not take effect, so benchmarks never silently run on stock NumPy. - -Visible output goes to stderr; pass --show-stderr to ``asv run`` to see it. """ -import sys - _PATCH_MAP = [ ("mkl_fft", "patch_numpy_fft"), ("mkl_random", "patch_numpy_random"), @@ -17,6 +13,8 @@ def _apply_patches(): + import numpy as np + patched = {} for mod_name, patch_fn_name in _PATCH_MAP: @@ -56,9 +54,6 @@ def _apply_patches(): patched[mod_name] = mod - # Verbose attribution — verify numpy-level dispatch changed hands - import numpy as np - _attr_checks = { "mkl_fft": lambda: np.fft.fft.__module__, "mkl_random": lambda: np.random.random.__module__, @@ -69,12 +64,6 @@ def _apply_patches(): attr = _attr_checks[mod_name]() except Exception: attr = "unknown" - sys.stderr.write(f"[mkl-patch] {mod_name}: numpy dispatch → {attr}\n") - - sys.stderr.write( - "[mkl-patch] ALL OK — mkl_fft, mkl_random, mkl_umath active\n" - ) - sys.stderr.flush() - + print(f"[mkl-patch] {mod_name}: numpy dispatch -> {attr}") -_apply_patches() + print("[mkl-patch] ALL OK -- mkl_fft, mkl_random, mkl_umath active") diff --git a/benchmarks/benchmarks/micro/bench_exp_log.py b/benchmarks/benchmarks/micro/bench_exp_log.py deleted file mode 100644 index 216fc740..00000000 --- a/benchmarks/benchmarks/micro/bench_exp_log.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Micro-benchmarks for mkl_umath exponential and logarithm ufuncs. - -Each class times a single ufunc over a Cartesian product of - dtype ∈ [float32, float64] - size ∈ [10_000, 100_000, 1_000_000] - -Arrays are pre-allocated in setup() and reused across timing calls. -Patching is applied once at package import via benchmarks._patch_setup. -""" - -import numpy as np - - -class BenchExp: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - # float32 overflows exp around 88.7; use [-10, 10] safe for both dtypes - rng = np.random.default_rng(42) - self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) - - def time_exp(self, dtype, size): - np.exp(self.x) - - -class BenchExp2: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - # float32 overflows exp2 around 127 - rng = np.random.default_rng(42) - self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) - - def time_exp2(self, dtype, size): - np.exp2(self.x) - - -class BenchExpm1: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) - - def time_expm1(self, dtype, size): - np.expm1(self.x) - - -class BenchLog: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(1e-3, 1e3, size).astype(dtype) - - def time_log(self, dtype, size): - np.log(self.x) - - -class BenchLog2: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(1e-3, 1e3, size).astype(dtype) - - def time_log2(self, dtype, size): - np.log2(self.x) - - -class BenchLog10: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(1e-3, 1e3, size).astype(dtype) - - def time_log10(self, dtype, size): - np.log10(self.x) - - -class BenchLog1p: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - # log1p(x) is defined for x > -1; use [0, 10] which is always safe - rng = np.random.default_rng(42) - self.x = rng.uniform(0.0, 10.0, size).astype(dtype) - - def time_log1p(self, dtype, size): - np.log1p(self.x) diff --git a/benchmarks/benchmarks/micro/bench_micro.py b/benchmarks/benchmarks/micro/bench_micro.py new file mode 100644 index 00000000..381c823c --- /dev/null +++ b/benchmarks/benchmarks/micro/bench_micro.py @@ -0,0 +1,88 @@ +"""Micro-benchmarks for mkl_umath unary ufuncs. + +Times each ufunc over a Cartesian product of + dtype in [float32, float64] + size in [10_000, 100_000, 1_000_000] + +Arrays are pre-allocated in setup() and reused across timing calls. +Patching is applied once at package import via benchmarks._patch_setup. +""" + +import numpy as np + + +_UFUNC_CONFIGS = { + "exp": {"func": np.exp, "low": -10.0, "high": 10.0}, + "exp2": {"func": np.exp2, "low": -10.0, "high": 10.0}, + "expm1": {"func": np.expm1, "low": -10.0, "high": 10.0}, + "log": {"func": np.log, "low": 1e-3, "high": 1e3}, + "log2": {"func": np.log2, "low": 1e-3, "high": 1e3}, + "log10": {"func": np.log10, "low": 1e-3, "high": 1e3}, + "log1p": {"func": np.log1p, "low": 0.0, "high": 10.0}, + "sin": {"func": np.sin, "low": -np.pi, "high": np.pi}, + "cos": {"func": np.cos, "low": -np.pi, "high": np.pi}, + "tan": {"func": np.tan, "low": -1.4, "high": 1.4}, + "arcsin": {"func": np.arcsin, "low": -1.0, "high": 1.0}, + "arccos": {"func": np.arccos, "low": -1.0, "high": 1.0}, + "arctan": {"func": np.arctan, "low": -10.0, "high": 10.0}, + "sinh": {"func": np.sinh, "low": -5.0, "high": 5.0}, + "cosh": {"func": np.cosh, "low": -5.0, "high": 5.0}, + "tanh": {"func": np.tanh, "low": -5.0, "high": 5.0}, + "arcsinh": {"func": np.arcsinh, "low": -10.0, "high": 10.0}, + "arccosh": {"func": np.arccosh, "low": 1.0, "high": 100.0}, + "arctanh": {"func": np.arctanh, "low": -0.99, "high": 0.99}, + "sqrt": {"func": np.sqrt, "low": 0.0, "high": 100.0}, + "cbrt": {"func": np.cbrt, "low": -100.0, "high": 100.0}, + "square": {"func": np.square, "low": -10.0, "high": 10.0}, + "fabs": {"func": np.fabs, "low": -100.0, "high": 100.0}, + "absolute": {"func": np.absolute, "low": -100.0, "high": 100.0}, + "reciprocal": {"func": np.reciprocal, "low": 0.01, "high": 100.0}, +} + + +class BenchMicro: + params = ( + sorted(_UFUNC_CONFIGS.keys()), + ["float32", "float64"], + [10_000, 100_000, 1_000_000], + ) + param_names = ["ufunc", "dtype", "size"] + + def setup(self, ufunc, dtype, size): + cfg = _UFUNC_CONFIGS[ufunc] + rng = np.random.default_rng(42) + self.x = rng.uniform(cfg["low"], cfg["high"], size).astype(dtype) + self._func = cfg["func"] + + def time_micro(self, ufunc, dtype, size): + self._func(self.x) + + +class BenchArctan2: + """Binary ufunc arctan2""" + + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.y = rng.uniform(-1.0, 1.0, size).astype(dtype) + self.x = rng.uniform(-1.0, 1.0, size).astype(dtype) + + def time_arctan2(self, dtype, size): + np.arctan2(self.y, self.x) + + +class BenchPower: + """Binary ufunc power (arbitrary exponent via MKL vdPow)""" + + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.base = rng.uniform(0.1, 10.0, size).astype(dtype) + self.exp = rng.uniform(0.5, 3.0, size).astype(dtype) + + def time_power(self, dtype, size): + np.power(self.base, self.exp) diff --git a/benchmarks/benchmarks/micro/bench_sqrt_misc.py b/benchmarks/benchmarks/micro/bench_sqrt_misc.py deleted file mode 100644 index b1170639..00000000 --- a/benchmarks/benchmarks/micro/bench_sqrt_misc.py +++ /dev/null @@ -1,84 +0,0 @@ -"""Micro-benchmarks for mkl_umath sqrt, cbrt, and miscellaneous ufuncs. - -Each class times a single ufunc over a Cartesian product of - dtype ∈ [float32, float64] - size ∈ [10_000, 100_000, 1_000_000] - -Arrays are pre-allocated in setup() and reused across timing calls. -Patching is applied once at package import via benchmarks._patch_setup. -""" - -import numpy as np - - -class BenchSqrt: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(0.0, 100.0, size).astype(dtype) - - def time_sqrt(self, dtype, size): - np.sqrt(self.x) - - -class BenchCbrt: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-100.0, 100.0, size).astype(dtype) - - def time_cbrt(self, dtype, size): - np.cbrt(self.x) - - -class BenchSquare: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) - - def time_square(self, dtype, size): - np.square(self.x) - - -class BenchFabs: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-100.0, 100.0, size).astype(dtype) - - def time_fabs(self, dtype, size): - np.fabs(self.x) - - -class BenchAbsolute: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-100.0, 100.0, size).astype(dtype) - - def time_absolute(self, dtype, size): - np.absolute(self.x) - - -class BenchReciprocal: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - # Avoid values near zero to prevent inf results dominating timing - rng = np.random.default_rng(42) - self.x = rng.uniform(0.01, 100.0, size).astype(dtype) - - def time_reciprocal(self, dtype, size): - np.reciprocal(self.x) diff --git a/benchmarks/benchmarks/micro/bench_trig.py b/benchmarks/benchmarks/micro/bench_trig.py deleted file mode 100644 index eb09b9c6..00000000 --- a/benchmarks/benchmarks/micro/bench_trig.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Micro-benchmarks for mkl_umath trigonometric ufuncs. - -Each class times a single ufunc over a Cartesian product of - dtype ∈ [float32, float64] - size ∈ [10_000, 100_000, 1_000_000] - -Arrays are pre-allocated in setup() and reused across timing calls. -Patching is applied once at package import via benchmarks._patch_setup. -""" - -import numpy as np - - -class BenchSin: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-np.pi, np.pi, size).astype(dtype) - - def time_sin(self, dtype, size): - np.sin(self.x) - - -class BenchCos: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-np.pi, np.pi, size).astype(dtype) - - def time_cos(self, dtype, size): - np.cos(self.x) - - -class BenchTan: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - # Avoid values near ±π/2 where tan diverges - rng = np.random.default_rng(42) - self.x = rng.uniform(-1.4, 1.4, size).astype(dtype) - - def time_tan(self, dtype, size): - np.tan(self.x) - - -class BenchArcsin: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-1.0, 1.0, size).astype(dtype) - - def time_arcsin(self, dtype, size): - np.arcsin(self.x) - - -class BenchArccos: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-1.0, 1.0, size).astype(dtype) - - def time_arccos(self, dtype, size): - np.arccos(self.x) - - -class BenchArctan: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-10.0, 10.0, size).astype(dtype) - - def time_arctan(self, dtype, size): - np.arctan(self.x) - - -class BenchArctan2: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.y = rng.uniform(-1.0, 1.0, size).astype(dtype) - self.x = rng.uniform(-1.0, 1.0, size).astype(dtype) - - def time_arctan2(self, dtype, size): - np.arctan2(self.y, self.x) - - -class BenchSinh: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - # float32 overflows sinh around ±89; keep well inside that - rng = np.random.default_rng(42) - self.x = rng.uniform(-5.0, 5.0, size).astype(dtype) - - def time_sinh(self, dtype, size): - np.sinh(self.x) - - -class BenchCosh: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-5.0, 5.0, size).astype(dtype) - - def time_cosh(self, dtype, size): - np.cosh(self.x) - - -class BenchTanh: - params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) - param_names = ["dtype", "size"] - - def setup(self, dtype, size): - rng = np.random.default_rng(42) - self.x = rng.uniform(-5.0, 5.0, size).astype(dtype) - - def time_tanh(self, dtype, size): - np.tanh(self.x) diff --git a/benchmarks/benchmarks/npbench/bench_cholesky2.py b/benchmarks/benchmarks/npbench/bench_cholesky2.py deleted file mode 100644 index ae19443f..00000000 --- a/benchmarks/benchmarks/npbench/bench_cholesky2.py +++ /dev/null @@ -1,56 +0,0 @@ -"""npbench wrapper: Cholesky decomposition v2 — mkl_umath ops: linalg.cholesky. - -Preset sizes from npbench bench_info/cholesky2.json: - M: N=2200 - L: N=8000 - -The kernel mutates A in-place (A[:] = cholesky(A) + triu(A, k=1)), so -setup() copies A from cache before each timing round. - -The initialization constructs a symmetric positive-definite matrix via A @ A^T, -which is expensive at N=8000. setup_cache() runs this once per commit. -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/cholesky2/cholesky2.py -def _initialize(N, datatype=np.float64): - A = np.empty((N, N), dtype=datatype) - for i in range(N): - A[i, : i + 1] = np.fromfunction( - lambda j: (-j % N) / N + 1, (i + 1,), dtype=datatype - ) - A[i, i + 1 :] = 0.0 - A[i, i] = 1.0 - A[:] = A @ np.transpose(A) - return A - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/cholesky2/cholesky2_numpy.py -def _kernel(A): - A[:] = np.linalg.cholesky(A) + np.triu(A, k=1) - - -_PRESETS = { - "M": {"N": 2200}, - "L": {"N": 8000}, -} - - -class BenchCholesky2: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - self.A = cache[preset].copy() # kernel mutates A in-place - - def time_cholesky2(self, cache, preset): - _kernel(self.A) diff --git a/benchmarks/benchmarks/npbench/bench_correlation.py b/benchmarks/benchmarks/npbench/bench_correlation.py deleted file mode 100644 index ca941443..00000000 --- a/benchmarks/benchmarks/npbench/bench_correlation.py +++ /dev/null @@ -1,60 +0,0 @@ -"""npbench wrapper: Correlation — mkl_umath ops: sqrt, std, mean. - -Preset sizes from npbench bench_info/correlation.json: - M: M=1400, N=1800 - L: M=3200, N=4000 - -The kernel mutates ``data`` in-place (data -= mean; data /= ...), so -setup() copies from the cache before each timing round. -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/correlation/correlation.py -def _initialize(M, N, datatype=np.float64): - float_n = datatype(N) - data = np.fromfunction(lambda i, j: (i * j) / M + i, (N, M), dtype=datatype) - return float_n, data - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/correlation/correlation_numpy.py -def _kernel(M, float_n, data): - mean = np.mean(data, axis=0) - stddev = np.std(data, axis=0) - stddev[stddev <= 0.1] = 1.0 - data -= mean - data /= np.sqrt(float_n) * stddev - corr = np.eye(M, dtype=data.dtype) - for i in range(M - 1): - corr[i + 1 : M, i] = corr[i, i + 1 : M] = ( - data[:, i] @ data[:, i + 1 : M] - ) - return corr - - -_PRESETS = { - "M": {"M": 1400, "N": 1800}, - "L": {"M": 3200, "N": 4000}, -} - - -class BenchCorrelation: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - float_n, data = cache[preset] - self.M = _PRESETS[preset]["M"] - self.float_n = float_n - self.data = data.copy() # kernel mutates data in-place - - def time_correlation(self, cache, preset): - _kernel(self.M, self.float_n, self.data) diff --git a/benchmarks/benchmarks/npbench/bench_covariance.py b/benchmarks/benchmarks/npbench/bench_covariance.py deleted file mode 100644 index b85b3191..00000000 --- a/benchmarks/benchmarks/npbench/bench_covariance.py +++ /dev/null @@ -1,55 +0,0 @@ -"""npbench wrapper: Covariance — mkl_umath ops: mean. - -Preset sizes from npbench bench_info/covariance.json: - M: M=1400, N=1800 - L: M=3200, N=4000 - -The kernel mutates ``data`` in-place (data -= mean), so setup() copies -from the cache before each timing round. -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/covariance/covariance.py -def _initialize(M, N, datatype=np.float64): - float_n = datatype(N) - data = np.fromfunction(lambda i, j: (i * j) / M, (N, M), dtype=datatype) - return float_n, data - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/covariance/covariance_numpy.py -def _kernel(M, float_n, data): - mean = np.mean(data, axis=0) - data -= mean - cov = np.zeros((M, M), dtype=data.dtype) - for i in range(M): - cov[i:M, i] = cov[i, i:M] = data[:, i] @ data[:, i:M] / (float_n - 1.0) - return cov - - -_PRESETS = { - "M": {"M": 1400, "N": 1800}, - "L": {"M": 3200, "N": 4000}, -} - - -class BenchCovariance: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - float_n, data = cache[preset] - self.M = _PRESETS[preset]["M"] - self.float_n = float_n - self.data = data.copy() # kernel mutates data in-place - - def time_covariance(self, cache, preset): - _kernel(self.M, self.float_n, self.data) diff --git a/benchmarks/benchmarks/npbench/bench_deriche.py b/benchmarks/benchmarks/npbench/bench_deriche.py deleted file mode 100644 index 4539053d..00000000 --- a/benchmarks/benchmarks/npbench/bench_deriche.py +++ /dev/null @@ -1,113 +0,0 @@ -"""npbench wrapper: Deriche Edge Detector — mkl_umath ops: exp. - -Preset sizes from npbench bench_info/deriche.json: - M: W=1500, H=1000 - L: W=6000, H=3000 - -Warning: this kernel contains Python for-loops over rows/columns. -At the L preset the Python loops dominate runtime; exp() calls on scalar -floats are measured, not vectorised MKL VM throughput. The L preset is -included for historical comparability with npbench runs. -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/deriche/deriche.py -def _initialize(W, H, datatype=np.float64): - alpha = datatype(0.25) - imgIn = np.fromfunction( - lambda i, j: ((313 * i + 991 * j) % 65536) / 65535.0, - (W, H), - dtype=datatype, - ) - return alpha, imgIn - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/deriche/deriche_numpy.py -def _kernel(alpha, imgIn): - k = ( - (1.0 - np.exp(-alpha)) - * (1.0 - np.exp(-alpha)) - / (1.0 + alpha * np.exp(-alpha) - np.exp(2.0 * alpha)) - ) - a1 = a5 = k - a2 = a6 = k * np.exp(-alpha) * (alpha - 1.0) - a3 = a7 = k * np.exp(-alpha) * (alpha + 1.0) - a4 = a8 = -k * np.exp(-2.0 * alpha) - b1 = 2.0 ** (-alpha) - b2 = -np.exp(-2.0 * alpha) - c1 = c2 = 1 - - y1 = np.empty_like(imgIn) - y1[:, 0] = a1 * imgIn[:, 0] - y1[:, 1] = a1 * imgIn[:, 1] + a2 * imgIn[:, 0] + b1 * y1[:, 0] - for j in range(2, imgIn.shape[1]): - y1[:, j] = ( - a1 * imgIn[:, j] - + a2 * imgIn[:, j - 1] - + b1 * y1[:, j - 1] - + b2 * y1[:, j - 2] - ) - - y2 = np.empty_like(imgIn) - y2[:, -1] = 0.0 - y2[:, -2] = a3 * imgIn[:, -1] - for j in range(imgIn.shape[1] - 3, -1, -1): - y2[:, j] = ( - a3 * imgIn[:, j + 1] - + a4 * imgIn[:, j + 2] - + b1 * y2[:, j + 1] - + b2 * y2[:, j + 2] - ) - - imgOut = c1 * (y1 + y2) - - y1[0, :] = a5 * imgOut[0, :] - y1[1, :] = a5 * imgOut[1, :] + a6 * imgOut[0, :] + b1 * y1[0, :] - for i in range(2, imgIn.shape[0]): - y1[i, :] = ( - a5 * imgOut[i, :] - + a6 * imgOut[i - 1, :] - + b1 * y1[i - 1, :] - + b2 * y1[i - 2, :] - ) - - y2[-1, :] = 0.0 - y2[-2, :] = a7 * imgOut[-1, :] - for i in range(imgIn.shape[0] - 3, -1, -1): - y2[i, :] = ( - a7 * imgOut[i + 1, :] - + a8 * imgOut[i + 2, :] - + b1 * y2[i + 1, :] - + b2 * y2[i + 2, :] - ) - - return c2 * (y1 + y2) - - -_PRESETS = { - "M": {"W": 1500, "H": 1000}, - "L": {"W": 6000, "H": 3000}, -} - - -class BenchDeriche: - # L preset has Python loops over 6000 rows — allow extra time - timeout = 600 - - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - self.alpha, self.imgIn = cache[preset] - - def time_deriche(self, cache, preset): - _kernel(self.alpha, self.imgIn) diff --git a/benchmarks/benchmarks/npbench/bench_doitgen.py b/benchmarks/benchmarks/npbench/bench_doitgen.py deleted file mode 100644 index eb255bae..00000000 --- a/benchmarks/benchmarks/npbench/bench_doitgen.py +++ /dev/null @@ -1,54 +0,0 @@ -"""npbench wrapper: Doitgen (multiresolution analysis) — mkl_umath ops: matmul. - -Preset sizes from npbench bench_info/doitgen.json: - M: NR=110, NQ=125, NP=256 - L: NR=220, NQ=250, NP=512 - -The kernel mutates ``A`` in-place (A[:] = ...), so setup() copies from cache. -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/doitgen/doitgen.py -def _initialize(NR, NQ, NP, datatype=np.float64): - A = np.fromfunction( - lambda i, j, k: ((i * j + k) % NP) / NP, (NR, NQ, NP), dtype=datatype - ) - C4 = np.fromfunction( - lambda i, j: (i * j % NP) / NP, (NP, NP), dtype=datatype - ) - return A, C4 - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/doitgen/doitgen_numpy.py -def _kernel(NR, NQ, NP, A, C4): - A[:] = np.reshape(np.reshape(A, (NR, NQ, 1, NP)) @ C4, (NR, NQ, NP)) - - -_PRESETS = { - "M": {"NR": 110, "NQ": 125, "NP": 256}, - "L": {"NR": 220, "NQ": 250, "NP": 512}, -} - - -class BenchDoitgen: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - A, C4 = cache[preset] - p = _PRESETS[preset] - self.NR, self.NQ, self.NP = p["NR"], p["NQ"], p["NP"] - self.A = A.copy() # kernel mutates A in-place - self.C4 = C4 - - def time_doitgen(self, cache, preset): - _kernel(self.NR, self.NQ, self.NP, self.A, self.C4) diff --git a/benchmarks/benchmarks/npbench/bench_gemm.py b/benchmarks/benchmarks/npbench/bench_gemm.py deleted file mode 100644 index c6a36fd5..00000000 --- a/benchmarks/benchmarks/npbench/bench_gemm.py +++ /dev/null @@ -1,63 +0,0 @@ -"""npbench wrapper: GEMM (general matrix-matrix multiply). - -mkl_umath ops: matmul. - -Preset sizes from npbench bench_info/gemm.json: - M: NI=2500, NJ=2750, NK=3000 - L: NI=7000, NJ=7500, NK=8000 - -The kernel mutates C in-place (C[:] = alpha * A @ B + beta * C), so -setup() copies C from cache before each timing round. -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm.py -def _initialize(NI, NJ, NK, datatype=np.float64): - alpha = datatype(1.5) - beta = datatype(1.2) - C = np.fromfunction( - lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ), dtype=datatype - ) - A = np.fromfunction( - lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK), dtype=datatype - ) - B = np.fromfunction( - lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ), dtype=datatype - ) - return alpha, beta, C, A, B - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemm/gemm_numpy.py -def _kernel(alpha, beta, C, A, B): - C[:] = alpha * A @ B + beta * C - - -_PRESETS = { - "M": {"NI": 2500, "NJ": 2750, "NK": 3000}, - "L": {"NI": 7000, "NJ": 7500, "NK": 8000}, -} - - -class BenchGemm: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - alpha, beta, C, A, B = cache[preset] - self.alpha = alpha - self.beta = beta - self.C = C.copy() # mutated in-place - self.A = A - self.B = B - - def time_gemm(self, cache, preset): - _kernel(self.alpha, self.beta, self.C, self.A, self.B) diff --git a/benchmarks/benchmarks/npbench/bench_gemver.py b/benchmarks/benchmarks/npbench/bench_gemver.py deleted file mode 100644 index c85313ed..00000000 --- a/benchmarks/benchmarks/npbench/bench_gemver.py +++ /dev/null @@ -1,83 +0,0 @@ -"""npbench wrapper: GEMVER (vector multiplication and matrix addition). - -mkl_umath ops: outer. - -Preset sizes from npbench bench_info/gemver.json: - M: N=3_000 - L: N=10_000 - -The kernel mutates A, x, and w in-place, so setup() copies those from cache. -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemver/gemver.py -def _initialize(N, datatype=np.float64): - alpha = datatype(1.5) - beta = datatype(1.2) - fn = datatype(N) - A = np.fromfunction(lambda i, j: (i * j % N) / N, (N, N), dtype=datatype) - u1 = np.fromfunction(lambda i: i, (N,), dtype=datatype) - u2 = np.fromfunction(lambda i: ((i + 1) / fn) / 2.0, (N,), dtype=datatype) - v1 = np.fromfunction(lambda i: ((i + 1) / fn) / 4.0, (N,), dtype=datatype) - v2 = np.fromfunction(lambda i: ((i + 1) / fn) / 6.0, (N,), dtype=datatype) - w = np.zeros((N,), dtype=datatype) - x = np.zeros((N,), dtype=datatype) - y = np.fromfunction(lambda i: ((i + 1) / fn) / 8.0, (N,), dtype=datatype) - z = np.fromfunction(lambda i: ((i + 1) / fn) / 9.0, (N,), dtype=datatype) - return alpha, beta, A, u1, v1, u2, v2, w, x, y, z - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gemver/gemver_numpy.py -def _kernel(alpha, beta, A, u1, v1, u2, v2, w, x, y, z): - A += np.outer(u1, v1) + np.outer(u2, v2) - x += beta * y @ A + z - w += alpha * A @ x - - -_PRESETS = { - "M": {"N": 3_000}, - "L": {"N": 10_000}, -} - - -class BenchGemver: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - alpha, beta, A, u1, v1, u2, v2, w, x, y, z = cache[preset] - self.alpha = alpha - self.beta = beta - self.A = A.copy() # mutated: A += outer(u1,v1) + outer(u2,v2) - self.u1 = u1 - self.v1 = v1 - self.u2 = u2 - self.v2 = v2 - self.w = w.copy() # mutated: w += alpha * A @ x - self.x = x.copy() # mutated: x += beta * y @ A + z - self.y = y - self.z = z - - def time_gemver(self, cache, preset): - _kernel( - self.alpha, - self.beta, - self.A, - self.u1, - self.v1, - self.u2, - self.v2, - self.w, - self.x, - self.y, - self.z, - ) diff --git a/benchmarks/benchmarks/npbench/bench_gesummv.py b/benchmarks/benchmarks/npbench/bench_gesummv.py deleted file mode 100644 index b3c02cd5..00000000 --- a/benchmarks/benchmarks/npbench/bench_gesummv.py +++ /dev/null @@ -1,53 +0,0 @@ -"""npbench wrapper: GESUMMV (scalar, vector and matrix multiplication). - -mkl_umath ops: matmul. - -Preset sizes from npbench bench_info/gesummv.json: - M: N=4_000 - L: N=14_000 -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gesummv/gesummv.py -def _initialize(N, datatype=np.float64): - alpha = datatype(1.5) - beta = datatype(1.2) - A = np.fromfunction( - lambda i, j: ((i * j + 1) % N) / N, (N, N), dtype=datatype - ) - B = np.fromfunction( - lambda i, j: ((i * j + 2) % N) / N, (N, N), dtype=datatype - ) - x = np.fromfunction(lambda i: (i % N) / N, (N,), dtype=datatype) - return alpha, beta, A, B, x - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/gesummv/gesummv_numpy.py -def _kernel(alpha, beta, A, B, x): - return alpha * A @ x + beta * B @ x - - -_PRESETS = { - "M": {"N": 4_000}, - "L": {"N": 14_000}, -} - - -class BenchGesummv: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - self.alpha, self.beta, self.A, self.B, self.x = cache[preset] - - def time_gesummv(self, cache, preset): - _kernel(self.alpha, self.beta, self.A, self.B, self.x) diff --git a/benchmarks/benchmarks/npbench/bench_k2mm.py b/benchmarks/benchmarks/npbench/bench_k2mm.py deleted file mode 100644 index 68143c6a..00000000 --- a/benchmarks/benchmarks/npbench/bench_k2mm.py +++ /dev/null @@ -1,65 +0,0 @@ -"""npbench wrapper: 2MM (two matrix multiplications) — mkl_umath ops: matmul. - -Preset sizes from npbench bench_info/k2mm.json: - M: NI=2000, NJ=2250, NK=2500, NL=2750 - L: NI=6000, NJ=6500, NK=7000, NL=7500 - -The kernel mutates D in-place (D[:] = alpha * A @ B @ C + beta * D), so -setup() copies D from cache before each timing round. -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm.py -def _initialize(NI, NJ, NK, NL, datatype=np.float64): - alpha = datatype(1.5) - beta = datatype(1.2) - A = np.fromfunction( - lambda i, j: ((i * j + 1) % NI) / NI, (NI, NK), dtype=datatype - ) - B = np.fromfunction( - lambda i, j: (i * (j + 1) % NJ) / NJ, (NK, NJ), dtype=datatype - ) - C = np.fromfunction( - lambda i, j: ((i * (j + 3) + 1) % NL) / NL, (NJ, NL), dtype=datatype - ) - D = np.fromfunction( - lambda i, j: (i * (j + 2) % NK) / NK, (NI, NL), dtype=datatype - ) - return alpha, beta, A, B, C, D - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k2mm/k2mm_numpy.py -def _kernel(alpha, beta, A, B, C, D): - D[:] = alpha * A @ B @ C + beta * D - - -_PRESETS = { - "M": {"NI": 2000, "NJ": 2250, "NK": 2500, "NL": 2750}, - "L": {"NI": 6000, "NJ": 6500, "NK": 7000, "NL": 7500}, -} - - -class BenchK2mm: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - alpha, beta, A, B, C, D = cache[preset] - self.alpha = alpha - self.beta = beta - self.A = A - self.B = B - self.C = C - self.D = D.copy() # mutated in-place - - def time_k2mm(self, cache, preset): - _kernel(self.alpha, self.beta, self.A, self.B, self.C, self.D) diff --git a/benchmarks/benchmarks/npbench/bench_k3mm.py b/benchmarks/benchmarks/npbench/bench_k3mm.py deleted file mode 100644 index 5211fdaf..00000000 --- a/benchmarks/benchmarks/npbench/bench_k3mm.py +++ /dev/null @@ -1,58 +0,0 @@ -"""npbench wrapper: 3MM (three matrix multiplications) — mkl_umath ops: matmul. - -Preset sizes from npbench bench_info/k3mm.json: - M: NI=2000, NJ=2200, NK=2400, NL=2600, NM=2800 - L: NI=5500, NJ=6000, NK=6500, NL=7000, NM=7500 -""" - -import numpy as np - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm.py -def _initialize(NI, NJ, NK, NL, NM, datatype=np.float64): - A = np.fromfunction( - lambda i, j: ((i * j + 1) % NI) / (5 * NI), (NI, NK), dtype=datatype - ) - B = np.fromfunction( - lambda i, j: ((i * (j + 1) + 2) % NJ) / (5 * NJ), - (NK, NJ), - dtype=datatype, - ) - C = np.fromfunction( - lambda i, j: (i * (j + 3) % NL) / (5 * NL), (NJ, NM), dtype=datatype - ) - D = np.fromfunction( - lambda i, j: ((i * (j + 2) + 2) % NK) / (5 * NK), - (NM, NL), - dtype=datatype, - ) - return A, B, C, D - - -# Inlined from spcl/npbench @ main -# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/polybench/k3mm/k3mm_numpy.py -def _kernel(A, B, C, D): - return A @ B @ C @ D - - -_PRESETS = { - "M": {"NI": 2000, "NJ": 2200, "NK": 2400, "NL": 2600, "NM": 2800}, - "L": {"NI": 5500, "NJ": 6000, "NK": 6500, "NL": 7000, "NM": 7500}, -} - - -class BenchK3mm: - params = (["M", "L"],) - param_names = ["preset"] - number = 1 - repeat = 20 - - def setup_cache(self): - return {p: _initialize(**kw) for p, kw in _PRESETS.items()} - - def setup(self, cache, preset): - self.A, self.B, self.C, self.D = cache[preset] - - def time_k3mm(self, cache, preset): - _kernel(self.A, self.B, self.C, self.D) From 8555438c08e0739509ed535490170c2acdf286f0 Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Wed, 20 May 2026 13:40:39 -0500 Subject: [PATCH 5/7] PR fixes --- benchmarks/README.md | 34 +++++++++++++++++ benchmarks/benchmarks/__init__.py | 43 +++++++++++++++++++++- benchmarks/benchmarks/micro/bench_micro.py | 1 - 3 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 benchmarks/README.md diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 00000000..2016345a --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,34 @@ +# mkl_umath ASV Benchmarks + +Performance benchmarks for [mkl_umath](https://github.com/IntelPython/mkl_umath) using [Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/). + +The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench) to measure end-to-end impact of MKL ufunc acceleration in realistic workloads. + +### Coverage + +| File | Ufuncs | Dtypes | Sizes/Presets | +|------|--------|--------|---------------| +| `micro/bench_micro.py` | 24 unary (`exp`, `log`, `sin`, `cos`, `sqrt`, `cbrt`, etc.) + `arctan2`, `power` | float32, float64 | 10k, 100k, 1M | +| `npbench/bench_softmax.py` | `exp`, `max`, `sum` | float32 | M (32x8x256x256), L (64x16x448x448) | +| `npbench/bench_arc_distance.py` | `sin`, `cos`, `arctan2`, `sqrt` | float64 | M (1M), L (10M) | +| `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) | +| `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) | + +## Threading + +Set `MKL_NUM_THREADS` in the environment before running ASV to control the thread count used by MKL: + +```bash +MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^! +``` + +If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation). + +## Quick Start + +```bash +cd benchmarks +asv run --python=same --quick HEAD^! # time the current commit +asv compare main HEAD # compare against main +asv publish && asv preview # view HTML report locally +``` diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py index f7739a93..dd9cbfb1 100644 --- a/benchmarks/benchmarks/__init__.py +++ b/benchmarks/benchmarks/__init__.py @@ -1,4 +1,45 @@ +"""ASV benchmarks for mkl_umath. + +Thread control — design rationale +---------------------------------- +Since we do not have a dedicated CI benchmark machine, benchmarks run +on a shared CI pool whose machines vary in core count over time. +Using the full physical core count of each machine would make results +incomparable across runs on different machines. + +Strategy: + - Physical cores >= 4 → fix MKL_NUM_THREADS = 4 + 4 is the lowest common denominator that guarantees multi-threaded MKL + behavior and is achievable on any modern CI machine. Results from + different machines in the pool are therefore directly comparable. + - Physical cores < 4 → fall back to MKL_NUM_THREADS = 1 (single-threaded) + Prevents over-subscription on under-resourced machines and avoids + misleading comparisons against 4-thread baselines. + +MKL recommendation: use physical cores, not logical (hyperthreaded) CPUs. +""" + +import os + +import psutil + from ._patch_setup import _apply_patches +_MIN_THREADS = 4 # minimum physical cores required for multi-threaded mode + + +def _physical_cores(): + """Return physical core count; fall back to 1 (conservative).""" + return psutil.cpu_count(logical=False) or 1 + + +def _thread_count(): + physical = _physical_cores() + return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1" + + +_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count()) +os.environ["MKL_NUM_THREADS"] = _THREADS + _apply_patches() -del _apply_patches \ No newline at end of file +del _apply_patches diff --git a/benchmarks/benchmarks/micro/bench_micro.py b/benchmarks/benchmarks/micro/bench_micro.py index 381c823c..1d6e4bb8 100644 --- a/benchmarks/benchmarks/micro/bench_micro.py +++ b/benchmarks/benchmarks/micro/bench_micro.py @@ -10,7 +10,6 @@ import numpy as np - _UFUNC_CONFIGS = { "exp": {"func": np.exp, "low": -10.0, "high": 10.0}, "exp2": {"func": np.exp2, "low": -10.0, "high": 10.0}, From aaf93842f27e07658ed9762acdbd95b978eb5349 Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Wed, 20 May 2026 13:43:21 -0500 Subject: [PATCH 6/7] PR suggestions --- benchmarks/benchmarks/__init__.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py index dd9cbfb1..665b2e16 100644 --- a/benchmarks/benchmarks/__init__.py +++ b/benchmarks/benchmarks/__init__.py @@ -1,23 +1,4 @@ -"""ASV benchmarks for mkl_umath. - -Thread control — design rationale ----------------------------------- -Since we do not have a dedicated CI benchmark machine, benchmarks run -on a shared CI pool whose machines vary in core count over time. -Using the full physical core count of each machine would make results -incomparable across runs on different machines. - -Strategy: - - Physical cores >= 4 → fix MKL_NUM_THREADS = 4 - 4 is the lowest common denominator that guarantees multi-threaded MKL - behavior and is achievable on any modern CI machine. Results from - different machines in the pool are therefore directly comparable. - - Physical cores < 4 → fall back to MKL_NUM_THREADS = 1 (single-threaded) - Prevents over-subscription on under-resourced machines and avoids - misleading comparisons against 4-thread baselines. - -MKL recommendation: use physical cores, not logical (hyperthreaded) CPUs. -""" +"""ASV benchmarks for mkl_umath""" import os From f07adb36892eb00a254de3023278e9df8130adc8 Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Wed, 20 May 2026 15:23:56 -0500 Subject: [PATCH 7/7] Improve readme --- benchmarks/README.md | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 2016345a..2da9bf01 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -14,21 +14,39 @@ The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench | `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) | | `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) | -## Threading +## Running Benchmarks -Set `MKL_NUM_THREADS` in the environment before running ASV to control the thread count used by MKL: +Prerequisites: ```bash -MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^! +pip install asv psutil ``` -If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation). +Run benchmarks against the current commit: + +```bash +asv run --python=same --quick HEAD^! +``` + +Compare two commits: + +```bash +asv continuous --python=same HEAD~1 HEAD +``` -## Quick Start +View results in a browser: ```bash -cd benchmarks -asv run --python=same --quick HEAD^! # time the current commit -asv compare main HEAD # compare against main -asv publish && asv preview # view HTML report locally +asv publish +asv preview ``` + +## Threading + +Set `MKL_NUM_THREADS` to control the thread count used by MKL: + +```bash +MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^! +``` + +If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation).