diff --git a/CHANGELOG.md b/CHANGELOG.md index b29625d..9709820 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **Benchmarking module** (`comprexx.benchmark`): `cx.benchmark()` measures real + inference latency with configurable warmup/iters, reporting mean, median, std, + p50/p90/p99, min/max, and throughput. `cx.compare_benchmarks()` returns a + before/after comparison with speedup and latency/throughput deltas. Quantized + models are automatically run on CPU. New `comprexx bench` CLI command. - GitHub Actions CI workflow running `pytest` on Python 3.10, 3.11, 3.12 plus a `ruff check` lint job. - `CHANGELOG.md` with history for v0.1.0 and v0.2.0. diff --git a/README.md b/README.md index 02df0c4..5bac748 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,29 @@ pipeline = cx.Pipeline([ The `perturbation` can be `"prune"` (zero the smallest weights) or `"noise"` (add Gaussian noise scaled by weight std). Each layer is snapshotted and restored in place, so no deep copies of the model are made. +### Benchmark inference latency + +Param counts and FLOPs tell you how small a model got. They don't tell you how fast it runs. `cx.benchmark` measures real latency: + +```python +result = cx.benchmark(model, input_shape=(1, 3, 224, 224), warmup=10, iters=50) +print(result.summary()) +``` + +You get mean, median, std, p50/p90/p99, min/max, and throughput in inferences per second. To see what compression actually bought you, run `compare_benchmarks` on the baseline and compressed models: + +```python +cmp = cx.compare_benchmarks( + baseline_model, result.model, + input_shape=(1, 3, 224, 224), + iters=50, +) +print(cmp.summary()) +print(f"{cmp.speedup:.2f}x faster") +``` + +Warmup iterations are excluded from measurements so caches and JIT settle first. Quantized models are automatically run on CPU regardless of the `device` argument. + ### Export to ONNX ```python @@ -177,6 +200,9 @@ comprexx analyze model.pt --input-shape "1,3,224,224" --json comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224" comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224" --dry-run +# Benchmark +comprexx bench model.pt --input-shape "1,3,224,224" --iters 50 + # Export comprexx export model.pt --format onnx --input-shape "1,3,224,224" ``` diff --git a/comprexx/__init__.py b/comprexx/__init__.py index fa15f19..aaf8b68 100644 --- a/comprexx/__init__.py +++ b/comprexx/__init__.py @@ -9,6 +9,12 @@ SensitivityReport, analyze_sensitivity, ) +from comprexx.benchmark.runner import ( + BenchmarkComparison, + BenchmarkResult, + benchmark, +) +from comprexx.benchmark.runner import compare as compare_benchmarks from comprexx.core.exceptions import ( AccuracyGuardTriggered, CalibrationError, @@ -27,6 +33,8 @@ __all__ = [ "AccuracyGuard", "AccuracyGuardTriggered", + "BenchmarkComparison", + "BenchmarkResult", "CalibrationError", "CompressionReport", "ComprexxError", @@ -43,6 +51,8 @@ "UnsupportedLayerError", "analyze", "analyze_sensitivity", + "benchmark", + "compare_benchmarks", "load_recipe", "stages", ] diff --git a/comprexx/benchmark/__init__.py b/comprexx/benchmark/__init__.py index e69de29..b736ea3 100644 --- a/comprexx/benchmark/__init__.py +++ b/comprexx/benchmark/__init__.py @@ -0,0 +1,10 @@ +"""Inference benchmarking.""" + +from comprexx.benchmark.runner import ( + BenchmarkComparison, + BenchmarkResult, + benchmark, + compare, +) + +__all__ = ["BenchmarkComparison", "BenchmarkResult", "benchmark", "compare"] diff --git a/comprexx/benchmark/runner.py b/comprexx/benchmark/runner.py new file mode 100644 index 0000000..f658a9f --- /dev/null +++ b/comprexx/benchmark/runner.py @@ -0,0 +1,250 @@ +"""Latency/throughput benchmarking for PyTorch models. + +Measures real inference performance so compression reports can show actual +speedups, not just parameter-count reductions. +""" + +from __future__ import annotations + +import json +import statistics +import time +from dataclasses import asdict, dataclass, field +from typing import Callable, Optional + +import torch +import torch.nn as nn + + +@dataclass +class BenchmarkResult: + """Latency statistics from a benchmark run.""" + + device: str + dtype: str + batch_size: int + warmup: int + iters: int + mean_ms: float + median_ms: float + std_ms: float + min_ms: float + max_ms: float + p50_ms: float + p90_ms: float + p99_ms: float + throughput_ips: float + samples_ms: list[float] = field(default_factory=list) + + def to_dict(self) -> dict: + return asdict(self) + + def to_json(self) -> str: + return json.dumps(self.to_dict(), indent=2) + + def summary(self) -> str: + return ( + f"Benchmark ({self.device}, batch={self.batch_size}, iters={self.iters})\n" + f" Mean: {self.mean_ms:.3f} ms\n" + f" Median: {self.median_ms:.3f} ms\n" + f" Std: {self.std_ms:.3f} ms\n" + f" p50/p90/p99:{self.p50_ms:.3f} / {self.p90_ms:.3f} / {self.p99_ms:.3f} ms\n" + f" Min/Max: {self.min_ms:.3f} / {self.max_ms:.3f} ms\n" + f" Throughput: {self.throughput_ips:.1f} inferences/sec" + ) + + +@dataclass +class BenchmarkComparison: + """Before/after benchmark comparison.""" + + baseline: BenchmarkResult + compressed: BenchmarkResult + + @property + def speedup(self) -> float: + if self.compressed.mean_ms == 0: + return float("inf") + return self.baseline.mean_ms / self.compressed.mean_ms + + @property + def latency_reduction_pct(self) -> float: + if self.baseline.mean_ms == 0: + return 0.0 + return (1 - self.compressed.mean_ms / self.baseline.mean_ms) * 100 + + @property + def throughput_gain_pct(self) -> float: + if self.baseline.throughput_ips == 0: + return 0.0 + return (self.compressed.throughput_ips / self.baseline.throughput_ips - 1) * 100 + + def to_dict(self) -> dict: + return { + "baseline": self.baseline.to_dict(), + "compressed": self.compressed.to_dict(), + "speedup": self.speedup, + "latency_reduction_pct": self.latency_reduction_pct, + "throughput_gain_pct": self.throughput_gain_pct, + } + + def to_json(self) -> str: + return json.dumps(self.to_dict(), indent=2) + + def summary(self) -> str: + return ( + f"Benchmark Comparison ({self.baseline.device})\n" + f" Baseline: {self.baseline.mean_ms:.3f} ms " + f"({self.baseline.throughput_ips:.1f} ips)\n" + f" Compressed: {self.compressed.mean_ms:.3f} ms " + f"({self.compressed.throughput_ips:.1f} ips)\n" + f" Speedup: {self.speedup:.2f}x " + f"({self.latency_reduction_pct:+.1f}% latency, " + f"{self.throughput_gain_pct:+.1f}% throughput)" + ) + + +def _make_input( + input_shape: tuple[int, ...] | list[tuple[int, ...]], + device: torch.device, + dtype: torch.dtype, +) -> torch.Tensor | tuple[torch.Tensor, ...]: + if isinstance(input_shape, list): + return tuple(torch.randn(*s, device=device, dtype=dtype) for s in input_shape) + return torch.randn(*input_shape, device=device, dtype=dtype) + + +def _sync(device: torch.device) -> None: + if device.type == "cuda": + torch.cuda.synchronize() + + +def _percentile(values: list[float], pct: float) -> float: + if not values: + return 0.0 + s = sorted(values) + k = (len(s) - 1) * (pct / 100.0) + lo = int(k) + hi = min(lo + 1, len(s) - 1) + frac = k - lo + return s[lo] * (1 - frac) + s[hi] * frac + + +def benchmark( + model: nn.Module, + input_shape: tuple[int, ...] | list[tuple[int, ...]], + device: str = "cpu", + dtype: torch.dtype = torch.float32, + warmup: int = 10, + iters: int = 50, + input_fn: Optional[Callable[[], torch.Tensor | tuple[torch.Tensor, ...]]] = None, +) -> BenchmarkResult: + """Benchmark a model's inference latency. + + Args: + model: Model to benchmark (set to eval mode internally). + input_shape: Single tensor shape, or list of shapes for multi-input models. + device: "cpu" or "cuda". + dtype: Input tensor dtype. Quantized models ignore this. + warmup: Warmup iterations (not measured) to stabilize caches/JIT. + iters: Measured iterations. + input_fn: Optional callable returning a fresh input per call. Overrides + `input_shape` when provided. + """ + if iters <= 0: + raise ValueError("iters must be positive") + + dev = torch.device(device) + model = model.eval() + + # Quantized models must run on CPU + is_quantized = any( + "quantized" in type(m).__module__ for m in model.modules() + ) + if is_quantized and dev.type != "cpu": + dev = torch.device("cpu") + + try: + model = model.to(dev) + except (RuntimeError, NotImplementedError): + # Some quantized modules refuse .to() transfers; fall through + pass + + def _gen_input(): + if input_fn is not None: + x = input_fn() + else: + x = _make_input(input_shape, dev, dtype) + return x + + sample = _gen_input() + batch_size = ( + sample[0].shape[0] if isinstance(sample, tuple) else sample.shape[0] + ) + + with torch.inference_mode(): + # Warmup + for _ in range(warmup): + x = _gen_input() + if isinstance(x, tuple): + model(*x) + else: + model(x) + _sync(dev) + + # Measure + samples_ms: list[float] = [] + for _ in range(iters): + x = _gen_input() + _sync(dev) + t0 = time.perf_counter() + if isinstance(x, tuple): + model(*x) + else: + model(x) + _sync(dev) + samples_ms.append((time.perf_counter() - t0) * 1000.0) + + mean_ms = statistics.fmean(samples_ms) + median_ms = statistics.median(samples_ms) + std_ms = statistics.pstdev(samples_ms) if len(samples_ms) > 1 else 0.0 + throughput = (batch_size * 1000.0 / mean_ms) if mean_ms > 0 else 0.0 + + return BenchmarkResult( + device=str(dev), + dtype=str(dtype).replace("torch.", ""), + batch_size=batch_size, + warmup=warmup, + iters=iters, + mean_ms=mean_ms, + median_ms=median_ms, + std_ms=std_ms, + min_ms=min(samples_ms), + max_ms=max(samples_ms), + p50_ms=_percentile(samples_ms, 50), + p90_ms=_percentile(samples_ms, 90), + p99_ms=_percentile(samples_ms, 99), + throughput_ips=throughput, + samples_ms=samples_ms, + ) + + +def compare( + baseline_model: nn.Module, + compressed_model: nn.Module, + input_shape: tuple[int, ...] | list[tuple[int, ...]], + device: str = "cpu", + dtype: torch.dtype = torch.float32, + warmup: int = 10, + iters: int = 50, +) -> BenchmarkComparison: + """Benchmark baseline and compressed models and return a comparison.""" + base = benchmark( + baseline_model, input_shape, device=device, dtype=dtype, + warmup=warmup, iters=iters, + ) + comp = benchmark( + compressed_model, input_shape, device=device, dtype=dtype, + warmup=warmup, iters=iters, + ) + return BenchmarkComparison(baseline=base, compressed=comp) diff --git a/comprexx/cli/main.py b/comprexx/cli/main.py index 335ecb4..055af9c 100644 --- a/comprexx/cli/main.py +++ b/comprexx/cli/main.py @@ -177,5 +177,30 @@ def export_cmd( raise typer.Exit(1) +@app.command() +def bench( + model_source: str = typer.Argument(..., help="Model path or Python module path"), + input_shape: str = typer.Option(..., "--input-shape", help="Input shape, e.g. '1,3,224,224'"), + device: str = typer.Option("cpu", help="Device (cpu or cuda)"), + warmup: int = typer.Option(10, "--warmup", help="Warmup iterations"), + iters: int = typer.Option(50, "--iters", help="Measured iterations"), + json_output: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """Benchmark a model's inference latency.""" + from comprexx.benchmark.runner import benchmark + + shape = _parse_input_shape(input_shape) + model = _load_model(model_source) + + with console.status("Benchmarking..."): + result = benchmark(model, input_shape=shape, device=device, + warmup=warmup, iters=iters) + + if json_output: + console.print(result.to_json()) + else: + console.print(Panel(result.summary(), title="Comprexx Benchmark")) + + if __name__ == "__main__": app() diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py new file mode 100644 index 0000000..8055eba --- /dev/null +++ b/tests/unit/test_benchmark.py @@ -0,0 +1,93 @@ +"""Tests for the benchmark module.""" + +import pytest +import torch + +import comprexx as cx +from comprexx.benchmark.runner import BenchmarkComparison, BenchmarkResult, _percentile +from tests.fixtures.models import tiny_cnn, tiny_transformer + + +class TestBenchmark: + def test_basic_run(self): + model = tiny_cnn() + result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=2, iters=5) + assert isinstance(result, BenchmarkResult) + assert result.iters == 5 + assert result.warmup == 2 + assert result.batch_size == 1 + assert result.mean_ms > 0 + assert result.throughput_ips > 0 + assert len(result.samples_ms) == 5 + + def test_batch_size_from_shape(self): + model = tiny_cnn() + result = cx.benchmark(model, input_shape=(4, 3, 32, 32), warmup=1, iters=3) + assert result.batch_size == 4 + + def test_percentiles_ordered(self): + model = tiny_cnn() + result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=1, iters=10) + assert result.p50_ms <= result.p90_ms <= result.p99_ms + assert result.min_ms <= result.mean_ms <= result.max_ms + + def test_invalid_iters(self): + model = tiny_cnn() + with pytest.raises(ValueError): + cx.benchmark(model, input_shape=(1, 3, 32, 32), iters=0) + + def test_transformer_multi_input_shape(self): + model = tiny_transformer() + # (batch, seq, features) + result = cx.benchmark(model, input_shape=(2, 8, 64), warmup=1, iters=3) + assert result.batch_size == 2 + assert result.mean_ms > 0 + + def test_custom_input_fn(self): + model = tiny_cnn() + + def fn(): + return torch.randn(1, 3, 32, 32) + + result = cx.benchmark(model, input_shape=(1, 3, 32, 32), input_fn=fn, + warmup=1, iters=3) + assert result.mean_ms > 0 + + def test_summary_and_serialization(self): + model = tiny_cnn() + result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=1, iters=3) + s = result.summary() + assert "Mean" in s and "ms" in s + d = result.to_dict() + assert d["iters"] == 3 + j = result.to_json() + assert "mean_ms" in j + + +class TestCompare: + def test_compare_same_model(self): + m = tiny_cnn() + cmp = cx.compare_benchmarks(m, m, input_shape=(1, 3, 32, 32), + warmup=1, iters=5) + assert isinstance(cmp, BenchmarkComparison) + # Same model: speedup should be in a sane range + assert 0.2 < cmp.speedup < 5.0 + + def test_compare_summary(self): + m = tiny_cnn() + cmp = cx.compare_benchmarks(m, m, input_shape=(1, 3, 32, 32), + warmup=1, iters=3) + s = cmp.summary() + assert "Baseline" in s and "Compressed" in s + assert "speedup" in cmp.to_dict() + + +class TestPercentile: + def test_basic(self): + vals = [1.0, 2.0, 3.0, 4.0, 5.0] + assert _percentile(vals, 50) == 3.0 + assert _percentile(vals, 0) == 1.0 + assert _percentile(vals, 100) == 5.0 + + def test_empty(self): + assert _percentile([], 50) == 0.0