Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

### Added
- **Benchmarking module** (`comprexx.benchmark`): `cx.benchmark()` measures real
inference latency with configurable warmup/iters, reporting mean, median, std,
p50/p90/p99, min/max, and throughput. `cx.compare_benchmarks()` returns a
before/after comparison with speedup and latency/throughput deltas. Quantized
models are automatically run on CPU. New `comprexx bench` CLI command.
- GitHub Actions CI workflow running `pytest` on Python 3.10, 3.11, 3.12 plus a
`ruff check` lint job.
- `CHANGELOG.md` with history for v0.1.0 and v0.2.0.
Expand Down
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,29 @@ pipeline = cx.Pipeline([

The `perturbation` can be `"prune"` (zero the smallest weights) or `"noise"` (add Gaussian noise scaled by weight std). Each layer is snapshotted and restored in place, so no deep copies of the model are made.

### Benchmark inference latency

Param counts and FLOPs tell you how small a model got. They don't tell you how fast it runs. `cx.benchmark` measures real latency:

```python
result = cx.benchmark(model, input_shape=(1, 3, 224, 224), warmup=10, iters=50)
print(result.summary())
```

You get mean, median, std, p50/p90/p99, min/max, and throughput in inferences per second. To see what compression actually bought you, run `compare_benchmarks` on the baseline and compressed models:

```python
cmp = cx.compare_benchmarks(
baseline_model, result.model,
input_shape=(1, 3, 224, 224),
iters=50,
)
print(cmp.summary())
print(f"{cmp.speedup:.2f}x faster")
```

Warmup iterations are excluded from measurements so caches and JIT settle first. Quantized models are automatically run on CPU regardless of the `device` argument.

### Export to ONNX

```python
Expand Down Expand Up @@ -177,6 +200,9 @@ comprexx analyze model.pt --input-shape "1,3,224,224" --json
comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224"
comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224" --dry-run

# Benchmark
comprexx bench model.pt --input-shape "1,3,224,224" --iters 50

# Export
comprexx export model.pt --format onnx --input-shape "1,3,224,224"
```
Expand Down
10 changes: 10 additions & 0 deletions comprexx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
SensitivityReport,
analyze_sensitivity,
)
from comprexx.benchmark.runner import (
BenchmarkComparison,
BenchmarkResult,
benchmark,
)
from comprexx.benchmark.runner import compare as compare_benchmarks
from comprexx.core.exceptions import (
AccuracyGuardTriggered,
CalibrationError,
Expand All @@ -27,6 +33,8 @@
__all__ = [
"AccuracyGuard",
"AccuracyGuardTriggered",
"BenchmarkComparison",
"BenchmarkResult",
"CalibrationError",
"CompressionReport",
"ComprexxError",
Expand All @@ -43,6 +51,8 @@
"UnsupportedLayerError",
"analyze",
"analyze_sensitivity",
"benchmark",
"compare_benchmarks",
"load_recipe",
"stages",
]
10 changes: 10 additions & 0 deletions comprexx/benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Inference benchmarking."""

from comprexx.benchmark.runner import (
BenchmarkComparison,
BenchmarkResult,
benchmark,
compare,
)

__all__ = ["BenchmarkComparison", "BenchmarkResult", "benchmark", "compare"]
250 changes: 250 additions & 0 deletions comprexx/benchmark/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
"""Latency/throughput benchmarking for PyTorch models.

Measures real inference performance so compression reports can show actual
speedups, not just parameter-count reductions.
"""

from __future__ import annotations

import json
import statistics
import time
from dataclasses import asdict, dataclass, field
from typing import Callable, Optional

import torch
import torch.nn as nn


@dataclass
class BenchmarkResult:
"""Latency statistics from a benchmark run."""

device: str
dtype: str
batch_size: int
warmup: int
iters: int
mean_ms: float
median_ms: float
std_ms: float
min_ms: float
max_ms: float
p50_ms: float
p90_ms: float
p99_ms: float
throughput_ips: float
samples_ms: list[float] = field(default_factory=list)

def to_dict(self) -> dict:
return asdict(self)

def to_json(self) -> str:
return json.dumps(self.to_dict(), indent=2)

def summary(self) -> str:
return (
f"Benchmark ({self.device}, batch={self.batch_size}, iters={self.iters})\n"
f" Mean: {self.mean_ms:.3f} ms\n"
f" Median: {self.median_ms:.3f} ms\n"
f" Std: {self.std_ms:.3f} ms\n"
f" p50/p90/p99:{self.p50_ms:.3f} / {self.p90_ms:.3f} / {self.p99_ms:.3f} ms\n"
f" Min/Max: {self.min_ms:.3f} / {self.max_ms:.3f} ms\n"
f" Throughput: {self.throughput_ips:.1f} inferences/sec"
)


@dataclass
class BenchmarkComparison:
"""Before/after benchmark comparison."""

baseline: BenchmarkResult
compressed: BenchmarkResult

@property
def speedup(self) -> float:
if self.compressed.mean_ms == 0:
return float("inf")
return self.baseline.mean_ms / self.compressed.mean_ms

@property
def latency_reduction_pct(self) -> float:
if self.baseline.mean_ms == 0:
return 0.0
return (1 - self.compressed.mean_ms / self.baseline.mean_ms) * 100

@property
def throughput_gain_pct(self) -> float:
if self.baseline.throughput_ips == 0:
return 0.0
return (self.compressed.throughput_ips / self.baseline.throughput_ips - 1) * 100

def to_dict(self) -> dict:
return {
"baseline": self.baseline.to_dict(),
"compressed": self.compressed.to_dict(),
"speedup": self.speedup,
"latency_reduction_pct": self.latency_reduction_pct,
"throughput_gain_pct": self.throughput_gain_pct,
}

def to_json(self) -> str:
return json.dumps(self.to_dict(), indent=2)

def summary(self) -> str:
return (
f"Benchmark Comparison ({self.baseline.device})\n"
f" Baseline: {self.baseline.mean_ms:.3f} ms "
f"({self.baseline.throughput_ips:.1f} ips)\n"
f" Compressed: {self.compressed.mean_ms:.3f} ms "
f"({self.compressed.throughput_ips:.1f} ips)\n"
f" Speedup: {self.speedup:.2f}x "
f"({self.latency_reduction_pct:+.1f}% latency, "
f"{self.throughput_gain_pct:+.1f}% throughput)"
)


def _make_input(
input_shape: tuple[int, ...] | list[tuple[int, ...]],
device: torch.device,
dtype: torch.dtype,
) -> torch.Tensor | tuple[torch.Tensor, ...]:
if isinstance(input_shape, list):
return tuple(torch.randn(*s, device=device, dtype=dtype) for s in input_shape)
return torch.randn(*input_shape, device=device, dtype=dtype)


def _sync(device: torch.device) -> None:
if device.type == "cuda":
torch.cuda.synchronize()


def _percentile(values: list[float], pct: float) -> float:
if not values:
return 0.0
s = sorted(values)
k = (len(s) - 1) * (pct / 100.0)
lo = int(k)
hi = min(lo + 1, len(s) - 1)
frac = k - lo
return s[lo] * (1 - frac) + s[hi] * frac


def benchmark(
model: nn.Module,
input_shape: tuple[int, ...] | list[tuple[int, ...]],
device: str = "cpu",
dtype: torch.dtype = torch.float32,
warmup: int = 10,
iters: int = 50,
input_fn: Optional[Callable[[], torch.Tensor | tuple[torch.Tensor, ...]]] = None,
) -> BenchmarkResult:
"""Benchmark a model's inference latency.

Args:
model: Model to benchmark (set to eval mode internally).
input_shape: Single tensor shape, or list of shapes for multi-input models.
device: "cpu" or "cuda".
dtype: Input tensor dtype. Quantized models ignore this.
warmup: Warmup iterations (not measured) to stabilize caches/JIT.
iters: Measured iterations.
input_fn: Optional callable returning a fresh input per call. Overrides
`input_shape` when provided.
"""
if iters <= 0:
raise ValueError("iters must be positive")

dev = torch.device(device)
model = model.eval()

# Quantized models must run on CPU
is_quantized = any(
"quantized" in type(m).__module__ for m in model.modules()
)
if is_quantized and dev.type != "cpu":
dev = torch.device("cpu")

try:
model = model.to(dev)
except (RuntimeError, NotImplementedError):
# Some quantized modules refuse .to() transfers; fall through
pass

def _gen_input():
if input_fn is not None:
x = input_fn()
else:
x = _make_input(input_shape, dev, dtype)
return x

sample = _gen_input()
batch_size = (
sample[0].shape[0] if isinstance(sample, tuple) else sample.shape[0]
)

with torch.inference_mode():
# Warmup
for _ in range(warmup):
x = _gen_input()
if isinstance(x, tuple):
model(*x)
else:
model(x)
_sync(dev)

# Measure
samples_ms: list[float] = []
for _ in range(iters):
x = _gen_input()
_sync(dev)
t0 = time.perf_counter()
if isinstance(x, tuple):
model(*x)
else:
model(x)
_sync(dev)
samples_ms.append((time.perf_counter() - t0) * 1000.0)

mean_ms = statistics.fmean(samples_ms)
median_ms = statistics.median(samples_ms)
std_ms = statistics.pstdev(samples_ms) if len(samples_ms) > 1 else 0.0
throughput = (batch_size * 1000.0 / mean_ms) if mean_ms > 0 else 0.0

return BenchmarkResult(
device=str(dev),
dtype=str(dtype).replace("torch.", ""),
batch_size=batch_size,
warmup=warmup,
iters=iters,
mean_ms=mean_ms,
median_ms=median_ms,
std_ms=std_ms,
min_ms=min(samples_ms),
max_ms=max(samples_ms),
p50_ms=_percentile(samples_ms, 50),
p90_ms=_percentile(samples_ms, 90),
p99_ms=_percentile(samples_ms, 99),
throughput_ips=throughput,
samples_ms=samples_ms,
)


def compare(
baseline_model: nn.Module,
compressed_model: nn.Module,
input_shape: tuple[int, ...] | list[tuple[int, ...]],
device: str = "cpu",
dtype: torch.dtype = torch.float32,
warmup: int = 10,
iters: int = 50,
) -> BenchmarkComparison:
"""Benchmark baseline and compressed models and return a comparison."""
base = benchmark(
baseline_model, input_shape, device=device, dtype=dtype,
warmup=warmup, iters=iters,
)
comp = benchmark(
compressed_model, input_shape, device=device, dtype=dtype,
warmup=warmup, iters=iters,
)
return BenchmarkComparison(baseline=base, compressed=comp)
25 changes: 25 additions & 0 deletions comprexx/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,5 +177,30 @@ def export_cmd(
raise typer.Exit(1)


@app.command()
def bench(
model_source: str = typer.Argument(..., help="Model path or Python module path"),
input_shape: str = typer.Option(..., "--input-shape", help="Input shape, e.g. '1,3,224,224'"),
device: str = typer.Option("cpu", help="Device (cpu or cuda)"),
warmup: int = typer.Option(10, "--warmup", help="Warmup iterations"),
iters: int = typer.Option(50, "--iters", help="Measured iterations"),
json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
):
"""Benchmark a model's inference latency."""
from comprexx.benchmark.runner import benchmark

shape = _parse_input_shape(input_shape)
model = _load_model(model_source)

with console.status("Benchmarking..."):
result = benchmark(model, input_shape=shape, device=device,
warmup=warmup, iters=iters)

if json_output:
console.print(result.to_json())
else:
console.print(Panel(result.summary(), title="Comprexx Benchmark"))


if __name__ == "__main__":
app()
Loading
Loading