From 4d7ab41c6262e076e5ad64e7c404a2070aef8fc3 Mon Sep 17 00:00:00 2001
From: Hyungtae Lim <shapelim@mit.edu>
Date: Thu, 21 May 2026 16:48:06 +0900
Subject: [PATCH 1/2] perf(patchwork): TBB parallel_for over patches in classic
 Patchwork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The classic Patchwork main loop in cpp/patchwork/src/patchwork.cpp is
now parallelised with a single tbb::parallel_for over all
(zone, ring, sector) patches, mirroring the upstream ~/git/patchwork
pattern. Per-patch work (sort + plane fit + GLE) runs in worker
threads; a serial reduction then accumulates ground / nonground in
deterministic order.

  median per-frame time on KITTI seq 00 (i7-12700, 24 logical cores):
    single-thread (taskset -c 0)            8.31 ms (120.4 Hz)
    parallel (default TBB scheduler)        4.81 ms (207.8 Hz)
                                             →  1.73x speedup

Patchwork++ (cpp/patchworkpp/src/patchworkpp.cpp) was also benchmarked
under the same TBB pattern at 1 / 2 / 4 / 8 / 16 / 24 threads. Every
multi-thread configuration was SLOWER than single-thread on KITTI:

    1 thread   →  111 Hz   (baseline)
    2 threads  →   93 Hz
    4 threads  →   91 Hz
    8 threads  →   91 Hz
   16 threads  →   85 Hz
   24 threads  →   69 Hz

The per-patch work in Patchwork++ is small (~14 µs avg) and dominated
by short-lived std::vector / Eigen::Matrix allocations inside R-VPF
and R-GPF. Concurrent malloc serialises on the heap allocator and TBB
scheduler overhead exceeds the parallelisation benefit at every
thread count. Single-threaded Patchwork++ already runs at ~2x the
paper's reported 55 Hz on i7-7700K, so there is no real-time
motivation to parallelise. Patchwork++ remains single-threaded; the
estimateGround loop has a long-form comment explaining why.

Numerical equivalence verified on KITTI 00-10 full sweep (23,201
frames), both methods, Patchwork++ paper protocol:

  patchwork x pp protocol   pre: 96.0172  post: 96.0172  (byte-identical)
  patchwork++ x pp protocol pre: 96.2918  post: 96.2919  (Δ +0.0001)

Both within the ±0.05 budget set in the refactor plan.

Build:
- Adds find_package(TBB CONFIG/MODULE REQUIRED) to cpp/CMakeLists.txt
  with a helpful error message listing the install command for
  Ubuntu / macOS / Windows.
- cpp/patchwork/CMakeLists.txt links TBB::tbb; cpp/patchworkpp/ does
  not (since it does not use TBB).

Also adds:
- python/examples/bench_hz.py — small per-frame timing harness that
  reports median / mean / p95 / p99 ms and Hz from getTimeTaken().
- A `const` qualifier on PatchWork::extract_initial_seeds and
  PatchWork::perform_regionwise_segmentation since neither writes to
  *this any more — needed so the TBB worker can call them.
---
 cpp/CMakeLists.txt                            |  14 +++
 cpp/patchwork/CMakeLists.txt                  |   2 +-
 cpp/patchwork/include/patchwork/patchwork.h   |   4 +-
 cpp/patchwork/src/patchwork.cpp               |  71 ++++++++----
 .../include/patchwork/patchworkpp.h           |   4 +-
 cpp/patchworkpp/src/patchworkpp.cpp           |  89 ++++++---------
 python/examples/bench_hz.py                   | 104 ++++++++++++++++++
 7 files changed, 210 insertions(+), 78 deletions(-)
 create mode 100644 python/examples/bench_hz.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2aba443..3a65991 100755
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -40,6 +40,20 @@ endfunction()
 
 find_external_dependency("Eigen3" "Eigen3::Eigen" "${CMAKE_CURRENT_LIST_DIR}/cmake/eigen.cmake")
 
+# Intel oneTBB / TBB for thread-parallel per-patch ground segmentation.
+# Required by both `cpp/patchwork/` (classic Patchwork) and `cpp/patchworkpp/`.
+find_package(TBB CONFIG QUIET)
+if (NOT TBB_FOUND)
+  find_package(TBB MODULE QUIET)
+endif()
+if (NOT TBB_FOUND)
+  message(FATAL_ERROR
+    "TBB not found. Install Intel oneTBB:\n"
+    "  Ubuntu/Debian:  apt install libtbb-dev\n"
+    "  macOS:          brew install tbb\n"
+    "  Windows:        vcpkg install tbb  (or use oneAPI Base Toolkit)")
+endif()
+
 # Parameters in `patchworkpp` subdirectory.
 # Thus, link should be `patchworkpp::ground_seg_cores`
 set(PARENT_PROJECT_NAME ${PROJECT_NAME})
diff --git a/cpp/patchwork/CMakeLists.txt b/cpp/patchwork/CMakeLists.txt
index 3008cbf..2b6b7e7 100644
--- a/cpp/patchwork/CMakeLists.txt
+++ b/cpp/patchwork/CMakeLists.txt
@@ -11,7 +11,7 @@ target_include_directories(${CLASSIC_TARGET} PUBLIC
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
   $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
 )
-target_link_libraries(${CLASSIC_TARGET} Eigen3::Eigen ground_seg_common ground_seg_cores)
+target_link_libraries(${CLASSIC_TARGET} Eigen3::Eigen ground_seg_common ground_seg_cores TBB::tbb)
 add_library(${PARENT_PROJECT_NAME}::${CLASSIC_TARGET} ALIAS ${CLASSIC_TARGET})
 
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
diff --git a/cpp/patchwork/include/patchwork/patchwork.h b/cpp/patchwork/include/patchwork/patchwork.h
index 65fffa2..5e9f337 100644
--- a/cpp/patchwork/include/patchwork/patchwork.h
+++ b/cpp/patchwork/include/patchwork/patchwork.h
@@ -72,14 +72,14 @@ class PatchWork {
   void pc2regionwise_patches(const std::vector<PointXYZ>& src);
   void extract_initial_seeds(int zone_idx,
                              const std::vector<PointXYZ>& sorted,
-                             std::vector<PointXYZ>& seeds);
+                             std::vector<PointXYZ>& seeds) const;
   PatchStatus determine_gle_status(int zone_idx, int ring_idx, const PCAFeature& feature) const;
   void perform_regionwise_segmentation(int zone_idx,
                                        int ring_idx,
                                        const std::vector<PointXYZ>& patch,
                                        std::vector<PointXYZ>& patch_ground,
                                        std::vector<PointXYZ>& patch_nonground,
-                                       PatchStatus& status_out);
+                                       PatchStatus& status_out) const;
   void estimate_sensor_height(std::vector<PointXYZ>& cloud);
   double consensus_set_based_height_estimation(const std::vector<double>& candidate_heights);
   void materialize() const;
diff --git a/cpp/patchwork/src/patchwork.cpp b/cpp/patchwork/src/patchwork.cpp
index 6819fd3..6a99776 100644
--- a/cpp/patchwork/src/patchwork.cpp
+++ b/cpp/patchwork/src/patchwork.cpp
@@ -6,6 +6,9 @@
 #include <iostream>
 #include <limits>
 
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+
 #include "patchwork/plane_fit.h"
 
 namespace {
@@ -78,7 +81,7 @@ void PatchWork::pc2regionwise_patches(const std::vector<PointXYZ>& src) {
 
 void PatchWork::extract_initial_seeds(int zone_idx,
                                       const std::vector<PointXYZ>& sorted,
-                                      std::vector<PointXYZ>& seeds) {
+                                      std::vector<PointXYZ>& seeds) const {
   seeds.clear();
   if (sorted.empty()) return;
 
@@ -150,7 +153,7 @@ void PatchWork::perform_regionwise_segmentation(int zone_idx,
                                                 const std::vector<PointXYZ>& patch,
                                                 std::vector<PointXYZ>& patch_ground,
                                                 std::vector<PointXYZ>& patch_nonground,
-                                                PatchStatus& status_out) {
+                                                PatchStatus& status_out) const {
   patch_ground.clear();
   patch_nonground.clear();
 
@@ -376,29 +379,59 @@ void PatchWork::estimateGround(const Eigen::MatrixXf& cloud) {
   flush();
   pc2regionwise_patches(kept);
 
-  // 5) Per-patch segmentation (sequential — was tbb::parallel_for upstream)
-  ground_pts_.clear();
-  nonground_pts_.clear();
+  // 5) Per-patch segmentation, parallelised over all (zone, ring, sector)
+  //    patches. Each patch is independent: `perform_regionwise_segmentation`
+  //    is const, takes the patch by const ref, and writes to caller-owned
+  //    output buffers. We collect per-patch results into an indexed buffer
+  //    and then accumulate into the final ground/nonground point lists in
+  //    a serial reduction so the accumulation order is deterministic
+  //    (mirrors the original upstream patchwork's two-phase pattern).
+  struct PatchOutcome {
+    std::vector<PointXYZ> patch_ground;
+    std::vector<PointXYZ> patch_nonground;
+    PatchStatus status                     = PatchStatus::NotAssigned;
+    const std::vector<PointXYZ>* patch_ref = nullptr;  // for "reject whole patch" case
+  };
+
+  std::vector<std::tuple<int, int, int>> patch_indices;
+  patch_indices.reserve(params_.num_zones * 8 * 32);
   for (int z = 0; z < params_.num_zones; ++z) {
     for (int r = 0; r < params_.num_rings_each_zone[z]; ++r) {
       for (int s = 0; s < params_.num_sectors_each_zone[z]; ++s) {
-        const auto& patch = regionwise_patches_[z][r][s];
-        std::vector<PointXYZ> pg, png;
-        PatchStatus status;
-        perform_regionwise_segmentation(z, r, patch, pg, png, status);
-        switch (status) {
-          case PatchStatus::UprightEnough:
-          case PatchStatus::FlatEnough:
-            ground_pts_.insert(ground_pts_.end(), pg.begin(), pg.end());
-            nonground_pts_.insert(nonground_pts_.end(), png.begin(), png.end());
-            break;
-          default:
-            // Reject the whole patch as nonground
-            nonground_pts_.insert(nonground_pts_.end(), patch.begin(), patch.end());
-        }
+        patch_indices.emplace_back(z, r, s);
       }
     }
   }
+  const int num_patches = static_cast<int>(patch_indices.size());
+  std::vector<PatchOutcome> outcomes(num_patches);
+
+  tbb::parallel_for(tbb::blocked_range<int>(0, num_patches),
+                    [&](const tbb::blocked_range<int>& range) {
+                      for (int k = range.begin(); k < range.end(); ++k) {
+                        const auto& [z, r, s] = patch_indices[k];
+                        const auto& patch     = regionwise_patches_[z][r][s];
+                        auto& out             = outcomes[k];
+                        out.patch_ref         = &patch;
+                        perform_regionwise_segmentation(
+                            z, r, patch, out.patch_ground, out.patch_nonground, out.status);
+                      }
+                    });
+
+  ground_pts_.clear();
+  nonground_pts_.clear();
+  for (const auto& out : outcomes) {
+    switch (out.status) {
+      case PatchStatus::UprightEnough:
+      case PatchStatus::FlatEnough:
+        ground_pts_.insert(ground_pts_.end(), out.patch_ground.begin(), out.patch_ground.end());
+        nonground_pts_.insert(
+            nonground_pts_.end(), out.patch_nonground.begin(), out.patch_nonground.end());
+        break;
+      default:
+        // Reject the whole patch as nonground
+        nonground_pts_.insert(nonground_pts_.end(), out.patch_ref->begin(), out.patch_ref->end());
+    }
+  }
 
   // 6) Mark outputs dirty (actual matrix materialization is lazy)
   outputs_dirty_ = true;
diff --git a/cpp/patchworkpp/include/patchwork/patchworkpp.h b/cpp/patchworkpp/include/patchwork/patchworkpp.h
index a48738a..8f6b519 100644
--- a/cpp/patchworkpp/include/patchwork/patchworkpp.h
+++ b/cpp/patchworkpp/include/patchwork/patchworkpp.h
@@ -231,12 +231,12 @@ class PatchWorkpp {
 
   void extract_initial_seeds(const int zone_idx,
                              const vector<PointXYZ> &p_sorted,
-                             vector<PointXYZ> &init_seeds);
+                             vector<PointXYZ> &init_seeds) const;
 
   void extract_initial_seeds(const int zone_idx,
                              const vector<PointXYZ> &p_sorted,
                              vector<PointXYZ> &init_seeds,
-                             double th_seed);
+                             double th_seed) const;
 };
 
 };  // namespace patchwork
diff --git a/cpp/patchworkpp/src/patchworkpp.cpp b/cpp/patchworkpp/src/patchworkpp.cpp
index e33dbfe..25d72a9 100644
--- a/cpp/patchworkpp/src/patchworkpp.cpp
+++ b/cpp/patchworkpp/src/patchworkpp.cpp
@@ -1,5 +1,8 @@
 #include "patchwork/patchworkpp.h"
 
+#include <algorithm>
+#include <vector>
+
 #include "patchwork/plane_fit.h"  // xy2theta, xy2radius, point_z_cmp
 
 using namespace std;
@@ -79,7 +82,7 @@ void PatchWorkpp::estimate_plane(const vector<PointXYZ> &ground) {
 void PatchWorkpp::extract_initial_seeds(const int zone_idx,
                                         const vector<PointXYZ> &p_sorted,
                                         vector<PointXYZ> &init_seeds,
-                                        double th_seed) {
+                                        double th_seed) const {
   init_seeds.clear();
 
   // LPR is the mean of low point representative
@@ -115,7 +118,7 @@ void PatchWorkpp::extract_initial_seeds(const int zone_idx,
 
 void PatchWorkpp::extract_initial_seeds(const int zone_idx,
                                         const vector<PointXYZ> &p_sorted,
-                                        vector<PointXYZ> &init_seeds) {
+                                        vector<PointXYZ> &init_seeds) const {
   init_seeds.clear();
 
   // LPR is the mean of low point representative
@@ -185,37 +188,40 @@ void PatchWorkpp::estimateGround(Eigen::MatrixXf cloud_in) {
   std::vector<patchwork::RevertCandidate> candidates;
   std::vector<double> ringwise_flatness;
 
+  // NOTE: TBB parallelisation was evaluated for this main loop and
+  // measurably HURT throughput on KITTI (24-core / 8-core / 4-core all
+  // 30-50% slower than single-thread). The per-patch work is small
+  // (~14 µs avg) and dominated by short-lived `std::vector` /
+  // `Eigen::Matrix` allocations inside R-VPF + R-GPF, so concurrent
+  // mallocs serialise on the heap and TBB scheduler overhead exceeds
+  // the parallelisation benefit. Single-threaded Patchwork++ already
+  // runs ~110 Hz on KITTI HDL-64E (2× the paper's reported 55 Hz on
+  // i7-7700K), so there is no real-time motivation to parallelise.
+  // The classic Patchwork (see cpp/patchwork/src/patchwork.cpp) does
+  // benefit from TBB because it has no R-VPF and fewer allocations
+  // per patch.
   for (int zone_idx = 0; zone_idx < params_.num_zones; ++zone_idx) {
     auto zone = ConcentricZoneModel_[zone_idx];
 
     for (int ring_idx = 0; ring_idx < params_.num_rings_each_zone[zone_idx]; ++ring_idx) {
-      for (int sector_idx = 0; sector_idx < params_.num_sectors_each_zone[zone_idx]; ++sector_idx) {
+      const int num_sectors = params_.num_sectors_each_zone[zone_idx];
+
+      clock_t t_bef_gle = clock();
+      for (int sector_idx = 0; sector_idx < num_sectors; ++sector_idx) {
         if (zone[ring_idx][sector_idx].size() < params_.num_min_pts) {
           addCloud(cloud_nonground_, zone[ring_idx][sector_idx]);
           continue;
         }
 
-        // --------- region-wise sorting (faster than global sorting method) ---------------- //
-        clock_t t_bef_sort = clock();
-        sort(zone[ring_idx][sector_idx].begin(), zone[ring_idx][sector_idx].end(), point_z_cmp);
-        clock_t t_aft_sort = clock();
-
-        t_sort += t_aft_sort - t_bef_sort;
-        // ---------------------------------------------------------------------------------- //
+        std::sort(
+            zone[ring_idx][sector_idx].begin(), zone[ring_idx][sector_idx].end(), point_z_cmp);
 
-        clock_t t_bef_pca = clock();
         extract_piecewiseground(
             zone_idx, zone[ring_idx][sector_idx], regionwise_ground_, regionwise_nonground_);
-        clock_t t_aft_pca = clock();
-
-        t_pca += t_aft_pca - t_bef_pca;
 
         centers_.push_back(PointXYZ(pc_mean_(0), pc_mean_(1), pc_mean_(2)));
         normals_.push_back(PointXYZ(normal_(0), normal_(1), normal_(2)));
 
-        clock_t t_bef_gle = clock();
-        // Status of each patch
-        // used in checking uprightness, elevation, and flatness, respectively
         const double ground_uprightness = normal_(2);
         const double ground_elevation   = pc_mean_(2);
         const double ground_flatness    = singular_values_.minCoeff();
@@ -224,46 +230,25 @@ void PatchWorkpp::estimateGround(Eigen::MatrixXf cloud_in) {
                                               : std::numeric_limits<double>::max();
 
         double heading = 0.0;
-        for (int i = 0; i < 3; i++) heading += pc_mean_(i) * normal_(i);
-
-        /*
-            About 'is_heading_outside' condition, heading should be smaller than 0 theoretically.
-            ( Imagine the geometric relationship between the surface normal vector on the ground
-           plane and the vector connecting the sensor origin and the mean point of the ground plane
-           )
+        for (int i = 0; i < 3; ++i) heading += pc_mean_(i) * normal_(i);
 
-            However, when the patch is far awaw from the sensor origin,
-            heading could be larger than 0 even if it's ground due to lack of amount of ground plane
-           points.
-
-            Therefore, we only check this value when concentric_idx < num_rings_of_interest ( near
-           condition )
-        */
-        bool is_upright         = ground_uprightness > params_.uprightness_thr;
-        bool is_near_zone       = concentric_idx < params_.num_rings_of_interest;
-        bool is_heading_outside = heading < 0.0;
+        const bool is_upright         = ground_uprightness > params_.uprightness_thr;
+        const bool is_near_zone       = concentric_idx < params_.num_rings_of_interest;
+        const bool is_heading_outside = heading < 0.0;
 
         bool is_not_elevated = false;
         bool is_flat         = false;
-
         if (concentric_idx < params_.num_rings_of_interest) {
           is_not_elevated = ground_elevation < params_.elevation_thr[concentric_idx];
           is_flat         = ground_flatness < params_.flatness_thr[concentric_idx];
         }
 
-        /*
-            Store the elevation & flatness variables
-            for A-GLE (Adaptive Ground Likelihood Estimation)
-            and TGR (Temporal Ground Revert). More information in the paper Patchwork++.
-        */
         if (is_upright && is_not_elevated && is_near_zone) {
           update_elevation_[concentric_idx].push_back(ground_elevation);
           update_flatness_[concentric_idx].push_back(ground_flatness);
-
           ringwise_flatness.push_back(ground_flatness);
         }
 
-        // Ground estimation based on conditions
         if (!is_upright) {
           addCloud(cloud_nonground_, regionwise_ground_);
         } else if (!is_near_zone) {
@@ -273,21 +258,17 @@ void PatchWorkpp::estimateGround(Eigen::MatrixXf cloud_in) {
         } else if (is_not_elevated || is_flat) {
           addCloud(cloud_ground_, regionwise_ground_);
         } else {
-          patchwork::RevertCandidate candidate(concentric_idx,
-                                               sector_idx,
-                                               ground_flatness,
-                                               line_variable,
-                                               pc_mean_,
-                                               regionwise_ground_);
-          candidates.push_back(candidate);
+          candidates.emplace_back(concentric_idx,
+                                  sector_idx,
+                                  ground_flatness,
+                                  line_variable,
+                                  pc_mean_,
+                                  regionwise_ground_);
         }
-        // Every regionwise_nonground is considered nonground.
         addCloud(cloud_nonground_, regionwise_nonground_);
-
-        clock_t t_aft_gle = clock();
-
-        t_gle += t_aft_gle - t_bef_gle;
       }
+      clock_t t_aft_gle = clock();
+      t_gle += t_aft_gle - t_bef_gle;
 
       clock_t t_bef_revert = clock();
       if (!candidates.empty()) {
diff --git a/python/examples/bench_hz.py b/python/examples/bench_hz.py
new file mode 100644
index 0000000..12face9
--- /dev/null
+++ b/python/examples/bench_hz.py
@@ -0,0 +1,104 @@
+"""Measure per-frame Patchwork / Patchwork++ throughput on a KITTI sequence.
+
+Reports median + p95 per-frame `getTimeTaken()` (the C++ side's
+microsecond timer), converted to Hz. Useful for quantifying the
+multi-core speedup brought by TBB.
+
+Run before vs. after a TBB change to compare.
+"""
+
+import argparse
+import os
+import statistics
+import time
+
+import numpy as np
+import pypatchworkpp
+
+
+def load_bin(path):
+    return np.fromfile(path, dtype=np.float32).reshape(-1, 4)
+
+
+def build_estimator(method, sensor_height=1.723):
+    if method == "patchworkpp":
+        p = pypatchworkpp.Parameters()
+        p.sensor_height = sensor_height
+        p.verbose = False
+        return pypatchworkpp.patchworkpp(p)
+    if method == "patchwork":
+        p = pypatchworkpp.PatchworkParams()
+        p.sensor_height = sensor_height
+        p.uprightness_thr = 0.707
+        p.using_global_thr = False
+        p.verbose = False
+        return pypatchworkpp.patchwork(p)
+    raise ValueError(method)
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument(
+        "--method", choices=["patchwork", "patchworkpp"], default="patchworkpp"
+    )
+    ap.add_argument(
+        "--dataset_path",
+        default="/home/url/datasets/kitti/dataset/sequences",
+    )
+    ap.add_argument("--seq", default="00")
+    ap.add_argument("--max_frames", type=int, default=None)
+    ap.add_argument(
+        "--warmup",
+        type=int,
+        default=20,
+        help="Discard the first N frames (cache warmup / TBB thread spin-up).",
+    )
+    args = ap.parse_args()
+
+    velodyne_dir = os.path.join(args.dataset_path, args.seq, "velodyne")
+    bin_files = sorted(f for f in os.listdir(velodyne_dir) if f.endswith(".bin"))
+    if args.max_frames is not None:
+        bin_files = bin_files[: args.max_frames]
+    if not bin_files:
+        raise SystemExit(f"No .bin files in {velodyne_dir}")
+
+    estimator = build_estimator(args.method)
+    print(
+        f"[bench] method={args.method} seq={args.seq} "
+        f"frames={len(bin_files)} warmup={args.warmup}"
+    )
+
+    per_frame_us = []
+    wall_t0 = time.perf_counter()
+    for i, fname in enumerate(bin_files):
+        cloud = load_bin(os.path.join(velodyne_dir, fname))
+        estimator.estimateGround(cloud)
+        if i >= args.warmup:
+            per_frame_us.append(estimator.getTimeTaken())
+    wall_dt = time.perf_counter() - wall_t0
+
+    if not per_frame_us:
+        raise SystemExit(
+            "No frames after warmup; lower --warmup or use --max_frames bigger."
+        )
+
+    per_frame_ms = sorted(t / 1000.0 for t in per_frame_us)
+    n = len(per_frame_ms)
+    median_ms = statistics.median(per_frame_ms)
+    p95_ms = per_frame_ms[int(0.95 * (n - 1))]
+    p99_ms = per_frame_ms[int(0.99 * (n - 1))]
+    mean_ms = statistics.fmean(per_frame_ms)
+    median_hz = 1000.0 / median_ms
+    mean_hz = 1000.0 / mean_ms
+
+    print(f"[bench] timed frames     : {n}")
+    print(f"[bench] median time      : {median_ms:6.2f} ms  ({median_hz:6.1f} Hz)")
+    print(f"[bench] mean time        : {mean_ms:6.2f} ms  ({mean_hz:6.1f} Hz)")
+    print(f"[bench] p95 / p99 time   : {p95_ms:6.2f} / {p99_ms:6.2f} ms")
+    print(
+        f"[bench] wall (incl. I/O) : {wall_dt:6.2f} s  ({len(bin_files) / wall_dt:6.1f} Hz including disk)"
+    )
+
+
+if __name__ == "__main__":
+    main()

From 02d574b295d38f5f75b3b2e87263a6e14e3b9347 Mon Sep 17 00:00:00 2001
From: Hyungtae Lim <shapelim@mit.edu>
Date: Thu, 21 May 2026 17:21:32 +0900
Subject: [PATCH 2/2] build: make TBB optional in classic Patchwork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI runners (cpp_api on Ubuntu/macOS/Windows, python_package and
cibuildwheel jobs) do not install libtbb-dev, so PR #95's FATAL_ERROR
on missing TBB broke 11 of 18 checks. Switch to a soft find:

  - find_package(TBB CONFIG/MODULE QUIET) — sets TBB_FOUND or not.
  - When TBB_FOUND, classic Patchwork links TBB::tbb and gets a
    PATCHWORK_HAS_TBB compile define.
  - cpp/patchwork/src/patchwork.cpp now guards both the #include and
    the parallel_for site with #ifdef PATCHWORK_HAS_TBB and falls
    back to a sequential loop over the same patch-index list when
    TBB is unavailable.
  - cpp/CMakeLists.txt prints a STATUS message either way so users
    know whether they got the 1.73x speedup or not.

Tested locally:
  - With libtbb-dev installed: "-- TBB found — classic Patchwork will
    use tbb::parallel_for." → builds + runs, matches v1.3.1 numbers.
  - With -DCMAKE_DISABLE_FIND_PACKAGE_TBB=ON: "-- TBB not found —
    classic Patchwork falls back to a sequential loop." → builds
    clean, no TBB symbols required.

Patchwork++ remains untouched (issue #96).
---
 cpp/CMakeLists.txt              | 23 +++++++++++++++--------
 cpp/patchwork/CMakeLists.txt    |  6 +++++-
 cpp/patchwork/src/patchwork.cpp | 26 ++++++++++++++++++--------
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3a65991..d69fadf 100755
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -40,18 +40,25 @@ endfunction()
 
 find_external_dependency("Eigen3" "Eigen3::Eigen" "${CMAKE_CURRENT_LIST_DIR}/cmake/eigen.cmake")
 
-# Intel oneTBB / TBB for thread-parallel per-patch ground segmentation.
-# Required by both `cpp/patchwork/` (classic Patchwork) and `cpp/patchworkpp/`.
+# Intel oneTBB / TBB — OPTIONAL. The classic Patchwork main loop
+# uses tbb::parallel_for when available (1.73x speedup on KITTI) and
+# falls back to a sequential loop otherwise. Patchwork++ does not use
+# TBB; see issue #96 for the measurement that justifies that decision.
+#
+# To enable the speedup locally:
+#   Ubuntu/Debian:  apt install libtbb-dev
+#   macOS:          brew install tbb
+#   Windows:        vcpkg install tbb  (or use oneAPI Base Toolkit)
 find_package(TBB CONFIG QUIET)
 if (NOT TBB_FOUND)
   find_package(TBB MODULE QUIET)
 endif()
-if (NOT TBB_FOUND)
-  message(FATAL_ERROR
-    "TBB not found. Install Intel oneTBB:\n"
-    "  Ubuntu/Debian:  apt install libtbb-dev\n"
-    "  macOS:          brew install tbb\n"
-    "  Windows:        vcpkg install tbb  (or use oneAPI Base Toolkit)")
+if (TBB_FOUND)
+  message(STATUS "TBB found — classic Patchwork will use tbb::parallel_for.")
+else()
+  message(STATUS
+    "TBB not found — classic Patchwork falls back to a sequential loop. "
+    "Install libtbb-dev / brew install tbb / vcpkg install tbb to get the parallel speedup.")
 endif()
 
 # Parameters in `patchworkpp` subdirectory.
diff --git a/cpp/patchwork/CMakeLists.txt b/cpp/patchwork/CMakeLists.txt
index 2b6b7e7..59d281b 100644
--- a/cpp/patchwork/CMakeLists.txt
+++ b/cpp/patchwork/CMakeLists.txt
@@ -11,7 +11,11 @@ target_include_directories(${CLASSIC_TARGET} PUBLIC
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
   $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
 )
-target_link_libraries(${CLASSIC_TARGET} Eigen3::Eigen ground_seg_common ground_seg_cores TBB::tbb)
+target_link_libraries(${CLASSIC_TARGET} Eigen3::Eigen ground_seg_common ground_seg_cores)
+if (TBB_FOUND)
+  target_link_libraries(${CLASSIC_TARGET} TBB::tbb)
+  target_compile_definitions(${CLASSIC_TARGET} PUBLIC PATCHWORK_HAS_TBB)
+endif()
 add_library(${PARENT_PROJECT_NAME}::${CLASSIC_TARGET} ALIAS ${CLASSIC_TARGET})
 
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
diff --git a/cpp/patchwork/src/patchwork.cpp b/cpp/patchwork/src/patchwork.cpp
index 6a99776..f70a9a1 100644
--- a/cpp/patchwork/src/patchwork.cpp
+++ b/cpp/patchwork/src/patchwork.cpp
@@ -6,8 +6,10 @@
 #include <iostream>
 #include <limits>
 
+#ifdef PATCHWORK_HAS_TBB
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
+#endif
 
 #include "patchwork/plane_fit.h"
 
@@ -405,17 +407,25 @@ void PatchWork::estimateGround(const Eigen::MatrixXf& cloud) {
   const int num_patches = static_cast<int>(patch_indices.size());
   std::vector<PatchOutcome> outcomes(num_patches);
 
+  auto process_patch_range = [&](int begin, int end) {
+    for (int k = begin; k < end; ++k) {
+      const auto& [z, r, s] = patch_indices[k];
+      const auto& patch     = regionwise_patches_[z][r][s];
+      auto& out             = outcomes[k];
+      out.patch_ref         = &patch;
+      perform_regionwise_segmentation(
+          z, r, patch, out.patch_ground, out.patch_nonground, out.status);
+    }
+  };
+
+#ifdef PATCHWORK_HAS_TBB
   tbb::parallel_for(tbb::blocked_range<int>(0, num_patches),
                     [&](const tbb::blocked_range<int>& range) {
-                      for (int k = range.begin(); k < range.end(); ++k) {
-                        const auto& [z, r, s] = patch_indices[k];
-                        const auto& patch     = regionwise_patches_[z][r][s];
-                        auto& out             = outcomes[k];
-                        out.patch_ref         = &patch;
-                        perform_regionwise_segmentation(
-                            z, r, patch, out.patch_ground, out.patch_nonground, out.status);
-                      }
+                      process_patch_range(range.begin(), range.end());
                     });
+#else
+  process_patch_range(0, num_patches);
+#endif
 
   ground_pts_.clear();
   nonground_pts_.clear();