diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2aba443..d69fadf 100755 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -40,6 +40,27 @@ endfunction() find_external_dependency("Eigen3" "Eigen3::Eigen" "${CMAKE_CURRENT_LIST_DIR}/cmake/eigen.cmake") +# Intel oneTBB / TBB — OPTIONAL. The classic Patchwork main loop +# uses tbb::parallel_for when available (1.73x speedup on KITTI) and +# falls back to a sequential loop otherwise. Patchwork++ does not use +# TBB; see issue #96 for the measurement that justifies that decision. +# +# To enable the speedup locally: +# Ubuntu/Debian: apt install libtbb-dev +# macOS: brew install tbb +# Windows: vcpkg install tbb (or use oneAPI Base Toolkit) +find_package(TBB CONFIG QUIET) +if (NOT TBB_FOUND) + find_package(TBB MODULE QUIET) +endif() +if (TBB_FOUND) + message(STATUS "TBB found — classic Patchwork will use tbb::parallel_for.") +else() + message(STATUS + "TBB not found — classic Patchwork falls back to a sequential loop. " + "Install libtbb-dev / brew install tbb / vcpkg install tbb to get the parallel speedup.") +endif() + # Parameters in `patchworkpp` subdirectory. # Thus, link should be `patchworkpp::ground_seg_cores` set(PARENT_PROJECT_NAME ${PROJECT_NAME}) diff --git a/cpp/patchwork/CMakeLists.txt b/cpp/patchwork/CMakeLists.txt index 3008cbf..59d281b 100644 --- a/cpp/patchwork/CMakeLists.txt +++ b/cpp/patchwork/CMakeLists.txt @@ -12,6 +12,10 @@ target_include_directories(${CLASSIC_TARGET} PUBLIC $ ) target_link_libraries(${CLASSIC_TARGET} Eigen3::Eigen ground_seg_common ground_seg_cores) +if (TBB_FOUND) + target_link_libraries(${CLASSIC_TARGET} TBB::tbb) + target_compile_definitions(${CLASSIC_TARGET} PUBLIC PATCHWORK_HAS_TBB) +endif() add_library(${PARENT_PROJECT_NAME}::${CLASSIC_TARGET} ALIAS ${CLASSIC_TARGET}) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ diff --git a/cpp/patchwork/include/patchwork/patchwork.h b/cpp/patchwork/include/patchwork/patchwork.h index 65fffa2..5e9f337 100644 --- a/cpp/patchwork/include/patchwork/patchwork.h +++ b/cpp/patchwork/include/patchwork/patchwork.h @@ -72,14 +72,14 @@ class PatchWork { void pc2regionwise_patches(const std::vector& src); void extract_initial_seeds(int zone_idx, const std::vector& sorted, - std::vector& seeds); + std::vector& seeds) const; PatchStatus determine_gle_status(int zone_idx, int ring_idx, const PCAFeature& feature) const; void perform_regionwise_segmentation(int zone_idx, int ring_idx, const std::vector& patch, std::vector& patch_ground, std::vector& patch_nonground, - PatchStatus& status_out); + PatchStatus& status_out) const; void estimate_sensor_height(std::vector& cloud); double consensus_set_based_height_estimation(const std::vector& candidate_heights); void materialize() const; diff --git a/cpp/patchwork/src/patchwork.cpp b/cpp/patchwork/src/patchwork.cpp index 6819fd3..f70a9a1 100644 --- a/cpp/patchwork/src/patchwork.cpp +++ b/cpp/patchwork/src/patchwork.cpp @@ -6,6 +6,11 @@ #include #include +#ifdef PATCHWORK_HAS_TBB +#include +#include +#endif + #include "patchwork/plane_fit.h" namespace { @@ -78,7 +83,7 @@ void PatchWork::pc2regionwise_patches(const std::vector& src) { void PatchWork::extract_initial_seeds(int zone_idx, const std::vector& sorted, - std::vector& seeds) { + std::vector& seeds) const { seeds.clear(); if (sorted.empty()) return; @@ -150,7 +155,7 @@ void PatchWork::perform_regionwise_segmentation(int zone_idx, const std::vector& patch, std::vector& patch_ground, std::vector& patch_nonground, - PatchStatus& status_out) { + PatchStatus& status_out) const { patch_ground.clear(); patch_nonground.clear(); @@ -376,29 +381,67 @@ void PatchWork::estimateGround(const Eigen::MatrixXf& cloud) { flush(); pc2regionwise_patches(kept); - // 5) Per-patch segmentation (sequential — was tbb::parallel_for upstream) - ground_pts_.clear(); - nonground_pts_.clear(); + // 5) Per-patch segmentation, parallelised over all (zone, ring, sector) + // patches. Each patch is independent: `perform_regionwise_segmentation` + // is const, takes the patch by const ref, and writes to caller-owned + // output buffers. We collect per-patch results into an indexed buffer + // and then accumulate into the final ground/nonground point lists in + // a serial reduction so the accumulation order is deterministic + // (mirrors the original upstream patchwork's two-phase pattern). + struct PatchOutcome { + std::vector patch_ground; + std::vector patch_nonground; + PatchStatus status = PatchStatus::NotAssigned; + const std::vector* patch_ref = nullptr; // for "reject whole patch" case + }; + + std::vector> patch_indices; + patch_indices.reserve(params_.num_zones * 8 * 32); for (int z = 0; z < params_.num_zones; ++z) { for (int r = 0; r < params_.num_rings_each_zone[z]; ++r) { for (int s = 0; s < params_.num_sectors_each_zone[z]; ++s) { - const auto& patch = regionwise_patches_[z][r][s]; - std::vector pg, png; - PatchStatus status; - perform_regionwise_segmentation(z, r, patch, pg, png, status); - switch (status) { - case PatchStatus::UprightEnough: - case PatchStatus::FlatEnough: - ground_pts_.insert(ground_pts_.end(), pg.begin(), pg.end()); - nonground_pts_.insert(nonground_pts_.end(), png.begin(), png.end()); - break; - default: - // Reject the whole patch as nonground - nonground_pts_.insert(nonground_pts_.end(), patch.begin(), patch.end()); - } + patch_indices.emplace_back(z, r, s); } } } + const int num_patches = static_cast(patch_indices.size()); + std::vector outcomes(num_patches); + + auto process_patch_range = [&](int begin, int end) { + for (int k = begin; k < end; ++k) { + const auto& [z, r, s] = patch_indices[k]; + const auto& patch = regionwise_patches_[z][r][s]; + auto& out = outcomes[k]; + out.patch_ref = &patch; + perform_regionwise_segmentation( + z, r, patch, out.patch_ground, out.patch_nonground, out.status); + } + }; + +#ifdef PATCHWORK_HAS_TBB + tbb::parallel_for(tbb::blocked_range(0, num_patches), + [&](const tbb::blocked_range& range) { + process_patch_range(range.begin(), range.end()); + }); +#else + process_patch_range(0, num_patches); +#endif + + ground_pts_.clear(); + nonground_pts_.clear(); + for (const auto& out : outcomes) { + switch (out.status) { + case PatchStatus::UprightEnough: + case PatchStatus::FlatEnough: + ground_pts_.insert(ground_pts_.end(), out.patch_ground.begin(), out.patch_ground.end()); + nonground_pts_.insert( + nonground_pts_.end(), out.patch_nonground.begin(), out.patch_nonground.end()); + break; + default: + // Reject the whole patch as nonground + nonground_pts_.insert(nonground_pts_.end(), out.patch_ref->begin(), out.patch_ref->end()); + } + } // 6) Mark outputs dirty (actual matrix materialization is lazy) outputs_dirty_ = true; diff --git a/cpp/patchworkpp/include/patchwork/patchworkpp.h b/cpp/patchworkpp/include/patchwork/patchworkpp.h index a48738a..8f6b519 100644 --- a/cpp/patchworkpp/include/patchwork/patchworkpp.h +++ b/cpp/patchworkpp/include/patchwork/patchworkpp.h @@ -231,12 +231,12 @@ class PatchWorkpp { void extract_initial_seeds(const int zone_idx, const vector &p_sorted, - vector &init_seeds); + vector &init_seeds) const; void extract_initial_seeds(const int zone_idx, const vector &p_sorted, vector &init_seeds, - double th_seed); + double th_seed) const; }; }; // namespace patchwork diff --git a/cpp/patchworkpp/src/patchworkpp.cpp b/cpp/patchworkpp/src/patchworkpp.cpp index e33dbfe..25d72a9 100644 --- a/cpp/patchworkpp/src/patchworkpp.cpp +++ b/cpp/patchworkpp/src/patchworkpp.cpp @@ -1,5 +1,8 @@ #include "patchwork/patchworkpp.h" +#include +#include + #include "patchwork/plane_fit.h" // xy2theta, xy2radius, point_z_cmp using namespace std; @@ -79,7 +82,7 @@ void PatchWorkpp::estimate_plane(const vector &ground) { void PatchWorkpp::extract_initial_seeds(const int zone_idx, const vector &p_sorted, vector &init_seeds, - double th_seed) { + double th_seed) const { init_seeds.clear(); // LPR is the mean of low point representative @@ -115,7 +118,7 @@ void PatchWorkpp::extract_initial_seeds(const int zone_idx, void PatchWorkpp::extract_initial_seeds(const int zone_idx, const vector &p_sorted, - vector &init_seeds) { + vector &init_seeds) const { init_seeds.clear(); // LPR is the mean of low point representative @@ -185,37 +188,40 @@ void PatchWorkpp::estimateGround(Eigen::MatrixXf cloud_in) { std::vector candidates; std::vector ringwise_flatness; + // NOTE: TBB parallelisation was evaluated for this main loop and + // measurably HURT throughput on KITTI (24-core / 8-core / 4-core all + // 30-50% slower than single-thread). The per-patch work is small + // (~14 µs avg) and dominated by short-lived `std::vector` / + // `Eigen::Matrix` allocations inside R-VPF + R-GPF, so concurrent + // mallocs serialise on the heap and TBB scheduler overhead exceeds + // the parallelisation benefit. Single-threaded Patchwork++ already + // runs ~110 Hz on KITTI HDL-64E (2× the paper's reported 55 Hz on + // i7-7700K), so there is no real-time motivation to parallelise. + // The classic Patchwork (see cpp/patchwork/src/patchwork.cpp) does + // benefit from TBB because it has no R-VPF and fewer allocations + // per patch. for (int zone_idx = 0; zone_idx < params_.num_zones; ++zone_idx) { auto zone = ConcentricZoneModel_[zone_idx]; for (int ring_idx = 0; ring_idx < params_.num_rings_each_zone[zone_idx]; ++ring_idx) { - for (int sector_idx = 0; sector_idx < params_.num_sectors_each_zone[zone_idx]; ++sector_idx) { + const int num_sectors = params_.num_sectors_each_zone[zone_idx]; + + clock_t t_bef_gle = clock(); + for (int sector_idx = 0; sector_idx < num_sectors; ++sector_idx) { if (zone[ring_idx][sector_idx].size() < params_.num_min_pts) { addCloud(cloud_nonground_, zone[ring_idx][sector_idx]); continue; } - // --------- region-wise sorting (faster than global sorting method) ---------------- // - clock_t t_bef_sort = clock(); - sort(zone[ring_idx][sector_idx].begin(), zone[ring_idx][sector_idx].end(), point_z_cmp); - clock_t t_aft_sort = clock(); - - t_sort += t_aft_sort - t_bef_sort; - // ---------------------------------------------------------------------------------- // + std::sort( + zone[ring_idx][sector_idx].begin(), zone[ring_idx][sector_idx].end(), point_z_cmp); - clock_t t_bef_pca = clock(); extract_piecewiseground( zone_idx, zone[ring_idx][sector_idx], regionwise_ground_, regionwise_nonground_); - clock_t t_aft_pca = clock(); - - t_pca += t_aft_pca - t_bef_pca; centers_.push_back(PointXYZ(pc_mean_(0), pc_mean_(1), pc_mean_(2))); normals_.push_back(PointXYZ(normal_(0), normal_(1), normal_(2))); - clock_t t_bef_gle = clock(); - // Status of each patch - // used in checking uprightness, elevation, and flatness, respectively const double ground_uprightness = normal_(2); const double ground_elevation = pc_mean_(2); const double ground_flatness = singular_values_.minCoeff(); @@ -224,46 +230,25 @@ void PatchWorkpp::estimateGround(Eigen::MatrixXf cloud_in) { : std::numeric_limits::max(); double heading = 0.0; - for (int i = 0; i < 3; i++) heading += pc_mean_(i) * normal_(i); - - /* - About 'is_heading_outside' condition, heading should be smaller than 0 theoretically. - ( Imagine the geometric relationship between the surface normal vector on the ground - plane and the vector connecting the sensor origin and the mean point of the ground plane - ) + for (int i = 0; i < 3; ++i) heading += pc_mean_(i) * normal_(i); - However, when the patch is far awaw from the sensor origin, - heading could be larger than 0 even if it's ground due to lack of amount of ground plane - points. - - Therefore, we only check this value when concentric_idx < num_rings_of_interest ( near - condition ) - */ - bool is_upright = ground_uprightness > params_.uprightness_thr; - bool is_near_zone = concentric_idx < params_.num_rings_of_interest; - bool is_heading_outside = heading < 0.0; + const bool is_upright = ground_uprightness > params_.uprightness_thr; + const bool is_near_zone = concentric_idx < params_.num_rings_of_interest; + const bool is_heading_outside = heading < 0.0; bool is_not_elevated = false; bool is_flat = false; - if (concentric_idx < params_.num_rings_of_interest) { is_not_elevated = ground_elevation < params_.elevation_thr[concentric_idx]; is_flat = ground_flatness < params_.flatness_thr[concentric_idx]; } - /* - Store the elevation & flatness variables - for A-GLE (Adaptive Ground Likelihood Estimation) - and TGR (Temporal Ground Revert). More information in the paper Patchwork++. - */ if (is_upright && is_not_elevated && is_near_zone) { update_elevation_[concentric_idx].push_back(ground_elevation); update_flatness_[concentric_idx].push_back(ground_flatness); - ringwise_flatness.push_back(ground_flatness); } - // Ground estimation based on conditions if (!is_upright) { addCloud(cloud_nonground_, regionwise_ground_); } else if (!is_near_zone) { @@ -273,21 +258,17 @@ void PatchWorkpp::estimateGround(Eigen::MatrixXf cloud_in) { } else if (is_not_elevated || is_flat) { addCloud(cloud_ground_, regionwise_ground_); } else { - patchwork::RevertCandidate candidate(concentric_idx, - sector_idx, - ground_flatness, - line_variable, - pc_mean_, - regionwise_ground_); - candidates.push_back(candidate); + candidates.emplace_back(concentric_idx, + sector_idx, + ground_flatness, + line_variable, + pc_mean_, + regionwise_ground_); } - // Every regionwise_nonground is considered nonground. addCloud(cloud_nonground_, regionwise_nonground_); - - clock_t t_aft_gle = clock(); - - t_gle += t_aft_gle - t_bef_gle; } + clock_t t_aft_gle = clock(); + t_gle += t_aft_gle - t_bef_gle; clock_t t_bef_revert = clock(); if (!candidates.empty()) { diff --git a/python/examples/bench_hz.py b/python/examples/bench_hz.py new file mode 100644 index 0000000..12face9 --- /dev/null +++ b/python/examples/bench_hz.py @@ -0,0 +1,104 @@ +"""Measure per-frame Patchwork / Patchwork++ throughput on a KITTI sequence. + +Reports median + p95 per-frame `getTimeTaken()` (the C++ side's +microsecond timer), converted to Hz. Useful for quantifying the +multi-core speedup brought by TBB. + +Run before vs. after a TBB change to compare. +""" + +import argparse +import os +import statistics +import time + +import numpy as np +import pypatchworkpp + + +def load_bin(path): + return np.fromfile(path, dtype=np.float32).reshape(-1, 4) + + +def build_estimator(method, sensor_height=1.723): + if method == "patchworkpp": + p = pypatchworkpp.Parameters() + p.sensor_height = sensor_height + p.verbose = False + return pypatchworkpp.patchworkpp(p) + if method == "patchwork": + p = pypatchworkpp.PatchworkParams() + p.sensor_height = sensor_height + p.uprightness_thr = 0.707 + p.using_global_thr = False + p.verbose = False + return pypatchworkpp.patchwork(p) + raise ValueError(method) + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument( + "--method", choices=["patchwork", "patchworkpp"], default="patchworkpp" + ) + ap.add_argument( + "--dataset_path", + default="/home/url/datasets/kitti/dataset/sequences", + ) + ap.add_argument("--seq", default="00") + ap.add_argument("--max_frames", type=int, default=None) + ap.add_argument( + "--warmup", + type=int, + default=20, + help="Discard the first N frames (cache warmup / TBB thread spin-up).", + ) + args = ap.parse_args() + + velodyne_dir = os.path.join(args.dataset_path, args.seq, "velodyne") + bin_files = sorted(f for f in os.listdir(velodyne_dir) if f.endswith(".bin")) + if args.max_frames is not None: + bin_files = bin_files[: args.max_frames] + if not bin_files: + raise SystemExit(f"No .bin files in {velodyne_dir}") + + estimator = build_estimator(args.method) + print( + f"[bench] method={args.method} seq={args.seq} " + f"frames={len(bin_files)} warmup={args.warmup}" + ) + + per_frame_us = [] + wall_t0 = time.perf_counter() + for i, fname in enumerate(bin_files): + cloud = load_bin(os.path.join(velodyne_dir, fname)) + estimator.estimateGround(cloud) + if i >= args.warmup: + per_frame_us.append(estimator.getTimeTaken()) + wall_dt = time.perf_counter() - wall_t0 + + if not per_frame_us: + raise SystemExit( + "No frames after warmup; lower --warmup or use --max_frames bigger." + ) + + per_frame_ms = sorted(t / 1000.0 for t in per_frame_us) + n = len(per_frame_ms) + median_ms = statistics.median(per_frame_ms) + p95_ms = per_frame_ms[int(0.95 * (n - 1))] + p99_ms = per_frame_ms[int(0.99 * (n - 1))] + mean_ms = statistics.fmean(per_frame_ms) + median_hz = 1000.0 / median_ms + mean_hz = 1000.0 / mean_ms + + print(f"[bench] timed frames : {n}") + print(f"[bench] median time : {median_ms:6.2f} ms ({median_hz:6.1f} Hz)") + print(f"[bench] mean time : {mean_ms:6.2f} ms ({mean_hz:6.1f} Hz)") + print(f"[bench] p95 / p99 time : {p95_ms:6.2f} / {p99_ms:6.2f} ms") + print( + f"[bench] wall (incl. I/O) : {wall_dt:6.2f} s ({len(bin_files) / wall_dt:6.1f} Hz including disk)" + ) + + +if __name__ == "__main__": + main()