Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 49 additions & 30 deletions cpp/src/barrier/barrier.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2707,6 +2707,38 @@ f_t barrier_solver_t<i_t, f_t>::compute_nonnegative_step_length(iteration_data_t
stream_view_);
}

/**
* @brief Copy the current device search direction into Mehrotra affine buffers.
*
* Device-to-device snapshot of (dw, dx, dy, dv, dz) into d_*_aff_. Called from
* gpu_compute_search_direction when snapshot_affine_direction is true, immediately
* after the direction is fully formed. The corrector step reuses d_dx_/d_dy_/etc.
* and must not refresh d_*_aff_.
*
* @param data Per-iteration device state (d_dw_, d_dx_, ..., d_*_aff_).
* @param stream CUDA stream for resize and copy operations.
*/
template <typename i_t, typename f_t>
void copy_affine_direction_to_device_buffers(iteration_data_t<i_t, f_t>& data,
rmm::cuda_stream_view stream)
{
raft::common::nvtx::range fun_scope("Barrier: copy_affine_direction_to_device_buffers");

auto copy_device_vec = [&](rmm::device_uvector<f_t>& dst,
const rmm::device_uvector<f_t>& src) {
cuopt_assert(dst.empty() || dst.size() == src.size(),
"Buffer size mismatch in affine snapshot");
dst.resize(src.size(), stream);
if (src.size() > 0) { raft::copy(dst.data(), src.data(), src.size(), stream); }
};

copy_device_vec(data.d_dw_aff_, data.d_dw_);
copy_device_vec(data.d_dx_aff_, data.d_dx_);
copy_device_vec(data.d_dy_aff_, data.d_dy_);
copy_device_vec(data.d_dv_aff_, data.d_dv_);
copy_device_vec(data.d_dz_aff_, data.d_dz_);
}

template <typename i_t, typename f_t>
i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_t, f_t>& data,
pinned_dense_vector_t<i_t, f_t>& dw,
Expand All @@ -2716,7 +2748,8 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
pinned_dense_vector_t<i_t, f_t>& dz,
f_t& dual_perturb,
f_t& primal_perturb,
f_t& max_residual)
f_t& max_residual,
bool snapshot_affine_direction)
{
raft::common::nvtx::range fun_scope("Barrier: compute_search_direction");

Expand Down Expand Up @@ -3017,9 +3050,6 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
raft::copy(data.d_dx_.data(), data.d_augmented_soln_.data(), lp.num_cols, stream_view_);
raft::copy(
data.d_dy_.data(), data.d_augmented_soln_.data() + lp.num_cols, lp.num_rows, stream_view_);
raft::copy(dx.data(), data.d_dx_.data(), lp.num_cols, stream_view_);
raft::copy(dy.data(), data.d_dy_.data(), lp.num_rows, stream_view_);
RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));

// TMP should only be init once
data.cusparse_dy_ = data.cusparse_view_.create_vector(data.d_dy_);
Expand All @@ -3029,9 +3059,6 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_

// Solve A D^{-1} A^T dy = h
i_t solve_status = data.gpu_solve_adat(data.d_h_, data.d_dy_);
// TODO Chris, we need to write to cpu because dx is used outside
// Can't we also GPUify what's usinng this dx?
raft::copy(dy.data(), data.d_dy_.data(), dy.size(), stream_view_);
if (solve_status == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; }
if (solve_status < 0) {
settings.log.printf("Linear solve failed\n");
Expand Down Expand Up @@ -3091,7 +3118,6 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
},
stream_view_.value());
RAFT_CHECK_CUDA(stream_view_);
raft::copy(dx.data(), data.d_dx_.data(), data.d_dx_.size(), stream_view_);

data.cusparse_view_.transpose_spmv(-1.0, data.cusparse_dy_, 1.0, data.cusparse_dx_residual_);
cub::DeviceTransform::Transform(
Expand Down Expand Up @@ -3277,7 +3303,6 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
raft::device_span<f_t>(data.d_dz_.data(), linear_dz_size),
raft::device_span<const i_t>(data.d_is_direct_free_linear_.data(), linear_dz_size),
stream_view_);
raft::copy(dz.data(), data.d_dz_.data(), data.d_dz_.size(), stream_view_);
}

if (debug) {
Expand Down Expand Up @@ -3332,7 +3357,6 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
},
stream_view_.value());
RAFT_CHECK_CUDA(stream_view_);
raft::copy(dv.data(), data.d_dv_.data(), data.d_dv_.size(), stream_view_);
}

if (debug) {
Expand Down Expand Up @@ -3414,7 +3438,13 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
[] HD(f_t dw, f_t gathered_dx) { return dw - gathered_dx; },
stream_view_.value());
RAFT_CHECK_CUDA(stream_view_);
raft::copy(dw.data(), data.d_dw_.data(), data.d_dw_.size(), stream_view_);

// Affine Mehrotra step: save (dw, dx, dy, dv, dz) into d_*_aff_ now that the direction is
// complete. The corrector is a separate gpu_compute_search_direction call that reuses d_dx_
// / d_dy_ for the centering direction; d_*_aff_ must be fixed before that second call.
if (snapshot_affine_direction) {
copy_affine_direction_to_device_buffers(data, stream_view_);
}

if (debug) {
// dw_residual <- dw + E'*dx - bound_rhs
Expand Down Expand Up @@ -3662,16 +3692,6 @@ void barrier_solver_t<i_t, f_t>::compute_target_mu(
const bool has_soc = data.has_cones();

f_t complementarity_aff_sum = 0.0;
// TMP no copy and data should always be on the GPU
data.d_dw_aff_.resize(data.dw_aff.size(), stream_view_);
data.d_dx_aff_.resize(data.dx_aff.size(), stream_view_);
data.d_dv_aff_.resize(data.dv_aff.size(), stream_view_);
data.d_dz_aff_.resize(data.dz_aff.size(), stream_view_);

raft::copy(data.d_dw_aff_.data(), data.dw_aff.data(), data.dw_aff.size(), stream_view_);
raft::copy(data.d_dx_aff_.data(), data.dx_aff.data(), data.dx_aff.size(), stream_view_);
raft::copy(data.d_dv_aff_.data(), data.dv_aff.data(), data.dv_aff.size(), stream_view_);
raft::copy(data.d_dz_aff_.data(), data.dz_aff.data(), data.dz_aff.size(), stream_view_);

f_t step_primal_aff = std::min(compute_nonnegative_step_length(data, data.d_w_, data.d_dw_aff_),
compute_nonnegative_step_length(data, data.d_x_, data.d_dx_aff_));
Expand Down Expand Up @@ -3799,16 +3819,18 @@ template <typename i_t, typename f_t>
void barrier_solver_t<i_t, f_t>::compute_final_direction(iteration_data_t<i_t, f_t>& data)
{
raft::common::nvtx::range fun_scope("Barrier: compute_final_direction");
data.d_dy_aff_.resize(data.dy_aff.size(), stream_view_);
raft::copy(data.d_dy_aff_.data(), data.dy_aff.data(), data.dy_aff.size(), stream_view_);

#ifdef FINITE_CHECK
for (i_t i = 0; i < (int)data.y.size(); i++) {
cuopt_assert(std::isfinite(data.y[i]), "data.d_y_[i] is not finite");
}

for (i_t i = 0; i < (int)data.dy_aff.size(); i++) {
cuopt_assert(std::isfinite(data.dy_aff[i]), "data.dy_aff_[i] is not finite");
if (data.d_dy_aff_.size() > 0) {
const auto dy_aff_host = host_copy(data.d_dy_aff_, stream_view_);
stream_view_.synchronize();
for (i_t i = 0; i < static_cast<i_t>(dy_aff_host.size()); ++i) {
cuopt_assert(std::isfinite(dy_aff_host[i]), "data.d_dy_aff_[i] is not finite");
}
}
#endif

Expand Down Expand Up @@ -4437,13 +4459,12 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time, lp_solution_t<i_t,
data.dz_aff,
dual_perturb,
primal_perturb,
max_affine_residual);
max_affine_residual,
true);
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
settings.log.printf("Barrier solver halted\n");
return lp_status_t::CONCURRENT_LIMIT;
}
// Sync to make sure all the async copies to host done inside are finished
RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));

if (status < 0) {
return check_for_suboptimal_solution(data,
Expand Down Expand Up @@ -4489,8 +4510,6 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time, lp_solution_t<i_t,
settings.log.printf("Barrier solver halted\n");
return lp_status_t::CONCURRENT_LIMIT;
}
// Sync to make sure all the async copies to host done inside are finished
RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
if (status < 0) {
return check_for_suboptimal_solution(data,
start_time,
Expand Down
12 changes: 11 additions & 1 deletion cpp/src/barrier/barrier.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@ class barrier_solver_t {
f_t compute_nonnegative_step_length(iteration_data_t<i_t, f_t>& data,
const rmm::device_uvector<f_t>& x,
const rmm::device_uvector<f_t>& dx);
/**
* @brief Solve for a Mehrotra search direction on the GPU.
*
* When snapshot_affine_direction is true (affine predictor step), copies the completed
* direction into data.d_*_aff_ for compute_target_mu and compute_final_direction.
*
* @param snapshot_affine_direction If true, snapshot (dw, dx, dy, dv, dz) into d_*_aff_.
* @return 0 on success, a negative value on failure, or CONCURRENT_HALT_RETURN if halted.
*/
i_t gpu_compute_search_direction(iteration_data_t<i_t, f_t>& data,
pinned_dense_vector_t<i_t, f_t>& dw,
pinned_dense_vector_t<i_t, f_t>& dx,
Expand All @@ -108,7 +117,8 @@ class barrier_solver_t {
pinned_dense_vector_t<i_t, f_t>& dz,
f_t& dual_perturb,
f_t& primal_perturb,
f_t& max_residual);
f_t& max_residual,
bool snapshot_affine_direction = false);

private:
lp_status_t check_for_suboptimal_solution(iteration_data_t<i_t, f_t>& data,
Expand Down