NVIDIA · rg20 · Mar 2, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
@@ -1 +1 @@
-release/26.04
+main
@@ -264,6 +264,8 @@ class pdlp_solver_settings_t {
   bool inside_mip{false};
   // For concurrent termination
   std::atomic<int>* concurrent_halt{nullptr};
+  /** If true, solver does not set concurrent_halt; caller sets it after crossover. */
+  bool halt_set_by_caller{false};
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;
   pdlp_hyper_params::pdlp_hyper_params_t hyper_params;
   // Holds the information of new variable lower and upper bounds for each climber in the format:

@@ -235,6 +235,7 @@ class optimization_problem_solution_t : public base_solution_t {
    * @return rmm::device_uvector<i_t> The device memory container for the reduced cost.
    */
   rmm::device_uvector<f_t>& get_reduced_cost();
+  const rmm::device_uvector<f_t>& get_reduced_cost() const;
 
   /**
    * @brief Get termination reason

@@ -17,6 +17,7 @@
 
 #include <cuts/cuts.hpp>
 
+#include <dual_simplex/crossover.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/phase2.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
@@ -33,15 +34,21 @@
 #include <omp.h>
 
 #include <atomic>
+#include <mutex>
+#include <thread>
+
 #include <functional>
 #include <future>
 #include <memory>
 #include <vector>
 
 namespace cuopt::linear_programming::detail {
+template <typename i_t, typename f_t>
+class problem_t;
+
 template <typename i_t, typename f_t>
 struct clique_table_t;
-}
+}  // namespace cuopt::linear_programming::detail
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -74,34 +81,20 @@ struct deterministic_diving_policy_t;
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
-  branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
-                     const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-                     f_t start_time,
-                     std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table = nullptr);
+  /** Host \p user_problem must be fully populated by the caller. When \p mip_problem_ptr is
+   *  non-null (GPU MIP / concurrent root), the caller must sync from device first, e.g.
+   *  recompute_objective_integrality(), set objective_is_integral, get_host_user_problem(). */
+  branch_and_bound_t(
+    const user_problem_t<i_t, f_t>& user_problem,
+    const simplex_solver_settings_t<i_t, f_t>& solver_settings,
+    f_t start_time,
+    cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr = nullptr,
+    i_t pdlp_root_num_gpus                                                  = 1,
+    std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table          = nullptr);
 
   // Set an initial guess based on the user_problem. This should be called before solve.
   void set_initial_guess(const std::vector<f_t>& user_guess) { guess_ = user_guess; }
 
-  // Set the root solution found by PDLP
-  void set_root_relaxation_solution(const std::vector<f_t>& primal,
-                                    const std::vector<f_t>& dual,
-                                    const std::vector<f_t>& reduced_costs,
-                                    f_t objective,
-                                    f_t user_objective,
-                                    i_t iterations)
-  {
-    if (!is_root_solution_set) {
-      root_crossover_soln_.x              = primal;
-      root_crossover_soln_.y              = dual;
-      root_crossover_soln_.z              = reduced_costs;
-      root_objective_                     = objective;
-      root_crossover_soln_.objective      = objective;
-      root_crossover_soln_.user_objective = user_objective;
-      root_crossover_soln_.iterations     = iterations;
-      root_crossover_solution_set_.store(true, std::memory_order_release);
-    }
-  }
-
   // Set a solution based on the user problem during the course of the solve
   void set_new_solution(const std::vector<f_t>& solution);
 
@@ -122,6 +115,7 @@ class branch_and_bound_t {
                        std::vector<f_t>& repaired_solution) const;
 
   f_t get_lower_bound();
+  i_t get_num_cols() const { return original_problem_.num_cols; }
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
   std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
   void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
@@ -133,6 +127,21 @@ class branch_and_bound_t {
                                     std::vector<i_t>& nonbasic_list,
                                     std::vector<f_t>& edge_norms);
 
+  /** Starts PDLP+crossover and Barrier+crossover in two threads. winner is 0=none, 1=dual, 2=PDLP,
+   * 3=Barrier; first OPTIMAL sets it. first_solver_* for diversity manager callback. */
+  void run_concurrent_pdlp_and_barrier_with_crossover(
+    const simplex_solver_settings_t<i_t, f_t>& lp_settings,
+    crossover_status_t& crossover_status_out,
+    lp_solution_t<i_t, f_t>& winner_crossover_soln_out,
+    std::vector<variable_status_t>& winner_crossover_vstatus_out,
+    f_t& winner_root_objective_out,
+    std::string& winner_solver_name_out,
+    std::atomic<int>& winner,
+    std::mutex* first_solver_mutex,
+    bool* first_solver_callback_done,
+    std::thread& pdlp_thread_out,
+    std::thread& barrier_thread_out);
+
   i_t find_reduced_cost_fixings(f_t upper_bound,
                                 std::vector<f_t>& lower_bounds,
                                 std::vector<f_t>& upper_bounds);
@@ -146,7 +155,7 @@ class branch_and_bound_t {
   producer_sync_t& get_producer_sync() { return producer_sync_; }
 
  private:
-  const user_problem_t<i_t, f_t>& original_problem_;
+  user_problem_t<i_t, f_t> original_problem_;
   const simplex_solver_settings_t<i_t, f_t> settings_;
   std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table_;
   std::future<std::shared_ptr<detail::clique_table_t<i_t, f_t>>> clique_table_future_;
@@ -194,17 +203,16 @@ class branch_and_bound_t {
 
   // Variables for the root node in the search tree.
   std::vector<variable_status_t> root_vstatus_;
-  std::vector<variable_status_t> crossover_vstatus_;
   f_t root_objective_;
   lp_solution_t<i_t, f_t> root_relax_soln_;
-  lp_solution_t<i_t, f_t> root_crossover_soln_;
   std::vector<f_t> edge_norms_;
   std::atomic<bool> root_crossover_solution_set_{false};
   omp_atomic_t<f_t> root_lp_current_lower_bound_;
   omp_atomic_t<bool> solving_root_relaxation_{false};
   bool enable_concurrent_lp_root_solve_{false};
   std::atomic<int> root_concurrent_halt_{0};
-  bool is_root_solution_set{false};
+  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr_{nullptr};
+  i_t pdlp_root_num_gpus_{1};
 
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;

@@ -331,6 +331,7 @@ void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
 
 template <typename i_t, typename f_t>
 i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
+              const csr_matrix_t<i_t, f_t>& Arow,
               const simplex_solver_settings_t<i_t, f_t>& settings,
               f_t start_time,
               lp_solution_t<i_t, f_t>& solution,
@@ -387,6 +388,9 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
   std::vector<f_t>& y       = solution.y;
   const std::vector<f_t>& x = solution.x;
   i_t num_pushes            = 0;
+  std::vector<f_t> delta_zN(n - m);
+  std::vector<f_t> delta_expanded;  // workspace for sparse path (delta_y is sparse enough)
+  std::vector<f_t> delta_y_dense;   // workspace for dense path (delta_y is not sparse enough)
   while (superbasic_list.size() > 0) {
     const i_t s                   = superbasic_list.back();
     const i_t basic_leaving_index = superbasic_list_index.back();
@@ -401,11 +405,9 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     es_sparse.x[0] = -delta_zs;
 
     // B^T delta_y = -delta_zs*es
-    std::vector<f_t> delta_y(m);
     sparse_vector_t<i_t, f_t> delta_y_sparse(m, 1);
     sparse_vector_t<i_t, f_t> UTsol_sparse(m, 1);
     ft.b_transpose_solve(es_sparse, delta_y_sparse, UTsol_sparse);
-    delta_y_sparse.scatter(delta_y);
 
     // We solved B^T delta_y = -delta_zs*es, but for the update we need
     // U^T*etilde = es.
@@ -416,16 +418,38 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     }
 
     // delta_zN = -N^T delta_y
-    std::vector<f_t> delta_zN(n - m);
-    for (i_t k = 0; k < n - m; ++k) {
-      const i_t j         = nonbasic_list[k];
-      const i_t col_start = lp.A.col_start[j];
-      const i_t col_end   = lp.A.col_start[j + 1];
-      f_t dot             = 0.0;
-      for (i_t p = col_start; p < col_end; ++p) {
-        dot += lp.A.x[p] * delta_y[lp.A.i[p]];
+    // Choose sparse vs dense method by delta_y sparsity (match dual simplex: sparse if <= 30% nnz)
+    std::fill(delta_zN.begin(), delta_zN.end(), 0.);
+    const bool use_sparse = (delta_y_sparse.i.size() * 1.0 / m) <= 0.3;
+
+    if (use_sparse) {
+      delta_expanded.resize(n);
+      std::fill(delta_expanded.begin(), delta_expanded.end(), 0.);
+      for (i_t nnz_idx = 0; nnz_idx < static_cast<i_t>(delta_y_sparse.i.size()); ++nnz_idx) {
+        const i_t row       = delta_y_sparse.i[nnz_idx];
+        const f_t val       = delta_y_sparse.x[nnz_idx];
+        const i_t row_start = Arow.row_start[row];
+        const i_t row_end   = Arow.row_start[row + 1];
+        for (i_t p = row_start; p < row_end; ++p) {
+          const i_t col = Arow.j[p];
+          delta_expanded[col] += Arow.x[p] * val;
+        }
+      }
+      for (i_t k = 0; k < n - m; ++k) {
+        delta_zN[k] = -delta_expanded[nonbasic_list[k]];
+      }
+    } else {
+      delta_y_sparse.to_dense(delta_y_dense);
+      for (i_t k = 0; k < n - m; ++k) {
+        const i_t j       = nonbasic_list[k];
+        f_t dot           = 0.0;
+        const i_t c_start = lp.A.col_start[j];
+        const i_t c_end   = lp.A.col_start[j + 1];
+        for (i_t p = c_start; p < c_end; ++p) {
+          dot += lp.A.x[p] * delta_y_dense[lp.A.i[p]];
+        }
+        delta_zN[k] = -dot;
       }
-      delta_zN[k] = -dot;
     }
 
     i_t entering_index          = -1;
@@ -435,8 +459,10 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     assert(step_length >= -1e-6);
 
     // y <- y + step_length * delta_y
-    for (i_t i = 0; i < m; ++i) {
-      y[i] += step_length * delta_y[i];
+    // Optimized: Only update non-zero elements from sparse representation
+    for (i_t nnz_idx = 0; nnz_idx < delta_y_sparse.i.size(); ++nnz_idx) {
+      const i_t i = delta_y_sparse.i[nnz_idx];
+      y[i] += step_length * delta_y_sparse.x[nnz_idx];
     }
 
     // z <- z + step_length * delta z
@@ -725,7 +751,6 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
 {
   const i_t m = lp.num_rows;
   const i_t n = lp.num_cols;
-
   settings.log.debug("Primal push: superbasic %ld\n", superbasic_list.size());
 
   std::vector<f_t>& x = solution.x;
@@ -1002,6 +1027,7 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
   }
   solution.x = x_compare;
   solution.iterations += num_pushes;
+
   return 0;
 }
 
@@ -1190,6 +1216,9 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   f_t crossover_start = tic();
   f_t work_estimate   = 0;
 
+  csr_matrix_t<i_t, f_t> Arow(m, n, 1);
+  lp.A.to_compressed_row(Arow);
+
   settings.log.printf("\n");
   settings.log.printf("Starting crossover\n");
 
@@ -1331,8 +1360,16 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   basis_update_mpf_t ft(L, U, p, settings.refactor_frequency);
   verify_basis<i_t, f_t>(m, n, vstatus);
   compare_vstatus_with_lists<i_t, f_t>(m, n, basic_list, nonbasic_list, vstatus);
-  i_t dual_push_status = dual_push(
-    lp, settings, start_time, solution, ft, basic_list, nonbasic_list, superbasic_list, vstatus);
+  i_t dual_push_status = dual_push(lp,
+                                   Arow,
+                                   settings,
+                                   start_time,
+                                   solution,
+                                   ft,
+                                   basic_list,
+                                   nonbasic_list,
+                                   superbasic_list,
+                                   vstatus);
   if (dual_push_status < 0) { return return_to_status(dual_push_status); }
   settings.log.debug("basic list size %ld m %d\n", basic_list.size(), m);
   settings.log.debug("nonbasic list size %ld n - m %d\n", nonbasic_list.size(), n - m);

@@ -3597,10 +3597,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                           100.0 * dense_delta_z / (sparse_delta_z + dense_delta_z));
       ft.print_stats();
     }
-    if (settings.inside_mip && settings.concurrent_halt != nullptr) {
-      settings.log.debug("Setting concurrent halt in Dual Simplex Phase 2\n");
-      *settings.concurrent_halt = 1;
-    }
   }
   return status;
 }

@@ -7,6 +7,8 @@
 
 #include <dual_simplex/presolve.hpp>
 
+#include <cuopt/linear_programming/constants.h>
+
 #include <dual_simplex/bounds_strengthening.hpp>
 #include <dual_simplex/folding.hpp>
 #include <dual_simplex/right_looking_lu.hpp>
@@ -1571,4 +1573,62 @@ template void uncrush_solution<int, double>(const presolve_info_t<int, double>&
 
 #endif
 
+#if CUOPT_INSTANTIATE_FLOAT
+
+template void convert_user_problem<int, float>(
+  const user_problem_t<int, float>& user_problem,
+  const simplex_solver_settings_t<int, float>& settings,
+  lp_problem_t<int, float>& problem,
+  std::vector<int>& new_slacks,
+  dualize_info_t<int, float>& dualize_info);
+
+template void convert_user_lp_with_guess<int, float>(
+  const user_problem_t<int, float>& user_problem,
+  const lp_solution_t<int, float>& initial_solution,
+  const std::vector<float>& initial_slack,
+  lp_problem_t<int, float>& lp,
+  lp_solution_t<int, float>& converted_solution);
+
+template int presolve<int, float>(const lp_problem_t<int, float>& original,
+                                  const simplex_solver_settings_t<int, float>& settings,
+                                  lp_problem_t<int, float>& presolved,
+                                  presolve_info_t<int, float>& presolve_info);
+
+template void crush_primal_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                const lp_problem_t<int, float>& problem,
+                                                const std::vector<float>& user_solution,
+                                                const std::vector<int>& new_slacks,
+                                                std::vector<float>& solution);
+
+template float crush_dual_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                               const lp_problem_t<int, float>& problem,
+                                               const std::vector<int>& new_slacks,
+                                               const std::vector<float>& user_y,
+                                               const std::vector<float>& user_z,
+                                               std::vector<float>& y,
+                                               std::vector<float>& z);
+
+template void uncrush_primal_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                  const lp_problem_t<int, float>& problem,
+                                                  const std::vector<float>& solution,
+                                                  std::vector<float>& user_solution);
+
+template void uncrush_dual_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                const lp_problem_t<int, float>& problem,
+                                                const std::vector<float>& y,
+                                                const std::vector<float>& z,
+                                                std::vector<float>& user_y,
+                                                std::vector<float>& user_z);
+
+template void uncrush_solution<int, float>(const presolve_info_t<int, float>& presolve_info,
+                                           const simplex_solver_settings_t<int, float>& settings,
+                                           const std::vector<float>& crushed_x,
+                                           const std::vector<float>& crushed_y,
+                                           const std::vector<float>& crushed_z,
+                                           std::vector<float>& uncrushed_x,
+                                           std::vector<float>& uncrushed_y,
+                                           std::vector<float>& uncrushed_z);
+
+#endif
+
 }  // namespace cuopt::linear_programming::dual_simplex
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		release/26.04
		main
Copy link coderabbitai bot Mar 19, 2026 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. ⚠️ Potential issue \| 🟠 Major Avoid pointing release flow to `main` for this branch marker. Using `main` here can pull moving-head workflow/config changes into a release line and make builds non-reproducible. Prefer pinning to the release branch/tag used for this train (e.g., `release/26.04` or an explicit immutable ref). 🤖 Prompt for AI Agents Verify each finding against the current code and only fix it if needed. In `@RAPIDS_BRANCH` at line 1, The RAPIDS_BRANCH file currently points at the mutable "main" branch which risks pulling moving-head changes into the release flow; update the branch marker to a stable release branch or tag (for example use "release/26.04" or an explicit immutable ref) by replacing the "main" entry in RAPIDS_BRANCH with the chosen release branch or tag so builds remain reproducible.