diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
index d5ea6ced53..ba2906d066 100644
--- a/RAPIDS_BRANCH
+++ b/RAPIDS_BRANCH
@@ -1 +1 @@
-release/26.04
+main
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index d3f59144cc..611e4419c2 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -264,6 +264,8 @@ class pdlp_solver_settings_t {
   bool inside_mip{false};
   // For concurrent termination
   std::atomic<int>* concurrent_halt{nullptr};
+  /** If true, solver does not set concurrent_halt; caller sets it after crossover. */
+  bool halt_set_by_caller{false};
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;
   pdlp_hyper_params::pdlp_hyper_params_t hyper_params;
   // Holds the information of new variable lower and upper bounds for each climber in the format:
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_solution.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_solution.hpp
index 45a47e7401..9bd5796a89 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_solution.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_solution.hpp
@@ -235,6 +235,7 @@ class optimization_problem_solution_t : public base_solution_t {
    * @return rmm::device_uvector<i_t> The device memory container for the reduced cost.
    */
   rmm::device_uvector<f_t>& get_reduced_cost();
+  const rmm::device_uvector<f_t>& get_reduced_cost() const;
 
   /**
    * @brief Get termination reason
diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index ba2b63983e..a5c3948ec9 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -9,6 +9,8 @@
 #include <branch_and_bound/mip_node.hpp>
 #include <branch_and_bound/pseudo_costs.hpp>
 
+#include <mip_heuristics/root_lp.cuh>
+
 #include <cuts/cuts.hpp>
 #include <mip_heuristics/presolve/conflict_graph/clique_table.cuh>
 
@@ -29,13 +31,18 @@
 #include <omp.h>
 
 #include <algorithm>
+#include <chrono>
 #include <cmath>
+#include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <deque>
+#include <functional>
 #include <future>
 #include <limits>
 #include <map>
+#include <memory>
+#include <mutex>
 #include <optional>
 #include <string>
 #include <thread>
@@ -243,6 +250,8 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   const user_problem_t<i_t, f_t>& user_problem,
   const simplex_solver_settings_t<i_t, f_t>& solver_settings,
   f_t start_time,
+  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
+  i_t pdlp_root_num_gpus,
   std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table)
   : original_problem_(user_problem),
     settings_(solver_settings),
@@ -251,9 +260,10 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
     Arow_(1, 1, 0),
     incumbent_(1),
     root_relax_soln_(1, 1),
-    root_crossover_soln_(1, 1),
     pc_(1),
-    solver_status_(mip_status_t::UNSET)
+    solver_status_(mip_status_t::UNSET),
+    mip_problem_ptr_(mip_problem_ptr),
+    pdlp_root_num_gpus_(pdlp_root_num_gpus)
 {
   exploration_stats_.start_time = start_time;
 #ifdef PRINT_CONSTRAINT_MATRIX
@@ -1802,6 +1812,84 @@ void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
   }
 }
 
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::run_concurrent_pdlp_and_barrier_with_crossover(
+  const simplex_solver_settings_t<i_t, f_t>& lp_settings,
+  crossover_status_t& crossover_status_out,
+  lp_solution_t<i_t, f_t>& winner_crossover_soln_out,
+  std::vector<variable_status_t>& winner_crossover_vstatus_out,
+  f_t& winner_root_objective_out,
+  std::string& winner_solver_name_out,
+  std::atomic<int>& winner,
+  std::mutex* first_solver_mutex,
+  bool* first_solver_callback_done,
+  std::thread& pdlp_thread_out,
+  std::thread& barrier_thread_out)
+{
+  // PDLP+crossover and Barrier+crossover each in a thread. winner: 0=none, 1=dual, 2=PDLP,
+  // 3=Barrier.
+  struct concurrent_shared_state_t {
+    std::mutex first_result_mutex;
+  };
+  auto shared = std::make_shared<concurrent_shared_state_t>();
+
+  auto do_crush_crossover = [this,
+                             &lp_settings,
+                             &crossover_status_out,
+                             &winner_crossover_soln_out,
+                             &winner_crossover_vstatus_out,
+                             &winner_root_objective_out,
+                             &winner_solver_name_out,
+                             &winner,
+                             first_solver_mutex,
+                             first_solver_callback_done,
+                             shared](const root_relaxation_first_solution_t<i_t, f_t>& result,
+                                     const char* solver_name,
+                                     int winner_id) {
+    return cuopt::linear_programming::detail::run_crush_crossover_and_maybe_win<i_t, f_t>(
+      result,
+      original_problem_,
+      original_lp_,
+      new_slacks_,
+      settings_,
+      exploration_stats_.start_time,
+      get_root_concurrent_halt(),
+      [this]() { set_root_concurrent_halt(1); },
+      lp_settings.on_first_lp_solution_available,
+      first_solver_mutex,
+      first_solver_callback_done,
+      &shared->first_result_mutex,
+      &winner,
+      winner_id,
+      &crossover_status_out,
+      &winner_crossover_soln_out,
+      &winner_crossover_vstatus_out,
+      &winner_root_objective_out,
+      solver_name,
+      &winner_solver_name_out);
+  };
+
+  pdlp_thread_out = std::thread([this, &lp_settings, do_crush_crossover]() {
+    auto result = cuopt::linear_programming::detail::run_solver_for_root_lp<i_t, f_t>(
+      mip_problem_ptr_,
+      lp_settings.time_limit,
+      get_root_concurrent_halt(),
+      pdlp_root_num_gpus_,
+      cuopt::linear_programming::method_t::PDLP);
+    (void)do_crush_crossover(result, "PDLP", 2);
+  });
+
+  barrier_thread_out = std::thread([this, &lp_settings, do_crush_crossover]() {
+    auto result = cuopt::linear_programming::detail::run_solver_for_root_lp<i_t, f_t>(
+      mip_problem_ptr_,
+      lp_settings.time_limit,
+      get_root_concurrent_halt(),
+      pdlp_root_num_gpus_,
+      cuopt::linear_programming::method_t::Barrier);
+    (void)do_crush_crossover(result, "Barrier", 3);
+  });
+}
+
 template <typename i_t, typename f_t>
 lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   simplex_solver_settings_t<i_t, f_t> const& lp_settings,
@@ -1817,89 +1905,157 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   i_t iter                = 0;
   std::string solver_name = "";
 
-  // Root node path
-  lp_status_t root_status;
-  std::future<lp_status_t> root_status_future;
-  root_status_future = std::async(std::launch::async,
-                                  &solve_linear_program_with_advanced_basis<i_t, f_t>,
-                                  std::ref(original_lp_),
-                                  exploration_stats_.start_time,
-                                  std::ref(lp_settings),
-                                  std::ref(root_relax_soln),
-                                  std::ref(basis_update),
-                                  std::ref(basic_list),
-                                  std::ref(nonbasic_list),
-                                  std::ref(root_vstatus),
-                                  std::ref(edge_norms),
-                                  nullptr);
-  // Wait for the root relaxation solution to be sent by the diversity manager or dual simplex
-  // to finish
-  while (!root_crossover_solution_set_.load(std::memory_order_acquire) &&
-         *get_root_concurrent_halt() == 0) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
-    continue;
-  }
-
-  if (root_crossover_solution_set_.load(std::memory_order_acquire)) {
-    // Crush the root relaxation solution on converted user problem
-    std::vector<f_t> crushed_root_x;
-    crush_primal_solution(
-      original_problem_, original_lp_, root_crossover_soln_.x, new_slacks_, crushed_root_x);
-    std::vector<f_t> crushed_root_y;
-    std::vector<f_t> crushed_root_z;
-
-    f_t dual_res_inf = crush_dual_solution(original_problem_,
-                                           original_lp_,
-                                           new_slacks_,
-                                           root_crossover_soln_.y,
-                                           root_crossover_soln_.z,
-                                           crushed_root_y,
-                                           crushed_root_z);
-
-    root_crossover_soln_.x = crushed_root_x;
-    root_crossover_soln_.y = crushed_root_y;
-    root_crossover_soln_.z = crushed_root_z;
-
-    // Call crossover on the crushed solution
-    auto root_crossover_settings            = settings_;
-    root_crossover_settings.log.log         = false;
-    root_crossover_settings.concurrent_halt = get_root_concurrent_halt();
-    crossover_status_t crossover_status     = crossover(original_lp_,
-                                                    root_crossover_settings,
-                                                    root_crossover_soln_,
-                                                    exploration_stats_.start_time,
-                                                    root_crossover_soln_,
-                                                    crossover_vstatus_);
-
-    // Check if crossover was stopped by dual simplex
+  // Dual simplex runs on the main thread when concurrent; otherwise it runs alone on main.
+  auto dual_simplex_settings = std::make_shared<simplex_solver_settings_t<i_t, f_t>>(lp_settings);
+  dual_simplex_settings->inside_mip = 1;
+
+  lp_status_t root_status = lp_status_t::UNSET;
+  lp_status_t root_result_status =
+    lp_status_t::UNSET;  // dual simplex result; set when dual returns, read in else branch
+
+  bool use_pdlp_path               = false;
+  bool dual_simplex_finished_first = false;
+
+  crossover_status_t crossover_status = crossover_status_t::NUMERICAL_ISSUES;
+  lp_solution_t<i_t, f_t> winner_crossover_soln(original_lp_.num_rows, original_lp_.num_cols);
+  std::vector<variable_status_t> winner_crossover_vstatus;
+  f_t winner_root_objective = 0;
+  std::string root_winner_solver_name;
+
+  std::thread pdlp_thread;
+  std::thread barrier_thread;
+  std::thread dual_simplex_thread;
+  std::atomic<int> winner{0};  // 0=none, 1=dual, 2=PDLP, 3=Barrier
+
+  if (enable_concurrent_lp_root_solve_ && mip_problem_ptr_ != nullptr) {
+    // All three run in threads; main only starts them and joins. First to finish with OPTIMAL sets
+    // winner and halt.
+    std::mutex first_solver_mutex;
+    bool first_solver_callback_done = false;
+    run_concurrent_pdlp_and_barrier_with_crossover(lp_settings,
+                                                   crossover_status,
+                                                   winner_crossover_soln,
+                                                   winner_crossover_vstatus,
+                                                   winner_root_objective,
+                                                   root_winner_solver_name,
+                                                   winner,
+                                                   &first_solver_mutex,
+                                                   &first_solver_callback_done,
+                                                   pdlp_thread,
+                                                   barrier_thread);
+
+    // Dual simplex does not call on_first_lp_solution: diversity manager prefers optimal first;
+    // only PDLP/Barrier feed first solution when they have one.
+    dual_simplex_thread = std::thread([this,
+                                       dual_simplex_settings,
+                                       &root_relax_soln,
+                                       &basis_update,
+                                       &basic_list,
+                                       &nonbasic_list,
+                                       &root_vstatus,
+                                       &edge_norms,
+                                       &root_result_status,
+                                       &winner]() {
+      lp_status_t status =
+        solve_linear_program_with_advanced_basis<i_t, f_t>(original_lp_,
+                                                           exploration_stats_.start_time,
+                                                           *dual_simplex_settings,
+                                                           root_relax_soln,
+                                                           basis_update,
+                                                           basic_list,
+                                                           nonbasic_list,
+                                                           root_vstatus,
+                                                           edge_norms,
+                                                           nullptr);
+      root_result_status = status;
+      int expected       = 0;
+      if (status == lp_status_t::OPTIMAL &&
+          winner.compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) {
+        set_root_concurrent_halt(1);
+      }
+    });
+
+    struct join_threads_guard_t {
+      std::thread* a = nullptr;
+      std::thread* b = nullptr;
+      std::thread* c = nullptr;
+      ~join_threads_guard_t()
+      {
+        if (a && a->joinable()) { a->join(); }
+        if (b && b->joinable()) { b->join(); }
+        if (c && c->joinable()) { c->join(); }
+      }
+    } join_guard;
+    join_guard.a = &pdlp_thread;
+    join_guard.b = &barrier_thread;
+    join_guard.c = &dual_simplex_thread;
+
+    pdlp_thread.join();
+    barrier_thread.join();
+    dual_simplex_thread.join();
+    join_guard.a = nullptr;
+    join_guard.b = nullptr;
+    join_guard.c = nullptr;
+
+    // Winner may have set concurrent_halt==1 to stop peer solvers. All threads are joined; reset
+    // the flag for the rest of B&B (subsequent LP solves, etc.).
+    set_root_concurrent_halt(0);
+
+    const int w   = winner.load(std::memory_order_acquire);
+    use_pdlp_path = (w == 2 || w == 3);
+    if (w == 1) { dual_simplex_finished_first = true; }
+  } else {
+    // Non-concurrent: run dual simplex on main only.
+    root_status        = solve_linear_program_with_advanced_basis<i_t, f_t>(original_lp_,
+                                                                     exploration_stats_.start_time,
+                                                                     *dual_simplex_settings,
+                                                                     root_relax_soln,
+                                                                     basis_update,
+                                                                     basic_list,
+                                                                     nonbasic_list,
+                                                                     root_vstatus,
+                                                                     edge_norms,
+                                                                     nullptr);
+    root_result_status = root_status;
+    if (lp_settings.on_first_lp_solution_available) {
+      root_relaxation_first_solution_t<i_t, f_t> ds_result;
+      ds_result.primal         = root_relax_soln.x;
+      ds_result.dual           = root_relax_soln.y;
+      ds_result.reduced_costs  = root_relax_soln.z;
+      ds_result.objective      = root_relax_soln.objective;
+      ds_result.user_objective = root_relax_soln.user_objective;
+      ds_result.iterations     = root_relax_soln.iterations;
+      lp_settings.on_first_lp_solution_available(ds_result);
+    }
+  }
+
+  if (use_pdlp_path) {
+    root_objective_                 = winner_root_objective;
+    auto root_crossover_settings    = settings_;
+    root_crossover_settings.log.log = false;
+    // Single-threaded CPU post-processing (refactor_basis, edge norms); concurrent halt must not
+    // apply.
+    root_crossover_settings.concurrent_halt = nullptr;
     if (crossover_status == crossover_status_t::OPTIMAL) {
-      set_root_concurrent_halt(1);             // Stop dual simplex
-      root_status = root_status_future.get();  // Wait for dual simplex to finish
-      set_root_concurrent_halt(0);             // Clear the concurrent halt flag
-      // Override the root relaxation solution with the crossover solution
-      root_relax_soln = root_crossover_soln_;
-      root_vstatus    = crossover_vstatus_;
+      // Use winner's crossover solution; no wait.
+      root_relax_soln = winner_crossover_soln;
+      root_vstatus    = winner_crossover_vstatus;
       root_status     = lp_status_t::OPTIMAL;
       basic_list.clear();
       nonbasic_list.reserve(original_lp_.num_cols - original_lp_.num_rows);
       nonbasic_list.clear();
       // Get the basic list and nonbasic list from the vstatus
       for (i_t j = 0; j < original_lp_.num_cols; j++) {
-        if (crossover_vstatus_[j] == variable_status_t::BASIC) {
+        if (winner_crossover_vstatus[j] == variable_status_t::BASIC) {
           basic_list.push_back(j);
         } else {
           nonbasic_list.push_back(j);
         }
       }
       if (basic_list.size() != original_lp_.num_rows) {
-        settings_.log.printf(
-          "basic_list size %d != m %d\n", basic_list.size(), original_lp_.num_rows);
         assert(basic_list.size() == original_lp_.num_rows);
       }
       if (nonbasic_list.size() != original_lp_.num_cols - original_lp_.num_rows) {
-        settings_.log.printf("nonbasic_list size %d != n - m %d\n",
-                             nonbasic_list.size(),
-                             original_lp_.num_cols - original_lp_.num_rows);
         assert(nonbasic_list.size() == original_lp_.num_cols - original_lp_.num_rows);
       }
       // Populate the basis_update from the crossover vstatus
@@ -1910,9 +2066,8 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
                                                         exploration_stats_.start_time,
                                                         basic_list,
                                                         nonbasic_list,
-                                                        crossover_vstatus_);
+                                                        winner_crossover_vstatus);
       if (refactor_status != 0) {
-        settings_.log.printf("Failed to refactor basis. %d deficient columns.\n", refactor_status);
         assert(refactor_status == 0);
         root_status = lp_status_t::NUMERICAL_ISSUES;
       }
@@ -1920,19 +2075,32 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       // Set the edge norms to a default value
       edge_norms.resize(original_lp_.num_cols, -1.0);
       set_uninitialized_steepest_edge_norms<i_t, f_t>(original_lp_, basic_list, edge_norms);
-      user_objective = root_crossover_soln_.user_objective;
-      iter           = root_crossover_soln_.iterations;
-      solver_name    = "Barrier/PDLP and Crossover";
+      user_objective = winner_crossover_soln.user_objective;
+      iter           = winner_crossover_soln.iterations;
+      solver_name    = root_winner_solver_name + " and Crossover";
     } else {
-      root_status    = root_status_future.get();
-      user_objective = root_relax_soln_.user_objective;
-      iter           = root_relax_soln_.iterations;
-      solver_name    = "Dual Simplex";
+      // Crossover winner path but crossover was not OPTIMAL. Map crossover outcome to lp_status_t.
+      switch (crossover_status) {
+        case crossover_status_t::TIME_LIMIT: root_status = lp_status_t::TIME_LIMIT; break;
+        case crossover_status_t::NUMERICAL_ISSUES:
+          root_status = lp_status_t::NUMERICAL_ISSUES;
+          break;
+        case crossover_status_t::CONCURRENT_LIMIT: root_status = lp_status_t::TIME_LIMIT; break;
+        case crossover_status_t::PRIMAL_FEASIBLE:
+        case crossover_status_t::DUAL_FEASIBLE: root_status = lp_status_t::NUMERICAL_ISSUES; break;
+        default: root_status = lp_status_t::NUMERICAL_ISSUES; break;
+      }
+      user_objective = winner_crossover_soln.user_objective;
+      iter           = winner_crossover_soln.iterations;
+      solver_name    = root_winner_solver_name + " and Crossover";
     }
   } else {
-    root_status    = root_status_future.get();
-    user_objective = root_relax_soln_.user_objective;
-    iter           = root_relax_soln_.iterations;
+    // Use dual simplex result (root_result_status was set when dual simplex returned).
+    root_status = root_result_status;
+    (void)dual_simplex_finished_first;  // used only to select path
+    // Diversity manager was already notified by whoever was first (dual simplex, PDLP, or Barrier).
+    user_objective = root_relax_soln.user_objective;
+    iter           = root_relax_soln.iterations;
     solver_name    = "Dual Simplex";
   }
 
@@ -1949,8 +2117,8 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   }
 
   settings_.log.printf("\n");
-  is_root_solution_set = true;
 
+  set_root_concurrent_halt(0);
   return root_status;
 }
 
@@ -3799,4 +3967,10 @@ template class branch_and_bound_t<int, double>;
 
 #endif
 
+#ifdef MIP_INSTANTIATION_FLOAT
+
+template class branch_and_bound_t<int, float>;
+
+#endif
+
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index 98674b7f9e..b4c46ac8e9 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -17,6 +17,7 @@
 
 #include <cuts/cuts.hpp>
 
+#include <dual_simplex/crossover.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/phase2.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
@@ -33,15 +34,21 @@
 #include <omp.h>
 
 #include <atomic>
+#include <mutex>
+#include <thread>
+
 #include <functional>
 #include <future>
 #include <memory>
 #include <vector>
 
 namespace cuopt::linear_programming::detail {
+template <typename i_t, typename f_t>
+class problem_t;
+
 template <typename i_t, typename f_t>
 struct clique_table_t;
-}
+}  // namespace cuopt::linear_programming::detail
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -74,34 +81,20 @@ struct deterministic_diving_policy_t;
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
-  branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
-                     const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-                     f_t start_time,
-                     std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table = nullptr);
+  /** Host \p user_problem must be fully populated by the caller. When \p mip_problem_ptr is
+   *  non-null (GPU MIP / concurrent root), the caller must sync from device first, e.g.
+   *  recompute_objective_integrality(), set objective_is_integral, get_host_user_problem(). */
+  branch_and_bound_t(
+    const user_problem_t<i_t, f_t>& user_problem,
+    const simplex_solver_settings_t<i_t, f_t>& solver_settings,
+    f_t start_time,
+    cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr = nullptr,
+    i_t pdlp_root_num_gpus                                                  = 1,
+    std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table          = nullptr);
 
   // Set an initial guess based on the user_problem. This should be called before solve.
   void set_initial_guess(const std::vector<f_t>& user_guess) { guess_ = user_guess; }
 
-  // Set the root solution found by PDLP
-  void set_root_relaxation_solution(const std::vector<f_t>& primal,
-                                    const std::vector<f_t>& dual,
-                                    const std::vector<f_t>& reduced_costs,
-                                    f_t objective,
-                                    f_t user_objective,
-                                    i_t iterations)
-  {
-    if (!is_root_solution_set) {
-      root_crossover_soln_.x              = primal;
-      root_crossover_soln_.y              = dual;
-      root_crossover_soln_.z              = reduced_costs;
-      root_objective_                     = objective;
-      root_crossover_soln_.objective      = objective;
-      root_crossover_soln_.user_objective = user_objective;
-      root_crossover_soln_.iterations     = iterations;
-      root_crossover_solution_set_.store(true, std::memory_order_release);
-    }
-  }
-
   // Set a solution based on the user problem during the course of the solve
   void set_new_solution(const std::vector<f_t>& solution);
 
@@ -122,6 +115,7 @@ class branch_and_bound_t {
                        std::vector<f_t>& repaired_solution) const;
 
   f_t get_lower_bound();
+  i_t get_num_cols() const { return original_problem_.num_cols; }
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
   std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
   void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
@@ -133,6 +127,21 @@ class branch_and_bound_t {
                                     std::vector<i_t>& nonbasic_list,
                                     std::vector<f_t>& edge_norms);
 
+  /** Starts PDLP+crossover and Barrier+crossover in two threads. winner is 0=none, 1=dual, 2=PDLP,
+   * 3=Barrier; first OPTIMAL sets it. first_solver_* for diversity manager callback. */
+  void run_concurrent_pdlp_and_barrier_with_crossover(
+    const simplex_solver_settings_t<i_t, f_t>& lp_settings,
+    crossover_status_t& crossover_status_out,
+    lp_solution_t<i_t, f_t>& winner_crossover_soln_out,
+    std::vector<variable_status_t>& winner_crossover_vstatus_out,
+    f_t& winner_root_objective_out,
+    std::string& winner_solver_name_out,
+    std::atomic<int>& winner,
+    std::mutex* first_solver_mutex,
+    bool* first_solver_callback_done,
+    std::thread& pdlp_thread_out,
+    std::thread& barrier_thread_out);
+
   i_t find_reduced_cost_fixings(f_t upper_bound,
                                 std::vector<f_t>& lower_bounds,
                                 std::vector<f_t>& upper_bounds);
@@ -146,7 +155,7 @@ class branch_and_bound_t {
   producer_sync_t& get_producer_sync() { return producer_sync_; }
 
  private:
-  const user_problem_t<i_t, f_t>& original_problem_;
+  user_problem_t<i_t, f_t> original_problem_;
   const simplex_solver_settings_t<i_t, f_t> settings_;
   std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table_;
   std::future<std::shared_ptr<detail::clique_table_t<i_t, f_t>>> clique_table_future_;
@@ -194,17 +203,16 @@ class branch_and_bound_t {
 
   // Variables for the root node in the search tree.
   std::vector<variable_status_t> root_vstatus_;
-  std::vector<variable_status_t> crossover_vstatus_;
   f_t root_objective_;
   lp_solution_t<i_t, f_t> root_relax_soln_;
-  lp_solution_t<i_t, f_t> root_crossover_soln_;
   std::vector<f_t> edge_norms_;
   std::atomic<bool> root_crossover_solution_set_{false};
   omp_atomic_t<f_t> root_lp_current_lower_bound_;
   omp_atomic_t<bool> solving_root_relaxation_{false};
   bool enable_concurrent_lp_root_solve_{false};
   std::atomic<int> root_concurrent_halt_{0};
-  bool is_root_solution_set{false};
+  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr_{nullptr};
+  i_t pdlp_root_num_gpus_{1};
 
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp
index 988c9c50ad..14624a4f4c 100644
--- a/cpp/src/dual_simplex/crossover.cpp
+++ b/cpp/src/dual_simplex/crossover.cpp
@@ -331,6 +331,7 @@ void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
 
 template <typename i_t, typename f_t>
 i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
+              const csr_matrix_t<i_t, f_t>& Arow,
               const simplex_solver_settings_t<i_t, f_t>& settings,
               f_t start_time,
               lp_solution_t<i_t, f_t>& solution,
@@ -387,6 +388,9 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
   std::vector<f_t>& y       = solution.y;
   const std::vector<f_t>& x = solution.x;
   i_t num_pushes            = 0;
+  std::vector<f_t> delta_zN(n - m);
+  std::vector<f_t> delta_expanded;  // workspace for sparse path (delta_y is sparse enough)
+  std::vector<f_t> delta_y_dense;   // workspace for dense path (delta_y is not sparse enough)
   while (superbasic_list.size() > 0) {
     const i_t s                   = superbasic_list.back();
     const i_t basic_leaving_index = superbasic_list_index.back();
@@ -401,11 +405,9 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     es_sparse.x[0] = -delta_zs;
 
     // B^T delta_y = -delta_zs*es
-    std::vector<f_t> delta_y(m);
     sparse_vector_t<i_t, f_t> delta_y_sparse(m, 1);
     sparse_vector_t<i_t, f_t> UTsol_sparse(m, 1);
     ft.b_transpose_solve(es_sparse, delta_y_sparse, UTsol_sparse);
-    delta_y_sparse.scatter(delta_y);
 
     // We solved B^T delta_y = -delta_zs*es, but for the update we need
     // U^T*etilde = es.
@@ -416,16 +418,38 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     }
 
     // delta_zN = -N^T delta_y
-    std::vector<f_t> delta_zN(n - m);
-    for (i_t k = 0; k < n - m; ++k) {
-      const i_t j         = nonbasic_list[k];
-      const i_t col_start = lp.A.col_start[j];
-      const i_t col_end   = lp.A.col_start[j + 1];
-      f_t dot             = 0.0;
-      for (i_t p = col_start; p < col_end; ++p) {
-        dot += lp.A.x[p] * delta_y[lp.A.i[p]];
+    // Choose sparse vs dense method by delta_y sparsity (match dual simplex: sparse if <= 30% nnz)
+    std::fill(delta_zN.begin(), delta_zN.end(), 0.);
+    const bool use_sparse = (delta_y_sparse.i.size() * 1.0 / m) <= 0.3;
+
+    if (use_sparse) {
+      delta_expanded.resize(n);
+      std::fill(delta_expanded.begin(), delta_expanded.end(), 0.);
+      for (i_t nnz_idx = 0; nnz_idx < static_cast<i_t>(delta_y_sparse.i.size()); ++nnz_idx) {
+        const i_t row       = delta_y_sparse.i[nnz_idx];
+        const f_t val       = delta_y_sparse.x[nnz_idx];
+        const i_t row_start = Arow.row_start[row];
+        const i_t row_end   = Arow.row_start[row + 1];
+        for (i_t p = row_start; p < row_end; ++p) {
+          const i_t col = Arow.j[p];
+          delta_expanded[col] += Arow.x[p] * val;
+        }
+      }
+      for (i_t k = 0; k < n - m; ++k) {
+        delta_zN[k] = -delta_expanded[nonbasic_list[k]];
+      }
+    } else {
+      delta_y_sparse.to_dense(delta_y_dense);
+      for (i_t k = 0; k < n - m; ++k) {
+        const i_t j       = nonbasic_list[k];
+        f_t dot           = 0.0;
+        const i_t c_start = lp.A.col_start[j];
+        const i_t c_end   = lp.A.col_start[j + 1];
+        for (i_t p = c_start; p < c_end; ++p) {
+          dot += lp.A.x[p] * delta_y_dense[lp.A.i[p]];
+        }
+        delta_zN[k] = -dot;
       }
-      delta_zN[k] = -dot;
     }
 
     i_t entering_index          = -1;
@@ -435,8 +459,10 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     assert(step_length >= -1e-6);
 
     // y <- y + step_length * delta_y
-    for (i_t i = 0; i < m; ++i) {
-      y[i] += step_length * delta_y[i];
+    // Optimized: Only update non-zero elements from sparse representation
+    for (i_t nnz_idx = 0; nnz_idx < delta_y_sparse.i.size(); ++nnz_idx) {
+      const i_t i = delta_y_sparse.i[nnz_idx];
+      y[i] += step_length * delta_y_sparse.x[nnz_idx];
     }
 
     // z <- z + step_length * delta z
@@ -725,7 +751,6 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
 {
   const i_t m = lp.num_rows;
   const i_t n = lp.num_cols;
-
   settings.log.debug("Primal push: superbasic %ld\n", superbasic_list.size());
 
   std::vector<f_t>& x = solution.x;
@@ -1002,6 +1027,7 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
   }
   solution.x = x_compare;
   solution.iterations += num_pushes;
+
   return 0;
 }
 
@@ -1190,6 +1216,9 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   f_t crossover_start = tic();
   f_t work_estimate   = 0;
 
+  csr_matrix_t<i_t, f_t> Arow(m, n, 1);
+  lp.A.to_compressed_row(Arow);
+
   settings.log.printf("\n");
   settings.log.printf("Starting crossover\n");
 
@@ -1331,8 +1360,16 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   basis_update_mpf_t ft(L, U, p, settings.refactor_frequency);
   verify_basis<i_t, f_t>(m, n, vstatus);
   compare_vstatus_with_lists<i_t, f_t>(m, n, basic_list, nonbasic_list, vstatus);
-  i_t dual_push_status = dual_push(
-    lp, settings, start_time, solution, ft, basic_list, nonbasic_list, superbasic_list, vstatus);
+  i_t dual_push_status = dual_push(lp,
+                                   Arow,
+                                   settings,
+                                   start_time,
+                                   solution,
+                                   ft,
+                                   basic_list,
+                                   nonbasic_list,
+                                   superbasic_list,
+                                   vstatus);
   if (dual_push_status < 0) { return return_to_status(dual_push_status); }
   settings.log.debug("basic list size %ld m %d\n", basic_list.size(), m);
   settings.log.debug("nonbasic list size %ld n - m %d\n", nonbasic_list.size(), n - m);
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 43429ba2de..9434f4661a 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -3597,10 +3597,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                           100.0 * dense_delta_z / (sparse_delta_z + dense_delta_z));
       ft.print_stats();
     }
-    if (settings.inside_mip && settings.concurrent_halt != nullptr) {
-      settings.log.debug("Setting concurrent halt in Dual Simplex Phase 2\n");
-      *settings.concurrent_halt = 1;
-    }
   }
   return status;
 }
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index b9ee419517..e8af7ba514 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -7,6 +7,8 @@
 
 #include <dual_simplex/presolve.hpp>
 
+#include <cuopt/linear_programming/constants.h>
+
 #include <dual_simplex/bounds_strengthening.hpp>
 #include <dual_simplex/folding.hpp>
 #include <dual_simplex/right_looking_lu.hpp>
@@ -1571,4 +1573,62 @@ template void uncrush_solution<int, double>(const presolve_info_t<int, double>&
 
 #endif
 
+#if CUOPT_INSTANTIATE_FLOAT
+
+template void convert_user_problem<int, float>(
+  const user_problem_t<int, float>& user_problem,
+  const simplex_solver_settings_t<int, float>& settings,
+  lp_problem_t<int, float>& problem,
+  std::vector<int>& new_slacks,
+  dualize_info_t<int, float>& dualize_info);
+
+template void convert_user_lp_with_guess<int, float>(
+  const user_problem_t<int, float>& user_problem,
+  const lp_solution_t<int, float>& initial_solution,
+  const std::vector<float>& initial_slack,
+  lp_problem_t<int, float>& lp,
+  lp_solution_t<int, float>& converted_solution);
+
+template int presolve<int, float>(const lp_problem_t<int, float>& original,
+                                  const simplex_solver_settings_t<int, float>& settings,
+                                  lp_problem_t<int, float>& presolved,
+                                  presolve_info_t<int, float>& presolve_info);
+
+template void crush_primal_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                const lp_problem_t<int, float>& problem,
+                                                const std::vector<float>& user_solution,
+                                                const std::vector<int>& new_slacks,
+                                                std::vector<float>& solution);
+
+template float crush_dual_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                               const lp_problem_t<int, float>& problem,
+                                               const std::vector<int>& new_slacks,
+                                               const std::vector<float>& user_y,
+                                               const std::vector<float>& user_z,
+                                               std::vector<float>& y,
+                                               std::vector<float>& z);
+
+template void uncrush_primal_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                  const lp_problem_t<int, float>& problem,
+                                                  const std::vector<float>& solution,
+                                                  std::vector<float>& user_solution);
+
+template void uncrush_dual_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                const lp_problem_t<int, float>& problem,
+                                                const std::vector<float>& y,
+                                                const std::vector<float>& z,
+                                                std::vector<float>& user_y,
+                                                std::vector<float>& user_z);
+
+template void uncrush_solution<int, float>(const presolve_info_t<int, float>& presolve_info,
+                                           const simplex_solver_settings_t<int, float>& settings,
+                                           const std::vector<float>& crushed_x,
+                                           const std::vector<float>& crushed_y,
+                                           const std::vector<float>& crushed_z,
+                                           std::vector<float>& uncrushed_x,
+                                           std::vector<float>& uncrushed_y,
+                                           std::vector<float>& uncrushed_z);
+
+#endif
+
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index 657ebc4762..37202000f8 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -30,7 +30,7 @@ struct element_t {
   f_t x;               // coefficient value
   i_t next_in_column;  // index of the next element in the column: kNone if there is no next element
   i_t next_in_row;     // index of the next element in the row: kNone if there is no next element
-};
+};  // 24 bytes
 constexpr int kNone = -1;
 
 template <typename i_t, typename f_t>
@@ -79,6 +79,29 @@ i_t initialize_degree_data(const csc_matrix_t<i_t, f_t>& A,
   return Bnz;
 }
 
+// Fill col_pos and row_pos so that column j has col_pos[j] = its index in col_count[Cdegree[j]],
+// and row i has row_pos[i] = its index in row_count[Rdegree[i]]. Enables O(1) degree-bucket
+// removal.
+template <typename i_t>
+void initialize_bucket_positions(const std::vector<std::vector<i_t>>& col_count,
+                                 const std::vector<std::vector<i_t>>& row_count,
+                                 i_t col_max_degree,
+                                 i_t row_max_degree,
+                                 std::vector<i_t>& col_pos,
+                                 std::vector<i_t>& row_pos)
+{
+  for (i_t d = 0; d <= col_max_degree; ++d) {
+    for (i_t pos = 0; pos < static_cast<i_t>(col_count[d].size()); ++pos) {
+      col_pos[col_count[d][pos]] = pos;
+    }
+  }
+  for (i_t d = 0; d <= row_max_degree; ++d) {
+    for (i_t pos = 0; pos < static_cast<i_t>(row_count[d].size()); ++pos) {
+      row_pos[row_count[d][pos]] = pos;
+    }
+  }
+}
+
 template <typename i_t, typename f_t>
 i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
                   const std::vector<i_t>& column_list,
@@ -86,11 +109,11 @@ i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
                   std::vector<element_t<i_t, f_t>>& elements,
                   std::vector<i_t>& first_in_row,
                   std::vector<i_t>& first_in_col,
+                  std::vector<i_t>& last_in_row,
                   f_t& work_estimate)
 {
   const i_t m = A.m;
   const i_t n = column_list.size();
-  std::vector<i_t> last_element_in_row(m, kNone);
   work_estimate += m;
 
   i_t nz = 0;
@@ -106,14 +129,14 @@ i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
       elements[nz].next_in_column = kNone;
       if (p > col_start) { elements[nz - 1].next_in_column = nz; }
       elements[nz].next_in_row = kNone;  // set the current next in row to None (since we don't know
-                                         // if there will be more entries in this row)
-      if (last_element_in_row[i] != kNone) {
+      // if there will be more entries in this row yet))
+      if (last_in_row[i] != kNone) {
         // If we have seen an entry in this row before, set the last entry we've seen in this row to
         // point to the current entry
-        elements[last_element_in_row[i]].next_in_row = nz;
+        elements[last_in_row[i]].next_in_row = nz;
       }
       // The current entry becomes the last element seen in the row
-      last_element_in_row[i] = nz;
+      last_in_row[i] = nz;
       if (p == col_start) { first_in_col[k] = nz; }
       if (first_in_row[i] == kNone) { first_in_row[i] = nz; }
       nz++;
@@ -316,10 +339,11 @@ void update_Cdegree_and_col_count(i_t pivot_i,
                                   const std::vector<i_t>& first_in_row,
                                   std::vector<i_t>& Cdegree,
                                   std::vector<std::vector<i_t>>& col_count,
+                                  std::vector<i_t>& col_pos,
                                   std::vector<element_t<i_t, f_t>>& elements,
                                   f_t& work_estimate)
 {
-  // Update Cdegree and col_count
+  // Update Cdegree and col_count (O(1) removal using position array)
   i_t loop_count = 0;
   for (i_t p = first_in_row[pivot_i]; p != kNone; p = elements[p].next_in_row) {
     element_t<i_t, f_t>* entry = &elements[p];
@@ -327,20 +351,20 @@ void update_Cdegree_and_col_count(i_t pivot_i,
     assert(entry->i == pivot_i);
     i_t cdeg = Cdegree[j];
     assert(cdeg >= 0);
-    for (typename std::vector<i_t>::iterator it = col_count[cdeg].begin();
-         it != col_count[cdeg].end();
-         it++) {
-      if (*it == j) {
-        // Remove col j from col_count[cdeg]
-        std::swap(*it, col_count[cdeg].back());
-        col_count[cdeg].pop_back();
-        work_estimate += (it - col_count[cdeg].begin());
-        break;
-      }
+    // O(1) swap-with-last removal
+    {
+      i_t pos              = col_pos[j];
+      i_t other            = col_count[cdeg].back();
+      col_count[cdeg][pos] = other;
+      col_pos[other]       = pos;
+      col_count[cdeg].pop_back();
     }
     cdeg = --Cdegree[j];
     assert(cdeg >= 0);
-    if (j != pivot_j && cdeg >= 0) { col_count[cdeg].push_back(j); }
+    if (j != pivot_j && cdeg >= 0) {
+      col_pos[j] = col_count[cdeg].size();
+      col_count[cdeg].push_back(j);
+    }
     loop_count++;
   }
   work_estimate += 7 * loop_count;
@@ -353,30 +377,31 @@ void update_Rdegree_and_row_count(i_t pivot_i,
                                   const std::vector<i_t>& first_in_col,
                                   std::vector<i_t>& Rdegree,
                                   std::vector<std::vector<i_t>>& row_count,
+                                  std::vector<i_t>& row_pos,
                                   std::vector<element_t<i_t, f_t>>& elements,
                                   f_t& work_estimate)
 {
-  // Update Rdegree and row_count
+  // Update Rdegree and row_count (O(1) removal using position array)
   i_t loop_count = 0;
   for (i_t p = first_in_col[pivot_j]; p != kNone; p = elements[p].next_in_column) {
     element_t<i_t, f_t>* entry = &elements[p];
     const i_t i                = entry->i;
     i_t rdeg                   = Rdegree[i];
     assert(rdeg >= 0);
-    for (typename std::vector<i_t>::iterator it = row_count[rdeg].begin();
-         it != row_count[rdeg].end();
-         it++) {
-      if (*it == i) {
-        // Remove row i from row_count[rdeg]
-        std::swap(*it, row_count[rdeg].back());
-        row_count[rdeg].pop_back();
-        work_estimate += (it - row_count[rdeg].begin());
-        break;
-      }
+    // O(1) swap-with-last removal
+    {
+      i_t pos              = row_pos[i];
+      i_t other            = row_count[rdeg].back();
+      row_count[rdeg][pos] = other;
+      row_pos[other]       = pos;
+      row_count[rdeg].pop_back();
     }
     rdeg = --Rdegree[i];
     assert(rdeg >= 0);
-    if (i != pivot_i && rdeg >= 0) { row_count[rdeg].push_back(i); }
+    if (i != pivot_i && rdeg >= 0) {
+      row_pos[i] = row_count[rdeg].size();
+      row_count[rdeg].push_back(i);
+    }
     loop_count++;
   }
   work_estimate += 7 * loop_count;
@@ -400,18 +425,17 @@ void schur_complement(i_t pivot_i,
                       std::vector<i_t>& Cdegree,
                       std::vector<std::vector<i_t>>& row_count,
                       std::vector<std::vector<i_t>>& col_count,
+                      std::vector<i_t>& last_in_row,
+                      std::vector<i_t>& col_pos,
+                      std::vector<i_t>& row_pos,
                       std::vector<element_t<i_t, f_t>>& elements,
                       f_t& work_estimate)
 {
+  // row_last_workspace: temp copy of last_in_row for this pivot step, updated when adding fill
+  // last_in_row: persistent tail pointer per row
   for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) {
-    element_t<i_t, f_t>* e = &elements[p1];
-    const i_t i            = e->i;
-    i_t row_last           = kNone;
-    for (i_t p3 = first_in_row[i]; p3 != kNone; p3 = elements[p3].next_in_row) {
-      row_last = p3;
-    }
-    work_estimate += 2 * Rdegree[i];
-    row_last_workspace[i] = row_last;
+    const i_t i           = elements[p1].i;
+    row_last_workspace[i] = last_in_row[i];
   }
   work_estimate += 4 * Cdegree[pivot_j];
 
@@ -478,35 +502,29 @@ void schur_complement(i_t pivot_i,
           first_in_row[i] = fill_p;
         }
         row_last_workspace[i] = fill_p;
-        i_t rdeg              = Rdegree[i];  // Rdgree must increase
-        for (typename std::vector<i_t>::iterator it = row_count[rdeg].begin();
-             it != row_count[rdeg].end();
-             it++) {
-          if (*it == i) {
-            // Remove row i from row_count[rdeg]
-            std::swap(*it, row_count[rdeg].back());
-            row_count[rdeg].pop_back();
-            work_estimate += 2 * (it - row_count[rdeg].begin());
-            break;
-          }
+        last_in_row[i]        = fill_p;  // maintain last_in_row persistent state
+        // Row degree update: O(1) removal using row_pos
+        {
+          i_t rdeg             = Rdegree[i];
+          i_t pos              = row_pos[i];
+          i_t other            = row_count[rdeg].back();
+          row_count[rdeg][pos] = other;
+          row_pos[other]       = pos;
+          row_count[rdeg].pop_back();
+          row_pos[i] = row_count[rdeg + 1].size();
+          row_count[++Rdegree[i]].push_back(i);
         }
-        rdeg = ++Rdegree[i];           // Increase rdeg
-        row_count[rdeg].push_back(i);  // Add row i to row_count[rdeg]
-
-        i_t cdeg = Cdegree[j];  // Cdegree must increase
-        for (typename std::vector<i_t>::iterator it = col_count[cdeg].begin();
-             it != col_count[cdeg].end();
-             it++) {
-          if (*it == j) {
-            // Remove col j from col_count[cdeg]
-            std::swap(*it, col_count[cdeg].back());
-            col_count[cdeg].pop_back();
-            work_estimate += 2 * (it - col_count[cdeg].begin());
-            break;
-          }
+        // Col degree update: O(1) removal using col_pos
+        {
+          i_t cdeg             = Cdegree[j];
+          i_t pos              = col_pos[j];
+          i_t other            = col_count[cdeg].back();
+          col_count[cdeg][pos] = other;
+          col_pos[other]       = pos;
+          col_count[cdeg].pop_back();
+          col_pos[j] = col_count[cdeg + 1].size();
+          col_count[++Cdegree[j]].push_back(j);
         }
-        cdeg = ++Cdegree[j];           // Increase Cdegree
-        col_count[cdeg].push_back(j);  // Add column j to col_count[cdeg]
       }
     }
     work_estimate += 10 * Cdegree[pivot_j];
@@ -532,7 +550,6 @@ void remove_pivot_row(i_t pivot_i,
                       f_t& work_estimate)
 {
   // Remove the pivot row
-
   i_t row_loop_count = 0;
   for (i_t p0 = first_in_row[pivot_i]; p0 != kNone; p0 = elements[p0].next_in_row) {
     element_t<i_t, f_t>* e = &elements[p0];
@@ -574,6 +591,7 @@ void remove_pivot_col(i_t pivot_i,
                       std::vector<i_t>& first_in_col,
                       std::vector<i_t>& first_in_row,
                       std::vector<f_t>& max_in_row,
+                      std::vector<i_t>& last_in_row,
                       std::vector<element_t<i_t, f_t>>& elements,
                       f_t& work_estimate)
 {
@@ -582,7 +600,10 @@ void remove_pivot_col(i_t pivot_i,
   for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) {
     element_t<i_t, f_t>* e = &elements[p1];
     const i_t i            = e->i;
-    i_t last               = kNone;
+    // Need both: last = previous-in-row (for link update when removing); last_surviving = new row
+    // tail (for last_in_row[i]). They differ when the pivot is the last element in the row.
+    i_t last           = kNone;
+    i_t last_surviving = kNone;
 #ifdef THRESHOLD_ROOK_PIVOTING
     f_t max_in_row_i = 0.0;
 #endif
@@ -598,16 +619,17 @@ void remove_pivot_col(i_t pivot_i,
         entry->i = -1;
         entry->j = -1;
         entry->x = std::numeric_limits<f_t>::quiet_NaN();
-      }
+      } else {
+        last_surviving = p;
 #ifdef THRESHOLD_ROOK_PIVOTING
-      else {
         const f_t abs_entryx = std::abs(entry->x);
         if (abs_entryx > max_in_row_i) { max_in_row_i = abs_entryx; }
-      }
 #endif
+      }
       last = p;
       row_loop_count++;
     }
+    last_in_row[i] = last_surviving;
     work_estimate += 3 * row_loop_count;
 #ifdef THRESHOLD_ROOK_PIVOTING
     max_in_row[i] = max_in_row_i;
@@ -656,11 +678,19 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
 
   const i_t Bnz =
     initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
+
+  // Position arrays for O(1) degree-bucket removal (col_count and row_count each have n+1 buckets)
+  std::vector<i_t> col_pos(n);  // if Cdegree[j] = nz, then j is in col_count[nz][col_pos[j]]
+  std::vector<i_t> row_pos(n);  // if Rdegree[i] = nz, then i is in row_count[nz][row_pos[i]]
+  initialize_bucket_positions(col_count, row_count, n, n, col_pos, row_pos);
+
   std::vector<element_t<i_t, f_t>> elements(Bnz);
   std::vector<i_t> first_in_row(n, kNone);
   std::vector<i_t> first_in_col(n, kNone);
+  std::vector<i_t> last_in_row(n, kNone);
   work_estimate += 2 * n + Bnz;
-  load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col, work_estimate);
+  load_elements(
+    A, column_list, Bnz, elements, first_in_row, first_in_col, last_in_row, work_estimate);
 
   std::vector<i_t> column_j_workspace(n, kNone);
   std::vector<i_t> row_last_workspace(n);
@@ -777,9 +807,9 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
 
     // Update Cdegree and col_count
     update_Cdegree_and_col_count(
-      pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements, work_estimate);
+      pivot_i, pivot_j, first_in_row, Cdegree, col_count, col_pos, elements, work_estimate);
     update_Rdegree_and_row_count(
-      pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements, work_estimate);
+      pivot_i, pivot_j, first_in_col, Rdegree, row_count, row_pos, elements, work_estimate);
 
     // A22 <- A22 - l u^T
     schur_complement(pivot_i,
@@ -798,14 +828,23 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      Cdegree,
                      row_count,
                      col_count,
+                     last_in_row,
+                     col_pos,
+                     row_pos,
                      elements,
                      work_estimate);
 
     // Remove the pivot row
     remove_pivot_row(
       pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements, work_estimate);
-    remove_pivot_col(
-      pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements, work_estimate);
+    remove_pivot_col(pivot_i,
+                     pivot_j,
+                     first_in_col,
+                     first_in_row,
+                     max_in_row,
+                     last_in_row,
+                     elements,
+                     work_estimate);
 
     // Set pivot entry to sentinel value
     pivot_entry->i = -1;
@@ -1030,10 +1069,18 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
 
   const i_t Bnz =
     initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
+
+  // Position arrays for O(1) degree-bucket removal (col_count has m+1 buckets, row_count n+1)
+  std::vector<i_t> col_pos(n);  // if Cdegree[j] = nz, then j is in col_count[nz][col_pos[j]]
+  std::vector<i_t> row_pos(m);  // if Rdegree[i] = nz, then i is in row_count[nz][row_pos[i]]
+  initialize_bucket_positions(col_count, row_count, m, n, col_pos, row_pos);
+
   std::vector<element_t<i_t, f_t>> elements(Bnz);
   std::vector<i_t> first_in_row(m, kNone);
   std::vector<i_t> first_in_col(n, kNone);
-  load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col, work_estimate);
+  std::vector<i_t> last_in_row(m, kNone);
+  load_elements(
+    A, column_list, Bnz, elements, first_in_row, first_in_col, last_in_row, work_estimate);
 
   std::vector<i_t> column_j_workspace(m, kNone);
   std::vector<i_t> row_last_workspace(m);
@@ -1100,9 +1147,9 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
 
     // Update Cdegree and col_count
     update_Cdegree_and_col_count<i_t, f_t>(
-      pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements, work_estimate);
+      pivot_i, pivot_j, first_in_row, Cdegree, col_count, col_pos, elements, work_estimate);
     update_Rdegree_and_row_count<i_t, f_t>(
-      pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements, work_estimate);
+      pivot_i, pivot_j, first_in_col, Rdegree, row_count, row_pos, elements, work_estimate);
 
     // A22 <- A22 - l u^T
     schur_complement<i_t, f_t>(pivot_i,
@@ -1121,14 +1168,23 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
                                Cdegree,
                                row_count,
                                col_count,
+                               last_in_row,
+                               col_pos,
+                               row_pos,
                                elements,
                                work_estimate);
 
     // Remove the pivot row
     remove_pivot_row<i_t, f_t>(
       pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements, work_estimate);
-    remove_pivot_col<i_t, f_t>(
-      pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements, work_estimate);
+    remove_pivot_col<i_t, f_t>(pivot_i,
+                               pivot_j,
+                               first_in_col,
+                               first_in_row,
+                               max_in_row,
+                               last_in_row,
+                               elements,
+                               work_estimate);
 
     // Set pivot entry to sentinel value
     pivot_entry->i = -1;
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index eadd93040c..702699980f 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -205,6 +205,9 @@ struct simplex_solver_settings_t {
   std::function<void(const std::vector<f_t>&, f_t)> node_processed_callback;
   std::function<void()> heuristic_preemption_callback;
   std::function<void(std::vector<f_t>&, std::vector<f_t>&, f_t)> set_simplex_solution_callback;
+  // Called by B&B when first LP solution is available (PDLP/Barrier or dual simplex).
+  std::function<void(root_relaxation_first_solution_t<i_t, f_t> const&)>
+    on_first_lp_solution_available;
   std::function<void(f_t)> dual_simplex_objective_callback;  // Called with current dual obj
   mutable logger_t log;
   std::atomic<int>* concurrent_halt;  // if nullptr ignored, if !nullptr, 0 if solver should
diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp
index d300d6011c..d5525891b6 100644
--- a/cpp/src/dual_simplex/solve.cpp
+++ b/cpp/src/dual_simplex/solve.cpp
@@ -706,7 +706,7 @@ i_t solve(const user_problem_t<i_t, f_t>& problem,
 {
   i_t status;
   if (is_mip(problem) && !settings.relaxation) {
-    branch_and_bound_t branch_and_bound(problem, settings, tic());
+    branch_and_bound_t<i_t, f_t> branch_and_bound(problem, settings, tic());
     mip_solution_t<i_t, f_t> mip_solution(problem.num_cols);
     mip_status_t mip_status = branch_and_bound.solve(mip_solution);
     if (mip_status == mip_status_t::OPTIMAL) {
@@ -745,7 +745,7 @@ i_t solve_mip_with_guess(const user_problem_t<i_t, f_t>& problem,
 {
   i_t status;
   if (is_mip(problem)) {
-    branch_and_bound_t branch_and_bound(problem, settings, tic());
+    branch_and_bound_t<i_t, f_t> branch_and_bound(problem, settings, tic());
     branch_and_bound.set_initial_guess(guess);
     mip_status_t mip_status = branch_and_bound.solve(solution);
     if (mip_status == mip_status_t::OPTIMAL) {
diff --git a/cpp/src/dual_simplex/types.hpp b/cpp/src/dual_simplex/types.hpp
index ea46a1f67e..6660a86f0a 100644
--- a/cpp/src/dual_simplex/types.hpp
+++ b/cpp/src/dual_simplex/types.hpp
@@ -9,6 +9,7 @@
 
 #include <cstdint>
 #include <limits>
+#include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -19,6 +20,18 @@ using float64_t = double;
 
 constexpr float64_t inf = std::numeric_limits<float64_t>::infinity();
 
+// First LP solution from either PDLP/Barrier or dual simplex; used to notify diversity manager
+// without B&B depending on PDLP types.
+template <typename i_t, typename f_t>
+struct root_relaxation_first_solution_t {
+  std::vector<f_t> primal;
+  std::vector<f_t> dual;
+  std::vector<f_t> reduced_costs;
+  f_t objective{0};
+  f_t user_objective{0};
+  i_t iterations{0};
+};
+
 // We return this constant to signal that a concurrent halt has occurred
 #define CONCURRENT_HALT_RETURN -2
 // We return this constant to signal that a time limit has occurred
diff --git a/cpp/src/mip_heuristics/CMakeLists.txt b/cpp/src/mip_heuristics/CMakeLists.txt
index a200d4265b..202ed94bd5 100644
--- a/cpp/src/mip_heuristics/CMakeLists.txt
+++ b/cpp/src/mip_heuristics/CMakeLists.txt
@@ -41,7 +41,8 @@ set(MIP_NON_LP_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/conflict_graph/clique_table.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump_kernels.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu)
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/root_lp.cu)
 
 # Choose which files to include based on build mode
 if(BUILD_LP_ONLY)
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index 0ded8337d8..c659e9788b 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -417,7 +417,16 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
   bool bb_thread_solution_exists = simplex_solution_exists.load();
   if (bb_thread_solution_exists) {
     ls.lp_optimal_exists = true;
+  } else if (branch_and_bound_ptr != nullptr &&
+             branch_and_bound_ptr->enable_concurrent_lp_root_solve()) {
+    // B&B drives root relaxation; wait for first solution (PDLP/Barrier or dual simplex)
+    first_solution_ready_.store(false, std::memory_order_release);
+    std::unique_lock<std::mutex> lock(first_solution_mutex_);
+    first_solution_cv_.wait(lock, [this]() { return first_solution_ready_.load(); });
+    lock.unlock();
+    clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
   } else if (!fj_only_run) {
+    // Heuristics-only or non-concurrent: diversity manager runs LP solve
     convert_greater_to_less(*problem_ptr);
 
     f_t tolerance_divisor =
@@ -489,38 +498,6 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
       // to bring variables within the bounds
     }
 
-    // Send PDLP relaxed solution to branch and bound
-    if (problem_ptr->set_root_relaxation_solution_callback != nullptr) {
-      auto& d_primal_solution = lp_result.get_primal_solution();
-      auto& d_dual_solution   = lp_result.get_dual_solution();
-      auto& d_reduced_costs   = lp_result.get_reduced_cost();
-
-      std::vector<f_t> host_primal(d_primal_solution.size());
-      std::vector<f_t> host_dual(d_dual_solution.size());
-      std::vector<f_t> host_reduced_costs(d_reduced_costs.size());
-      raft::copy(host_primal.data(),
-                 d_primal_solution.data(),
-                 d_primal_solution.size(),
-                 problem_ptr->handle_ptr->get_stream());
-      raft::copy(host_dual.data(),
-                 d_dual_solution.data(),
-                 d_dual_solution.size(),
-                 problem_ptr->handle_ptr->get_stream());
-      raft::copy(host_reduced_costs.data(),
-                 d_reduced_costs.data(),
-                 d_reduced_costs.size(),
-                 problem_ptr->handle_ptr->get_stream());
-      problem_ptr->handle_ptr->sync_stream();
-
-      // PDLP returns user-space objective (it applies objective_scaling_factor internally)
-      auto user_obj   = lp_result.get_objective_value();
-      auto solver_obj = problem_ptr->get_solver_obj_from_user_obj(user_obj);
-      auto iterations = lp_result.get_additional_termination_information().number_of_steps_taken;
-      // Set for the B&B (param4 expects solver space, param5 expects user space)
-      problem_ptr->set_root_relaxation_solution_callback(
-        host_primal, host_dual, host_reduced_costs, solver_obj, user_obj, iterations);
-    }
-
     // in case the pdlp returned var boudns that are out of bounds
     clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
   }
@@ -859,6 +836,35 @@ std::pair<solution_t<i_t, f_t>, bool> diversity_manager_t<i_t, f_t>::recombine(
   return std::make_pair(solution_t<i_t, f_t>(a), false);
 }
 
+template <typename i_t, typename f_t>
+void diversity_manager_t<i_t, f_t>::on_first_lp_solution(
+  cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> const& result)
+{
+  {
+    std::lock_guard<std::mutex> lock(relaxed_solution_mutex);
+    cuopt_assert(result.primal.size() == lp_optimal_solution.size(),
+                 "First LP solution primal size mismatch");
+    cuopt_assert(result.dual.size() == lp_dual_optimal_solution.size(),
+                 "First LP solution dual size mismatch");
+    raft::copy(lp_optimal_solution.data(),
+               result.primal.data(),
+               result.primal.size(),
+               problem_ptr->handle_ptr->get_stream());
+    raft::copy(lp_dual_optimal_solution.data(),
+               result.dual.data(),
+               result.dual.size(),
+               problem_ptr->handle_ptr->get_stream());
+    problem_ptr->handle_ptr->sync_stream();
+    ls.lp_optimal_exists = true;
+    set_new_user_bound(result.user_objective);
+  }
+  {
+    std::lock_guard<std::mutex> lock(first_solution_mutex_);
+    first_solution_ready_.store(true, std::memory_order_release);
+    first_solution_cv_.notify_all();
+  }
+}
+
 template <typename i_t, typename f_t>
 void diversity_manager_t<i_t, f_t>::set_simplex_solution(const std::vector<f_t>& solution,
                                                          const std::vector<f_t>& dual_solution,
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
index 4f86192db8..fed937a88b 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
@@ -21,12 +21,16 @@
 #include <cuopt/linear_programming/mip/solver_settings.hpp>
 #include <cuopt/linear_programming/mip/solver_stats.hpp>
 
+#include <dual_simplex/types.hpp>
+
 #include <mip_heuristics/diversity/lns/rins.cuh>
 #include <mip_heuristics/local_search/local_search.cuh>
 #include <mip_heuristics/solution/solution.cuh>
 #include <mip_heuristics/solver.cuh>
 #include <utilities/timer.hpp>
 
+#include <condition_variable>
+#include <mutex>
 #include <cstdint>
 
 namespace cuopt::linear_programming::detail {
@@ -71,6 +75,12 @@ class diversity_manager_t {
   void set_simplex_solution(const std::vector<f_t>& solution,
                             const std::vector<f_t>& dual_solution,
                             f_t objective);
+
+  // Called by B&B when first LP solution is available (PDLP/Barrier or dual simplex).
+  void on_first_lp_solution(
+    cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> const&
+      result);
+
   mip_solver_context_t<i_t, f_t>& context;
   dual_simplex::branch_and_bound_t<i_t, f_t>* branch_and_bound_ptr;
   problem_t<i_t, f_t>* problem_ptr;
@@ -98,6 +108,11 @@ class diversity_manager_t {
   // atomic for signalling pdlp to stop
   std::atomic<int> global_concurrent_halt{0};
 
+  // First solution from B&B: wait for B&B to call on_first_lp_solution when run_bb and concurrent
+  std::mutex first_solution_mutex_;
+  std::condition_variable first_solution_cv_;
+  std::atomic<bool> first_solution_ready_{false};
+
   rins_t<i_t, f_t> rins;
 
   bool run_only_ls_recombiner{false};
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index d7d7601014..fcee9ecc89 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -248,12 +248,9 @@ void rins_t<i_t, f_t>::run_rins()
 
   // run sub-mip
   namespace dual_simplex = cuopt::linear_programming::dual_simplex;
-  dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem(&rins_handle);
   dual_simplex::simplex_solver_settings_t<i_t, f_t> branch_and_bound_settings;
   dual_simplex::mip_solution_t<i_t, f_t> branch_and_bound_solution(1);
   dual_simplex::mip_status_t branch_and_bound_status = dual_simplex::mip_status_t::UNSET;
-  fixed_problem.get_host_user_problem(branch_and_bound_problem);
-  branch_and_bound_solution.resize(branch_and_bound_problem.num_cols);
   // Fill in the settings for branch and bound
   branch_and_bound_settings.time_limit = time_limit;
   // branch_and_bound_settings.node_limit = 5000 + node_count / 100;  // try harder as time goes
@@ -274,8 +271,13 @@ void rins_t<i_t, f_t>::run_rins()
                                                                        f_t objective) {
     rins_solution_queue.push_back(solution);
   };
+  dual_simplex::user_problem_t<i_t, f_t> bb_user_problem(fixed_problem.handle_ptr);
+  fixed_problem.recompute_objective_integrality();
+  bb_user_problem.objective_is_integral = fixed_problem.is_objective_integral();
+  fixed_problem.get_host_user_problem(bb_user_problem);
   dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(
-    branch_and_bound_problem, branch_and_bound_settings, dual_simplex::tic());
+    bb_user_problem, branch_and_bound_settings, dual_simplex::tic(), &fixed_problem, 1);
+  branch_and_bound_solution.resize(branch_and_bound.get_num_cols());
   branch_and_bound.set_initial_guess(cuopt::host_copy(fixed_assignment, rins_handle.get_stream()));
   branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution);
 
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
index a867141d0a..e55355ecba 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
@@ -95,10 +95,7 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
     if (run_sub_mip) {
       // run sub-mip
       namespace dual_simplex = cuopt::linear_programming::dual_simplex;
-      dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem(offspring.handle_ptr);
       dual_simplex::simplex_solver_settings_t<i_t, f_t> branch_and_bound_settings;
-      fixed_problem.get_host_user_problem(branch_and_bound_problem);
-      branch_and_bound_solution.resize(branch_and_bound_problem.num_cols);
       // Fill in the settings for branch and bound
       branch_and_bound_settings.time_limit = sub_mip_recombiner_config_t::sub_mip_time_limit;
       branch_and_bound_settings.print_presolve_stats = false;
@@ -117,8 +114,13 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
 
       // disable B&B logs, so that it is not interfering with the main B&B thread
       branch_and_bound_settings.log.log = false;
+      dual_simplex::user_problem_t<i_t, f_t> bb_user_problem(fixed_problem.handle_ptr);
+      fixed_problem.recompute_objective_integrality();
+      bb_user_problem.objective_is_integral = fixed_problem.is_objective_integral();
+      fixed_problem.get_host_user_problem(bb_user_problem);
       dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(
-        branch_and_bound_problem, branch_and_bound_settings, dual_simplex::tic());
+        bb_user_problem, branch_and_bound_settings, dual_simplex::tic(), &fixed_problem, 1);
+      branch_and_bound_solution.resize(branch_and_bound.get_num_cols());
       branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution);
       if (solution_vector.size() > 0) {
         cuopt_assert(fixed_assignment.size() == branch_and_bound_solution.x.size(),
diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu
index 90d80f5948..32aeef695a 100644
--- a/cpp/src/mip_heuristics/problem/problem.cu
+++ b/cpp/src/mip_heuristics/problem/problem.cu
@@ -149,8 +149,7 @@ problem_t<i_t, f_t>::problem_t(
     Q_values(problem_.get_quadratic_objective_values())
 {
   op_problem_cstr_body(problem_);
-  branch_and_bound_callback             = nullptr;
-  set_root_relaxation_solution_callback = nullptr;
+  branch_and_bound_callback = nullptr;
 }
 
 template <typename i_t, typename f_t>
@@ -162,7 +161,6 @@ problem_t<i_t, f_t>::problem_t(const problem_t<i_t, f_t>& problem_)
     integer_fixed_problem(problem_.integer_fixed_problem),
     integer_fixed_variable_map(problem_.integer_fixed_variable_map, handle_ptr->get_stream()),
     branch_and_bound_callback(nullptr),
-    set_root_relaxation_solution_callback(nullptr),
     n_variables(problem_.n_variables),
     n_constraints(problem_.n_constraints),
     n_binary_vars(problem_.n_binary_vars),
@@ -219,7 +217,6 @@ problem_t<i_t, f_t>::problem_t(const problem_t<i_t, f_t>& problem_,
     integer_fixed_problem(problem_.integer_fixed_problem),
     integer_fixed_variable_map(problem_.integer_fixed_variable_map, handle_ptr->get_stream()),
     branch_and_bound_callback(nullptr),
-    set_root_relaxation_solution_callback(nullptr),
     n_variables(problem_.n_variables),
     n_constraints(problem_.n_constraints),
     n_binary_vars(problem_.n_binary_vars),
diff --git a/cpp/src/mip_heuristics/problem/problem.cuh b/cpp/src/mip_heuristics/problem/problem.cuh
index 9771bab568..130c97526e 100644
--- a/cpp/src/mip_heuristics/problem/problem.cuh
+++ b/cpp/src/mip_heuristics/problem/problem.cuh
@@ -242,9 +242,6 @@ class problem_t {
   rmm::device_uvector<i_t> integer_fixed_variable_map;
 
   std::function<void(const std::vector<f_t>&)> branch_and_bound_callback;
-  std::function<void(
-    const std::vector<f_t>&, const std::vector<f_t>&, const std::vector<f_t>&, f_t, f_t, i_t)>
-    set_root_relaxation_solution_callback;
 
   typename mip_solver_settings_t<i_t, f_t>::tolerances_t tolerances{};
   i_t n_variables{0};
diff --git a/cpp/src/mip_heuristics/root_lp.cu b/cpp/src/mip_heuristics/root_lp.cu
new file mode 100644
index 0000000000..b181db43cd
--- /dev/null
+++ b/cpp/src/mip_heuristics/root_lp.cu
@@ -0,0 +1,219 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <mip_heuristics/problem/problem.cuh>
+#include <mip_heuristics/problem/problem_helpers.cuh>
+#include "root_lp.cuh"
+
+#include <pdlp/pdlp.cuh>
+#include <pdlp/solve.cuh>
+
+#include <dual_simplex/crossover.hpp>
+#include <dual_simplex/presolve.hpp>
+#include <dual_simplex/types.hpp>
+#include <raft/core/copy.hpp>
+#include <utilities/timer.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+namespace {
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
+copy_lp_result_to_root_solution(problem_t<i_t, f_t>* problem,
+                                const optimization_problem_solution_t<i_t, f_t>& lp_result)
+{
+  cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> result;
+  auto stream = problem->handle_ptr->get_stream();
+  result.primal.resize(lp_result.get_primal_solution().size());
+  result.dual.resize(lp_result.get_dual_solution().size());
+  result.reduced_costs.resize(lp_result.get_reduced_cost().size());
+  raft::copy(
+    result.primal.data(), lp_result.get_primal_solution().data(), result.primal.size(), stream);
+  raft::copy(result.dual.data(), lp_result.get_dual_solution().data(), result.dual.size(), stream);
+  raft::copy(result.reduced_costs.data(),
+             lp_result.get_reduced_cost().data(),
+             result.reduced_costs.size(),
+             stream);
+  problem->handle_ptr->sync_stream();
+  result.objective      = problem->get_solver_obj_from_user_obj(lp_result.get_objective_value());
+  result.user_objective = lp_result.get_objective_value();
+  result.iterations     = lp_result.get_additional_termination_information().number_of_steps_taken;
+  return result;
+}
+
+}  // namespace
+
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
+run_solver_for_root_lp(problem_t<i_t, f_t>* problem,
+                       f_t time_limit,
+                       std::atomic<int>* concurrent_halt,
+                       i_t num_gpus,
+                       method_t method)
+{
+  convert_greater_to_less(*problem);
+  f_t tolerance_divisor =
+    problem->tolerances.absolute_tolerance /
+    (problem->tolerances.relative_tolerance > 0 ? problem->tolerances.relative_tolerance : 1);
+  pdlp_solver_settings_t<i_t, f_t> pdlp_settings{};
+  pdlp_settings.tolerances.relative_primal_tolerance =
+    problem->tolerances.absolute_tolerance / tolerance_divisor;
+  pdlp_settings.tolerances.relative_dual_tolerance =
+    problem->tolerances.absolute_tolerance / tolerance_divisor;
+  pdlp_settings.time_limit            = time_limit;
+  pdlp_settings.first_primal_feasible = false;
+  pdlp_settings.concurrent_halt       = concurrent_halt;
+  pdlp_settings.halt_set_by_caller    = true;  // B&B sets halt only after crossover
+  pdlp_settings.method                = method;
+  pdlp_settings.inside_mip            = true;
+  pdlp_settings.num_gpus              = num_gpus;
+  pdlp_settings.presolver             = presolver_t::None;
+  pdlp_settings.crossover             = false;  // B&B does crush + crossover for both paths
+  if (method == method_t::PDLP) { pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; }
+
+  timer_t lp_timer(time_limit);
+  auto lp_result = solve_lp_with_method<i_t, f_t>(*problem, pdlp_settings, lp_timer);
+  return copy_lp_result_to_root_solution(problem, lp_result);
+}
+
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::crossover_status_t run_crush_crossover_and_maybe_win(
+  const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>& result,
+  const cuopt::linear_programming::dual_simplex::user_problem_t<i_t, f_t>& original_problem,
+  const cuopt::linear_programming::dual_simplex::lp_problem_t<i_t, f_t>& original_lp,
+  const std::vector<i_t>& new_slacks,
+  const cuopt::linear_programming::dual_simplex::simplex_solver_settings_t<i_t, f_t>&
+    crossover_settings,
+  f_t start_time,
+  std::atomic<int>* concurrent_halt,
+  std::function<void()> set_halter,
+  std::function<void(
+    const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>&)>
+    on_first_lp_solution,
+  std::mutex* first_solver_mutex,
+  bool* first_solver_callback_done,
+  std::mutex* first_result_mutex,
+  std::atomic<int>* winner,
+  int winner_id,
+  cuopt::linear_programming::dual_simplex::crossover_status_t* first_crossover_status_out,
+  cuopt::linear_programming::dual_simplex::lp_solution_t<i_t, f_t>* winner_crossover_soln,
+  std::vector<cuopt::linear_programming::dual_simplex::variable_status_t>* winner_crossover_vstatus,
+  f_t* winner_root_objective,
+  const char* this_solver_name,
+  std::string* winner_solver_name_out)
+{
+  using namespace cuopt::linear_programming::dual_simplex;
+  if (on_first_lp_solution) {
+    std::lock_guard<std::mutex> lock(*first_solver_mutex);
+    if (!*first_solver_callback_done) {
+      *first_solver_callback_done = true;
+      on_first_lp_solution(result);
+    }
+  }
+  lp_solution_t<i_t, f_t> soln(original_lp.num_rows, original_lp.num_cols);
+  soln.x              = result.primal;
+  soln.y              = result.dual;
+  soln.z              = result.reduced_costs;
+  soln.objective      = result.objective;
+  soln.user_objective = result.user_objective;
+  soln.iterations     = result.iterations;
+  std::vector<f_t> crushed_x;
+  crush_primal_solution(original_problem, original_lp, soln.x, new_slacks, crushed_x);
+  std::vector<f_t> crushed_y;
+  std::vector<f_t> crushed_z;
+  (void)crush_dual_solution(
+    original_problem, original_lp, new_slacks, soln.y, soln.z, crushed_y, crushed_z);
+  soln.x = std::move(crushed_x);
+  soln.y = std::move(crushed_y);
+  soln.z = std::move(crushed_z);
+  lp_solution_t<i_t, f_t> crossover_out(original_lp.num_rows, original_lp.num_cols);
+  std::vector<variable_status_t> vstatus_out(original_lp.num_cols);
+  auto root_crossover_settings = crossover_settings;
+  root_crossover_settings.inside_mip =
+    1;  // root LP crossover; dual_phase2 uses this to set concurrent_halt
+  root_crossover_settings.log.log         = false;
+  root_crossover_settings.concurrent_halt = concurrent_halt;
+  crossover_status_t status =
+    crossover(original_lp, root_crossover_settings, soln, start_time, crossover_out, vstatus_out);
+  {
+    std::lock_guard<std::mutex> lock(*first_result_mutex);
+    int expected = 0;
+    if (status == crossover_status_t::OPTIMAL &&
+        winner->compare_exchange_strong(expected, winner_id, std::memory_order_acq_rel)) {
+      *first_crossover_status_out = status;
+      if (winner_solver_name_out) { *winner_solver_name_out = this_solver_name; }
+      winner_crossover_soln->x              = std::move(crossover_out.x);
+      winner_crossover_soln->y              = std::move(crossover_out.y);
+      winner_crossover_soln->z              = std::move(crossover_out.z);
+      winner_crossover_soln->objective      = result.objective;
+      winner_crossover_soln->user_objective = result.user_objective;
+      winner_crossover_soln->iterations     = result.iterations;
+      *winner_root_objective                = result.objective;
+      *winner_crossover_vstatus             = std::move(vstatus_out);
+      set_halter();
+    } else {
+      if (winner->load(std::memory_order_acquire) != 0) { status = *first_crossover_status_out; }
+    }
+  }
+  return status;
+}
+
+template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>
+run_solver_for_root_lp<int, double>(
+  problem_t<int, double>*, double, std::atomic<int>*, int, method_t);
+template cuopt::linear_programming::dual_simplex::crossover_status_t
+run_crush_crossover_and_maybe_win<int, double>(
+  const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>&,
+  const cuopt::linear_programming::dual_simplex::user_problem_t<int, double>&,
+  const cuopt::linear_programming::dual_simplex::lp_problem_t<int, double>&,
+  const std::vector<int>&,
+  const cuopt::linear_programming::dual_simplex::simplex_solver_settings_t<int, double>&,
+  double,
+  std::atomic<int>*,
+  std::function<void()>,
+  std::function<void(
+    const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>&)>,
+  std::mutex*,
+  bool*,
+  std::mutex*,
+  std::atomic<int>*,
+  int,
+  cuopt::linear_programming::dual_simplex::crossover_status_t*,
+  cuopt::linear_programming::dual_simplex::lp_solution_t<int, double>*,
+  std::vector<cuopt::linear_programming::dual_simplex::variable_status_t>*,
+  double*,
+  const char*,
+  std::string*);
+
+#ifdef MIP_INSTANTIATION_FLOAT
+template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>
+run_solver_for_root_lp<int, float>(problem_t<int, float>*, float, std::atomic<int>*, int, method_t);
+template cuopt::linear_programming::dual_simplex::crossover_status_t
+run_crush_crossover_and_maybe_win<int, float>(
+  const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>&,
+  const cuopt::linear_programming::dual_simplex::user_problem_t<int, float>&,
+  const cuopt::linear_programming::dual_simplex::lp_problem_t<int, float>&,
+  const std::vector<int>&,
+  const cuopt::linear_programming::dual_simplex::simplex_solver_settings_t<int, float>&,
+  float,
+  std::atomic<int>*,
+  std::function<void()>,
+  std::function<void(
+    const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>&)>,
+  std::mutex*,
+  bool*,
+  std::mutex*,
+  std::atomic<int>*,
+  int,
+  cuopt::linear_programming::dual_simplex::crossover_status_t*,
+  cuopt::linear_programming::dual_simplex::lp_solution_t<int, float>*,
+  std::vector<cuopt::linear_programming::dual_simplex::variable_status_t>*,
+  float*,
+  const char*,
+  std::string*);
+#endif
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/root_lp.cuh b/cpp/src/mip_heuristics/root_lp.cuh
new file mode 100644
index 0000000000..2f87884fe9
--- /dev/null
+++ b/cpp/src/mip_heuristics/root_lp.cuh
@@ -0,0 +1,66 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+#include <dual_simplex/crossover.hpp>
+#include <dual_simplex/types.hpp>
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <mutex>
+#include <string>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+class problem_t;
+
+/** Run PDLP or Barrier for root LP. Uses concurrent_halt to stop; does not set it. Crossover done
+ * by caller. */
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
+run_solver_for_root_lp(problem_t<i_t, f_t>* problem,
+                       f_t time_limit,
+                       std::atomic<int>* concurrent_halt,
+                       i_t num_gpus,
+                       method_t method);
+
+/**
+ * Run crush + crossover on a root LP solution and optionally store as winner (first to finish).
+ * Used by B&B when running PDLP and Barrier concurrently; both paths call this after their solver
+ * returns.
+ */
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::crossover_status_t run_crush_crossover_and_maybe_win(
+  const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>& result,
+  const cuopt::linear_programming::dual_simplex::user_problem_t<i_t, f_t>& original_problem,
+  const cuopt::linear_programming::dual_simplex::lp_problem_t<i_t, f_t>& original_lp,
+  const std::vector<i_t>& new_slacks,
+  const cuopt::linear_programming::dual_simplex::simplex_solver_settings_t<i_t, f_t>&
+    crossover_settings,
+  f_t start_time,
+  std::atomic<int>* concurrent_halt,
+  std::function<void()> set_halter,
+  std::function<void(
+    const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>&)>
+    on_first_lp_solution,
+  std::mutex* first_solver_mutex,
+  bool* first_solver_callback_done,
+  std::mutex* first_result_mutex,
+  std::atomic<int>* winner,
+  int winner_id,
+  cuopt::linear_programming::dual_simplex::crossover_status_t* first_crossover_status_out,
+  cuopt::linear_programming::dual_simplex::lp_solution_t<i_t, f_t>* winner_crossover_soln,
+  std::vector<cuopt::linear_programming::dual_simplex::variable_status_t>* winner_crossover_vstatus,
+  f_t* winner_root_objective,
+  const char* this_solver_name,
+  std::string* winner_solver_name_out);
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index f25c093afb..e426365841 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -180,13 +180,11 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
   namespace dual_simplex = cuopt::linear_programming::dual_simplex;
   std::future<dual_simplex::mip_status_t> branch_and_bound_status_future;
-  dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem(context.problem_ptr->handle_ptr);
   context.problem_ptr->recompute_objective_integrality();
   if (context.problem_ptr->is_objective_integral()) {
     CUOPT_LOG_INFO("Objective function is integral, scale %g",
                    context.problem_ptr->presolve_data.objective_scaling_factor);
   }
-  branch_and_bound_problem.objective_is_integral = context.problem_ptr->is_objective_integral();
   dual_simplex::simplex_solver_settings_t<i_t, f_t> branch_and_bound_settings;
   std::unique_ptr<dual_simplex::branch_and_bound_t<i_t, f_t>> branch_and_bound;
   branch_and_bound_solution_helper_t solution_helper(&dm, branch_and_bound_settings);
@@ -194,11 +192,6 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
   bool run_bb = !context.settings.heuristics_only;
   if (run_bb) {
-    // Convert the presolved problem to dual_simplex::user_problem_t
-    op_problem_.get_host_user_problem(branch_and_bound_problem);
-    // Resize the solution now that we know the number of columns/variables
-    branch_and_bound_solution.resize(branch_and_bound_problem.num_cols);
-
     // Fill in the settings for branch and bound
     branch_and_bound_settings.time_limit           = timer_.get_time_limit();
     branch_and_bound_settings.node_limit           = context.settings.node_limit;
@@ -262,20 +255,30 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
                   &solution_helper,
                   std::placeholders::_1,
                   std::placeholders::_2);
+
+      branch_and_bound_settings.on_first_lp_solution_available =
+        [&dm](dual_simplex::root_relaxation_first_solution_t<i_t, f_t> const& result) {
+          dm.on_first_lp_solution(result);
+        };
     }
 
-    // Create the branch and bound object
-    branch_and_bound = std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(
-      branch_and_bound_problem,
-      branch_and_bound_settings,
-      timer_.get_tic_start(),
-      context.problem_ptr->clique_table);
+    dual_simplex::user_problem_t<i_t, f_t> bb_user_problem(context.problem_ptr->handle_ptr);
+    context.problem_ptr->recompute_objective_integrality();
+    bb_user_problem.objective_is_integral = context.problem_ptr->is_objective_integral();
+    context.problem_ptr->get_host_user_problem(bb_user_problem);
+    branch_and_bound =
+      std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(bb_user_problem,
+                                                                   branch_and_bound_settings,
+                                                                   timer_.get_tic_start(),
+                                                                   context.problem_ptr,
+                                                                   context.settings.num_gpus,
+                                                                  context.problem_ptr->clique_table);
+    branch_and_bound_solution.resize(branch_and_bound->get_num_cols());
     context.branch_and_bound_ptr = branch_and_bound.get();
     auto* stats_ptr              = &context.stats;
     branch_and_bound->set_user_bound_callback(
       [stats_ptr](f_t user_bound) { stats_ptr->set_solution_bound(user_bound); });
 
-    // Set the primal heuristics -> branch and bound callback
     if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) {
       branch_and_bound->set_concurrent_lp_root_solve(true);
 
@@ -295,16 +298,6 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     context.work_unit_scheduler_.register_context(branch_and_bound->get_work_unit_context());
     // context.work_unit_scheduler_.verbose = true;
 
-    context.problem_ptr->set_root_relaxation_solution_callback =
-      std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_root_relaxation_solution,
-                branch_and_bound.get(),
-                std::placeholders::_1,
-                std::placeholders::_2,
-                std::placeholders::_3,
-                std::placeholders::_4,
-                std::placeholders::_5,
-                std::placeholders::_6);
-
     if (timer_.check_time_limit()) {
       CUOPT_LOG_INFO("Time limit reached during B&B setup");
       solution_t<i_t, f_t> sol(*context.problem_ptr);
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 82e79098a7..9424240a08 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -499,9 +499,9 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
                                              pdlp_termination_status_t::IterationLimit));
   }
 
-  // Check for concurrent limit
-  if (settings_.method == method_t::Concurrent && settings_.concurrent_halt != nullptr &&
-      *settings_.concurrent_halt == 1) {
+  // Check for concurrent limit (whenever caller provides a halt flag, e.g. B&B racing PDLP vs
+  // Barrier)
+  if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
 #ifdef PDLP_VERBOSE_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Concurrent Limit reached, returning current solution" << std::endl;
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 2fc9ec08d5..3c3ce1e0eb 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -463,9 +463,10 @@ run_barrier(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
   CUOPT_LOG_CONDITIONAL_INFO(
     !settings.inside_mip, "Barrier finished in %.2f seconds", timer.elapsed_time());
 
-  if (settings.concurrent_halt != nullptr && (status == dual_simplex::lp_status_t::OPTIMAL ||
-                                              status == dual_simplex::lp_status_t::UNBOUNDED ||
-                                              status == dual_simplex::lp_status_t::INFEASIBLE)) {
+  if (!settings.halt_set_by_caller && settings.concurrent_halt != nullptr &&
+      (status == dual_simplex::lp_status_t::OPTIMAL ||
+       status == dual_simplex::lp_status_t::UNBOUNDED ||
+       status == dual_simplex::lp_status_t::INFEASIBLE)) {
     // We finished. Tell PDLP to stop if it is still running.
     *settings.concurrent_halt = 1;
   }
@@ -535,9 +536,10 @@ run_dual_simplex(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
   CUOPT_LOG_CONDITIONAL_INFO(
     !settings.inside_mip, "Dual simplex finished in %.2f seconds", timer.elapsed_time());
 
-  if (settings.concurrent_halt != nullptr && (status == dual_simplex::lp_status_t::OPTIMAL ||
-                                              status == dual_simplex::lp_status_t::UNBOUNDED ||
-                                              status == dual_simplex::lp_status_t::INFEASIBLE)) {
+  if (!settings.halt_set_by_caller && settings.concurrent_halt != nullptr &&
+      (status == dual_simplex::lp_status_t::OPTIMAL ||
+       status == dual_simplex::lp_status_t::UNBOUNDED ||
+       status == dual_simplex::lp_status_t::INFEASIBLE)) {
     // We finished. Tell PDLP to stop if it is still running.
     *settings.concurrent_halt = 1;
   }
@@ -828,7 +830,7 @@ optimization_problem_solution_t<i_t, f_t> run_pdlp(detail::problem_t<i_t, f_t>&
       CUOPT_LOG_CONDITIONAL_INFO(
         !settings.inside_mip, "Crossover status %s", sol.get_termination_status_string().c_str());
     }
-    if (settings.method == method_t::Concurrent && settings.concurrent_halt != nullptr &&
+    if (!settings.halt_set_by_caller && settings.method == method_t::Concurrent && settings.concurrent_halt != nullptr &&
         crossover_info == 0 && sol.get_termination_status() == pdlp_termination_status_t::Optimal) {
       // We finished. Tell dual simplex to stop if it is still running.
       CUOPT_LOG_CONDITIONAL_INFO(!settings.inside_mip, "PDLP finished. Telling others to stop");
diff --git a/cpp/src/pdlp/solver_solution.cu b/cpp/src/pdlp/solver_solution.cu
index 10e6a80593..87ad02e020 100644
--- a/cpp/src/pdlp/solver_solution.cu
+++ b/cpp/src/pdlp/solver_solution.cu
@@ -372,6 +372,12 @@ rmm::device_uvector<f_t>& optimization_problem_solution_t<i_t, f_t>::get_reduced
   return reduced_cost_;
 }
 
+template <typename i_t, typename f_t>
+const rmm::device_uvector<f_t>& optimization_problem_solution_t<i_t, f_t>::get_reduced_cost() const
+{
+  return reduced_cost_;
+}
+
 template <typename i_t, typename f_t>
 pdlp_termination_status_t optimization_problem_solution_t<i_t, f_t>::get_termination_status(
   i_t id) const
diff --git a/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb b/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
index 9df5e2c0c7..9cfc05f9bb 100644
--- a/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
+++ b/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
@@ -147,7 +147,7 @@
    "metadata": {},
    "source": [
     "#### Compressed Sparse Row (CSR) representation of above weighted waypoint graph.\n",
-    "For details on the CSR encoding of the above graph see the [cost_matrix_and_waypoint_graph_creation.ipynb](https://github.com/NVIDIA/cuopt-examples/blob/release/26.04/intra-factory_transport/cost_matrix_and_waypoint_graph_creation.ipynb) notebook."
+    "For details on the CSR encoding of the above graph see the [cost_matrix_and_waypoint_graph_creation.ipynb](https://github.com/NVIDIA/cuopt-examples/blob/main/intra-factory_transport/cost_matrix_and_waypoint_graph_creation.ipynb) notebook."
    ]
   },
   {
diff --git a/docs/cuopt/source/faq.rst b/docs/cuopt/source/faq.rst
index 1985052531..0c3a0e219f 100644
--- a/docs/cuopt/source/faq.rst
+++ b/docs/cuopt/source/faq.rst
@@ -283,7 +283,7 @@ Routing FAQ
 
     So in either case, task locations are actually integer indices into another structure.
 
-    If you have (lat, long) values, then you can generate a cost matrix using a map API. cuOpt does not directly connect to a third-party map engine, but that can be done outside of cuOpt as shown `here <https://github.com/NVIDIA/cuOpt-Resources/blob/release/26.04/notebooks/routing/service/cost_matrix_creation.ipynb>`__.
+    If you have (lat, long) values, then you can generate a cost matrix using a map API. cuOpt does not directly connect to a third-party map engine, but that can be done outside of cuOpt as shown `here <https://github.com/NVIDIA/cuOpt-Resources/blob/main/notebooks/routing/service/cost_matrix_creation.ipynb>`__.
 
 .. dropdown:: Is it possible to define constraints such as refrigerated vehicles required for certain orders?