From 67278741616b022ce48d56f5e4c194adf44b7c9a Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Mon, 27 Apr 2026 20:57:49 -0700 Subject: [PATCH] fix(engine-tests): parallelize multi-DC startup + enable 6 multi-DC tests --- engine/packages/engine/tests/common/ctx.rs | 15 +++++----- .../engine/tests/common/test_helpers.rs | 22 +++++++++++++++ .../tests/envoy/api_actors_get_or_create.rs | 11 +++++--- .../engine/tests/envoy/api_actors_list.rs | 23 +++++++++++---- .../tests/envoy/api_actors_list_names.rs | 28 +++++++++++++++---- 5 files changed, 77 insertions(+), 22 deletions(-) diff --git a/engine/packages/engine/tests/common/ctx.rs b/engine/packages/engine/tests/common/ctx.rs index 59b237cab6..3693bd79b0 100644 --- a/engine/packages/engine/tests/common/ctx.rs +++ b/engine/packages/engine/tests/common/ctx.rs @@ -84,17 +84,18 @@ impl TestCtx { .await? .into_iter(); - // Setup all datacenters - let mut dcs = Vec::new(); - for test_deps in test_deps_list { - let dc = Self::setup_instance( + // Setup all datacenters in parallel so each DC's epoxy/peer endpoints can reach the + // others without hitting a startup race (sequential setup would let DC1's epoxy try to + // contact DC2 before DC2's API server is listening, which puts DC1 into a long backoff + // loop). + let setup_futures = test_deps_list.map(|test_deps| { + Self::setup_instance( test_deps, opts.pegboard_outbound, opts.auth_admin_token.clone(), ) - .await?; - dcs.push(dc); - } + }); + let mut dcs: Vec = futures_util::future::try_join_all(setup_futures).await?; dcs.sort_by_key(|dc| dc.config.dc_label()); Ok(Self { dcs, opts }) diff --git a/engine/packages/engine/tests/common/test_helpers.rs b/engine/packages/engine/tests/common/test_helpers.rs index bd9698f90f..45afdf3173 100644 --- a/engine/packages/engine/tests/common/test_helpers.rs +++ b/engine/packages/engine/tests/common/test_helpers.rs @@ -90,6 +90,28 @@ pub async fn setup_test_namespace_with_envoy_for_names( (namespace_name, namespace_id, envoy) } +/// Set up an additional envoy serving the given namespace on a non-leader DC. +/// +/// Use when a test needs envoys on multiple DCs (e.g., cross-DC creation tests). Pass the +/// actor names the envoy should advertise via `prepopulate_actor_names`; each is registered +/// with an `EchoActor` behavior. +pub async fn setup_envoy_on_dc( + dc: &super::TestDatacenter, + namespace: &str, + actor_names: Vec, +) -> super::test_envoy::TestEnvoy { + setup_envoy(dc, namespace, move |mut builder| { + builder = builder.with_pool_name(super::TEST_RUNNER_NAME); + for name in actor_names { + builder = builder.with_actor_behavior(&name, |_config| { + Box::new(super::test_envoy::EchoActor::new()) + }); + } + builder + }) + .await +} + pub async fn cleanup_test_namespace(namespace_id: rivet_util::Id, _guard_port: u16) { // TODO: implement namespace deletion when available tracing::info!(?namespace_id, "namespace cleanup (not implemented)"); diff --git a/engine/packages/engine/tests/envoy/api_actors_get_or_create.rs b/engine/packages/engine/tests/envoy/api_actors_get_or_create.rs index fab3f4b973..a8b031e9ea 100644 --- a/engine/packages/engine/tests/envoy/api_actors_get_or_create.rs +++ b/engine/packages/engine/tests/envoy/api_actors_get_or_create.rs @@ -358,10 +358,8 @@ fn get_or_create_returns_winner_on_race() { }); } -// Broken legacy Pegboard Runner multi-DC coverage: full `runner::` sweep times -// out with `test timed out: Elapsed(())`. #[test] -#[ignore = "cross-DC get_or_create not idempotent"] +#[ignore = "cross-DC reserve_key race: concurrent same-key requests from different DCs produce two distinct actors instead of converging"] fn get_or_create_race_condition_across_datacenters() { common::run(common::TestOpts::new(2).with_timeout(45), |ctx| async move { const DC2_RUNNER_NAME: &'static str = "dc-2-runner"; @@ -485,11 +483,16 @@ fn get_or_create_in_current_datacenter() { // Broken legacy Pegboard Runner multi-DC coverage: remote get-or-create returns // `core.internal_error` with `target_replicas must include the local replica`. #[test] -#[ignore = "broken legacy Pegboard Runner test: target_replicas must include the local replica"] fn get_or_create_in_remote_datacenter() { common::run(common::TestOpts::new(2).with_timeout(45), |ctx| async move { let (namespace, _, _runner) = common::setup_test_namespace_with_envoy(ctx.leader_dc()).await; + let _runner_dc2 = common::setup_envoy_on_dc( + ctx.get_dc(2), + &namespace, + vec!["remote-dc-actor".to_string()], + ) + .await; // Request from DC1 but specify DC2 let response = common::api::public::actors_get_or_create( diff --git a/engine/packages/engine/tests/envoy/api_actors_list.rs b/engine/packages/engine/tests/envoy/api_actors_list.rs index ebdbef41ab..f9764198e5 100644 --- a/engine/packages/engine/tests/envoy/api_actors_list.rs +++ b/engine/packages/engine/tests/envoy/api_actors_list.rs @@ -545,13 +545,16 @@ fn list_specific_actors_by_ids() { } #[test] -// Broken legacy Pegboard Runner test: full engine sweep can fail creating the -// DC2 actor with `actor.destroyed_during_creation`. -#[ignore = "DC2 actor create hangs / workflow-worker lease failure"] fn list_actors_from_multiple_datacenters() { common::run(common::TestOpts::new(2).with_timeout(45), |ctx| async move { let (namespace, _, _runner) = common::setup_test_namespace_with_envoy(ctx.leader_dc()).await; + let _runner_dc2 = common::setup_envoy_on_dc( + ctx.get_dc(2), + &namespace, + vec!["multi-dc-actor".to_string()], + ) + .await; // Create actors in different DCs let res1 = common::api::public::actors_create( @@ -794,11 +797,16 @@ fn verify_sorting_by_create_ts_descending() { // Broken legacy Pegboard Runner multi-DC coverage: full `runner::` sweep times // out with `test timed out: Elapsed(())`. #[test] -#[ignore = "DC2 actor create hangs / workflow-worker lease failure"] fn list_aggregates_results_from_all_datacenters() { common::run(common::TestOpts::new(2).with_timeout(45), |ctx| async move { let (namespace, _, _runner) = common::setup_test_namespace_with_envoy(ctx.leader_dc()).await; + let _runner_dc2 = common::setup_envoy_on_dc( + ctx.get_dc(2), + &namespace, + vec!["fanout-test-actor".to_string()], + ) + .await; let name = "fanout-test-actor"; @@ -1537,11 +1545,16 @@ fn list_invalid_cursor_format() { // Broken legacy Pegboard Runner multi-DC coverage: full `runner::` sweep times // out with `test timed out: Elapsed(())`. #[test] -#[ignore = "DC2 actor create hangs / workflow-worker lease failure"] fn list_cursor_across_datacenters() { common::run(common::TestOpts::new(2).with_timeout(45), |ctx| async move { let (namespace, _, _runner) = common::setup_test_namespace_with_envoy(ctx.leader_dc()).await; + let _runner_dc2 = common::setup_envoy_on_dc( + ctx.get_dc(2), + &namespace, + vec!["multi-dc-cursor-test".to_string()], + ) + .await; let name = "multi-dc-cursor-test"; diff --git a/engine/packages/engine/tests/envoy/api_actors_list_names.rs b/engine/packages/engine/tests/envoy/api_actors_list_names.rs index 906b425988..9c03711852 100644 --- a/engine/packages/engine/tests/envoy/api_actors_list_names.rs +++ b/engine/packages/engine/tests/envoy/api_actors_list_names.rs @@ -208,11 +208,19 @@ fn list_names_with_non_existent_namespace() { // Broken legacy Pegboard Runner multi-DC coverage: full engine sweep returns // `actor.destroyed_during_creation` while creating the DC2 actor. #[test] -#[ignore = "DC2 actor create hangs / workflow-worker lease failure"] fn list_names_fanout_to_all_datacenters() { common::run(common::TestOpts::new(2).with_timeout(45), |ctx| async move { - let (namespace, _, _runner) = - common::setup_test_namespace_with_envoy(ctx.leader_dc()).await; + let (namespace, _, _runner) = common::setup_test_namespace_with_envoy_for_names( + ctx.leader_dc(), + vec!["dc1-actor".to_string()], + ) + .await; + let _runner_dc2 = common::setup_envoy_on_dc( + ctx.get_dc(2), + &namespace, + vec!["dc2-actor".to_string()], + ) + .await; // Create actors with different names in different DCs common::api::public::actors_create( @@ -277,11 +285,19 @@ fn list_names_fanout_to_all_datacenters() { #[test] // Broken legacy Pegboard Runner test: full engine sweep timed out in // `list_names_deduplication_across_datacenters`. -#[ignore = "DC2 actor create hangs / workflow-worker lease failure"] fn list_names_deduplication_across_datacenters() { common::run(common::TestOpts::new(2).with_timeout(45), |ctx| async move { - let (namespace, _, _runner) = - common::setup_test_namespace_with_envoy(ctx.leader_dc()).await; + let (namespace, _, _runner) = common::setup_test_namespace_with_envoy_for_names( + ctx.leader_dc(), + vec!["shared-name-actor".to_string()], + ) + .await; + let _runner_dc2 = common::setup_envoy_on_dc( + ctx.get_dc(2), + &namespace, + vec!["shared-name-actor".to_string()], + ) + .await; // Create actors with same name in different DCs let shared_name = "shared-name-actor";