From d9aff3d55a09e0e46e1b4db5c3d4c0ca4570b8af Mon Sep 17 00:00:00 2001 From: Tim Cassell Date: Sun, 31 May 2026 22:23:39 -0400 Subject: [PATCH 1/9] Add JitListener for deterministic JIT stage. Co-Authored-By: Claude Opus 4.8 --- .../Code/DeclarationsProvider.cs | 33 +++ src/BenchmarkDotNet/Engines/Engine.cs | 2 + src/BenchmarkDotNet/Engines/EngineJitStage.cs | 187 +++++++++++++-- .../Engines/EngineParameters.cs | 13 + src/BenchmarkDotNet/Engines/JitListener.cs | 222 ++++++++++++++++++ src/BenchmarkDotNet/Portability/JitInfo.cs | 27 ++- .../Templates/BenchmarkType.txt | 11 + .../Emitters/AsyncStateMachineEmitter.cs | 20 +- .../InProcess/Emit/InProcessEmitRunner.cs | 1 + .../InProcess/NoEmit/InProcessNoEmitRunner.cs | 1 + .../BenchmarkDotNet.IntegrationTests.csproj | 2 - .../NaiveRunnableEmitDiff.cs | 10 +- .../JitListenerTests.cs | 195 +++++++++++++++ .../Engine/EnumerateStagesTests.cs | 8 +- .../Shared/Mocks/MockEngine.cs | 2 + 15 files changed, 689 insertions(+), 45 deletions(-) create mode 100644 src/BenchmarkDotNet/Engines/JitListener.cs create mode 100644 tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs diff --git a/src/BenchmarkDotNet/Code/DeclarationsProvider.cs b/src/BenchmarkDotNet/Code/DeclarationsProvider.cs index ba52e0ac4f..7a4ddc3e80 100644 --- a/src/BenchmarkDotNet/Code/DeclarationsProvider.cs +++ b/src/BenchmarkDotNet/Code/DeclarationsProvider.cs @@ -33,6 +33,9 @@ public SmartStringBuilder ReplaceTemplate(SmartStringBuilder smartStringBuilder) return ReplaceCore(smartStringBuilder) .Replace("$DisassemblerEntryMethodImpl$", GetWorkloadMethodCall(GetPassArgumentsDirect())) .Replace("$OperationsPerInvoke$", Descriptor.OperationsPerInvoke.ToString()) + .Replace("$WorkloadMethodResolve$", GetWorkloadMethodResolve()) + .Replace("$WorkloadMethodReturnTypeModifiers$", GetWorkloadMethodReturnTypeModifiers()) + .Replace("$WorkloadMethodReturnType$", GetWorkloadMethodReturnTypeName()) .Replace("$WorkloadTypeName$", Descriptor.Type.GetCorrectCSharpTypeName()); } @@ -89,6 +92,36 @@ private static string GetMethodPrefix(MethodInfo method) protected string GetWorkloadMethodCall(string passArguments) => $"{GetMethodPrefix(Descriptor.WorkloadMethod)}.{Descriptor.WorkloadMethod.Name}({passArguments});"; + // Resolve the benchmark MethodInfo at runtime so the jit stage can watch its tier-up via JIT events. + // Method-group conversion to the generated __WorkloadMethodDelegate, then read its .Method: the compiler + // binds the correct overload and verifies the signature at build time, so it's overload- and inheritance-safe + // without rendering parameter type lists. We deliberately avoid resolving by metadata token — the benchmark + // assembly is built separately and conditional compilation (e.g. #if) can shift token RIDs. + private string GetWorkloadMethodResolve() + { + var method = Descriptor.WorkloadMethod; + return $"((__WorkloadMethodDelegate){GetMethodPrefix(method)}.{method.Name}).Method"; + } + + // The delegate return type, split into modifiers ("", "ref", "ref readonly") and the (non-byref) type name, + // because the benchmark method's return ref-kind must match the delegate's for the method-group conversion. + private string GetWorkloadMethodReturnTypeName() + { + var returnType = Descriptor.WorkloadMethod.ReturnType; + return (returnType.IsByRef ? returnType.GetElementType()! : returnType).GetCorrectCSharpTypeName(); + } + + private string GetWorkloadMethodReturnTypeModifiers() + { + var method = Descriptor.WorkloadMethod; + if (!method.ReturnType.IsByRef) + return string.Empty; + // ref readonly returns carry an InAttribute required modifier on the return; plain ref returns don't. + bool isReadOnly = method.ReturnParameter.GetRequiredCustomModifiers() + .Any(modifier => modifier.FullName == "System.Runtime.InteropServices.InAttribute"); + return isReadOnly ? "ref readonly" : "ref"; + } + protected string GetPassArgumentsDirect() => string.Join( ", ", diff --git a/src/BenchmarkDotNet/Engines/Engine.cs b/src/BenchmarkDotNet/Engines/Engine.cs index 489060068c..292620d6cf 100644 --- a/src/BenchmarkDotNet/Engines/Engine.cs +++ b/src/BenchmarkDotNet/Engines/Engine.cs @@ -34,6 +34,8 @@ internal Engine(EngineParameters engineParameters) var job = engineParameters.TargetJob ?? throw new ArgumentNullException(nameof(EngineParameters.TargetJob)); Parameters = new() { + WorkloadMethod = engineParameters.WorkloadMethod ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadMethod)), + EnableJitListener = engineParameters.EnableJitListener, WorkloadActionNoUnroll = engineParameters.WorkloadActionNoUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadActionNoUnroll)), WorkloadActionUnroll = engineParameters.WorkloadActionUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadActionUnroll)), OverheadActionNoUnroll = engineParameters.OverheadActionNoUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.OverheadActionNoUnroll)), diff --git a/src/BenchmarkDotNet/Engines/EngineJitStage.cs b/src/BenchmarkDotNet/Engines/EngineJitStage.cs index 1e61e8315e..cb6c2a8bee 100644 --- a/src/BenchmarkDotNet/Engines/EngineJitStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineJitStage.cs @@ -1,3 +1,5 @@ +using System.Reflection; +using System.Runtime.CompilerServices; using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Portability; using BenchmarkDotNet.Reports; @@ -11,8 +13,10 @@ namespace BenchmarkDotNet.Engines; // the following stages (Pilot and Warmup) will likely take it the rest of the way. Long-running benchmarks may never fully reach tier1. internal sealed class EngineJitStage : EngineStage { - // Jit call counting delay is only for when the app starts up. We don't need to wait for every benchmark if multiple benchmarks are ran in-process. - private static TimeSpan s_tieredDelay = JitInfo.TieredDelay; + // After a tier's single burst fails to tier-up, we nudge one invocation at a time and wait out the async + // event-delivery lag (~10ms) after each before nudging again — so we stop the instant the next tier's + // MethodLoadVerbose publication confirms the tier-up instead of overshooting by re-bursting the whole budget. + private static readonly TimeSpan EventDeliveryLag = TimeSpan.FromMilliseconds(10); internal bool didStopEarly = false; internal Measurement lastMeasurement; @@ -31,7 +35,9 @@ internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters) : ba private int GetMaxMeasurementCount() { int count = JitInfo.IsTiered - ? JitInfo.MaxTierPromotions * JitInfo.TieredCallCountThreshold + 2 + // Per tier: one full burst plus up to a threshold of single-call nudges (×2 covers the worst case of a + // user-pinned InvocationCount of 1, where the burst is also split into single-call iterations). + ? JitInfo.MaxTierPromotions * JitInfo.TieredCallCountThreshold * 2 + 2 : 1; if (evaluateOverhead) { @@ -44,7 +50,7 @@ internal override bool GetShouldRunIteration(List measurements, out { if (measurements.Count > 0) { - var measurement = measurements[measurements.Count - 1]; + var measurement = measurements[^1]; if (measurement.IterationMode == IterationMode.Workload) { lastMeasurement = measurement; @@ -62,6 +68,14 @@ internal override bool GetShouldRunIteration(List measurements, out private IEnumerator EnumerateIterations() { + // Watch for the method's background tier-up via JIT events so we can proceed as soon as each tier is + // published instead of waiting a fixed delay. Null when watching is disabled, EventSource is disabled, + // or the method is not eligible for tiered compilation, in which case we fall back to the fixed delay. + // Created BEFORE the first invoke so it observes the method's very first (tier0) jit and the surrounding + // TieredCompilation events from the start. + using JitListener? listener = JitListener.Create(parameters.WorkloadMethod, parameters.EnableJitListener); + bool useListener = listener != null; + // If the user pinned InvocationCount (e.g. via [IterationSetup]/[IterationCleanup] which implies RunOncePerIteration), // honor it so IterationSetup/Cleanup runs around each invocation. #3102 bool hasUserInvocationCount = parameters.TargetJob.HasValue(RunMode.InvocationCountCharacteristic); @@ -81,16 +95,41 @@ private IEnumerator EnumerateIterations() yield break; } - // Wait enough time for jit call counting to begin. - Engine.SleepIfPositive(s_tieredDelay); - // Don't make the next jit stage wait if it's ran in the same process. - s_tieredDelay = TimeSpan.Zero; + if (JitInfo.TieredDelay > TimeSpan.Zero) + { + if (useListener) + { + bool waitForTieringActive = true; + if (!listener!.WaitForTieringActivePrimed(JitInfo.TieredDelay + TimeSpan.FromMilliseconds(50), parameters.Host.CancellationToken)) + { + // If we observed no tier0 JIT (of any method) and no TieredCompilationResume/Pause event within the + // timeout, tiering is quiet in the process — which likely means the watched method was pre-warmed to at + // least tier0 and the listener was possibly created after its tiered compilation was resumed. In that + // case, we force a tier0 JIT which forces a TieredCompilationPause event, which guarantees a followup + // TieredCompilationResume that we can wait on deterministically. + if (!TryForceTier0Jit()) + { + // We couldn't establish that the call-counting delay is (or will be) active, and can't force one. + // Stop trusting the listener's tiering gate for the rest of the stage and fall back to the fixed delay. + waitForTieringActive = false; + useListener = false; + Thread.Sleep(JitInfo.TieredDelay); + } + } + if (waitForTieringActive) + { + listener!.WaitForTieringActive(parameters.Host.CancellationToken); + } + } + else + { + Thread.Sleep(JitInfo.TieredDelay); + } + } - // If the first iteration suggests a long-running benchmark (a single invocation already - // takes ~2/3 of IterationTime or more), run one confirmation iteration and bail out if - // it agrees. Same cutoff value that pilot stage uses. - // We do not bail out immediately if the first iteration is long-running because it could - // be due to cctors or other lazy initialization that won't be hit in steady-state. #2004 + // Long-running early-exit: if a single invocation already takes ~2/3 of IterationTime, this is a long-running + // benchmark — bail and let the Pilot/Warmup stages finish tiering. The first invoke can be inflated by JIT or + // cctors, so confirm with one more iteration before bailing (it could be a one-time cost). #2004 // JitTieringMode.Force opts out of this heuristic and always promotes through every tier. TimeInterval iterationTime = parameters.TargetJob.ResolveValue(RunMode.IterationTimeCharacteristic, parameters.Resolver); long remainingCalls = JitInfo.TieredCallCountThreshold; @@ -104,12 +143,20 @@ private IEnumerator EnumerateIterations() didStopEarly = true; yield break; } - remainingCalls -= userInvokeCount; } // Promote methods to tier1. for (int remainingTiers = JitInfo.MaxTierPromotions; remainingTiers > 0; --remainingTiers) { + // Run ONE full burst of this tier's call budget, gated so it's counted rather than wasted into a + // deferred window. The next tier's publication (a non-tier0 MethodLoadVerbose) is the trustworthy + // "the count reached the threshold and the next tier compiled" signal — the persistent per-tier counter + // means calls accumulate, so if the burst doesn't tier up we nudge the rest one at a time below rather + // than re-bursting the whole budget. + if (useListener) + { + listener!.WaitForTieringActive(parameters.Host.CancellationToken); + } while (remainingCalls > 0) { // Run the whole tier's call budget in a single iteration unless the user pinned InvocationCount. @@ -120,7 +167,68 @@ private IEnumerator EnumerateIterations() yield return GetWorkloadIterationData(invokeCount); } - Engine.SleepIfPositive(JitInfo.BackgroundCompilationDelay); + if (useListener) + { + // Background compilation can take an indeterminate amount of time. Ideally we would wait for the MethodJittingStarted event, + // but it doesn't carry tier information, so we can't skip it for the async tier0 events (if we try there is a race condition). + // The only thing we can do safely is wait for the compilation to complete with a sensible timeout via the MethodLoadVerbose event that carries the tier info. + bool tieredUp = listener!.WaitForPublication(JitInfo.BackgroundCompilationDelay, parameters.Host.CancellationToken); + if (!tieredUp) + { + // Unlikely, but technically possible. The call-counting delay could be active, + // but we don't receive the event for it for 10ms, so the initial burst ran some invocations + // that didn't count. In that case it's most likely that most of the invocations did count, so + // we only need a few more to nudge it over the threshold. Re-bursting the whole budget would overshoot + // wastefully (up to threshold * call-time), so nudge one invocation at a time, waiting out the + // async event-delivery lag after each so we stop the instant the tier-up is confirmed. Gate + // first so the stub is live (handles a fully-deferred burst). + // - or - + // The method could have been pre-warmed to tier1 before the stage started (e.g. via InProcess toolchains), + // which would also hit this case. In that case we will waste time with unnecessary calls, + // but it's impossible for us to detect that scenario with the available JIT APIs. + listener!.WaitForTieringActive(parameters.Host.CancellationToken); + long nudgeCalls = hasUserInvocationCount ? userInvokeCount : 1; + for (long nudged = 0; nudged < JitInfo.TieredCallCountThreshold && !tieredUp; nudged += nudgeCalls) + { + ++iterationIndex; + yield return GetWorkloadIterationData(nudgeCalls); + tieredUp = listener!.WaitForPublication(EventDeliveryLag, parameters.Host.CancellationToken); + } + // If a whole threshold of nudges went without a confirmed tier-up, it is most likely the case that the + // method was already pre-warmed to tier1. Wait it out once more just in case, then fallback to the fixed delay. + // We don't bail out here because it's possible the benchmark will call other methods via different control flow + // (e.g. InProcess toolchain with arguments/params). + if (!tieredUp && !listener!.WaitForPublication(JitInfo.BackgroundCompilationDelay, parameters.Host.CancellationToken)) + { + useListener = false; + // We already invoked and waited 2 stages worth, subtract 1 tier and continue the loop here to not waste an extra stage of unnecessary invocations. + remainingCalls = JitInfo.TieredCallCountThreshold; + --remainingTiers; + continue; + } + } + + // A new tier was published; the tier it carried tells us whether the method reached tier1. + if (listener!.ReachedTier1) + { + if (!JitInfo.IsOSRDuplicated) + { + break; + } + // We need to handle the case of the benchmark method reached tier1, but it calls another method that is OSR'd. + // The listener only tracks the benchmark method and an unknown callee can't be watched, so + // stop consulting it and let the loop run one more promotion iteration on the fixed delay + // to give such a callee time. MaxTierPromotions already budgets +1 for OSR, so cap the loop + // to one further iteration: Min(remainingTiers, 2) yields Min(remainingTiers - 1, 1) more + // passes once the for-loop's --remainingTiers is applied. + useListener = false; + remainingTiers = Math.Min(remainingTiers, 2); + } + } + else + { + Engine.SleepIfPositive(JitInfo.BackgroundCompilationDelay); + } remainingCalls = JitInfo.TieredCallCountThreshold; } @@ -130,6 +238,55 @@ private IEnumerator EnumerateIterations() yield return GetWorkloadIterationData(userInvokeCount); } + // Throwaway type used only as a generic argument: nesting it (Wrapper>) makes a never-before-seen + // closed type on demand, so each ForceTier0JitTarget instantiation is a distinct MethodDesc that JITs fresh. + private struct Wrapper { } + + // A real (non-trivial, non-inlined) generic method so each value-type instantiation gets its own tier0 JIT — and + // thus runs HandleCallCountingForFirstCall, starting a call-counting delay we can wait on. + [MethodImpl(MethodImplOptions.NoInlining)] + private static long ForceTier0JitTarget(long x) => x * default(T)!.GetHashCode(); + + private static readonly MethodInfo ForceTargetMethod = + typeof(EngineJitStage).GetMethod(nameof(ForceTier0JitTarget), BindingFlags.NonPublic | BindingFlags.Static)!; + private static Type s_nextForceType = typeof(int); + private static readonly object[] BoxedZero = [0L]; + // This (engine) assembly: a forced JIT of ForceTier0JitTarget below would never start a call-counting delay when + // optimizations are disabled here, so we skip forcing in that case. + private static readonly bool OptimizationsDisabled = JitListener.AreOptimizationsDisabledFor(typeof(EngineJitStage)); + + // Manufacture a tier0 JIT to start a call-counting delay, for the rare already-tiered method whose own delay + // elapsed before we were listening. Returns false if it couldn't run — the caller then falls back to a fixed sleep + // instead of waiting for a Resume that would never come. + private bool TryForceTier0Jit() + { + // In a DisableOptimizations build this assembly's methods are never tier-eligible, so forcing a tier0 JIT here + // is a silent no-op that starts no delay. Bail rather than commit the caller to an unbounded wait for a Resume. + if (OptimizationsDisabled) + { + return false; + } + try + { + Type forceType; + while (true) + { + forceType = Volatile.Read(ref s_nextForceType); + if (Interlocked.CompareExchange(ref s_nextForceType, typeof(Wrapper<>).MakeGenericType(forceType), forceType) == forceType) + { + break; + } + } + ForceTargetMethod.MakeGenericMethod(forceType).Invoke(null, BoxedZero); + return true; + } + catch (Exception e) + { + parameters.Host.SendError(e.ToString()); + return false; + } + } + private IterationData GetOverheadIterationData(long invokeCount) => new(IterationMode.Overhead, IterationStage.Jitting, iterationIndex, invokeCount, 1, () => new(), () => new(), parameters.OverheadActionNoUnroll); diff --git a/src/BenchmarkDotNet/Engines/EngineParameters.cs b/src/BenchmarkDotNet/Engines/EngineParameters.cs index 7d7dab4b21..7d4d2f7bb4 100644 --- a/src/BenchmarkDotNet/Engines/EngineParameters.cs +++ b/src/BenchmarkDotNet/Engines/EngineParameters.cs @@ -1,3 +1,4 @@ +using System.Reflection; using BenchmarkDotNet.Characteristics; using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Running; @@ -16,6 +17,18 @@ public class EngineParameters public required Func> OverheadActionNoUnroll { get; set; } public required Func> OverheadActionUnroll { get; set; } public Job TargetJob { get; set; } = Job.Default; + + /// + /// The benchmark method, used by the jit stage to watch for its tier-up via JIT events. + /// + public required MethodInfo WorkloadMethod { get; set; } + + /// + /// Whether the jit stage may watch JIT tier-up events. Disabled by the stage-enumeration unit + /// tests, which drive the stage with mock (non-executing) workloads that never raise events. + /// + internal bool EnableJitListener { get; set; } = true; + public long OperationsPerInvoke { get; set; } = 1; public required Func GlobalSetupAction { get; set; } public required Func GlobalCleanupAction { get; set; } diff --git a/src/BenchmarkDotNet/Engines/JitListener.cs b/src/BenchmarkDotNet/Engines/JitListener.cs new file mode 100644 index 0000000000..6a2ee5249f --- /dev/null +++ b/src/BenchmarkDotNet/Engines/JitListener.cs @@ -0,0 +1,222 @@ +using System.Diagnostics.Tracing; +using System.Reflection; +using BenchmarkDotNet.Portability; + +namespace BenchmarkDotNet.Engines; + +// Observes background JIT tier-up of a single (benchmark) method by listening to the runtime's JIT events +// in-process, so the jit stage can proceed as soon as the next tier is actually reached instead of waiting a +// fixed delay. The runtime only announces transitions (there is no API to poll a method's current tier), so we +// must be listening while they happen. +// +// The events it watches, and their roles: +// * MethodLoadVerbose (per-method, JIT keyword) reports each tier publication and carries the tier. A burst that +// reaches the call-count threshold triggers the next tier's compile, which publishes a (non-tier0) load — so the +// first such publication after a burst is the AUTHORITATIVE "the burst tiered up" signal, and the tier it carries +// tells us when the method reached tier1. The stage keeps invoking until it sees one. (WaitForPublication / +// ReachedTier1.) We deliberately do NOT use MethodJittingStarted (compile-began): it carries no tier, so the +// tier0 compile's start is indistinguishable from a tier-up's and would race the tier0 publish that filters it. +// * TieredCompilationPause/Resume (the call-counting delay bracket, Compilation keyword) gate the bursts: a burst +// issued while the delay is active isn't counted (the counting stub is deferred), so the stage waits until the +// delay is observed inactive — a Resume, when the stubs are installed — before bursting (WaitForTieringActive), and +// up front checks whether any method's tier0 JIT or an already-active delay is underway +// (WaitForTieringActivePrimed) so it can force one if not. These only avoid wasting bursts — correctness +// comes from the publication. +// +// This is intentionally a per-stage listener: enabling the Jit keyword emits an event for every method jitted +// process-wide, which we must NOT pay during the measurement stages. It is created at the start of the jit stage +// and disposed at the end. +// +// Create returns null (and the caller falls back to the fixed delay) when EventSource is unavailable — it can be +// disabled via the System.Diagnostics.Tracing.EventSource.IsSupported feature switch — or the method isn't eligible +// for tiered compilation (its assembly has optimizations disabled, or it's pinned to a single optimization level). +internal sealed class JitListener : EventListener +{ + private const string RuntimeEventSourceName = "Microsoft-Windows-DotNETRuntime"; + private const EventKeywords JitKeyword = (EventKeywords)0x10; + // The "Compilation" keyword carries the TieredCompilation/Pause|Resume events that bracket the runtime's + // call-counting delay. Low volume (a handful per delay cycle), so enabling it adds no meaningful cost. + private const EventKeywords CompilationKeyword = (EventKeywords)0x1000000000; + private const string TieredCompilationResumeEvent = "TieredCompilationResume"; + private const string TieredCompilationPauseEvent = "TieredCompilationPause"; + // Event-name prefix (the runtime appends a version suffix, e.g. MethodLoadVerbose_V2). + private const string MethodLoadVerbosePrefix = "MethodLoadVerbose"; + + // Optimization tier is packed into MethodFlags bits [7..9]: (MethodFlags >> 7) & 0x7. + // The initial tier0 quick compile is QuickJitted = 3, intermediate instrumented/OSR + // publications report other values (and just count as "a recompilation happened"), + // and the fully-optimized steady-state tier1 is OptimizedTier1 = 4. + private const int OptimizationTierShift = 7; + private const int OptimizationTierMask = 0x7; + private const int QuickJittedTier0 = 3; + private const int OptimizedTier1 = 4; + + private readonly int metadataToken; + private readonly string methodName; + private readonly ManualResetEventSlim publicationSignal = new(false); + private readonly ManualResetEventSlim tieringActiveSignal = new(false); + private readonly ManualResetEventSlim tieringActivePrimedSignal = new(false); + + private volatile bool reachedTier1; + private volatile bool canObserve; + + // Cached payload indices (field order is stable within a process for a given event version). + private int loadTokenIndex = -1; + private int loadFlagsIndex = -1; + private int loadNameIndex = -1; + + private JitListener(MethodInfo method) + { + // NOTE: the base EventListener ctor calls OnEventSourceCreated before these fields are set, + // but that callback only enables events / probes canObserve and never reads them. + metadataToken = method.MetadataToken; + methodName = method.Name; + } + + internal static JitListener? Create(MethodInfo method, bool enabled) + { + if (!enabled || !JitInfo.IsTiered || !IsTierable(method)) + { + return null; + } + var listener = new JitListener(method); + if (!listener.canObserve) + { + listener.Dispose(); + return null; + } + return listener; + } + + private static bool IsTierable(MethodInfo method) + => !AreOptimizationsDisabledFor(method) + && (method.MethodImplementationFlags & (MethodImplAttributes.NoOptimization | CodeGenHelper.AggressiveOptimizationOptionForEmit)) == 0; + + internal static bool AreOptimizationsDisabledFor(MemberInfo member) + => member.Module.Assembly.GetCustomAttribute()?.IsJITOptimizerDisabled ?? false; + + internal bool ReachedTier1 => reachedTier1; + + // Reports (within the timeout) whether the call-counting machinery is active in the process: a tier0 (QuickJitted) + // publication for ANY method, or a TieredCompilation Pause/Resume — any of which guarantees a Resume is coming to + // gate on. The stage checks this once before the tier loop: a false result means tiering is quiet and the watched + // method was likely pre-warmed past tier0, so no delay is coming on its own and one must be forced to get a Resume. + internal bool WaitForTieringActivePrimed(TimeSpan timeout, CancellationToken cancellationToken) + => tieringActivePrimedSignal.Wait(timeout, cancellationToken); + + // Waits until the call-counting delay is inactive (a TieredCompilationResume was observed). + internal void WaitForTieringActive(CancellationToken cancellationToken) + { + // No call-counting delay (e.g. AggressiveTiering) — counting is armed immediately, nothing to gate on. + if (JitInfo.TieredDelay > TimeSpan.Zero) + { + tieringActiveSignal.Wait(Timeout.InfiniteTimeSpan, cancellationToken); + } + } + + // Waits for a new tier publication (a non-tier0 MethodLoadVerbose for the method) — i.e. the latest burst drove + // the method to its next tier and the runtime published it. True if one arrived before the timeout; false otherwise. + internal bool WaitForPublication(TimeSpan timeout, CancellationToken cancellationToken) + { + if (!publicationSignal.Wait(timeout, cancellationToken)) + { + return false; + } + // Reset for the next tier. We can't use AutoResetEvent because it doesn't support CancellationToken. + publicationSignal.Reset(); + return true; + } + + protected override void OnEventSourceCreated(EventSource source) + { + if (source.Name == RuntimeEventSourceName) + { + EnableEvents(source, EventLevel.Verbose, JitKeyword | CompilationKeyword); + // IsEnabled is true only when EventSource is supported AND the enable actually took effect + // at the level/keyword we need. + canObserve = source.IsEnabled(EventLevel.Verbose, JitKeyword | CompilationKeyword); + } + } + + protected override void OnEventWritten(EventWrittenEventArgs e) + { + if (!canObserve) + return; + string? name = e.EventName; + if (name is null) + return; + + // The runtime brackets the call-counting delay with these: Pause when a new tier0 method's first call + // (re)starts the delay, Resume when it elapses and the whole pending list of counting stubs is installed. + // tieringActiveSignal is the flip-flop the burst gate waits on (Set on Resume = delay inactive = stubs live, + // Reset on Pause); tieringActivePrimedSignal just records that some delay activity occurred (set by either). + if (name == TieredCompilationResumeEvent) + { + tieringActiveSignal.Set(); + tieringActivePrimedSignal.Set(); + return; + } + if (name == TieredCompilationPauseEvent) + { + tieringActiveSignal.Reset(); + tieringActivePrimedSignal.Set(); + return; + } + + if (name.StartsWith(MethodLoadVerbosePrefix, StringComparison.Ordinal)) + { + HandleMethodLoad(e); + } + } + + private void HandleMethodLoad(EventWrittenEventArgs e) + { + var payloadNames = e.PayloadNames; + var payload = e.Payload; + if (payloadNames is null || payload is null) + return; + + if (loadTokenIndex < 0) + { + loadTokenIndex = payloadNames.IndexOf("MethodToken"); + loadFlagsIndex = payloadNames.IndexOf("MethodFlags"); + loadNameIndex = payloadNames.IndexOf("MethodName"); + if (loadTokenIndex < 0 || loadFlagsIndex < 0 || loadNameIndex < 0) + return; + } + + long tier = (Convert.ToInt64(payload[loadFlagsIndex]) >> OptimizationTierShift) & OptimizationTierMask; + + // A QuickJitted (tier0) publication — for ANY method, not just the one we watch — means an eligible method was + // just tier0-compiled and is about to run, so its first call will start or join the call-counting delay and a + // TieredCompilationResume is coming. That is exactly (and all) the up-front gate (WaitForTieringActivePrimed) + // needs: it only asks "is the tiering machinery active, so a Resume will arrive to gate on?", which is a + // process-wide question. (Pause/Resume prime it too; this also covers the brief window before the first call + // fires Pause.) The tier0 compile itself is the baseline, not a tier-up, so we never raise a publication for it. + if (tier == QuickJittedTier0) + { + tieringActivePrimedSignal.Set(); + return; + } + + // Everything below concerns OUR method reaching its next tier, so filter to it. + if (Convert.ToInt32(payload[loadTokenIndex]) != metadataToken) + return; + if (payload[loadNameIndex] as string != methodName) + return; + + if (tier == OptimizedTier1) + reachedTier1 = true; + + publicationSignal.Set(); + } + + public override void Dispose() + { + // base.Dispose disables the events we enabled (when no other listener wants them). + base.Dispose(); + publicationSignal.Dispose(); + tieringActivePrimedSignal.Dispose(); + tieringActiveSignal.Dispose(); + } +} diff --git a/src/BenchmarkDotNet/Portability/JitInfo.cs b/src/BenchmarkDotNet/Portability/JitInfo.cs index 103dc6881a..6f3a4bcf48 100644 --- a/src/BenchmarkDotNet/Portability/JitInfo.cs +++ b/src/BenchmarkDotNet/Portability/JitInfo.cs @@ -70,6 +70,18 @@ private static bool IsDisabled(string envName, string knobName) // Disabled by default in netcoreapp2.X, check if it's enabled. : IsEnabled(EnvTieredCompilation, KnobTieredCompilation)); + // On-stack-replacement *shouldn't* interfere with promotion velocity, but there is a bug where OSR may cause a method to be tier0 instrumented twice. + // https://github.com/dotnet/runtime/issues/117787#issuecomment-3090771091 + public static readonly bool IsOSRDuplicated = + IsTiered + // Added experimentally in .Net 5. + && Environment.Version.Major >= 5 + && (Environment.Version.Major >= 7 + // Enabled by default in .Net 7, check if it's disabled. + ? !IsEnvVarDisabled(EnvOSR) + // Disabled by default in earlier versions, check if it's enabled. + : IsEnvVarEnabled(EnvOSR)); + /// /// The maximum numbers of jit tiers that a method may be promoted through. This is the maximum number of jit tiers - 1. /// @@ -88,10 +100,8 @@ private static int GetMaxTierPromotions() // Tier0 instrumented ++maxPromotions; } - if (GetIsOSR()) + if (IsOSRDuplicated) { - // On-stack-replacement *shouldn't* interfere with promotion velocity, but there is a bug where OSR may cause a method to be tier0 instrumented twice. - // https://github.com/dotnet/runtime/issues/117787#issuecomment-3090771091 ++maxPromotions; } return maxPromotions; @@ -106,15 +116,6 @@ static bool GetIsDPGO() => ? !IsDisabled(EnvPGO, KnobPGO) // Disabled by default in earlier versions, check if it's enabled. : IsEnabled(EnvPGO, KnobPGO)); - - static bool GetIsOSR() => - // Added experimentally in .Net 5. - Environment.Version.Major >= 5 - && (Environment.Version.Major >= 7 - // Enabled by default in .Net 7, check if it's disabled. - ? !IsEnvVarDisabled(EnvOSR) - // Disabled by default in earlier versions, check if it's enabled. - : IsEnvVarEnabled(EnvOSR)); } /// @@ -180,7 +181,7 @@ private static TimeSpan GetTieredDelay() /// public static readonly TimeSpan BackgroundCompilationDelay = IsTiered - // It's impossible for us to know exactly how long to wait without hooking into JIT notifications (which we can't do in-process). + // It's impossible for us to know exactly how long to wait without hooking into JIT notifications. // 100ms should be enough most of the time, but we bump it up to 250ms for higher confidence. // When https://github.com/dotnet/runtime/issues/101868 is resolved, if AggressiveTiering is enabled, we can skip the wait time and return TimeSpan.Zero. ? TimeSpan.FromMilliseconds(250) diff --git a/src/BenchmarkDotNet/Templates/BenchmarkType.txt b/src/BenchmarkDotNet/Templates/BenchmarkType.txt index affe8f2f40..3882f7efeb 100644 --- a/src/BenchmarkDotNet/Templates/BenchmarkType.txt +++ b/src/BenchmarkDotNet/Templates/BenchmarkType.txt @@ -39,6 +39,7 @@ global::BenchmarkDotNet.Engines.EngineParameters engineParameters = new global::BenchmarkDotNet.Engines.EngineParameters() { Host = host, + WorkloadMethod = instance.__ResolveWorkloadMethod(), WorkloadActionUnroll = instance.WorkloadActionUnroll, WorkloadActionNoUnroll = instance.WorkloadActionNoUnroll, OverheadActionNoUnroll = instance.OverheadActionNoUnroll, @@ -69,8 +70,18 @@ $CancellationTokenAssignment$ } + private unsafe delegate $WorkloadMethodReturnTypeModifiers$ $WorkloadMethodReturnType$ __WorkloadMethodDelegate($ArgumentsDefinition$); + $DeclareFieldsContainer$ + private global::System.Reflection.MethodInfo __ResolveWorkloadMethod() + { + unsafe + { + return $WorkloadMethodResolve$; + } + } + private $GlobalSetupModifiers$ global::System.Threading.Tasks.ValueTask __GlobalSetup() { $GlobalSetupImpl$ diff --git a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs index bd0528d122..9334c781e2 100644 --- a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs +++ b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs @@ -14,16 +14,16 @@ partial class RunnableEmitter // This doesn't really matter for the runtime, but it helps with the NaiveRunnableEmitDiff tests. private readonly Dictionary s_asyncMethodToOrdinalMap = new() { - { GlobalSetupMethodName, 4 }, - { GlobalCleanupMethodName, 5 }, - { IterationSetupMethodName, 6 }, - { IterationCleanupMethodName, 7 }, - { OverheadActionUnrollMethodName, 11 }, - { OverheadActionNoUnrollMethodName, 12 }, - { WorkloadActionUnrollMethodName, 13 }, - { WorkloadActionNoUnrollMethodName, 14 }, - { StartWorkloadMethodName, 15 }, - { WorkloadCoreMethodName, 16 }, + { GlobalSetupMethodName, 6 }, + { GlobalCleanupMethodName, 7 }, + { IterationSetupMethodName, 8 }, + { IterationCleanupMethodName, 9 }, + { OverheadActionUnrollMethodName, 13 }, + { OverheadActionNoUnrollMethodName, 14 }, + { WorkloadActionUnrollMethodName, 15 }, + { WorkloadActionNoUnrollMethodName, 16 }, + { StartWorkloadMethodName, 17 }, + { WorkloadCoreMethodName, 18 }, }; private record struct AsyncStateMachineFields(FieldInfo StateField, FieldInfo BuilderField, FieldInfo? ThisField); diff --git a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/InProcessEmitRunner.cs b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/InProcessEmitRunner.cs index f44c6da266..9fd3d5e1a1 100644 --- a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/InProcessEmitRunner.cs +++ b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/InProcessEmitRunner.cs @@ -92,6 +92,7 @@ private static async ValueTask RunCore(Type runnableType, IHost host, ExecutePar var engineParameters = new EngineParameters() { Host = host, + WorkloadMethod = benchmarkCase.Descriptor.WorkloadMethod, WorkloadActionUnroll = LoopCallbackFromMethod(instance, WorkloadActionUnrollMethodName), WorkloadActionNoUnroll = LoopCallbackFromMethod(instance, WorkloadActionNoUnrollMethodName), OverheadActionNoUnroll = LoopCallbackFromMethod(instance, OverheadActionNoUnrollMethodName), diff --git a/src/BenchmarkDotNet/Toolchains/InProcess/NoEmit/InProcessNoEmitRunner.cs b/src/BenchmarkDotNet/Toolchains/InProcess/NoEmit/InProcessNoEmitRunner.cs index 28dc146f36..07946484ab 100644 --- a/src/BenchmarkDotNet/Toolchains/InProcess/NoEmit/InProcessNoEmitRunner.cs +++ b/src/BenchmarkDotNet/Toolchains/InProcess/NoEmit/InProcessNoEmitRunner.cs @@ -178,6 +178,7 @@ public static async ValueTask RunCore(IHost host, ExecuteParameters parameters, var engineParameters = new EngineParameters { Host = host, + WorkloadMethod = target.WorkloadMethod, WorkloadActionNoUnroll = workloadAction.InvokeNoUnroll, WorkloadActionUnroll = workloadAction.InvokeUnroll, OverheadActionNoUnroll = overheadAction.InvokeNoUnroll, diff --git a/tests/BenchmarkDotNet.IntegrationTests/BenchmarkDotNet.IntegrationTests.csproj b/tests/BenchmarkDotNet.IntegrationTests/BenchmarkDotNet.IntegrationTests.csproj index 89939ea2fb..aabd80ffeb 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/BenchmarkDotNet.IntegrationTests.csproj +++ b/tests/BenchmarkDotNet.IntegrationTests/BenchmarkDotNet.IntegrationTests.csproj @@ -23,8 +23,6 @@ Always - - diff --git a/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs b/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs index 2ee80f55bc..318fba26f2 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs +++ b/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs @@ -27,7 +27,13 @@ public class NaiveRunnableEmitDiff private static readonly HashSet IgnoredRunnableMethodNames = [ "Run", - ".ctor" + ".ctor", + "__ResolveWorkloadMethod" + ]; + + private static readonly HashSet IgnoredRunnableNestedTypeNames = + [ + "__WorkloadMethodDelegate" ]; private static readonly IReadOnlyDictionary AltOpCodes = new Dictionary() @@ -409,7 +415,7 @@ private static void DiffMembers(TypeDefinition type1, TypeDefinition type2, ILog var nested2ByName = type2.NestedTypes.ToLookup(t => t.Name); foreach (var nested1 in type1.NestedTypes) { - if (ignoredStateMachineNames.Contains(nested1.FullName)) + if (ignoredStateMachineNames.Contains(nested1.FullName) || IgnoredRunnableNestedTypeNames.Contains(nested1.Name)) continue; var nested2 = nested2ByName[nested1.Name].SingleOrDefault() ?? type2.NestedTypes.SingleOrDefault(t => AreSameTypeIgnoreNested(nested1, t)); diff --git a/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs new file mode 100644 index 0000000000..e6580f2cdf --- /dev/null +++ b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs @@ -0,0 +1,195 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using BenchmarkDotNet.Engines; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Portability; +using BenchmarkDotNet.Reports; +using BenchmarkDotNet.Tests.XUnit; +using Perfolizer.Horology; + +namespace BenchmarkDotNet.IntegrationTests; + +public class JitListenerTests +{ + // The jit stage's behavior depends on whether the benchmark method's assembly is optimized, and this test + // assembly is built both ways across configurations, so each case asserts whichever applies: + // * Optimized build: the target method participates in tiered compilation, so the stage drives it to + // OptimizedTier1. We verify through a SECOND, independent JitListener created before the stage runs — + // multiple EventListeners each receive the same runtime events, so it observes exactly what the stage's + // internal listener does, and unlike the internal listener (disposed when the stage ends) it outlives the stage. + // * DisableOptimizations build (e.g. Debug): the assembly is JITted at minopts and never tiers, so the listener + // declines to watch it (Create returns null) and the stage falls back to the fixed-delay loop. + private static readonly bool OptimizationsDisabled = + typeof(JitListenerTests).Assembly.GetCustomAttribute()?.IsJITOptimizerDisabled ?? false; + + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_Cold() + { + Func workloadMethod = Cold; + + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + + RunJitStageToCompletion(workloadMethod); + + AssertTierUpOrDeclined(observer); + } + + // Tests the case of InProcess benchmarking the same method multiple times. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_AlreadyTier1() + { + Func workloadMethod = AlreadyTier1; + + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + + // The first jit stage brings the method to tier1 (in an optimized build); running the jit stage again for the + // same method should succeed without issue and leave the method in tier1. + RunJitStageToCompletion(workloadMethod); + RunJitStageToCompletion(workloadMethod); + + AssertTierUpOrDeclined(observer); + } + + // Tests the case of InProcess benchmarking a method that the user already invoked before starting the benchmarks when call counting is active. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_AlreadyTier0() + { + DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0(42)); + // Sleep long enough for the tiered call counting to begin. + Engine.SleepIfPositive(JitInfo.TieredDelay + JitInfo.TieredDelay); + Func workloadMethod = AlreadyTier0; + + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + + RunJitStageToCompletion(workloadMethod); + + AssertTierUpOrDeclined(observer); + } + + // Tests the case of InProcess benchmarking a method that the user already invoked before starting the benchmarks when call counting is delayed. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_AlreadyTier0DelayedCallCounting() + { + DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0DelayedCallCounting(42)); + Func workloadMethod = AlreadyTier0DelayedCallCounting; + + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + + RunJitStageToCompletion(workloadMethod); + + AssertTierUpOrDeclined(observer); + } + + // A pinned optimization level makes a method ineligible for tiered compilation regardless of the assembly, so the + // listener declines to watch it (Create returns null). In an optimized build that attribute is the sole reason; in a + // DisableOptimizations build the assembly excludes it too — either way there is nothing for the listener to observe. + + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void Create_DeclinesNoOptimizationMethod() + { + // [MethodImpl(NoOptimization)] pins the method to minopts, so it never tiers. + Func workloadMethod = NoOptimization; + using var listener = JitListener.Create(workloadMethod.Method, enabled: true); + Assert.Null(listener); + } + + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void Create_DeclinesAggressiveOptimizationMethod() + { + // [MethodImpl(AggressiveOptimization)] pins the method straight to tier1, so it never goes through tier0 -> tier1. + Func workloadMethod = AggressiveOptimization; + using var listener = JitListener.Create(workloadMethod.Method, enabled: true); + Assert.Null(listener); + } + + private static void AssertTierUpOrDeclined(JitListener? observer) + { + if (OptimizationsDisabled) + { + // The method can't tier, so the listener declines to watch it (Create returns null) and the stage falls + // back to the fixed-delay loop without ever reaching tier1. + Assert.Null(observer); + } + else + { + // requires EventSource support in the test host (it's enabled by default) + Assert.NotNull(observer); + Assert.True(observer!.ReachedTier1, "the jit stage should have driven the benchmark method to tier1"); + } + } + + private static void RunJitStageToCompletion(Func workloadMethod) + { + // The per-tier publication wait is unbounded, cancellable only via the host's token. When the method tiers, the + // stage re-bursts until the runtime reports the next-tier compile began, so the wait isn't actually hit — but a + // large timeout guards against a hang if tiering somehow stalls, instead of wedging the whole test run. + using var timeout = new CancellationTokenSource(TimeSpan.FromSeconds(60)); + var host = new CancellableHost(timeout.Token); + Func> empty = (_, _) => new(default(ClockSpan)); + Func> workload = (invokeCount, _) => + { + // Really invoke the benchmark method so it goes through call counting and tiers up for real. + for (long i = 0; i < invokeCount; i++) + workloadMethod(i); + return new(default(ClockSpan)); + }; + + var parameters = new EngineParameters + { + Host = host, + WorkloadMethod = workloadMethod.Method, + WorkloadActionNoUnroll = workload, + WorkloadActionUnroll = workload, + OverheadActionNoUnroll = empty, + OverheadActionUnroll = empty, + GlobalSetupAction = () => new(), + GlobalCleanupAction = () => new(), + IterationSetupAction = () => new(), + IterationCleanupAction = () => new(), + TargetJob = Job.Default, + BenchmarkName = "", + InProcessDiagnoserHandler = new([], host, BenchmarkDotNet.Diagnosers.RunMode.None, null!), + }; + + var stage = new EngineJitStage(evaluateOverhead: false, parameters); + var measurements = stage.GetMeasurementList(); + while (stage.GetShouldRunIteration(measurements, out var data)) + { + data.setupAction().GetAwaiter().GetResult(); + data.workloadAction(data.invokeCount / data.unrollFactor, null!).GetAwaiter().GetResult(); + data.cleanupAction().GetAwaiter().GetResult(); + // A zero-time measurement keeps the stage out of its "long-running benchmark" early-exit + // (iterationTime / 0 == Infinity, and Infinity < 1.5 is false). + measurements.Add(new Measurement(1, data.mode, data.stage, data.index, data.invokeCount, 0d)); + } + } + + // Benchmark-method stand-ins. A distinct method per tier-up scenario so that one test tiering a method up doesn't + // affect another's starting state. In a DisableOptimizations build these are JITted at minopts and never tier; in + // an optimized build they reach OptimizedTier1. + [MethodImpl(MethodImplOptions.NoInlining)] + private static long Cold(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long AlreadyTier1(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long AlreadyTier0(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long AlreadyTier0DelayedCallCounting(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)] + private static long NoOptimization(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining | CodeGenHelper.AggressiveOptimizationOption)] + private static long AggressiveOptimization(long x) => x * x + 1; + + // Minimal host that surfaces a cancellation token so the stage's unbounded per-tier wait stays interruptible. + private sealed class CancellableHost(CancellationToken cancellationToken) : IHost + { + public CancellationToken CancellationToken { get; } = cancellationToken; + public void Dispose() { } + public void WriteLine() { } + public void WriteLine(string message) { } + public void SendError(string message) { } + public void ReportResults(RunResults runResults) { } + public ValueTask SendSignalAsync(HostSignal hostSignal) => new(); + public ValueTask Yield() => new(); + } +} diff --git a/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs b/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs index 569889ba10..33f484f22f 100644 --- a/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs +++ b/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs @@ -226,9 +226,9 @@ public void SlowFirstIterationButFastSteadyStateDoesNotExitJitStageEarly() if (stage is EngineJitStage) break; } - // Pre-loop iter + confirmation + full tiering loop (one yield per tier since the user - // pinned InvocationCount=1, matching JitInfo.MaxTierPromotions * TieredCallCountThreshold) - // + one stabilization iteration. Just assert it ran the full tiering loop rather than bailing. + // Pre-loop iter + confirmation + full tiering loop (one yield per tier since the user pinned + // InvocationCount=1, matching JitInfo.MaxTierPromotions * TieredCallCountThreshold) + the trailing + // stabilization iteration. Just assert it ran the full tiering loop rather than bailing. Assert.True(jitWorkloadCount > 2, $"Expected the tiering loop to run after confirmation disagreed, got {jitWorkloadCount} jitting iterations."); } @@ -347,6 +347,8 @@ private EngineParameters CreateEngineParameters(Job job) Func> emptyAction = (_, _) => new(default(ClockSpan)); return new() { + WorkloadMethod = emptyAction.Method, + EnableJitListener = false, GlobalSetupAction = () => new(), GlobalCleanupAction = () => new(), Host = host, diff --git a/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs b/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs index 59dcb72633..5ba9625a52 100644 --- a/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs +++ b/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs @@ -22,6 +22,8 @@ internal MockEngine(ITestOutputHelper output, Job job, Func Date: Mon, 1 Jun 2026 01:03:41 -0400 Subject: [PATCH 2/9] Fix FSharpAnonymousRecordIsSupported test. --- .../Code/DeclarationsProvider.cs | 43 +++++++------------ .../Templates/BenchmarkType.txt | 31 +++++++++++-- .../Emitters/AsyncStateMachineEmitter.cs | 20 ++++----- .../NaiveRunnableEmitDiff.cs | 7 +-- 4 files changed, 54 insertions(+), 47 deletions(-) diff --git a/src/BenchmarkDotNet/Code/DeclarationsProvider.cs b/src/BenchmarkDotNet/Code/DeclarationsProvider.cs index 7a4ddc3e80..120e19006d 100644 --- a/src/BenchmarkDotNet/Code/DeclarationsProvider.cs +++ b/src/BenchmarkDotNet/Code/DeclarationsProvider.cs @@ -33,9 +33,8 @@ public SmartStringBuilder ReplaceTemplate(SmartStringBuilder smartStringBuilder) return ReplaceCore(smartStringBuilder) .Replace("$DisassemblerEntryMethodImpl$", GetWorkloadMethodCall(GetPassArgumentsDirect())) .Replace("$OperationsPerInvoke$", Descriptor.OperationsPerInvoke.ToString()) - .Replace("$WorkloadMethodResolve$", GetWorkloadMethodResolve()) - .Replace("$WorkloadMethodReturnTypeModifiers$", GetWorkloadMethodReturnTypeModifiers()) - .Replace("$WorkloadMethodReturnType$", GetWorkloadMethodReturnTypeName()) + .Replace("$WorkloadMethodName$", Descriptor.WorkloadMethod.Name) + .Replace("$WorkloadMethodParameterTypes$", GetWorkloadMethodParameterTypes()) .Replace("$WorkloadTypeName$", Descriptor.Type.GetCorrectCSharpTypeName()); } @@ -92,34 +91,24 @@ private static string GetMethodPrefix(MethodInfo method) protected string GetWorkloadMethodCall(string passArguments) => $"{GetMethodPrefix(Descriptor.WorkloadMethod)}.{Descriptor.WorkloadMethod.Name}({passArguments});"; - // Resolve the benchmark MethodInfo at runtime so the jit stage can watch its tier-up via JIT events. - // Method-group conversion to the generated __WorkloadMethodDelegate, then read its .Method: the compiler - // binds the correct overload and verifies the signature at build time, so it's overload- and inheritance-safe - // without rendering parameter type lists. We deliberately avoid resolving by metadata token — the benchmark - // assembly is built separately and conditional compilation (e.g. #if) can shift token RIDs. - private string GetWorkloadMethodResolve() + // Renders the benchmark method's parameter types as a Type[] for __ResolveWorkloadMethod to match overloads + // exactly. Each is a typeof(...) of the element type, re-wrapping by-ref/pointer via reflection (typeof can't + // express `T&`), so resolution never has to name the method's (possibly unspellable) return type. + private string GetWorkloadMethodParameterTypes() { - var method = Descriptor.WorkloadMethod; - return $"((__WorkloadMethodDelegate){GetMethodPrefix(method)}.{method.Name}).Method"; + var parameters = Descriptor.WorkloadMethod.GetParameters(); + if (parameters.Length == 0) + return "global::System.Array.Empty()"; + return $"new global::System.Type[] {{ {string.Join(", ", parameters.Select(p => GetTypeOfExpression(p.ParameterType)))} }}"; } - // The delegate return type, split into modifiers ("", "ref", "ref readonly") and the (non-byref) type name, - // because the benchmark method's return ref-kind must match the delegate's for the method-group conversion. - private string GetWorkloadMethodReturnTypeName() + private static string GetTypeOfExpression(System.Type type) { - var returnType = Descriptor.WorkloadMethod.ReturnType; - return (returnType.IsByRef ? returnType.GetElementType()! : returnType).GetCorrectCSharpTypeName(); - } - - private string GetWorkloadMethodReturnTypeModifiers() - { - var method = Descriptor.WorkloadMethod; - if (!method.ReturnType.IsByRef) - return string.Empty; - // ref readonly returns carry an InAttribute required modifier on the return; plain ref returns don't. - bool isReadOnly = method.ReturnParameter.GetRequiredCustomModifiers() - .Any(modifier => modifier.FullName == "System.Runtime.InteropServices.InAttribute"); - return isReadOnly ? "ref readonly" : "ref"; + if (type.IsByRef) + return $"{GetTypeOfExpression(type.GetElementType()!)}.MakeByRefType()"; + if (type.IsPointer) + return $"{GetTypeOfExpression(type.GetElementType()!)}.MakePointerType()"; + return $"typeof({type.GetCorrectCSharpTypeName()})"; } protected string GetPassArgumentsDirect() diff --git a/src/BenchmarkDotNet/Templates/BenchmarkType.txt b/src/BenchmarkDotNet/Templates/BenchmarkType.txt index 3882f7efeb..ade82eba79 100644 --- a/src/BenchmarkDotNet/Templates/BenchmarkType.txt +++ b/src/BenchmarkDotNet/Templates/BenchmarkType.txt @@ -70,16 +70,39 @@ $CancellationTokenAssignment$ } - private unsafe delegate $WorkloadMethodReturnTypeModifiers$ $WorkloadMethodReturnType$ __WorkloadMethodDelegate($ArgumentsDefinition$); - $DeclareFieldsContainer$ private global::System.Reflection.MethodInfo __ResolveWorkloadMethod() { - unsafe + global::System.Type[] parameterTypes = $WorkloadMethodParameterTypes$; + foreach (global::System.Reflection.MethodInfo candidate in typeof($WorkloadTypeName$).GetMethods( + global::System.Reflection.BindingFlags.Instance | global::System.Reflection.BindingFlags.Static | + global::System.Reflection.BindingFlags.Public | global::System.Reflection.BindingFlags.NonPublic)) { - return $WorkloadMethodResolve$; + if (candidate.Name != "$WorkloadMethodName$") + { + continue; + } + global::System.Reflection.ParameterInfo[] parameters = candidate.GetParameters(); + if (parameters.Length != parameterTypes.Length) + { + continue; + } + global::System.Boolean isMatch = true; + for (global::System.Int32 i = 0; i < parameters.Length; i++) + { + if (parameters[i].ParameterType != parameterTypes[i]) + { + isMatch = false; + break; + } + } + if (isMatch) + { + return candidate; + } } + return null; } private $GlobalSetupModifiers$ global::System.Threading.Tasks.ValueTask __GlobalSetup() diff --git a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs index 9334c781e2..0c93543f32 100644 --- a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs +++ b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs @@ -14,16 +14,16 @@ partial class RunnableEmitter // This doesn't really matter for the runtime, but it helps with the NaiveRunnableEmitDiff tests. private readonly Dictionary s_asyncMethodToOrdinalMap = new() { - { GlobalSetupMethodName, 6 }, - { GlobalCleanupMethodName, 7 }, - { IterationSetupMethodName, 8 }, - { IterationCleanupMethodName, 9 }, - { OverheadActionUnrollMethodName, 13 }, - { OverheadActionNoUnrollMethodName, 14 }, - { WorkloadActionUnrollMethodName, 15 }, - { WorkloadActionNoUnrollMethodName, 16 }, - { StartWorkloadMethodName, 17 }, - { WorkloadCoreMethodName, 18 }, + { GlobalSetupMethodName, 5 }, + { GlobalCleanupMethodName, 6 }, + { IterationSetupMethodName, 7 }, + { IterationCleanupMethodName, 8 }, + { OverheadActionUnrollMethodName, 12 }, + { OverheadActionNoUnrollMethodName, 13 }, + { WorkloadActionUnrollMethodName, 14 }, + { WorkloadActionNoUnrollMethodName, 15 }, + { StartWorkloadMethodName, 16 }, + { WorkloadCoreMethodName, 17 }, }; private record struct AsyncStateMachineFields(FieldInfo StateField, FieldInfo BuilderField, FieldInfo? ThisField); diff --git a/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs b/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs index 318fba26f2..69e1e1c95c 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs +++ b/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs @@ -31,11 +31,6 @@ public class NaiveRunnableEmitDiff "__ResolveWorkloadMethod" ]; - private static readonly HashSet IgnoredRunnableNestedTypeNames = - [ - "__WorkloadMethodDelegate" - ]; - private static readonly IReadOnlyDictionary AltOpCodes = new Dictionary() { { OpCodes.Br_S, OpCodes.Br }, @@ -415,7 +410,7 @@ private static void DiffMembers(TypeDefinition type1, TypeDefinition type2, ILog var nested2ByName = type2.NestedTypes.ToLookup(t => t.Name); foreach (var nested1 in type1.NestedTypes) { - if (ignoredStateMachineNames.Contains(nested1.FullName) || IgnoredRunnableNestedTypeNames.Contains(nested1.Name)) + if (ignoredStateMachineNames.Contains(nested1.FullName)) continue; var nested2 = nested2ByName[nested1.Name].SingleOrDefault() ?? type2.NestedTypes.SingleOrDefault(t => AreSameTypeIgnoreNested(nested1, t)); From 186fffadb63490ff7ea0f8350f1b3763d56ff3a6 Mon Sep 17 00:00:00 2001 From: Tim Cassell Date: Mon, 1 Jun 2026 16:04:20 -0400 Subject: [PATCH 3/9] Don't shorten iterations when tier1 reached. --- src/BenchmarkDotNet/Engines/EngineJitStage.cs | 18 +++++-------- src/BenchmarkDotNet/Portability/JitInfo.cs | 25 +++++++++---------- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/src/BenchmarkDotNet/Engines/EngineJitStage.cs b/src/BenchmarkDotNet/Engines/EngineJitStage.cs index cb6c2a8bee..46cc1c54fa 100644 --- a/src/BenchmarkDotNet/Engines/EngineJitStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineJitStage.cs @@ -208,21 +208,15 @@ private IEnumerator EnumerateIterations() } } - // A new tier was published; the tier it carried tells us whether the method reached tier1. if (listener!.ReachedTier1) { - if (!JitInfo.IsOSRDuplicated) - { - break; - } - // We need to handle the case of the benchmark method reached tier1, but it calls another method that is OSR'd. - // The listener only tracks the benchmark method and an unknown callee can't be watched, so - // stop consulting it and let the loop run one more promotion iteration on the fixed delay - // to give such a callee time. MaxTierPromotions already budgets +1 for OSR, so cap the loop - // to one further iteration: Min(remainingTiers, 2) yields Min(remainingTiers - 1, 1) more - // passes once the for-loop's --remainingTiers is applied. + // If the method has reached tier1 we will not receive any more JIT events for it. + // In case OSR is enabled and the method calls another method that is OSR'd, a runtime bug causes that other method to duplicate a tier (JitInfo.MaxTierPromotions already accounts for it). + // Or the method could have been pre-warmed before the stage started, but the benchmark case uses a different control flow that calls different methods that were not pre-warmed. + // In either case, the listener only tracks the benchmark method, and unknown callees can't be watched, + // so stop consulting it and let the loop run the remaining calculated promotion iterations on the fixed delay. useListener = false; - remainingTiers = Math.Min(remainingTiers, 2); + listener!.WaitForTieringActive(parameters.Host.CancellationToken); } } else diff --git a/src/BenchmarkDotNet/Portability/JitInfo.cs b/src/BenchmarkDotNet/Portability/JitInfo.cs index 6f3a4bcf48..3a85d28bcc 100644 --- a/src/BenchmarkDotNet/Portability/JitInfo.cs +++ b/src/BenchmarkDotNet/Portability/JitInfo.cs @@ -70,18 +70,6 @@ private static bool IsDisabled(string envName, string knobName) // Disabled by default in netcoreapp2.X, check if it's enabled. : IsEnabled(EnvTieredCompilation, KnobTieredCompilation)); - // On-stack-replacement *shouldn't* interfere with promotion velocity, but there is a bug where OSR may cause a method to be tier0 instrumented twice. - // https://github.com/dotnet/runtime/issues/117787#issuecomment-3090771091 - public static readonly bool IsOSRDuplicated = - IsTiered - // Added experimentally in .Net 5. - && Environment.Version.Major >= 5 - && (Environment.Version.Major >= 7 - // Enabled by default in .Net 7, check if it's disabled. - ? !IsEnvVarDisabled(EnvOSR) - // Disabled by default in earlier versions, check if it's enabled. - : IsEnvVarEnabled(EnvOSR)); - /// /// The maximum numbers of jit tiers that a method may be promoted through. This is the maximum number of jit tiers - 1. /// @@ -100,8 +88,10 @@ private static int GetMaxTierPromotions() // Tier0 instrumented ++maxPromotions; } - if (IsOSRDuplicated) + if (GetIsOSR()) { + // On-stack-replacement *shouldn't* interfere with promotion velocity, but there is a bug where OSR may cause a method to be tier0 instrumented twice. + // https://github.com/dotnet/runtime/issues/117787#issuecomment-3090771091 ++maxPromotions; } return maxPromotions; @@ -116,6 +106,15 @@ static bool GetIsDPGO() => ? !IsDisabled(EnvPGO, KnobPGO) // Disabled by default in earlier versions, check if it's enabled. : IsEnabled(EnvPGO, KnobPGO)); + + static bool GetIsOSR() => + // Added experimentally in .Net 5. + Environment.Version.Major >= 5 + && (Environment.Version.Major >= 7 + // Enabled by default in .Net 7, check if it's disabled. + ? !IsEnvVarDisabled(EnvOSR) + // Disabled by default in earlier versions, check if it's enabled. + : IsEnvVarEnabled(EnvOSR)); } /// From 93d4bc878bbfbc4e4cf59bf4c5bd1bb42e87d74b Mon Sep 17 00:00:00 2001 From: Tim Cassell Date: Tue, 2 Jun 2026 14:06:06 -0400 Subject: [PATCH 4/9] Test OSR methods --- src/BenchmarkDotNet/Engines/JitListener.cs | 18 +++++- .../JitListenerTests.cs | 62 +++++++++++++++++++ 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/src/BenchmarkDotNet/Engines/JitListener.cs b/src/BenchmarkDotNet/Engines/JitListener.cs index 6a2ee5249f..05f7129748 100644 --- a/src/BenchmarkDotNet/Engines/JitListener.cs +++ b/src/BenchmarkDotNet/Engines/JitListener.cs @@ -43,13 +43,18 @@ internal sealed class JitListener : EventListener private const string MethodLoadVerbosePrefix = "MethodLoadVerbose"; // Optimization tier is packed into MethodFlags bits [7..9]: (MethodFlags >> 7) & 0x7. - // The initial tier0 quick compile is QuickJitted = 3, intermediate instrumented/OSR - // publications report other values (and just count as "a recompilation happened"), - // and the fully-optimized steady-state tier1 is OptimizedTier1 = 4. + // The initial tier0 quick compile is QuickJitted = 3; the intermediate instrumented (PGO) publication reports + // another value and just counts as "a recompilation happened"; and the fully-optimized steady-state tier1 is + // OptimizedTier1 = 4. OptimizedTier1OSR = 5 is special: an on-stack-replacement of a still-running body with a + // hot loop. It fires off the loop's back-edge counter, NOT off the call-count threshold, so it's orthogonal to + // the call-count tier ladder the stage drives — and a watched method that OSRs in both its tier0 and instrumented + // bodies emits two of them on the way to tier1. We therefore ignore OSR publications for our method (see + // HandleMethodLoad) so they don't consume the stage's per-tier publication budget and stall it short of tier1. private const int OptimizationTierShift = 7; private const int OptimizationTierMask = 0x7; private const int QuickJittedTier0 = 3; private const int OptimizedTier1 = 4; + private const int OptimizedTier1OSR = 5; private readonly int metadataToken; private readonly string methodName; @@ -205,6 +210,13 @@ private void HandleMethodLoad(EventWrittenEventArgs e) if (payload[loadNameIndex] as string != methodName) return; + // An OSR publication is not a step on the call-count tier ladder (it fires off a hot loop's back-edge counter, + // and the method goes on to be call-count-promoted past it), so don't let it count as a tier-up the stage is + // waiting on — otherwise a method that OSRs in multiple bodies overruns the stage's publication budget and + // stops short of tier1. + if (tier == OptimizedTier1OSR) + return; + if (tier == OptimizedTier1) reachedTier1 = true; diff --git a/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs index e6580f2cdf..d9e63dc71d 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs +++ b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs @@ -80,6 +80,44 @@ public void JitStage_AlreadyTier0DelayedCallCounting() AssertTierUpOrDeclined(observer); } + // Tests a benchmark method whose own hot loop is On-Stack-Replaced (OSR) mid-execution. Where OSR is enabled + // (by default in .NET 7+) this drives the method through an OSR publication on top of its normal tier-ups, and the + // stage must still reach OptimizedTier1 — JitInfo.MaxTierPromotions reserves an extra promotion for the OSR-induced + // double tier0-instrumentation. Where OSR is off it is simply a hot-loop method that tiers up normally; either way + // it ends at tier1. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_Osr() + { + Func workloadMethod = Osr; + + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + + RunJitStageToCompletion(workloadMethod); + + AssertTierUpOrDeclined(observer); + } + + // Tests a benchmark method that calls (without inlining) a separate method whose hot loop is OSR'd. The listener + // only watches the benchmark method, never the callee, so it can't observe the callee's tiering at all — this + // exercises the runtime bug where an OSR'd callee gets tier0-instrumented twice (JitInfo.MaxTierPromotions reserves + // the extra promotion the stage spends on it). The benchmark method itself must still be driven to OptimizedTier1. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_CallsOsr() + { + Func workloadMethod = CallsOsr; + Func calleeMethod = OsrCallee; + + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + // The stage only drives (and the engine's listener only watches) the benchmark method, but every call to it + // calls the OSR'd callee, so the callee should be driven all the way to tier1 too. Watch it independently. + using var calleeObserver = JitListener.Create(calleeMethod.Method, enabled: true); + + RunJitStageToCompletion(workloadMethod); + + AssertTierUpOrDeclined(observer); + AssertTierUpOrDeclined(calleeObserver); + } + // A pinned optimization level makes a method ineligible for tiered compilation regardless of the assembly, so the // listener declines to watch it (Create returns null). In an optimized build that attribute is the sole reason; in a // DisableOptimizations build the assembly excludes it too — either way there is nothing for the listener to observe. @@ -180,6 +218,30 @@ private static void RunJitStageToCompletion(Func workloadMethod) [MethodImpl(MethodImplOptions.NoInlining | CodeGenHelper.AggressiveOptimizationOption)] private static long AggressiveOptimization(long x) => x * x + 1; + // A loop long enough to cross the OSR back-edge threshold so these methods are On-Stack-Replaced where OSR is + // enabled. Timing is irrelevant (RunJitStageToCompletion records 0ns measurements, so the stage never takes its + // long-running early-exit), so the only requirement is enough iterations to trigger OSR. + private const int OsrLoopCount = 1_000_000; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long Osr(long x) + { + long sum = x; + for (int i = 0; i < OsrLoopCount; i++) + sum += i; + return sum; + } + // The benchmark method: it does nothing but call the OSR'd method, which NoInlining keeps as a separate jit unit. + [MethodImpl(MethodImplOptions.NoInlining)] + private static long CallsOsr(long x) => OsrCallee(x); + [MethodImpl(MethodImplOptions.NoInlining)] + private static long OsrCallee(long x) + { + long sum = x; + for (int i = 0; i < OsrLoopCount; i++) + sum += i; + return sum; + } + // Minimal host that surfaces a cancellation token so the stage's unbounded per-tier wait stays interruptible. private sealed class CancellableHost(CancellationToken cancellationToken) : IHost { From d158dbf21cbb4b3e50e5bfe5d0516f0921e998a9 Mon Sep 17 00:00:00 2001 From: Tim Cassell Date: Tue, 2 Jun 2026 16:52:24 -0400 Subject: [PATCH 5/9] Always use listener until final tier. --- src/BenchmarkDotNet/Engines/Engine.cs | 1 - src/BenchmarkDotNet/Engines/EngineJitStage.cs | 26 ++-- src/BenchmarkDotNet/Engines/JitListener.cs | 53 ++++---- .../JitListenerTests.cs | 117 +++++++++--------- .../Shared/Mocks/MockEngine.cs | 1 - 5 files changed, 105 insertions(+), 93 deletions(-) diff --git a/src/BenchmarkDotNet/Engines/Engine.cs b/src/BenchmarkDotNet/Engines/Engine.cs index 292620d6cf..8fc2bac853 100644 --- a/src/BenchmarkDotNet/Engines/Engine.cs +++ b/src/BenchmarkDotNet/Engines/Engine.cs @@ -35,7 +35,6 @@ internal Engine(EngineParameters engineParameters) Parameters = new() { WorkloadMethod = engineParameters.WorkloadMethod ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadMethod)), - EnableJitListener = engineParameters.EnableJitListener, WorkloadActionNoUnroll = engineParameters.WorkloadActionNoUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadActionNoUnroll)), WorkloadActionUnroll = engineParameters.WorkloadActionUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadActionUnroll)), OverheadActionNoUnroll = engineParameters.OverheadActionNoUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.OverheadActionNoUnroll)), diff --git a/src/BenchmarkDotNet/Engines/EngineJitStage.cs b/src/BenchmarkDotNet/Engines/EngineJitStage.cs index 46cc1c54fa..c6a3263be2 100644 --- a/src/BenchmarkDotNet/Engines/EngineJitStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineJitStage.cs @@ -23,9 +23,19 @@ internal sealed class EngineJitStage : EngineStage private readonly IEnumerator enumerator; private readonly bool evaluateOverhead; + // Watches for the method's background tier-up via JIT events so we can proceed as soon as each tier is published. + // Null when watching is disabled or EventSource is disabled, in which case we fall back to the fixed delay. + private readonly JitListener? listener; - internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters) : base(IterationStage.Jitting, IterationMode.Workload, parameters) + internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters) + : this(evaluateOverhead, parameters, JitListener.Create(parameters.WorkloadMethod, parameters.EnableJitListener)) { + } + + internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters, JitListener? listener) + : base(IterationStage.Jitting, IterationMode.Workload, parameters) + { + this.listener = listener; enumerator = EnumerateIterations(); this.evaluateOverhead = evaluateOverhead; } @@ -61,6 +71,7 @@ internal override bool GetShouldRunIteration(List measurements, out iterationData = enumerator.Current; return true; } + listener?.Dispose(); enumerator.Dispose(); iterationData = default; return false; @@ -68,14 +79,6 @@ internal override bool GetShouldRunIteration(List measurements, out private IEnumerator EnumerateIterations() { - // Watch for the method's background tier-up via JIT events so we can proceed as soon as each tier is - // published instead of waiting a fixed delay. Null when watching is disabled, EventSource is disabled, - // or the method is not eligible for tiered compilation, in which case we fall back to the fixed delay. - // Created BEFORE the first invoke so it observes the method's very first (tier0) jit and the surrounding - // TieredCompilation events from the start. - using JitListener? listener = JitListener.Create(parameters.WorkloadMethod, parameters.EnableJitListener); - bool useListener = listener != null; - // If the user pinned InvocationCount (e.g. via [IterationSetup]/[IterationCleanup] which implies RunOncePerIteration), // honor it so IterationSetup/Cleanup runs around each invocation. #3102 bool hasUserInvocationCount = parameters.TargetJob.HasValue(RunMode.InvocationCountCharacteristic); @@ -95,6 +98,7 @@ private IEnumerator EnumerateIterations() yield break; } + bool useListener = listener != null; if (JitInfo.TieredDelay > TimeSpan.Zero) { if (useListener) @@ -208,9 +212,9 @@ private IEnumerator EnumerateIterations() } } - if (listener!.ReachedTier1) + if (listener!.ReachedFinalTier) { - // If the method has reached tier1 we will not receive any more JIT events for it. + // If the method has reached its final tier we will not receive any more JIT events for it. // In case OSR is enabled and the method calls another method that is OSR'd, a runtime bug causes that other method to duplicate a tier (JitInfo.MaxTierPromotions already accounts for it). // Or the method could have been pre-warmed before the stage started, but the benchmark case uses a different control flow that calls different methods that were not pre-warmed. // In either case, the listener only tracks the benchmark method, and unknown callees can't be watched, diff --git a/src/BenchmarkDotNet/Engines/JitListener.cs b/src/BenchmarkDotNet/Engines/JitListener.cs index 05f7129748..f19cd600bb 100644 --- a/src/BenchmarkDotNet/Engines/JitListener.cs +++ b/src/BenchmarkDotNet/Engines/JitListener.cs @@ -13,8 +13,8 @@ namespace BenchmarkDotNet.Engines; // * MethodLoadVerbose (per-method, JIT keyword) reports each tier publication and carries the tier. A burst that // reaches the call-count threshold triggers the next tier's compile, which publishes a (non-tier0) load — so the // first such publication after a burst is the AUTHORITATIVE "the burst tiered up" signal, and the tier it carries -// tells us when the method reached tier1. The stage keeps invoking until it sees one. (WaitForPublication / -// ReachedTier1.) We deliberately do NOT use MethodJittingStarted (compile-began): it carries no tier, so the +// tells us when the method reached its final tier. The stage keeps invoking until it sees one. (WaitForPublication / +// ReachedFinalTier.) We deliberately do NOT use MethodJittingStarted (compile-began): it carries no tier, so the // tier0 compile's start is indistinguishable from a tier-up's and would race the tier0 publish that filters it. // * TieredCompilationPause/Resume (the call-counting delay bracket, Compilation keyword) gate the bursts: a burst // issued while the delay is active isn't counted (the counting stub is deferred), so the stage waits until the @@ -27,9 +27,10 @@ namespace BenchmarkDotNet.Engines; // process-wide, which we must NOT pay during the measurement stages. It is created at the start of the jit stage // and disposed at the end. // -// Create returns null (and the caller falls back to the fixed delay) when EventSource is unavailable — it can be -// disabled via the System.Diagnostics.Tracing.EventSource.IsSupported feature switch — or the method isn't eligible -// for tiered compilation (its assembly has optimizations disabled, or it's pinned to a single optimization level). +// Create returns null (and the caller falls back to the fixed delay) when the runtime has no tiered JIT, or when +// EventSource is unavailable — it can be disabled via the System.Diagnostics.Tracing.EventSource.IsSupported feature +// switch. It otherwise watches the method regardless of whether it looks tier-eligible: a method that can't tier just +// publishes its single final tier (see the tier constants below), which the stage observes and treats as "done". internal sealed class JitListener : EventListener { private const string RuntimeEventSourceName = "Microsoft-Windows-DotNETRuntime"; @@ -44,14 +45,25 @@ internal sealed class JitListener : EventListener // Optimization tier is packed into MethodFlags bits [7..9]: (MethodFlags >> 7) & 0x7. // The initial tier0 quick compile is QuickJitted = 3; the intermediate instrumented (PGO) publication reports - // another value and just counts as "a recompilation happened"; and the fully-optimized steady-state tier1 is - // OptimizedTier1 = 4. OptimizedTier1OSR = 5 is special: an on-stack-replacement of a still-running body with a - // hot loop. It fires off the loop's back-edge counter, NOT off the call-count threshold, so it's orthogonal to - // the call-count tier ladder the stage drives — and a watched method that OSRs in both its tier0 and instrumented - // bodies emits two of them on the way to tier1. We therefore ignore OSR publications for our method (see - // HandleMethodLoad) so they don't consume the stage's per-tier publication budget and stall it short of tier1. + // another value and just counts as "a recompilation happened". A method is fully warmed once it reaches one of + // the runtime's FINAL tiers — those from which no further tier-up is coming: + // * OptimizedTier1 = 4 — the usual steady state for a tier-eligible method. + // * Optimized = 2 (NativeCodeVersion::OptimizationTierOptimized) — a method compiled straight to optimized code + // without a tier1 promotion: AggressiveOptimization, or a method with a loop when TC_QuickJitForLoops is off. + // * MinOptJitted = 1 — a method that never tiers at all: NoOptimization, or any method in an + // optimization-disabled assembly. This is its first and only compile. + // Since Create now watches every method (not just ones that look tier-eligible), a non-tiering method publishes + // exactly one of MinOptJitted/Optimized and we recognize it as final immediately, rather than predicting it from + // attributes. OptimizedTier1OSR = 5 is special: an on-stack-replacement of a still-running body with a hot loop. + // It fires off the loop's back-edge counter, NOT off the call-count threshold, so unlike every other tier it is + // never the method's active entry-point code version and is never call-counted — it's orthogonal to the + // call-count tier ladder the stage drives, and a watched method that OSRs in both its tier0 and instrumented + // bodies emits two of them on the way to its final tier. We therefore ignore OSR publications for our method (see + // HandleMethodLoad) so they don't consume the stage's per-tier publication budget and stall it short of the final tier. private const int OptimizationTierShift = 7; private const int OptimizationTierMask = 0x7; + private const int MinOptJitted = 1; + private const int Optimized = 2; private const int QuickJittedTier0 = 3; private const int OptimizedTier1 = 4; private const int OptimizedTier1OSR = 5; @@ -62,7 +74,7 @@ internal sealed class JitListener : EventListener private readonly ManualResetEventSlim tieringActiveSignal = new(false); private readonly ManualResetEventSlim tieringActivePrimedSignal = new(false); - private volatile bool reachedTier1; + private volatile bool reachedFinalTier; private volatile bool canObserve; // Cached payload indices (field order is stable within a process for a given event version). @@ -80,7 +92,7 @@ private JitListener(MethodInfo method) internal static JitListener? Create(MethodInfo method, bool enabled) { - if (!enabled || !JitInfo.IsTiered || !IsTierable(method)) + if (!enabled || !JitInfo.IsTiered) { return null; } @@ -93,14 +105,10 @@ private JitListener(MethodInfo method) return listener; } - private static bool IsTierable(MethodInfo method) - => !AreOptimizationsDisabledFor(method) - && (method.MethodImplementationFlags & (MethodImplAttributes.NoOptimization | CodeGenHelper.AggressiveOptimizationOptionForEmit)) == 0; - internal static bool AreOptimizationsDisabledFor(MemberInfo member) => member.Module.Assembly.GetCustomAttribute()?.IsJITOptimizerDisabled ?? false; - internal bool ReachedTier1 => reachedTier1; + internal bool ReachedFinalTier => reachedFinalTier; // Reports (within the timeout) whether the call-counting machinery is active in the process: a tier0 (QuickJitted) // publication for ANY method, or a TieredCompilation Pause/Resume — any of which guarantees a Resume is coming to @@ -213,12 +221,15 @@ private void HandleMethodLoad(EventWrittenEventArgs e) // An OSR publication is not a step on the call-count tier ladder (it fires off a hot loop's back-edge counter, // and the method goes on to be call-count-promoted past it), so don't let it count as a tier-up the stage is // waiting on — otherwise a method that OSRs in multiple bodies overruns the stage's publication budget and - // stops short of tier1. + // stops short of the final tier. if (tier == OptimizedTier1OSR) return; - if (tier == OptimizedTier1) - reachedTier1 = true; + // Any of the runtime's final tiers means the method is fully warmed and will emit no further tier-ups — + // whether it tiered all the way up (OptimizedTier1), was compiled straight to optimized code (Optimized), or + // never tiers at all (MinOptJitted). + if (tier == OptimizedTier1 || tier == Optimized || tier == MinOptJitted) + reachedFinalTier = true; publicationSignal.Set(); } diff --git a/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs index d9e63dc71d..59df0b4f80 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs +++ b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs @@ -1,4 +1,3 @@ -using System.Reflection; using System.Runtime.CompilerServices; using BenchmarkDotNet.Engines; using BenchmarkDotNet.Jobs; @@ -11,17 +10,6 @@ namespace BenchmarkDotNet.IntegrationTests; public class JitListenerTests { - // The jit stage's behavior depends on whether the benchmark method's assembly is optimized, and this test - // assembly is built both ways across configurations, so each case asserts whichever applies: - // * Optimized build: the target method participates in tiered compilation, so the stage drives it to - // OptimizedTier1. We verify through a SECOND, independent JitListener created before the stage runs — - // multiple EventListeners each receive the same runtime events, so it observes exactly what the stage's - // internal listener does, and unlike the internal listener (disposed when the stage ends) it outlives the stage. - // * DisableOptimizations build (e.g. Debug): the assembly is JITted at minopts and never tiers, so the listener - // declines to watch it (Create returns null) and the stage falls back to the fixed-delay loop. - private static readonly bool OptimizationsDisabled = - typeof(JitListenerTests).Assembly.GetCustomAttribute()?.IsJITOptimizerDisabled ?? false; - [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] public void JitStage_Cold() { @@ -29,9 +17,9 @@ public void JitStage_Cold() using var observer = JitListener.Create(workloadMethod.Method, enabled: true); - RunJitStageToCompletion(workloadMethod); + RunJitStageToCompletion(workloadMethod, observer); - AssertTierUpOrDeclined(observer); + AssertReachedFinalTier(observer); } // Tests the case of InProcess benchmarking the same method multiple times. @@ -42,42 +30,50 @@ public void JitStage_AlreadyTier1() using var observer = JitListener.Create(workloadMethod.Method, enabled: true); - // The first jit stage brings the method to tier1 (in an optimized build); running the jit stage again for the - // same method should succeed without issue and leave the method in tier1. - RunJitStageToCompletion(workloadMethod); - RunJitStageToCompletion(workloadMethod); + // The first jit stage brings the method to tier1 (in an optimized build) and our observer records it. Running + // the jit stage again for the same (now tier1) method should also succeed; it gets a fresh listener, because + // the stage drove the first to completion and reusing one across runs would leave its tiering signals ambiguous. + RunJitStageToCompletion(workloadMethod, observer); + using var observer2 = JitListener.Create(workloadMethod.Method, enabled: true); + RunJitStageToCompletion(workloadMethod, observer2); - AssertTierUpOrDeclined(observer); + AssertReachedFinalTier(observer); } // Tests the case of InProcess benchmarking a method that the user already invoked before starting the benchmarks when call counting is active. [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] public void JitStage_AlreadyTier0() { + Func workloadMethod = AlreadyTier0; + // Watch from before the pre-invoke, and hand this listener to the stage so it doesn't create a second one + // (see RunJitStageToCompletion): in a minopt build the pre-invoke is the method's only compile. + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0(42)); // Sleep long enough for the tiered call counting to begin. Engine.SleepIfPositive(JitInfo.TieredDelay + JitInfo.TieredDelay); - Func workloadMethod = AlreadyTier0; - - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); - RunJitStageToCompletion(workloadMethod); + RunJitStageToCompletion(workloadMethod, observer); - AssertTierUpOrDeclined(observer); + AssertReachedFinalTier(observer); } // Tests the case of InProcess benchmarking a method that the user already invoked before starting the benchmarks when call counting is delayed. [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] public void JitStage_AlreadyTier0DelayedCallCounting() { - DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0DelayedCallCounting(42)); Func workloadMethod = AlreadyTier0DelayedCallCounting; - + // Watch from before the pre-invoke, and hand this listener to the stage (see RunJitStageToCompletion). We do + // NOT sleep first: this test's whole point is that the call-counting delay is still pending when the stage + // starts. Because the stage reuses this one listener, the pre-invoke's event is never lost to a second + // listener's session churn, so no wait is needed to observe the final tier. using var observer = JitListener.Create(workloadMethod.Method, enabled: true); - RunJitStageToCompletion(workloadMethod); + DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0DelayedCallCounting(42)); + + RunJitStageToCompletion(workloadMethod, observer); - AssertTierUpOrDeclined(observer); + AssertReachedFinalTier(observer); } // Tests a benchmark method whose own hot loop is On-Stack-Replaced (OSR) mid-execution. Where OSR is enabled @@ -92,9 +88,9 @@ public void JitStage_Osr() using var observer = JitListener.Create(workloadMethod.Method, enabled: true); - RunJitStageToCompletion(workloadMethod); + RunJitStageToCompletion(workloadMethod, observer); - AssertTierUpOrDeclined(observer); + AssertReachedFinalTier(observer); } // Tests a benchmark method that calls (without inlining) a separate method whose hot loop is OSR'd. The listener @@ -112,51 +108,54 @@ public void JitStage_CallsOsr() // calls the OSR'd callee, so the callee should be driven all the way to tier1 too. Watch it independently. using var calleeObserver = JitListener.Create(calleeMethod.Method, enabled: true); - RunJitStageToCompletion(workloadMethod); + RunJitStageToCompletion(workloadMethod, observer); - AssertTierUpOrDeclined(observer); - AssertTierUpOrDeclined(calleeObserver); + AssertReachedFinalTier(observer); + AssertReachedFinalTier(calleeObserver); } - // A pinned optimization level makes a method ineligible for tiered compilation regardless of the assembly, so the - // listener declines to watch it (Create returns null). In an optimized build that attribute is the sole reason; in a - // DisableOptimizations build the assembly excludes it too — either way there is nothing for the listener to observe. + // A method pinned to a single optimization level never tiers, but the listener still watches it and recognizes + // its one-and-only compile as a final tier (MinOptJitted or Optimized) — so the stage observes "done" rather than + // depending on an attribute heuristic to decline up front. + // [MethodImpl(NoOptimization)] pins the method to minopts, so its final tier is MinOptJitted. [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] - public void Create_DeclinesNoOptimizationMethod() + public void JitStage_NoOptimization() { - // [MethodImpl(NoOptimization)] pins the method to minopts, so it never tiers. Func workloadMethod = NoOptimization; - using var listener = JitListener.Create(workloadMethod.Method, enabled: true); - Assert.Null(listener); + + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); } + // [MethodImpl(AggressiveOptimization)] pins the method straight to optimized code, so it never goes through + // tier0 -> tier1 and its final tier is Optimized (or OptimizedTier1, depending on runtime). [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] - public void Create_DeclinesAggressiveOptimizationMethod() + public void JitStage_AggressiveOptimization() { - // [MethodImpl(AggressiveOptimization)] pins the method straight to tier1, so it never goes through tier0 -> tier1. Func workloadMethod = AggressiveOptimization; - using var listener = JitListener.Create(workloadMethod.Method, enabled: true); - Assert.Null(listener); + + using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); } - private static void AssertTierUpOrDeclined(JitListener? observer) + private static void AssertReachedFinalTier(JitListener? observer) { - if (OptimizationsDisabled) - { - // The method can't tier, so the listener declines to watch it (Create returns null) and the stage falls - // back to the fixed-delay loop without ever reaching tier1. - Assert.Null(observer); - } - else - { - // requires EventSource support in the test host (it's enabled by default) - Assert.NotNull(observer); - Assert.True(observer!.ReachedTier1, "the jit stage should have driven the benchmark method to tier1"); - } + // No wait needed: the tier-up event is delivered while the stage is still running (it spans hundreds of ms of + // tiering delays), so by the time the stage returns the observer has already recorded the final tier. + Assert.NotNull(observer); + Assert.True(observer.ReachedFinalTier, "the jit stage should have driven the benchmark method to its final tier"); } - private static void RunJitStageToCompletion(Func workloadMethod) + // The test owns the listener and passes it in; the stage uses that exact instance (it never creates its own), so + // there is never a second EventListener whose setup/teardown could flush an event in flight to the test's listener. + private static void RunJitStageToCompletion(Func workloadMethod, JitListener? listener) { // The per-tier publication wait is unbounded, cancellable only via the host's token. When the method tiers, the // stage re-bursts until the runtime reports the next-tier compile began, so the wait isn't actually hit — but a @@ -189,7 +188,7 @@ private static void RunJitStageToCompletion(Func workloadMethod) InProcessDiagnoserHandler = new([], host, BenchmarkDotNet.Diagnosers.RunMode.None, null!), }; - var stage = new EngineJitStage(evaluateOverhead: false, parameters); + var stage = new EngineJitStage(evaluateOverhead: false, parameters, listener); var measurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(measurements, out var data)) { diff --git a/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs b/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs index 5ba9625a52..ee5b37f1f6 100644 --- a/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs +++ b/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs @@ -23,7 +23,6 @@ internal MockEngine(ITestOutputHelper output, Job job, Func Date: Tue, 2 Jun 2026 22:46:37 -0400 Subject: [PATCH 6/9] Wait for quiet JIT fallback. Simplify WaitForInitialTieringActive. --- src/BenchmarkDotNet/Engines/EngineJitStage.cs | 179 ++++++++---------- src/BenchmarkDotNet/Engines/JitListener.cs | 146 +++++++++++--- .../JitListenerTests.cs | 20 +- 3 files changed, 204 insertions(+), 141 deletions(-) diff --git a/src/BenchmarkDotNet/Engines/EngineJitStage.cs b/src/BenchmarkDotNet/Engines/EngineJitStage.cs index c6a3263be2..5f06446a03 100644 --- a/src/BenchmarkDotNet/Engines/EngineJitStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineJitStage.cs @@ -1,5 +1,3 @@ -using System.Reflection; -using System.Runtime.CompilerServices; using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Portability; using BenchmarkDotNet.Reports; @@ -18,6 +16,9 @@ internal sealed class EngineJitStage : EngineStage // MethodLoadVerbose publication confirms the tier-up instead of overshooting by re-bursting the whole budget. private static readonly TimeSpan EventDeliveryLag = TimeSpan.FromMilliseconds(10); + // How long to wait for the JIT to be quiet (not compiling any tiered methods in the background). + private static readonly TimeSpan JitQuiescenceWindow = TimeSpan.FromMilliseconds(50); + internal bool didStopEarly = false; internal Measurement lastMeasurement; @@ -26,16 +27,19 @@ internal sealed class EngineJitStage : EngineStage // Watches for the method's background tier-up via JIT events so we can proceed as soon as each tier is published. // Null when watching is disabled or EventSource is disabled, in which case we fall back to the fixed delay. private readonly JitListener? listener; + // True when this stage created the listener and must dispose it; false when a caller (a test) injected one it owns. + private readonly bool disposeListener; internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters) - : this(evaluateOverhead, parameters, JitListener.Create(parameters.WorkloadMethod, parameters.EnableJitListener)) + : this(evaluateOverhead, parameters, JitListener.Create(parameters.WorkloadMethod, parameters.EnableJitListener), disposeListener: true) { } - internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters, JitListener? listener) + internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters, JitListener? listener, bool disposeListener = false) : base(IterationStage.Jitting, IterationMode.Workload, parameters) { this.listener = listener; + this.disposeListener = disposeListener; enumerator = EnumerateIterations(); this.evaluateOverhead = evaluateOverhead; } @@ -71,7 +75,10 @@ internal override bool GetShouldRunIteration(List measurements, out iterationData = enumerator.Current; return true; } - listener?.Dispose(); + if (disposeListener) + { + listener?.Dispose(); + } enumerator.Dispose(); iterationData = default; return false; @@ -98,37 +105,18 @@ private IEnumerator EnumerateIterations() yield break; } - bool useListener = listener != null; - if (JitInfo.TieredDelay > TimeSpan.Zero) + bool observeMethod = listener != null; + if (observeMethod) { - if (useListener) - { - bool waitForTieringActive = true; - if (!listener!.WaitForTieringActivePrimed(JitInfo.TieredDelay + TimeSpan.FromMilliseconds(50), parameters.Host.CancellationToken)) - { - // If we observed no tier0 JIT (of any method) and no TieredCompilationResume/Pause event within the - // timeout, tiering is quiet in the process — which likely means the watched method was pre-warmed to at - // least tier0 and the listener was possibly created after its tiered compilation was resumed. In that - // case, we force a tier0 JIT which forces a TieredCompilationPause event, which guarantees a followup - // TieredCompilationResume that we can wait on deterministically. - if (!TryForceTier0Jit()) - { - // We couldn't establish that the call-counting delay is (or will be) active, and can't force one. - // Stop trusting the listener's tiering gate for the rest of the stage and fall back to the fixed delay. - waitForTieringActive = false; - useListener = false; - Thread.Sleep(JitInfo.TieredDelay); - } - } - if (waitForTieringActive) - { - listener!.WaitForTieringActive(parameters.Host.CancellationToken); - } - } - else - { - Thread.Sleep(JitInfo.TieredDelay); - } + // Before the tier loop, wait until the call-counting delay is inactive so the first burst is counted — + // or, if tiering is quiet because the method was pre-warmed past tier0, fake it and proceed. The first + // invoke above already fired the watched method's Pause if it was tier0. See WaitForInitialTieringActive. + listener!.WaitForInitialTieringActive(parameters.Host.CancellationToken); + } + else if (JitInfo.TieredDelay > TimeSpan.Zero) + { + // Fall back to a fixed wait for the call-counting delay to elapse. + Thread.Sleep(JitInfo.TieredDelay + TimeSpan.FromMilliseconds(10)); } // Long-running early-exit: if a single invocation already takes ~2/3 of IterationTime, this is a long-running @@ -150,17 +138,14 @@ private IEnumerator EnumerateIterations() } // Promote methods to tier1. - for (int remainingTiers = JitInfo.MaxTierPromotions; remainingTiers > 0; --remainingTiers) + for (int remainingTiers = JitInfo.MaxTierPromotions; remainingTiers > 0; --remainingTiers, remainingCalls = JitInfo.TieredCallCountThreshold) { // Run ONE full burst of this tier's call budget, gated so it's counted rather than wasted into a // deferred window. The next tier's publication (a non-tier0 MethodLoadVerbose) is the trustworthy // "the count reached the threshold and the next tier compiled" signal — the persistent per-tier counter // means calls accumulate, so if the burst doesn't tier up we nudge the rest one at a time below rather // than re-bursting the whole budget. - if (useListener) - { - listener!.WaitForTieringActive(parameters.Host.CancellationToken); - } + listener?.WaitForTieringActive(parameters.Host.CancellationToken); while (remainingCalls > 0) { // Run the whole tier's call budget in a single iteration unless the user pinned InvocationCount. @@ -171,12 +156,15 @@ private IEnumerator EnumerateIterations() yield return GetWorkloadIterationData(invokeCount); } - if (useListener) + if (observeMethod) { // Background compilation can take an indeterminate amount of time. Ideally we would wait for the MethodJittingStarted event, // but it doesn't carry tier information, so we can't skip it for the async tier0 events (if we try there is a race condition). // The only thing we can do safely is wait for the compilation to complete with a sensible timeout via the MethodLoadVerbose event that carries the tier info. - bool tieredUp = listener!.WaitForPublication(JitInfo.BackgroundCompilationDelay, parameters.Host.CancellationToken); + // If the publication doesn't arrive in the window the tier-up may still be compiling, so wait for the + // JIT to go quiet and re-check (TryQuiescentPublication) before spending nudges. + bool tieredUp = listener!.WaitForPublication(JitInfo.BackgroundCompilationDelay, parameters.Host.CancellationToken) + || TryQuiescentPublication(parameters.Host.CancellationToken); if (!tieredUp) { // Unlikely, but technically possible. The call-counting delay could be active, @@ -190,44 +178,65 @@ private IEnumerator EnumerateIterations() // The method could have been pre-warmed to tier1 before the stage started (e.g. via InProcess toolchains), // which would also hit this case. In that case we will waste time with unnecessary calls, // but it's impossible for us to detect that scenario with the available JIT APIs. - listener!.WaitForTieringActive(parameters.Host.CancellationToken); + listener.WaitForTieringActive(parameters.Host.CancellationToken); long nudgeCalls = hasUserInvocationCount ? userInvokeCount : 1; for (long nudged = 0; nudged < JitInfo.TieredCallCountThreshold && !tieredUp; nudged += nudgeCalls) { ++iterationIndex; yield return GetWorkloadIterationData(nudgeCalls); - tieredUp = listener!.WaitForPublication(EventDeliveryLag, parameters.Host.CancellationToken); + tieredUp = listener.WaitForPublication(EventDeliveryLag, parameters.Host.CancellationToken); } - // If a whole threshold of nudges went without a confirmed tier-up, it is most likely the case that the - // method was already pre-warmed to tier1. Wait it out once more just in case, then fallback to the fixed delay. - // We don't bail out here because it's possible the benchmark will call other methods via different control flow - // (e.g. InProcess toolchain with arguments/params). - if (!tieredUp && !listener!.WaitForPublication(JitInfo.BackgroundCompilationDelay, parameters.Host.CancellationToken)) + // If a whole threshold of nudges went without a confirmed tier-up, it is most likely the case that the method + // was already pre-warmed to its final tier. Wait it out once more (and re-check after quiescence) just in case. + if (!tieredUp) { - useListener = false; - // We already invoked and waited 2 stages worth, subtract 1 tier and continue the loop here to not waste an extra stage of unnecessary invocations. - remainingCalls = JitInfo.TieredCallCountThreshold; - --remainingTiers; - continue; + tieredUp = listener.WaitForPublication(JitInfo.BackgroundCompilationDelay, parameters.Host.CancellationToken) + || TryQuiescentPublication(parameters.Host.CancellationToken); } } - if (listener!.ReachedFinalTier) + if (!tieredUp) + { + // The method didn't tier up — most likely it was pre-warmed past where we can still see its events. + // Stop consulting the listener for the method's tier-ups; the remaining budget warms untracked + // callees on the quiescence path below instead. We already invoked and waited ~2 stages' worth, so + // subtract a tier to not waste an extra one. We don't bail out entirely because the benchmark may + // call other (un-pre-warmed) methods via different control flow (e.g. an InProcess toolchain with arguments/params). + observeMethod = false; + --remainingTiers; + continue; + } + + if (listener.ReachedFinalTier) { // If the method has reached its final tier we will not receive any more JIT events for it. // In case OSR is enabled and the method calls another method that is OSR'd, a runtime bug causes that other method to duplicate a tier (JitInfo.MaxTierPromotions already accounts for it). // Or the method could have been pre-warmed before the stage started, but the benchmark case uses a different control flow that calls different methods that were not pre-warmed. - // In either case, the listener only tracks the benchmark method, and unknown callees can't be watched, - // so stop consulting it and let the loop run the remaining calculated promotion iterations on the fixed delay. - useListener = false; - listener!.WaitForTieringActive(parameters.Host.CancellationToken); + // In either case, the listener only tracks the benchmark method, and unknown callees can't be watched, so stop consulting it for tier-ups. + // The remaining budget iterations instead keep bursting to push any such callee through its tiers, waiting for the background JIT queue to go quiet (below) + // instead of sleeping the full fixed delay each time. + observeMethod = false; + } + } + else if (listener != null) + { + // We're no longer driving tier-ups of the benchmark method (it reached its final tier, or its tier-up + // couldn't be confirmed — e.g. it was pre-warmed), but the burst may still push untracked callees + // through their tiers. So wait for the background JIT queue to go quiet instead of sleeping the fixed + // delay: while the worker is (or becomes) busy within the window, drain that batch and re-check; once + // it stays idle for a whole window, this tier's callee work is done and we loop to burst the next. + // Tracking busy/idle STATE means a batch already in flight is seen, and a callee that enqueues just + // after the worker momentarily went idle is still caught within the window. + while (listener.WaitForBackgroundJitBusy(JitQuiescenceWindow, parameters.Host.CancellationToken)) + { + listener.WaitForBackgroundJitIdle(parameters.Host.CancellationToken); } } else { + // No listener at all (no tiered JIT, or EventSource unavailable): fall back to the fixed delay. Engine.SleepIfPositive(JitInfo.BackgroundCompilationDelay); } - remainingCalls = JitInfo.TieredCallCountThreshold; } // Empirical evidence shows that the first call after the method is tiered up may take longer, @@ -236,53 +245,15 @@ private IEnumerator EnumerateIterations() yield return GetWorkloadIterationData(userInvokeCount); } - // Throwaway type used only as a generic argument: nesting it (Wrapper>) makes a never-before-seen - // closed type on demand, so each ForceTier0JitTarget instantiation is a distinct MethodDesc that JITs fresh. - private struct Wrapper { } - - // A real (non-trivial, non-inlined) generic method so each value-type instantiation gets its own tier0 JIT — and - // thus runs HandleCallCountingForFirstCall, starting a call-counting delay we can wait on. - [MethodImpl(MethodImplOptions.NoInlining)] - private static long ForceTier0JitTarget(long x) => x * default(T)!.GetHashCode(); - - private static readonly MethodInfo ForceTargetMethod = - typeof(EngineJitStage).GetMethod(nameof(ForceTier0JitTarget), BindingFlags.NonPublic | BindingFlags.Static)!; - private static Type s_nextForceType = typeof(int); - private static readonly object[] BoxedZero = [0L]; - // This (engine) assembly: a forced JIT of ForceTier0JitTarget below would never start a call-counting delay when - // optimizations are disabled here, so we skip forcing in that case. - private static readonly bool OptimizationsDisabled = JitListener.AreOptimizationsDisabledFor(typeof(EngineJitStage)); - - // Manufacture a tier0 JIT to start a call-counting delay, for the rare already-tiered method whose own delay - // elapsed before we were listening. Returns false if it couldn't run — the caller then falls back to a fixed sleep - // instead of waiting for a Resume that would never come. - private bool TryForceTier0Jit() + // After a publication-wait times out, the tier-up may simply still be compiling in the background. + // Wait for the JIT worker to go quiet, then re-check for the publication. + private bool TryQuiescentPublication(CancellationToken cancellationToken) { - // In a DisableOptimizations build this assembly's methods are never tier-eligible, so forcing a tier0 JIT here - // is a silent no-op that starts no delay. Bail rather than commit the caller to an unbounded wait for a Resume. - if (OptimizationsDisabled) - { - return false; - } - try - { - Type forceType; - while (true) - { - forceType = Volatile.Read(ref s_nextForceType); - if (Interlocked.CompareExchange(ref s_nextForceType, typeof(Wrapper<>).MakeGenericType(forceType), forceType) == forceType) - { - break; - } - } - ForceTargetMethod.MakeGenericMethod(forceType).Invoke(null, BoxedZero); - return true; - } - catch (Exception e) + if (listener!.WaitForBackgroundJitBusy(JitQuiescenceWindow, cancellationToken)) { - parameters.Host.SendError(e.ToString()); - return false; + listener.WaitForBackgroundJitIdle(cancellationToken); } + return listener.WaitForPublication(TimeSpan.Zero, cancellationToken); } private IterationData GetOverheadIterationData(long invokeCount) diff --git a/src/BenchmarkDotNet/Engines/JitListener.cs b/src/BenchmarkDotNet/Engines/JitListener.cs index f19cd600bb..d098a47beb 100644 --- a/src/BenchmarkDotNet/Engines/JitListener.cs +++ b/src/BenchmarkDotNet/Engines/JitListener.cs @@ -18,10 +18,10 @@ namespace BenchmarkDotNet.Engines; // tier0 compile's start is indistinguishable from a tier-up's and would race the tier0 publish that filters it. // * TieredCompilationPause/Resume (the call-counting delay bracket, Compilation keyword) gate the bursts: a burst // issued while the delay is active isn't counted (the counting stub is deferred), so the stage waits until the -// delay is observed inactive — a Resume, when the stubs are installed — before bursting (WaitForTieringActive), and -// up front checks whether any method's tier0 JIT or an already-active delay is underway -// (WaitForTieringActivePrimed) so it can force one if not. These only avoid wasting bursts — correctness -// comes from the publication. +// delay is observed inactive — a Resume, when the stubs are installed — before bursting (WaitForTieringActive). Up +// front (WaitForInitialTieringActive) it waits for any method's tier0 JIT or a Pause/Resume to confirm a Resume is +// coming; if none arrives the method was pre-warmed and its stub is already installed, so it fakes the inactive +// state and proceeds. These only avoid wasting bursts — correctness comes from the publication. // // This is intentionally a per-stage listener: enabling the Jit keyword emits an event for every method jitted // process-wide, which we must NOT pay during the measurement stages. It is created at the start of the jit stage @@ -40,6 +40,12 @@ internal sealed class JitListener : EventListener private const EventKeywords CompilationKeyword = (EventKeywords)0x1000000000; private const string TieredCompilationResumeEvent = "TieredCompilationResume"; private const string TieredCompilationPauseEvent = "TieredCompilationPause"; + // The background tiering worker brackets each batch with these: Start when it begins draining its queue, Stop when + // it finishes (Stop's PendingMethodCount payload is how many remain — 0 = drained). Unlike per-method JIT events + // these fire ONLY for actual tiered background work, so a Start is a clean "an untracked callee is tiering up" + // signal. Used to wait out such callees once the watched method itself is fully warmed. + private const string TieredCompilationBackgroundJitStartEvent = "TieredCompilationBackgroundJitStart"; + private const string TieredCompilationBackgroundJitStopEvent = "TieredCompilationBackgroundJitStop"; // Event-name prefix (the runtime appends a version suffix, e.g. MethodLoadVerbose_V2). private const string MethodLoadVerbosePrefix = "MethodLoadVerbose"; @@ -73,6 +79,13 @@ internal sealed class JitListener : EventListener private readonly ManualResetEventSlim publicationSignal = new(false); private readonly ManualResetEventSlim tieringActiveSignal = new(false); private readonly ManualResetEventSlim tieringActivePrimedSignal = new(false); + // Reflect the background tiering worker's STATE (used only after the watched method reaches its final tier, to + // wait out untracked callees). The runtime brackets each batch with BackgroundJitStart..Stop and the worker is + // single-threaded, so these stay complementary: busy while a batch is running, idle otherwise. Tracking state + // rather than a start edge means a batch already in flight when the stage looks is observed — there is no manual + // reset that could wipe a "started" we hadn't seen yet. + private readonly ManualResetEventSlim backgroundJitBusySignal = new(false); + private readonly ManualResetEventSlim backgroundJitIdleSignal = new(true); private volatile bool reachedFinalTier; private volatile bool canObserve; @@ -81,6 +94,7 @@ internal sealed class JitListener : EventListener private int loadTokenIndex = -1; private int loadFlagsIndex = -1; private int loadNameIndex = -1; + private int backgroundJitStopPendingIndex = -1; private JitListener(MethodInfo method) { @@ -90,7 +104,7 @@ private JitListener(MethodInfo method) methodName = method.Name; } - internal static JitListener? Create(MethodInfo method, bool enabled) + internal static JitListener? Create(MethodInfo method, bool enabled = true) { if (!enabled || !JitInfo.IsTiered) { @@ -105,19 +119,39 @@ private JitListener(MethodInfo method) return listener; } - internal static bool AreOptimizationsDisabledFor(MemberInfo member) - => member.Module.Assembly.GetCustomAttribute()?.IsJITOptimizerDisabled ?? false; - internal bool ReachedFinalTier => reachedFinalTier; - // Reports (within the timeout) whether the call-counting machinery is active in the process: a tier0 (QuickJitted) - // publication for ANY method, or a TieredCompilation Pause/Resume — any of which guarantees a Resume is coming to - // gate on. The stage checks this once before the tier loop: a false result means tiering is quiet and the watched - // method was likely pre-warmed past tier0, so no delay is coming on its own and one must be forced to get a Resume. - internal bool WaitForTieringActivePrimed(TimeSpan timeout, CancellationToken cancellationToken) - => tieringActivePrimedSignal.Wait(timeout, cancellationToken); + // Waits until the call-counting delay is observed inactive (a TieredCompilationResume), so the stage's first burst + // will be counted. It first waits up to a timeout for any sign the tiering machinery is active — a tier0 (QuickJitted) + // publication for ANY method, or a TieredCompilation Pause/Resume — which guarantees a Resume is coming to gate on. + // (The stage calls this AFTER its first invoke, so a freshly-tier0 watched method has already fired its Pause.) If + // nothing arrives within the timeout, tiering is quiet: the watched method was pre-warmed past tier0, its stub is + // already installed, and no delay is coming on its own — so we fake the active state and proceed. The lock + IsSet + // re-check makes that fake atomic against a real event landing right at the timeout boundary (the event handlers + // take the same lock), so we never overwrite one; and we wait OUTSIDE the lock so the handlers never block on us. + internal void WaitForInitialTieringActive(CancellationToken cancellationToken) + { + // No call-counting delay (e.g. AggressiveTiering) — counting is armed immediately, nothing to gate on. + if (JitInfo.TieredDelay <= TimeSpan.Zero) + { + return; + } + if (!tieringActivePrimedSignal.Wait(JitInfo.TieredDelay + TimeSpan.FromMilliseconds(50), cancellationToken)) + { + lock (tieringActivePrimedSignal) + { + if (!tieringActivePrimedSignal.IsSet) + { + tieringActivePrimedSignal.Set(); + tieringActiveSignal.Set(); + } + } + } + WaitForTieringActive(cancellationToken); + } - // Waits until the call-counting delay is inactive (a TieredCompilationResume was observed). + // Waits until the call-counting delay is inactive (a TieredCompilationResume was observed). Re-gates each burst in + // the tier loop after WaitForInitialTieringActive established the delay was inactive up front. internal void WaitForTieringActive(CancellationToken cancellationToken) { // No call-counting delay (e.g. AggressiveTiering) — counting is armed immediately, nothing to gate on. @@ -140,6 +174,18 @@ internal bool WaitForPublication(TimeSpan timeout, CancellationToken cancellatio return true; } + // Waits (up to the timeout) for the background tiering worker to be running a batch — either one already in flight + // or one that starts within the window. True if it is/becomes busy; false if it stays idle the whole timeout, + // meaning no background tiering is underway (quiet). + internal bool WaitForBackgroundJitBusy(TimeSpan timeout, CancellationToken cancellationToken) + => backgroundJitBusySignal.Wait(timeout, cancellationToken); + + // Waits for the background tiering worker to go idle (its queue drained — a BackgroundJitStop with + // PendingMethodCount == 0). No timeout: the caller only waits after observing the worker busy, and a running + // batch always finishes, so idle is guaranteed to arrive (the host's token still bounds it). + internal void WaitForBackgroundJitIdle(CancellationToken cancellationToken) + => backgroundJitIdleSignal.Wait(cancellationToken); + protected override void OnEventSourceCreated(EventSource source) { if (source.Name == RuntimeEventSourceName) @@ -165,14 +211,32 @@ protected override void OnEventWritten(EventWrittenEventArgs e) // Reset on Pause); tieringActivePrimedSignal just records that some delay activity occurred (set by either). if (name == TieredCompilationResumeEvent) { - tieringActiveSignal.Set(); - tieringActivePrimedSignal.Set(); + lock (tieringActivePrimedSignal) + { + tieringActiveSignal.Set(); + tieringActivePrimedSignal.Set(); + } return; } if (name == TieredCompilationPauseEvent) { - tieringActiveSignal.Reset(); - tieringActivePrimedSignal.Set(); + lock (tieringActivePrimedSignal) + { + tieringActiveSignal.Reset(); + tieringActivePrimedSignal.Set(); + } + return; + } + if (name == TieredCompilationBackgroundJitStartEvent) + { + // The worker began a batch. Reset idle before setting busy so a reader never sees both set at once. + backgroundJitIdleSignal.Reset(); + backgroundJitBusySignal.Set(); + return; + } + if (name == TieredCompilationBackgroundJitStopEvent) + { + HandleBackgroundJitStop(e); return; } @@ -182,6 +246,29 @@ protected override void OnEventWritten(EventWrittenEventArgs e) } } + private void HandleBackgroundJitStop(EventWrittenEventArgs e) + { + var payloadNames = e.PayloadNames; + var payload = e.Payload; + if (payloadNames is null || payload is null) + return; + + if (backgroundJitStopPendingIndex < 0) + { + backgroundJitStopPendingIndex = payloadNames.IndexOf("PendingMethodCount"); + if (backgroundJitStopPendingIndex < 0) + return; + } + + // The worker stopped; once nothing is left queued it has gone idle (its batch — e.g. an OSR'd callee's + // tier-up — is complete). Reset busy before setting idle so a reader never sees both set at once. + if (Convert.ToInt64(payload[backgroundJitStopPendingIndex]) == 0) + { + backgroundJitBusySignal.Reset(); + backgroundJitIdleSignal.Set(); + } + } + private void HandleMethodLoad(EventWrittenEventArgs e) { var payloadNames = e.PayloadNames; @@ -202,22 +289,19 @@ private void HandleMethodLoad(EventWrittenEventArgs e) // A QuickJitted (tier0) publication — for ANY method, not just the one we watch — means an eligible method was // just tier0-compiled and is about to run, so its first call will start or join the call-counting delay and a - // TieredCompilationResume is coming. That is exactly (and all) the up-front gate (WaitForTieringActivePrimed) + // TieredCompilationResume is coming. That is exactly (and all) the up-front gate (WaitForInitialTieringActive) // needs: it only asks "is the tiering machinery active, so a Resume will arrive to gate on?", which is a // process-wide question. (Pause/Resume prime it too; this also covers the brief window before the first call // fires Pause.) The tier0 compile itself is the baseline, not a tier-up, so we never raise a publication for it. if (tier == QuickJittedTier0) { - tieringActivePrimedSignal.Set(); + lock (tieringActivePrimedSignal) + { + tieringActivePrimedSignal.Set(); + } return; } - // Everything below concerns OUR method reaching its next tier, so filter to it. - if (Convert.ToInt32(payload[loadTokenIndex]) != metadataToken) - return; - if (payload[loadNameIndex] as string != methodName) - return; - // An OSR publication is not a step on the call-count tier ladder (it fires off a hot loop's back-edge counter, // and the method goes on to be call-count-promoted past it), so don't let it count as a tier-up the stage is // waiting on — otherwise a method that OSRs in multiple bodies overruns the stage's publication budget and @@ -225,6 +309,12 @@ private void HandleMethodLoad(EventWrittenEventArgs e) if (tier == OptimizedTier1OSR) return; + // Everything below concerns OUR method reaching its next tier, so filter to it. + if (Convert.ToInt32(payload[loadTokenIndex]) != metadataToken) + return; + if (payload[loadNameIndex] as string != methodName) + return; + // Any of the runtime's final tiers means the method is fully warmed and will emit no further tier-ups — // whether it tiered all the way up (OptimizedTier1), was compiled straight to optimized code (Optimized), or // never tiers at all (MinOptJitted). @@ -241,5 +331,7 @@ public override void Dispose() publicationSignal.Dispose(); tieringActivePrimedSignal.Dispose(); tieringActiveSignal.Dispose(); + backgroundJitBusySignal.Dispose(); + backgroundJitIdleSignal.Dispose(); } } diff --git a/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs index 59df0b4f80..54c7ce1408 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs +++ b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs @@ -15,7 +15,7 @@ public void JitStage_Cold() { Func workloadMethod = Cold; - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer = JitListener.Create(workloadMethod.Method); RunJitStageToCompletion(workloadMethod, observer); @@ -28,13 +28,13 @@ public void JitStage_AlreadyTier1() { Func workloadMethod = AlreadyTier1; - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer = JitListener.Create(workloadMethod.Method); // The first jit stage brings the method to tier1 (in an optimized build) and our observer records it. Running // the jit stage again for the same (now tier1) method should also succeed; it gets a fresh listener, because // the stage drove the first to completion and reusing one across runs would leave its tiering signals ambiguous. RunJitStageToCompletion(workloadMethod, observer); - using var observer2 = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer2 = JitListener.Create(workloadMethod.Method); RunJitStageToCompletion(workloadMethod, observer2); AssertReachedFinalTier(observer); @@ -47,7 +47,7 @@ public void JitStage_AlreadyTier0() Func workloadMethod = AlreadyTier0; // Watch from before the pre-invoke, and hand this listener to the stage so it doesn't create a second one // (see RunJitStageToCompletion): in a minopt build the pre-invoke is the method's only compile. - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer = JitListener.Create(workloadMethod.Method); DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0(42)); // Sleep long enough for the tiered call counting to begin. @@ -67,7 +67,7 @@ public void JitStage_AlreadyTier0DelayedCallCounting() // NOT sleep first: this test's whole point is that the call-counting delay is still pending when the stage // starts. Because the stage reuses this one listener, the pre-invoke's event is never lost to a second // listener's session churn, so no wait is needed to observe the final tier. - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer = JitListener.Create(workloadMethod.Method); DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0DelayedCallCounting(42)); @@ -86,7 +86,7 @@ public void JitStage_Osr() { Func workloadMethod = Osr; - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer = JitListener.Create(workloadMethod.Method); RunJitStageToCompletion(workloadMethod, observer); @@ -103,10 +103,10 @@ public void JitStage_CallsOsr() Func workloadMethod = CallsOsr; Func calleeMethod = OsrCallee; - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer = JitListener.Create(workloadMethod.Method); // The stage only drives (and the engine's listener only watches) the benchmark method, but every call to it // calls the OSR'd callee, so the callee should be driven all the way to tier1 too. Watch it independently. - using var calleeObserver = JitListener.Create(calleeMethod.Method, enabled: true); + using var calleeObserver = JitListener.Create(calleeMethod.Method); RunJitStageToCompletion(workloadMethod, observer); @@ -124,7 +124,7 @@ public void JitStage_NoOptimization() { Func workloadMethod = NoOptimization; - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer = JitListener.Create(workloadMethod.Method); RunJitStageToCompletion(workloadMethod, observer); @@ -138,7 +138,7 @@ public void JitStage_AggressiveOptimization() { Func workloadMethod = AggressiveOptimization; - using var observer = JitListener.Create(workloadMethod.Method, enabled: true); + using var observer = JitListener.Create(workloadMethod.Method); RunJitStageToCompletion(workloadMethod, observer); From 02e7826f529bb4c87e63779bb8c7e3c83e7b5ca8 Mon Sep 17 00:00:00 2001 From: Tim Cassell Date: Wed, 3 Jun 2026 00:46:26 -0400 Subject: [PATCH 7/9] Handle dropped events. Reduce event noise from the framework. --- src/BenchmarkDotNet/Engines/EngineJitStage.cs | 10 ++- src/BenchmarkDotNet/Engines/JitListener.cs | 74 ++++++++++++++----- 2 files changed, 65 insertions(+), 19 deletions(-) diff --git a/src/BenchmarkDotNet/Engines/EngineJitStage.cs b/src/BenchmarkDotNet/Engines/EngineJitStage.cs index 5f06446a03..3232a6bf3a 100644 --- a/src/BenchmarkDotNet/Engines/EngineJitStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineJitStage.cs @@ -1,3 +1,4 @@ +using BenchmarkDotNet.Attributes.CompilerServices; using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Portability; using BenchmarkDotNet.Reports; @@ -9,6 +10,7 @@ namespace BenchmarkDotNet.Engines; // and we purposefully don't spend too much time in this stage, so we can't guarantee it. // This should succeed for 99%+ of microbenchmarks. For any sufficiently short benchmarks where this fails, // the following stages (Pilot and Warmup) will likely take it the rest of the way. Long-running benchmarks may never fully reach tier1. +[AggressivelyOptimizeMethods] // Reduce JIT event noise from the jit stage itself. internal sealed class EngineJitStage : EngineStage { // After a tier's single burst fails to tier-up, we nudge one invocation at a time and wait out the async @@ -19,6 +21,10 @@ internal sealed class EngineJitStage : EngineStage // How long to wait for the JIT to be quiet (not compiling any tiered methods in the background). private static readonly TimeSpan JitQuiescenceWindow = TimeSpan.FromMilliseconds(50); + // How long to wait for an observed-busy background JIT batch to drain before assuming its BackgroundJitStop was + // dropped by EventPipe and proceeding. Generous — it only bites on a dropped event; a real drain completes sooner. + private static readonly TimeSpan BackgroundJitDrainTimeout = TimeSpan.FromSeconds(10); + internal bool didStopEarly = false; internal Measurement lastMeasurement; @@ -229,7 +235,7 @@ private IEnumerator EnumerateIterations() // after the worker momentarily went idle is still caught within the window. while (listener.WaitForBackgroundJitBusy(JitQuiescenceWindow, parameters.Host.CancellationToken)) { - listener.WaitForBackgroundJitIdle(parameters.Host.CancellationToken); + listener.WaitForBackgroundJitIdle(BackgroundJitDrainTimeout, parameters.Host.CancellationToken); } } else @@ -251,7 +257,7 @@ private bool TryQuiescentPublication(CancellationToken cancellationToken) { if (listener!.WaitForBackgroundJitBusy(JitQuiescenceWindow, cancellationToken)) { - listener.WaitForBackgroundJitIdle(cancellationToken); + listener.WaitForBackgroundJitIdle(BackgroundJitDrainTimeout, cancellationToken); } return listener.WaitForPublication(TimeSpan.Zero, cancellationToken); } diff --git a/src/BenchmarkDotNet/Engines/JitListener.cs b/src/BenchmarkDotNet/Engines/JitListener.cs index d098a47beb..51738b8c0f 100644 --- a/src/BenchmarkDotNet/Engines/JitListener.cs +++ b/src/BenchmarkDotNet/Engines/JitListener.cs @@ -1,5 +1,6 @@ using System.Diagnostics.Tracing; using System.Reflection; +using BenchmarkDotNet.Attributes.CompilerServices; using BenchmarkDotNet.Portability; namespace BenchmarkDotNet.Engines; @@ -31,6 +32,7 @@ namespace BenchmarkDotNet.Engines; // EventSource is unavailable — it can be disabled via the System.Diagnostics.Tracing.EventSource.IsSupported feature // switch. It otherwise watches the method regardless of whether it looks tier-eligible: a method that can't tier just // publishes its single final tier (see the tier constants below), which the stage observes and treats as "done". +[AggressivelyOptimizeMethods] // Reduce JIT event noise from the listener itself. internal sealed class JitListener : EventListener { private const string RuntimeEventSourceName = "Microsoft-Windows-DotNETRuntime"; @@ -74,8 +76,15 @@ internal sealed class JitListener : EventListener private const int OptimizedTier1 = 4; private const int OptimizedTier1OSR = 5; + // Margin added on top of the call-counting delay when waiting for a TieredCompilationResume, before assuming it + // was dropped (EventPipe sheds events under buffer pressure) and proceeding as if the delay had elapsed. We add it + // to TieredDelay rather than use a flat cap so a deliberately huge delay can't make the cap shorter than the delay + // itself. Generous vs the ~100ms default delay, so it only ever fires on an actual drop, not on the normal path. + private static readonly TimeSpan TieringActiveTimeoutMargin = TimeSpan.FromSeconds(1); + private readonly int metadataToken; private readonly string methodName; + private readonly object lockObj = new(); private readonly ManualResetEventSlim publicationSignal = new(false); private readonly ManualResetEventSlim tieringActiveSignal = new(false); private readonly ManualResetEventSlim tieringActivePrimedSignal = new(false); @@ -138,7 +147,7 @@ internal void WaitForInitialTieringActive(CancellationToken cancellationToken) } if (!tieringActivePrimedSignal.Wait(JitInfo.TieredDelay + TimeSpan.FromMilliseconds(50), cancellationToken)) { - lock (tieringActivePrimedSignal) + lock (lockObj) { if (!tieringActivePrimedSignal.IsSet) { @@ -151,13 +160,25 @@ internal void WaitForInitialTieringActive(CancellationToken cancellationToken) } // Waits until the call-counting delay is inactive (a TieredCompilationResume was observed). Re-gates each burst in - // the tier loop after WaitForInitialTieringActive established the delay was inactive up front. + // the tier loop after WaitForInitialTieringActive established the delay was inactive up front. Bounded: a Resume can + // be dropped by EventPipe under buffer pressure, so rather than block forever we wait up to TieredDelay plus a margin and + // then assume the delay elapsed (stubs installed) and proceed — the same fallback WaitForInitialTieringActive uses. + // The cap only bites on a dropped event; a real Resume normally arrives within the call-counting delay (~100ms). internal void WaitForTieringActive(CancellationToken cancellationToken) { // No call-counting delay (e.g. AggressiveTiering) — counting is armed immediately, nothing to gate on. - if (JitInfo.TieredDelay > TimeSpan.Zero) + if (JitInfo.TieredDelay <= TimeSpan.Zero) { - tieringActiveSignal.Wait(Timeout.InfiniteTimeSpan, cancellationToken); + return; + } + if (!tieringActiveSignal.Wait(JitInfo.TieredDelay + TieringActiveTimeoutMargin, cancellationToken)) + { + // The primed signal is already set (WaitForInitialTieringActive ran first), so only flip the active + // signal. Lock so this can't interleave with a concurrent Pause/Resume handler. + lock (lockObj) + { + tieringActiveSignal.Set(); + } } } @@ -180,11 +201,24 @@ internal bool WaitForPublication(TimeSpan timeout, CancellationToken cancellatio internal bool WaitForBackgroundJitBusy(TimeSpan timeout, CancellationToken cancellationToken) => backgroundJitBusySignal.Wait(timeout, cancellationToken); - // Waits for the background tiering worker to go idle (its queue drained — a BackgroundJitStop with - // PendingMethodCount == 0). No timeout: the caller only waits after observing the worker busy, and a running - // batch always finishes, so idle is guaranteed to arrive (the host's token still bounds it). - internal void WaitForBackgroundJitIdle(CancellationToken cancellationToken) - => backgroundJitIdleSignal.Wait(cancellationToken); + // Waits (up to the timeout) for the background tiering worker to go idle (its queue drained — a BackgroundJitStop + // with PendingMethodCount == 0). The caller only waits after observing the worker busy, and a running batch always + // finishes — but the BackgroundJitStop that announces it can be dropped by EventPipe under buffer pressure (most + // likely right here, since a busy drain floods the same buffers with MethodLoadVerbose). So on timeout we force the + // idle state and proceed: this wait is only best-effort warming of untracked callees, and leaving busy stuck set + // would also poison every later quiescence check. The cap only bites on a dropped Stop. + internal void WaitForBackgroundJitIdle(TimeSpan timeout, CancellationToken cancellationToken) + { + if (!backgroundJitIdleSignal.Wait(timeout, cancellationToken)) + { + lock (lockObj) + { + // Reset busy before setting idle so a reader never sees both set at once (matches HandleBackgroundJitStop). + backgroundJitBusySignal.Reset(); + backgroundJitIdleSignal.Set(); + } + } + } protected override void OnEventSourceCreated(EventSource source) { @@ -211,7 +245,7 @@ protected override void OnEventWritten(EventWrittenEventArgs e) // Reset on Pause); tieringActivePrimedSignal just records that some delay activity occurred (set by either). if (name == TieredCompilationResumeEvent) { - lock (tieringActivePrimedSignal) + lock (lockObj) { tieringActiveSignal.Set(); tieringActivePrimedSignal.Set(); @@ -220,7 +254,7 @@ protected override void OnEventWritten(EventWrittenEventArgs e) } if (name == TieredCompilationPauseEvent) { - lock (tieringActivePrimedSignal) + lock (lockObj) { tieringActiveSignal.Reset(); tieringActivePrimedSignal.Set(); @@ -229,9 +263,12 @@ protected override void OnEventWritten(EventWrittenEventArgs e) } if (name == TieredCompilationBackgroundJitStartEvent) { - // The worker began a batch. Reset idle before setting busy so a reader never sees both set at once. - backgroundJitIdleSignal.Reset(); - backgroundJitBusySignal.Set(); + lock (lockObj) + { + // The worker began a batch. Reset idle before setting busy so a reader never sees both set at once. + backgroundJitIdleSignal.Reset(); + backgroundJitBusySignal.Set(); + } return; } if (name == TieredCompilationBackgroundJitStopEvent) @@ -264,8 +301,11 @@ private void HandleBackgroundJitStop(EventWrittenEventArgs e) // tier-up — is complete). Reset busy before setting idle so a reader never sees both set at once. if (Convert.ToInt64(payload[backgroundJitStopPendingIndex]) == 0) { - backgroundJitBusySignal.Reset(); - backgroundJitIdleSignal.Set(); + lock (lockObj) + { + backgroundJitBusySignal.Reset(); + backgroundJitIdleSignal.Set(); + } } } @@ -295,7 +335,7 @@ private void HandleMethodLoad(EventWrittenEventArgs e) // fires Pause.) The tier0 compile itself is the baseline, not a tier-up, so we never raise a publication for it. if (tier == QuickJittedTier0) { - lock (tieringActivePrimedSignal) + lock (lockObj) { tieringActivePrimedSignal.Set(); } From ae07d7b7a302f8409f33a6b01bec39d1b4098c12 Mon Sep 17 00:00:00 2001 From: Tim Cassell Date: Sat, 20 Jun 2026 16:07:18 -0400 Subject: [PATCH 8/9] Fix some race conditions. Add function pointer return test. --- src/BenchmarkDotNet/Engines/EngineJitStage.cs | 2 +- src/BenchmarkDotNet/Engines/JitListener.cs | 42 +++++++++++-------- .../ValuesReturnedByBenchmarkTest.cs | 5 +++ 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/BenchmarkDotNet/Engines/EngineJitStage.cs b/src/BenchmarkDotNet/Engines/EngineJitStage.cs index 3232a6bf3a..0506d559b9 100644 --- a/src/BenchmarkDotNet/Engines/EngineJitStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineJitStage.cs @@ -167,7 +167,7 @@ private IEnumerator EnumerateIterations() // Background compilation can take an indeterminate amount of time. Ideally we would wait for the MethodJittingStarted event, // but it doesn't carry tier information, so we can't skip it for the async tier0 events (if we try there is a race condition). // The only thing we can do safely is wait for the compilation to complete with a sensible timeout via the MethodLoadVerbose event that carries the tier info. - // If the publication doesn't arrive in the window the tier-up may still be compiling, so wait for the + // If the publication doesn't arrive in the window, the tier-up may still be compiling, so wait for the // JIT to go quiet and re-check (TryQuiescentPublication) before spending nudges. bool tieredUp = listener!.WaitForPublication(JitInfo.BackgroundCompilationDelay, parameters.Host.CancellationToken) || TryQuiescentPublication(parameters.Host.CancellationToken); diff --git a/src/BenchmarkDotNet/Engines/JitListener.cs b/src/BenchmarkDotNet/Engines/JitListener.cs index 51738b8c0f..3c7693ca3a 100644 --- a/src/BenchmarkDotNet/Engines/JitListener.cs +++ b/src/BenchmarkDotNet/Engines/JitListener.cs @@ -85,19 +85,15 @@ internal sealed class JitListener : EventListener private readonly int metadataToken; private readonly string methodName; private readonly object lockObj = new(); - private readonly ManualResetEventSlim publicationSignal = new(false); + private readonly SemaphoreSlim publicationSignal = new(0); private readonly ManualResetEventSlim tieringActiveSignal = new(false); private readonly ManualResetEventSlim tieringActivePrimedSignal = new(false); - // Reflect the background tiering worker's STATE (used only after the watched method reaches its final tier, to - // wait out untracked callees). The runtime brackets each batch with BackgroundJitStart..Stop and the worker is - // single-threaded, so these stay complementary: busy while a batch is running, idle otherwise. Tracking state - // rather than a start edge means a batch already in flight when the stage looks is observed — there is no manual - // reset that could wipe a "started" we hadn't seen yet. private readonly ManualResetEventSlim backgroundJitBusySignal = new(false); private readonly ManualResetEventSlim backgroundJitIdleSignal = new(true); private volatile bool reachedFinalTier; private volatile bool canObserve; + private bool disposed; // Cached payload indices (field order is stable within a process for a given event version). private int loadTokenIndex = -1; @@ -183,17 +179,10 @@ internal void WaitForTieringActive(CancellationToken cancellationToken) } // Waits for a new tier publication (a non-tier0 MethodLoadVerbose for the method) — i.e. the latest burst drove - // the method to its next tier and the runtime published it. True if one arrived before the timeout; false otherwise. + // the method to its next tier and the runtime published it. Consumes one queued publication permit. True if one + // arrived before the timeout; false otherwise. Wait(TimeSpan.Zero, ...) is a non-blocking probe (TryQuiescentPublication). internal bool WaitForPublication(TimeSpan timeout, CancellationToken cancellationToken) - { - if (!publicationSignal.Wait(timeout, cancellationToken)) - { - return false; - } - // Reset for the next tier. We can't use AutoResetEvent because it doesn't support CancellationToken. - publicationSignal.Reset(); - return true; - } + => publicationSignal.Wait(timeout, cancellationToken); // Waits (up to the timeout) for the background tiering worker to be running a batch — either one already in flight // or one that starts within the window. True if it is/becomes busy; false if it stays idle the whole timeout, @@ -247,6 +236,8 @@ protected override void OnEventWritten(EventWrittenEventArgs e) { lock (lockObj) { + if (disposed) + return; tieringActiveSignal.Set(); tieringActivePrimedSignal.Set(); } @@ -256,6 +247,8 @@ protected override void OnEventWritten(EventWrittenEventArgs e) { lock (lockObj) { + if (disposed) + return; tieringActiveSignal.Reset(); tieringActivePrimedSignal.Set(); } @@ -265,6 +258,8 @@ protected override void OnEventWritten(EventWrittenEventArgs e) { lock (lockObj) { + if (disposed) + return; // The worker began a batch. Reset idle before setting busy so a reader never sees both set at once. backgroundJitIdleSignal.Reset(); backgroundJitBusySignal.Set(); @@ -303,6 +298,8 @@ private void HandleBackgroundJitStop(EventWrittenEventArgs e) { lock (lockObj) { + if (disposed) + return; backgroundJitBusySignal.Reset(); backgroundJitIdleSignal.Set(); } @@ -337,6 +334,8 @@ private void HandleMethodLoad(EventWrittenEventArgs e) { lock (lockObj) { + if (disposed) + return; tieringActivePrimedSignal.Set(); } return; @@ -361,11 +360,20 @@ private void HandleMethodLoad(EventWrittenEventArgs e) if (tier == OptimizedTier1 || tier == Optimized || tier == MinOptJitted) reachedFinalTier = true; - publicationSignal.Set(); + lock (lockObj) + { + if (disposed) + return; + publicationSignal.Release(); + } } public override void Dispose() { + lock (lockObj) + { + disposed = true; + } // base.Dispose disables the events we enabled (when no other listener wants them). base.Dispose(); publicationSignal.Dispose(); diff --git a/tests/BenchmarkDotNet.IntegrationTests/ValuesReturnedByBenchmarkTest.cs b/tests/BenchmarkDotNet.IntegrationTests/ValuesReturnedByBenchmarkTest.cs index 6c35a91748..77c0fc4567 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/ValuesReturnedByBenchmarkTest.cs +++ b/tests/BenchmarkDotNet.IntegrationTests/ValuesReturnedByBenchmarkTest.cs @@ -107,6 +107,11 @@ public class Job { } [Benchmark] public unsafe int* PointerToUnmanagedType() => (int*)System.IntPtr.Zero.ToPointer(); + [Benchmark] + public unsafe delegate* FunctionPointer() => &ReturnArgument; + + private static int ReturnArgument(int value) => value; + [Benchmark] public System.IntPtr IntPtr() => System.IntPtr.Zero; From 21a7d8ca5cbee02c95d4912dd60083ea779010be Mon Sep 17 00:00:00 2001 From: Tim Cassell Date: Sat, 20 Jun 2026 16:59:35 -0400 Subject: [PATCH 9/9] Make WorkloadMethod optional. Speed up EnumerateStagesTests. --- src/BenchmarkDotNet/Engines/Engine.cs | 2 +- src/BenchmarkDotNet/Engines/EngineJitStage.cs | 12 ++-- .../Engines/EngineParameters.cs | 9 +-- src/BenchmarkDotNet/Engines/EngineStage.cs | 6 +- src/BenchmarkDotNet/Engines/JitListener.cs | 4 +- .../Templates/BenchmarkType.txt | 60 +++++++++++-------- .../Engine/EnumerateStagesTests.cs | 23 ++++--- .../Shared/Mocks/MockEngine.cs | 2 +- 8 files changed, 64 insertions(+), 54 deletions(-) diff --git a/src/BenchmarkDotNet/Engines/Engine.cs b/src/BenchmarkDotNet/Engines/Engine.cs index 8fc2bac853..e794af08d0 100644 --- a/src/BenchmarkDotNet/Engines/Engine.cs +++ b/src/BenchmarkDotNet/Engines/Engine.cs @@ -34,7 +34,7 @@ internal Engine(EngineParameters engineParameters) var job = engineParameters.TargetJob ?? throw new ArgumentNullException(nameof(EngineParameters.TargetJob)); Parameters = new() { - WorkloadMethod = engineParameters.WorkloadMethod ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadMethod)), + WorkloadMethod = engineParameters.WorkloadMethod, WorkloadActionNoUnroll = engineParameters.WorkloadActionNoUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadActionNoUnroll)), WorkloadActionUnroll = engineParameters.WorkloadActionUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadActionUnroll)), OverheadActionNoUnroll = engineParameters.OverheadActionNoUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.OverheadActionNoUnroll)), diff --git a/src/BenchmarkDotNet/Engines/EngineJitStage.cs b/src/BenchmarkDotNet/Engines/EngineJitStage.cs index 0506d559b9..87f6df0471 100644 --- a/src/BenchmarkDotNet/Engines/EngineJitStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineJitStage.cs @@ -30,24 +30,26 @@ internal sealed class EngineJitStage : EngineStage private readonly IEnumerator enumerator; private readonly bool evaluateOverhead; + private readonly bool skipDelays; // Watches for the method's background tier-up via JIT events so we can proceed as soon as each tier is published. // Null when watching is disabled or EventSource is disabled, in which case we fall back to the fixed delay. private readonly JitListener? listener; // True when this stage created the listener and must dispose it; false when a caller (a test) injected one it owns. private readonly bool disposeListener; - internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters) - : this(evaluateOverhead, parameters, JitListener.Create(parameters.WorkloadMethod, parameters.EnableJitListener), disposeListener: true) + internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters, bool skipDelays) + : this(evaluateOverhead, parameters, JitListener.Create(parameters.WorkloadMethod), disposeListener: true, skipDelays: skipDelays) { } - internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters, JitListener? listener, bool disposeListener = false) + internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters, JitListener? listener, bool disposeListener = false, bool skipDelays = false) : base(IterationStage.Jitting, IterationMode.Workload, parameters) { this.listener = listener; this.disposeListener = disposeListener; enumerator = EnumerateIterations(); this.evaluateOverhead = evaluateOverhead; + this.skipDelays = skipDelays; } internal override List GetMeasurementList() => new(GetMaxMeasurementCount()); @@ -119,7 +121,7 @@ private IEnumerator EnumerateIterations() // invoke above already fired the watched method's Pause if it was tier0. See WaitForInitialTieringActive. listener!.WaitForInitialTieringActive(parameters.Host.CancellationToken); } - else if (JitInfo.TieredDelay > TimeSpan.Zero) + else if (!skipDelays && JitInfo.TieredDelay > TimeSpan.Zero) { // Fall back to a fixed wait for the call-counting delay to elapse. Thread.Sleep(JitInfo.TieredDelay + TimeSpan.FromMilliseconds(10)); @@ -238,7 +240,7 @@ private IEnumerator EnumerateIterations() listener.WaitForBackgroundJitIdle(BackgroundJitDrainTimeout, parameters.Host.CancellationToken); } } - else + else if (!skipDelays) { // No listener at all (no tiered JIT, or EventSource unavailable): fall back to the fixed delay. Engine.SleepIfPositive(JitInfo.BackgroundCompilationDelay); diff --git a/src/BenchmarkDotNet/Engines/EngineParameters.cs b/src/BenchmarkDotNet/Engines/EngineParameters.cs index 7d4d2f7bb4..93ecdb160e 100644 --- a/src/BenchmarkDotNet/Engines/EngineParameters.cs +++ b/src/BenchmarkDotNet/Engines/EngineParameters.cs @@ -20,14 +20,9 @@ public class EngineParameters /// /// The benchmark method, used by the jit stage to watch for its tier-up via JIT events. + /// If , falls back to a fixed delay. /// - public required MethodInfo WorkloadMethod { get; set; } - - /// - /// Whether the jit stage may watch JIT tier-up events. Disabled by the stage-enumeration unit - /// tests, which drive the stage with mock (non-executing) workloads that never raise events. - /// - internal bool EnableJitListener { get; set; } = true; + public required MethodInfo? WorkloadMethod { get; set; } public long OperationsPerInvoke { get; set; } = 1; public required Func GlobalSetupAction { get; set; } diff --git a/src/BenchmarkDotNet/Engines/EngineStage.cs b/src/BenchmarkDotNet/Engines/EngineStage.cs index 7d1333f390..8b23bfacc4 100644 --- a/src/BenchmarkDotNet/Engines/EngineStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineStage.cs @@ -15,7 +15,9 @@ internal abstract class EngineStage(IterationStage stage, IterationMode mode, En internal abstract bool GetShouldRunIteration(List measurements, out IterationData iterationData); [MethodImpl(MethodImplOptions.NoInlining)] - internal static IEnumerable EnumerateStages(EngineParameters parameters) + // skipJitDelays is used by EnumerateStagesTests to skip waiting when it's only testing the stage logic, not real JIT compilation. + // Real JIT compilation is tested in JitListenerTests. + internal static IEnumerable EnumerateStages(EngineParameters parameters, bool skipJitDelays = false) { var strategy = parameters.TargetJob.ResolveValue(RunMode.RunStrategyCharacteristic, parameters.Resolver); var invokeCount = parameters.TargetJob.ResolveValue(RunMode.InvocationCountCharacteristic, parameters.Resolver, 1); @@ -31,7 +33,7 @@ internal static IEnumerable EnumerateStages(EngineParameters parame int minInvokeCount = parameters.TargetJob.ResolveValue(AccuracyMode.MinInvokeCountCharacteristic, parameters.Resolver); // AOT technically doesn't have a JIT, but we run jit stage regardless because of static constructors. #2004 - var jitStage = new EngineJitStage(evaluateOverhead, parameters); + var jitStage = new EngineJitStage(evaluateOverhead, parameters, skipJitDelays); yield return jitStage; bool hasUnrollFactor = parameters.TargetJob.HasValue(RunMode.UnrollFactorCharacteristic); diff --git a/src/BenchmarkDotNet/Engines/JitListener.cs b/src/BenchmarkDotNet/Engines/JitListener.cs index 3c7693ca3a..f733ac8f06 100644 --- a/src/BenchmarkDotNet/Engines/JitListener.cs +++ b/src/BenchmarkDotNet/Engines/JitListener.cs @@ -109,9 +109,9 @@ private JitListener(MethodInfo method) methodName = method.Name; } - internal static JitListener? Create(MethodInfo method, bool enabled = true) + internal static JitListener? Create(MethodInfo? method) { - if (!enabled || !JitInfo.IsTiered) + if (method is null || !JitInfo.IsTiered) { return null; } diff --git a/src/BenchmarkDotNet/Templates/BenchmarkType.txt b/src/BenchmarkDotNet/Templates/BenchmarkType.txt index ade82eba79..b5d113e19d 100644 --- a/src/BenchmarkDotNet/Templates/BenchmarkType.txt +++ b/src/BenchmarkDotNet/Templates/BenchmarkType.txt @@ -39,7 +39,7 @@ global::BenchmarkDotNet.Engines.EngineParameters engineParameters = new global::BenchmarkDotNet.Engines.EngineParameters() { Host = host, - WorkloadMethod = instance.__ResolveWorkloadMethod(), + WorkloadMethod = instance.__ResolveWorkloadMethod(host), WorkloadActionUnroll = instance.WorkloadActionUnroll, WorkloadActionNoUnroll = instance.WorkloadActionNoUnroll, OverheadActionNoUnroll = instance.OverheadActionNoUnroll, @@ -72,36 +72,48 @@ $DeclareFieldsContainer$ - private global::System.Reflection.MethodInfo __ResolveWorkloadMethod() + private global::System.Reflection.MethodInfo __ResolveWorkloadMethod(global::BenchmarkDotNet.Engines.IHost host) { - global::System.Type[] parameterTypes = $WorkloadMethodParameterTypes$; - foreach (global::System.Reflection.MethodInfo candidate in typeof($WorkloadTypeName$).GetMethods( - global::System.Reflection.BindingFlags.Instance | global::System.Reflection.BindingFlags.Static | - global::System.Reflection.BindingFlags.Public | global::System.Reflection.BindingFlags.NonPublic)) + // Best-effort: the jit stage uses the resolved method to watch its JIT tier-up events, and falls back to a + // fixed delay when WorkloadMethod is null. So neither a missed match nor a reflection failure (e.g. a + // same-named overload's parameter type fails to load) may break the benchmark — report and return null. + try { - if (candidate.Name != "$WorkloadMethodName$") + global::System.Type[] parameterTypes = $WorkloadMethodParameterTypes$; + foreach (global::System.Reflection.MethodInfo candidate in typeof($WorkloadTypeName$).GetMethods( + global::System.Reflection.BindingFlags.Instance | global::System.Reflection.BindingFlags.Static | + global::System.Reflection.BindingFlags.Public | global::System.Reflection.BindingFlags.NonPublic)) { - continue; - } - global::System.Reflection.ParameterInfo[] parameters = candidate.GetParameters(); - if (parameters.Length != parameterTypes.Length) - { - continue; - } - global::System.Boolean isMatch = true; - for (global::System.Int32 i = 0; i < parameters.Length; i++) - { - if (parameters[i].ParameterType != parameterTypes[i]) + if (candidate.Name != "$WorkloadMethodName$") { - isMatch = false; - break; + continue; + } + global::System.Reflection.ParameterInfo[] parameters = candidate.GetParameters(); + if (parameters.Length != parameterTypes.Length) + { + continue; + } + global::System.Boolean isMatch = true; + for (global::System.Int32 i = 0; i < parameters.Length; i++) + { + if (parameters[i].ParameterType != parameterTypes[i]) + { + isMatch = false; + break; + } + } + if (isMatch) + { + return candidate; } } - if (isMatch) - { - return candidate; - } } + catch (global::System.Exception e) + { + host.SendError($"Exception during __ResolveWorkloadMethod!{(global::System.Environment.NewLine)}{e}"); + return null; + } + host.WriteLine("// Could not resolve the benchmark method '$WorkloadMethodName$' to watch JIT tier-up events; the jit stage will fall back to a fixed delay."); return null; } diff --git a/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs b/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs index 33f484f22f..d0c26aec7d 100644 --- a/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs +++ b/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs @@ -31,7 +31,7 @@ public void JobsThatDontRequireJittingSkipJitStage(string jobName) var engineParameters = CreateEngineParameters(job); bool didRunStages = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { Assert.True(stage is not EngineJitStage); didRunStages = true; @@ -47,7 +47,7 @@ public void DefaultSettingsVeryTimeConsumingBenchmarksAreExecutedOncePerIteratio var engineParameters = CreateEngineParameters(Job.Default); bool didRunActualStage = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { Assert.NotEqual(IterationMode.Overhead, stage.Mode); @@ -81,7 +81,7 @@ public void BenchmarksThatRunLongerThanIterationTimeOnlyDuringFirstInvocationAre var engineParameters = CreateEngineParameters(Job.Default.WithIterationTime(TimeInterval.FromMilliseconds(iterationTime))); bool didRunActualStage = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -119,7 +119,7 @@ private void AssertUnroll(Job job) var engineParameters = CreateEngineParameters(job); bool didRunUnroll = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -150,7 +150,7 @@ public void JobWithExplicitInvocationCount(long invocationCount) // A short measurement encourages the JIT stage to batch many invocations into a single iteration, // which is the regression introduced by #2806. var fastMeasurement = TimeInterval.FromMicroseconds(1); - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -177,7 +177,7 @@ public void LongRunningBenchmarksExitJitStageEarly() var engineParameters = CreateEngineParameters(job); int jitWorkloadCount = 0; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -210,7 +210,7 @@ public void SlowFirstIterationButFastSteadyStateDoesNotExitJitStageEarly() var engineParameters = CreateEngineParameters(Job.Default.WithInvocationCount(1).WithUnrollFactor(1)); int jitWorkloadCount = 0; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -247,7 +247,7 @@ public void ForceJitTieringModeRunsFullTieringLoopEvenForLongRunningBenchmarks() int jitWorkloadCount = 0; bool didStopEarly = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -282,7 +282,7 @@ public void SkipJitTieringModeSkipsTierPromotion() int jitWorkloadCount = 0; bool didStopEarly = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -316,7 +316,7 @@ public void MediumTimeConsumingBenchmarksStartPilotFrom2AndIncrementItWithEveryS var engineParameters = CreateEngineParameters(Job.Default); bool didRunPilotStage = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -347,8 +347,7 @@ private EngineParameters CreateEngineParameters(Job job) Func> emptyAction = (_, _) => new(default(ClockSpan)); return new() { - WorkloadMethod = emptyAction.Method, - EnableJitListener = false, + WorkloadMethod = null, GlobalSetupAction = () => new(), GlobalCleanupAction = () => new(), Host = host, diff --git a/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs b/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs index ee5b37f1f6..1929667221 100644 --- a/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs +++ b/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs @@ -22,7 +22,7 @@ internal MockEngine(ITestOutputHelper output, Job job, Func