diff --git a/src/BenchmarkDotNet/Code/DeclarationsProvider.cs b/src/BenchmarkDotNet/Code/DeclarationsProvider.cs index 107c2bcb6a..155fba85ee 100644 --- a/src/BenchmarkDotNet/Code/DeclarationsProvider.cs +++ b/src/BenchmarkDotNet/Code/DeclarationsProvider.cs @@ -34,6 +34,8 @@ public SmartStringBuilder ReplaceTemplate(SmartStringBuilder smartStringBuilder) return ReplaceCore(smartStringBuilder) .Replace("$DisassemblerEntryMethodImpl$", GetWorkloadMethodCall(GetPassArgumentsDirect())) .Replace("$OperationsPerInvoke$", Descriptor.OperationsPerInvoke.ToString()) + .Replace("$WorkloadMethodName$", Descriptor.WorkloadMethod.Name) + .Replace("$WorkloadMethodParameterTypes$", GetWorkloadMethodParameterTypes()) .Replace("$WorkloadTypeName$", Descriptor.Type.GetCorrectCSharpTypeName()); } @@ -108,6 +110,26 @@ protected string GetPassArguments() .Select((parameter, index) => $"{CodeGenerator.GetParameterModifier(parameter)} arg{index}") ); + // Renders the benchmark method's parameter types as a Type[] for __ResolveWorkloadMethods to match overloads + // exactly. Each is a typeof(...) of the element type, re-wrapping by-ref/pointer via reflection (typeof can't + // express `T&`), so resolution never has to name the method's (possibly unspellable) return type. + private string GetWorkloadMethodParameterTypes() + { + var parameters = Descriptor.WorkloadMethod.GetParameters(); + if (parameters.Length == 0) + return "global::System.Array.Empty()"; + return $"new global::System.Type[] {{ {string.Join(", ", parameters.Select(p => GetTypeOfExpression(p.ParameterType)))} }}"; + } + + private static string GetTypeOfExpression(System.Type type) + { + if (type.IsByRef) + return $"{GetTypeOfExpression(type.GetElementType()!)}.MakeByRefType()"; + if (type.IsPointer) + return $"{GetTypeOfExpression(type.GetElementType()!)}.MakePointerType()"; + return $"typeof({type.GetCorrectCSharpTypeName()})"; + } + protected string GetPassArgumentsDirect() => string.Join( ", ", diff --git a/src/BenchmarkDotNet/Engines/Engine.cs b/src/BenchmarkDotNet/Engines/Engine.cs index a2c4e9eb4b..269280937e 100644 --- a/src/BenchmarkDotNet/Engines/Engine.cs +++ b/src/BenchmarkDotNet/Engines/Engine.cs @@ -34,6 +34,7 @@ internal Engine(EngineParameters engineParameters) var job = engineParameters.TargetJob ?? throw new ArgumentNullException(nameof(EngineParameters.TargetJob)); Parameters = new() { + WorkloadMethods = engineParameters.WorkloadMethods ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadMethods)), WorkloadActionNoUnroll = engineParameters.WorkloadActionNoUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadActionNoUnroll)), WorkloadActionUnroll = engineParameters.WorkloadActionUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.WorkloadActionUnroll)), OverheadActionNoUnroll = engineParameters.OverheadActionNoUnroll ?? throw new ArgumentNullException(nameof(EngineParameters.OverheadActionNoUnroll)), diff --git a/src/BenchmarkDotNet/Engines/EngineJitStage.cs b/src/BenchmarkDotNet/Engines/EngineJitStage.cs index 1e61e8315e..005e569e03 100644 --- a/src/BenchmarkDotNet/Engines/EngineJitStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineJitStage.cs @@ -1,3 +1,4 @@ +using BenchmarkDotNet.Attributes.CompilerServices; using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Portability; using BenchmarkDotNet.Reports; @@ -9,29 +10,49 @@ namespace BenchmarkDotNet.Engines; // and we purposefully don't spend too much time in this stage, so we can't guarantee it. // This should succeed for 99%+ of microbenchmarks. For any sufficiently short benchmarks where this fails, // the following stages (Pilot and Warmup) will likely take it the rest of the way. Long-running benchmarks may never fully reach tier1. +[AggressivelyOptimizeMethods] // Reduce JIT event noise from the jit stage itself. internal sealed class EngineJitStage : EngineStage { - // Jit call counting delay is only for when the app starts up. We don't need to wait for every benchmark if multiple benchmarks are ran in-process. - private static TimeSpan s_tieredDelay = JitInfo.TieredDelay; + // After a tier's single burst fails to tier-up, we nudge one invocation at a time, giving the background worker a + // short window (~10ms) to pick each nudge up before trying the next — so we stop the instant the tier-up lands + // instead of overshooting by re-bursting the whole budget. Passed to WaitForTierUp as its busy-wait timeout. + private static readonly TimeSpan EventDeliveryLag = TimeSpan.FromMilliseconds(10); internal bool didStopEarly = false; internal Measurement lastMeasurement; private readonly IEnumerator enumerator; private readonly bool evaluateOverhead; + private readonly bool skipDelays; + // Watches the benchmark method(s)' background tier-up via JIT events so we can proceed once the JIT goes quiet after + // each tier (the whole call tree warmed), instead of waiting a fixed delay. Null when there's nothing to watch or + // EventSource is disabled, in which case we fall back to the fixed delay. + private readonly JitListener? listener; + // True when this stage created the listener and must dispose it; false when a caller (a test) injected one it owns. + private readonly bool disposeListener; + + internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters, bool skipDelays) + : this(evaluateOverhead, parameters, JitListener.Create(parameters.WorkloadMethods), disposeListener: true, skipDelays: skipDelays) + { + } - internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters) : base(IterationStage.Jitting, IterationMode.Workload, parameters) + internal EngineJitStage(bool evaluateOverhead, EngineParameters parameters, JitListener? listener, bool disposeListener = false, bool skipDelays = false) + : base(IterationStage.Jitting, IterationMode.Workload, parameters) { + this.listener = listener; + this.disposeListener = disposeListener; enumerator = EnumerateIterations(); this.evaluateOverhead = evaluateOverhead; + this.skipDelays = skipDelays; } internal override List GetMeasurementList() => new(GetMaxMeasurementCount()); private int GetMaxMeasurementCount() { + int nudgeMultiplier = JitInfo.TieredDelay > TimeSpan.Zero ? 2 : 1; int count = JitInfo.IsTiered - ? JitInfo.MaxTierPromotions * JitInfo.TieredCallCountThreshold + 2 + ? JitInfo.MaxTierPromotions * JitInfo.TieredCallCountThreshold * nudgeMultiplier + 2 : 1; if (evaluateOverhead) { @@ -44,7 +65,7 @@ internal override bool GetShouldRunIteration(List measurements, out { if (measurements.Count > 0) { - var measurement = measurements[measurements.Count - 1]; + var measurement = measurements[^1]; if (measurement.IterationMode == IterationMode.Workload) { lastMeasurement = measurement; @@ -55,6 +76,10 @@ internal override bool GetShouldRunIteration(List measurements, out iterationData = enumerator.Current; return true; } + if (disposeListener) + { + listener?.Dispose(); + } enumerator.Dispose(); iterationData = default; return false; @@ -81,16 +106,23 @@ private IEnumerator EnumerateIterations() yield break; } - // Wait enough time for jit call counting to begin. - Engine.SleepIfPositive(s_tieredDelay); - // Don't make the next jit stage wait if it's ran in the same process. - s_tieredDelay = TimeSpan.Zero; + bool observeMethod = listener != null; + if (observeMethod) + { + // Before the tier loop, wait until the call-counting delay is inactive so the first burst is counted — + // or, if tiering is quiet because the method was pre-warmed past tier0, fake it and proceed. The first + // invoke above already fired the watched method's Pause if it was tier0. See WaitForInitialTieringActive. + listener!.WaitForInitialTieringActive(parameters.Host.CancellationToken); + } + else if (!skipDelays && JitInfo.TieredDelay > TimeSpan.Zero) + { + // Fall back to a fixed wait for the call-counting delay to elapse. + Thread.Sleep(JitInfo.TieredDelay + TimeSpan.FromMilliseconds(10)); + } - // If the first iteration suggests a long-running benchmark (a single invocation already - // takes ~2/3 of IterationTime or more), run one confirmation iteration and bail out if - // it agrees. Same cutoff value that pilot stage uses. - // We do not bail out immediately if the first iteration is long-running because it could - // be due to cctors or other lazy initialization that won't be hit in steady-state. #2004 + // Long-running early-exit: if a single invocation already takes ~2/3 of IterationTime, this is a long-running + // benchmark — bail and let the Pilot/Warmup stages finish tiering. The first invoke can be inflated by JIT or + // cctors, so confirm with one more iteration before bailing (it could be a one-time cost). #2004 // JitTieringMode.Force opts out of this heuristic and always promotes through every tier. TimeInterval iterationTime = parameters.TargetJob.ResolveValue(RunMode.IterationTimeCharacteristic, parameters.Resolver); long remainingCalls = JitInfo.TieredCallCountThreshold; @@ -104,12 +136,18 @@ private IEnumerator EnumerateIterations() didStopEarly = true; yield break; } - remainingCalls -= userInvokeCount; } // Promote methods to tier1. - for (int remainingTiers = JitInfo.MaxTierPromotions; remainingTiers > 0; --remainingTiers) + for (int tierCount = 0; tierCount < JitInfo.MaxTierPromotions; ++tierCount, remainingCalls = JitInfo.TieredCallCountThreshold) { + // Run ONE full burst of this tier's call budget, gated so it's counted rather than wasted into a + // deferred window. After it, wait for the background JIT to go QUIET (WaitForQuiescentTierUp): once the + // worker is idle, this tier's compiles — the watched method(s) AND their untracked callees — have all + // landed, so the next burst / the following stage won't race them. The per-tier counter persists, so if + // the burst didn't tier the watched method(s) up we nudge the rest one at a time below rather than + // re-bursting the whole budget. + listener?.WaitForTieringActive(parameters.Host.CancellationToken); while (remainingCalls > 0) { // Run the whole tier's call budget in a single iteration unless the user pinned InvocationCount. @@ -120,8 +158,71 @@ private IEnumerator EnumerateIterations() yield return GetWorkloadIterationData(invokeCount); } - Engine.SleepIfPositive(JitInfo.BackgroundCompilationDelay); - remainingCalls = JitInfo.TieredCallCountThreshold; + if (listener != null) + { + // Wait for the background JIT to go quiet (the watched method(s) and their callees settle), then read + // whether the watched method(s) actually advanced this burst. Once we've stopped observing them the + // advanced result is ignored, but this still drains untracked-callee tier-ups before the next burst. + bool advanced = listener.WaitForQuiescentTierUp(tierCount, parameters.Host.CancellationToken); + if (observeMethod) + { + if (!advanced) + { + // The burst didn't tier the watched method(s) up. With NO call-counting delay, the burst's whole + // budget was counted, so a miss means they were pre-warmed past this tier (or are otherwise + // unobservable) — nudging can't help, so stop consulting the listener for them. We don't bail out + // entirely because the benchmark may call other (un-pre-warmed) methods via different control + // flow (e.g. an InProcess toolchain with arguments/params); the remaining bursts warm those + callees. + if (JitInfo.TieredDelay <= TimeSpan.Zero) + { + observeMethod = false; + continue; + } + + // Otherwise the call-counting delay was probably active for the first ~10ms of the burst due to event + // delivery lag, so some invocations didn't count and we just need a few more. Re-bursting the whole + // budget would overshoot wastefully (up to threshold * call-time), so nudge one invocation at a time, + // detecting the tier-up cheaply (WaitForTierUp, no full quiescence settle per nudge), then + // settle once at the end so this tier's callees are warm. + listener.WaitForTieringActive(parameters.Host.CancellationToken); + long nudgeCalls = hasUserInvocationCount ? userInvokeCount : 1; + for (long nudged = 0; nudged < JitInfo.TieredCallCountThreshold && !advanced; nudged += nudgeCalls) + { + ++iterationIndex; + yield return GetWorkloadIterationData(nudgeCalls); + advanced = listener.WaitForTierUp(tierCount, EventDeliveryLag, parameters.Host.CancellationToken); + } + // Settle the callees pushed by the nudges (and re-read the tier state race-free); this also + // catches a tier-up whose publication arrived just after the last cheap WaitForTierUp window. + advanced = listener.WaitForQuiescentTierUp(tierCount, parameters.Host.CancellationToken); + if (!advanced) + { + // Even nudging didn't tier them up — most likely pre-warmed to their final tier before the + // stage started (e.g. via InProcess toolchains). Stop consulting the listener (same as the + // no-delay case above). We already spent ~2 tiers' worth here, so skip a tier (the extra + // ++tierCount on top of the loop's) so we don't overspend the budget. + observeMethod = false; + ++tierCount; + continue; + } + } + + if (listener.ReachedFinalTier) + { + // ReachedFinalTier is the aggregate: every watched method is fully warmed, so we will not + // receive any more tier-up JIT events for the method(s) we track. (OSR adds a quirk: a runtime + // bug double-instruments an OSR'd callee, which JitInfo.MaxTierPromotions already budgets for.) + // Keep bursting to push any untracked callees through their tiers — the quiescence wait above + // handles them — but stop consulting the listener for the watched method(s). + observeMethod = false; + } + } + } + else if (!skipDelays) + { + // No listener (nothing to watch, or EventSource unavailable), fall back to the fixed delay. + Engine.SleepIfPositive(JitInfo.BackgroundCompilationDelay); + } } // Empirical evidence shows that the first call after the method is tiered up may take longer, diff --git a/src/BenchmarkDotNet/Engines/EngineParameters.cs b/src/BenchmarkDotNet/Engines/EngineParameters.cs index 7d7dab4b21..a496cfa116 100644 --- a/src/BenchmarkDotNet/Engines/EngineParameters.cs +++ b/src/BenchmarkDotNet/Engines/EngineParameters.cs @@ -1,3 +1,4 @@ +using System.Reflection; using BenchmarkDotNet.Characteristics; using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Running; @@ -16,6 +17,13 @@ public class EngineParameters public required Func> OverheadActionNoUnroll { get; set; } public required Func> OverheadActionUnroll { get; set; } public Job TargetJob { get; set; } = Job.Default; + + /// + /// The benchmark method(s), used by the jit stage to watch for their tier-up via JIT events. + /// When empty (nothing to watch, or resolution failed), the jit stage falls back to a fixed delay. + /// + public required IEnumerable WorkloadMethods { get; set; } + public long OperationsPerInvoke { get; set; } = 1; public required Func GlobalSetupAction { get; set; } public required Func GlobalCleanupAction { get; set; } diff --git a/src/BenchmarkDotNet/Engines/EngineStage.cs b/src/BenchmarkDotNet/Engines/EngineStage.cs index 7d1333f390..8b23bfacc4 100644 --- a/src/BenchmarkDotNet/Engines/EngineStage.cs +++ b/src/BenchmarkDotNet/Engines/EngineStage.cs @@ -15,7 +15,9 @@ internal abstract class EngineStage(IterationStage stage, IterationMode mode, En internal abstract bool GetShouldRunIteration(List measurements, out IterationData iterationData); [MethodImpl(MethodImplOptions.NoInlining)] - internal static IEnumerable EnumerateStages(EngineParameters parameters) + // skipJitDelays is used by EnumerateStagesTests to skip waiting when it's only testing the stage logic, not real JIT compilation. + // Real JIT compilation is tested in JitListenerTests. + internal static IEnumerable EnumerateStages(EngineParameters parameters, bool skipJitDelays = false) { var strategy = parameters.TargetJob.ResolveValue(RunMode.RunStrategyCharacteristic, parameters.Resolver); var invokeCount = parameters.TargetJob.ResolveValue(RunMode.InvocationCountCharacteristic, parameters.Resolver, 1); @@ -31,7 +33,7 @@ internal static IEnumerable EnumerateStages(EngineParameters parame int minInvokeCount = parameters.TargetJob.ResolveValue(AccuracyMode.MinInvokeCountCharacteristic, parameters.Resolver); // AOT technically doesn't have a JIT, but we run jit stage regardless because of static constructors. #2004 - var jitStage = new EngineJitStage(evaluateOverhead, parameters); + var jitStage = new EngineJitStage(evaluateOverhead, parameters, skipJitDelays); yield return jitStage; bool hasUnrollFactor = parameters.TargetJob.HasValue(RunMode.UnrollFactorCharacteristic); diff --git a/src/BenchmarkDotNet/Engines/JitListener.cs b/src/BenchmarkDotNet/Engines/JitListener.cs new file mode 100644 index 0000000000..884cf2d138 --- /dev/null +++ b/src/BenchmarkDotNet/Engines/JitListener.cs @@ -0,0 +1,500 @@ +using System.Diagnostics.Tracing; +using System.Reflection; +using BenchmarkDotNet.Attributes.CompilerServices; +using BenchmarkDotNet.Portability; + +namespace BenchmarkDotNet.Engines; + +// Observes background JIT tier-up of one or more (benchmark) methods by listening to the runtime's JIT events +// in-process, so the jit stage can proceed as soon as the call tree is actually warmed instead of waiting a fixed +// delay. The runtime only announces transitions (there is no API to poll a method's current tier), so we must be +// listening while they happen. A single listener can watch several methods at once — ReachedFinalTier is the aggregate +// (true only once EVERY watched method has reached its final tier). (Today the stage watches just the benchmark method; +// watching multiple is in place so scenarios that drive several methods need no contract change. #147) +// +// The core signal is JIT QUIESCENCE, not the individual tier-up. A burst tiers up the watched method AND its (untracked) +// callees on the same background worker; proceeding the instant the watched method publishes tier1 would leave its +// callees still compiling and race them into the next burst / the following stage. So each tier the stage waits for the +// background worker to go idle (WaitForQuiescentTierUp): once it's quiet the whole tree reached this tier, and the +// watched methods' tier counts can be read race-free. +// +// The events it watches, and their roles: +// * TieredCompilationBackgroundJitStart/Stop (Compilation keyword) bracket the background tiering worker draining its +// queue — Start when it begins, Stop when it finishes (Stop's PendingMethodCount payload is how many remain; 0 = +// drained). These fire ONLY for actual tiered background work, so they are how we detect quiescence: a burst's +// methods and their callees tier up in a train of back-to-back batches, and WaitForQuiescentTierUp waits for the +// worker to be idle and STAY idle for a short settle window. A batch that began-and-finished before we looked is not +// lost — it already bumped the tier counts, which we only read after observing the worker idle (see MethodLoadVerbose). +// * MethodLoadVerbose (per-method, JIT keyword) reports each tier publication and carries the tier. A non-tier0 load +// for a watched method bumps its tier-up count (so callers can detect it advanced beyond a given tier) and, when the +// tier is a final one, marks it done. We deliberately do NOT use MethodJittingStarted (compile-began): it carries no +// tier, so the tier0 compile's start is indistinguishable from a tier-up's and would race the tier0 publish that +// filters it. +// * TieredCompilationPause/Resume (the tiering delay bracket, Compilation keyword) bound the call-counting delay. Two +// roles: (1) a burst issued while the delay is active isn't counted (the counting stub is deferred), so the stage +// waits until the delay is observed inactive — a Resume — before bursting (WaitForTieringActive); up front +// (WaitForInitialTieringActive) it waits for any method's tier0 JIT or a Pause/Resume to confirm a Resume is coming, +// and if none arrives the method was pre-warmed so it fakes the inactive state and proceeds. (2) While the delay is +// active the background worker is paused — it won't compile even already-enqueued tier-ups until the Resume — so +// "worker idle" during a pause is NOT quiescence. WaitForQuiescentTierUp therefore calls WaitForTieringActive each +// loop turn to wait the pause out before timing the idle settle window. +// +// The busy/idle state is a ManualResetEventSlim pair; the per-method tier/completion counts are mutated under syncRoot +// (which OnEventWritten already holds) but declared volatile so the waiters read them lock-free — their ordering comes +// from draining the in-flight batch first, not from the lock. So the waiters are lock-free (the events handle their own +// timeouts and cancellation), and WaitForTieringActive composes naturally into the quiescence loop. +// +// This is intentionally a per-stage listener: enabling the Jit keyword emits an event for every method jitted +// process-wide, which we must NOT pay during the measurement stages. It is created at the start of the jit stage +// and disposed at the end. +// +// Create returns null (and the caller falls back to the fixed delay) when the runtime has no tiered JIT, or when +// EventSource is unavailable — it can be disabled via the System.Diagnostics.Tracing.EventSource.IsSupported feature +// switch. It otherwise watches each method regardless of whether it looks tier-eligible: a method that can't tier just +// publishes its single final tier (see the tier constants below), which the stage observes and treats as "done". +[AggressivelyOptimizeMethods] // Reduce JIT event noise from the listener itself. +internal sealed class JitListener : EventListener +{ + private const string RuntimeEventSourceName = "Microsoft-Windows-DotNETRuntime"; + private const EventKeywords JitKeyword = (EventKeywords)0x10; + // The "Compilation" keyword carries the TieredCompilation/Pause|Resume and BackgroundJit Start/Stop events. Low + // volume (a handful per delay/batch cycle), so enabling it adds no meaningful cost. + private const EventKeywords CompilationKeyword = (EventKeywords)0x1000000000; + private const string TieredCompilationResumeEvent = "TieredCompilationResume"; + private const string TieredCompilationPauseEvent = "TieredCompilationPause"; + // The background tiering worker brackets each batch with these: Start when it begins draining its queue, Stop when + // it finishes (Stop's PendingMethodCount payload is how many remain — 0 = drained). They are the quiescence signal. + private const string TieredCompilationBackgroundJitStartEvent = "TieredCompilationBackgroundJitStart"; + private const string TieredCompilationBackgroundJitStopEvent = "TieredCompilationBackgroundJitStop"; + // Event-name prefix (the runtime appends a version suffix, e.g. MethodLoadVerbose_V2). + private const string MethodLoadVerbosePrefix = "MethodLoadVerbose"; + + // Optimization tier is packed into MethodFlags bits [7..9]: (MethodFlags >> 7) & 0x7. + // The initial tier0 quick compile is QuickJitted = 3; the intermediate instrumented (PGO) publication reports + // another value and just counts as "a recompilation happened". A method is fully warmed once it reaches one of + // the runtime's FINAL tiers — those from which no further tier-up is coming: + // * OptimizedTier1 = 4 — the usual steady state for a tier-eligible method. + // * Optimized = 2 (NativeCodeVersion::OptimizationTierOptimized) — a method compiled straight to optimized code + // without a tier1 promotion: AggressiveOptimization, or a method with a loop when TC_QuickJitForLoops is off. + // * MinOptJitted = 1 — a method that never tiers at all: NoOptimization, or any method in an + // optimization-disabled assembly. This is its first and only compile. + // Since Create now watches every method (not just ones that look tier-eligible), a non-tiering method publishes + // exactly one of MinOptJitted/Optimized and we recognize it as final immediately, rather than predicting it from + // attributes. OptimizedTier1OSR = 5 is special: an on-stack-replacement of a still-running body with a hot loop. + // It fires off the loop's back-edge counter, NOT off the call-count threshold, so unlike every other tier it is + // never the method's active entry-point code version and is never call-counted — it's orthogonal to the + // call-count tier ladder the stage drives, and a watched method that OSRs in both its tier0 and instrumented + // bodies emits two of them on the way to its final tier. We therefore ignore OSR publications for our method (see + // HandleMethodLoad) so they don't inflate its tier count and stall the stage short of the final tier. + private const int OptimizationTierShift = 7; + private const int OptimizationTierMask = 0x7; + private const int MinOptJitted = 1; + private const int Optimized = 2; + private const int QuickJittedTier0 = 3; + private const int OptimizedTier1 = 4; + private const int OptimizedTier1OSR = 5; + + // Margin added on top of the call-counting delay when waiting for a TieredCompilationResume, before assuming it + // was dropped (EventPipe sheds events under buffer pressure) and proceeding as if the delay had elapsed. We add it + // to TieredDelay rather than use a flat cap so a deliberately huge delay can't make the cap shorter than the delay + // itself. Generous vs the ~100ms default delay, so it only ever fires on an actual drop, not on the normal path. + private static readonly TimeSpan TieringActiveTimeoutMargin = TimeSpan.FromSeconds(1); + + // How long the background worker must stay idle (no new batch) for us to declare quiescence. A burst's methods and + // their callees tier up in a TRAIN of back-to-back background batches (a few tens of ms apart), so the window has to + // bridge those gaps and only conclude "quiet" once a full window passes with no new batch. 30ms comfortably spans + // the inter-batch gap while keeping the per-tier settle cost small. + private static readonly TimeSpan QuiescenceSettleWindow = TimeSpan.FromMilliseconds(30); + // How long to wait for an observed-busy background JIT batch to drain before assuming its BackgroundJitStop was + // dropped (EventPipe sheds events under pressure) and proceeding. Generous — it only bites on a dropped Stop, and a + // large compile queue can legitimately take a while; leaving "busy" stuck would poison every later quiescence check. + private static readonly TimeSpan BackgroundJitDrainTimeout = TimeSpan.FromSeconds(10); + + // One entry per watched method. Small (a handful at most), so HandleMethodLoad scans it linearly per publication. + private readonly WatchedMethod[] watchedMethods; + private readonly object syncRoot = new(); + private readonly ManualResetEventSlim tieringActiveSignal = new(false); + private readonly ManualResetEventSlim tieringActivePrimedSignal = new(false); + // The background tiering worker's busy/idle state (Start..Stop-with-0-pending). Kept as a paired flip-flop so a + // reader never sees both set at once. Set/Reset under syncRoot. + private readonly ManualResetEventSlim backgroundJitBusySignal = new(false); + private readonly ManualResetEventSlim backgroundJitIdleSignal = new(true); + + // Number of watched methods that have reached a final tier (guarded by syncRoot). reachedFinalTier mirrors + // "finalTierCount == watchedMethods.Length" for a lock-free read; both flip together once every method is done. + private int finalTierCount; + private volatile bool reachedFinalTier; + private volatile bool canObserve; + private bool disposed; + + // Cached payload indices (field order is stable within a process for a given event version). + private int loadTokenIndex = -1; + private int loadFlagsIndex = -1; + private int loadNameIndex = -1; + private int backgroundJitStopPendingIndex = -1; + + private JitListener(WatchedMethod[] methods) + { + // NOTE: the base EventListener ctor calls OnEventSourceCreated before this field is set, but that callback only + // enables events / probes canObserve and never reads watchedMethods. + watchedMethods = methods; + } + + // Watches every method in the collection. Returns null — so the caller falls back to the fixed delay — when there + // is nothing to watch, the runtime has no tiered JIT, or EventSource is unavailable. + internal static JitListener? Create(IEnumerable methods) + { + if (!JitInfo.IsTiered) + { + return null; + } + var watched = methods.Select(m => new WatchedMethod(m)).ToArray(); + if (watched.Length == 0) + { + return null; + } + var listener = new JitListener(watched); + if (!listener.canObserve) + { + listener.Dispose(); + return null; + } + return listener; + } + + // True only once EVERY watched method has reached a final tier. + internal bool ReachedFinalTier => reachedFinalTier; + + // Waits until the call-counting delay is observed inactive (a TieredCompilationResume), so the stage's first burst + // will be counted. It first waits up to a timeout for any sign the tiering machinery is active — a tier0 (QuickJitted) + // publication for ANY method, or a TieredCompilation Pause/Resume — which guarantees a Resume is coming to gate on. + // (The stage calls this AFTER its first invoke, so a freshly-tier0 watched method has already fired its Pause.) If + // nothing arrives within the timeout, tiering is quiet: the watched method was pre-warmed past tier0, its stub is + // already installed, and no delay is coming on its own — so we fake the active state and proceed. The lock + IsSet + // re-check makes that fake atomic against a real event landing right at the timeout boundary (the event handlers + // take the same lock), so we never overwrite one; and we wait OUTSIDE the lock so the handlers never block on us. + internal void WaitForInitialTieringActive(CancellationToken cancellationToken) + { + // No call-counting delay (e.g. AggressiveTiering) — counting is armed immediately, nothing to gate on. + if (JitInfo.TieredDelay <= TimeSpan.Zero) + { + return; + } + if (!tieringActivePrimedSignal.Wait(JitInfo.TieredDelay + TimeSpan.FromMilliseconds(50), cancellationToken)) + { + lock (syncRoot) + { + if (!tieringActivePrimedSignal.IsSet) + { + tieringActivePrimedSignal.Set(); + tieringActiveSignal.Set(); + } + } + } + WaitForTieringActive(cancellationToken); + } + + // Waits until the call-counting delay is inactive (a TieredCompilationResume was observed). Re-gates each burst in + // the tier loop after WaitForInitialTieringActive established the delay was inactive up front. Bounded: a Resume can + // be dropped by EventPipe under buffer pressure, so rather than block forever we wait up to TieredDelay plus a margin and + // then assume the delay elapsed (stubs installed) and proceed — the same fallback WaitForInitialTieringActive uses. + // The cap only bites on a dropped event; a real Resume normally arrives within the call-counting delay (~100ms). + internal void WaitForTieringActive(CancellationToken cancellationToken) + { + // No call-counting delay (e.g. AggressiveTiering) — counting is armed immediately, nothing to gate on. + if (JitInfo.TieredDelay <= TimeSpan.Zero) + { + return; + } + if (!tieringActiveSignal.Wait(JitInfo.TieredDelay + TieringActiveTimeoutMargin, cancellationToken)) + { + // The primed signal is already set (WaitForInitialTieringActive ran first), so only flip the active + // signal. Lock so this can't interleave with a concurrent Pause/Resume handler. + lock (syncRoot) + { + tieringActiveSignal.Set(); + } + } + } + + // Waits for the background tiering worker to go quiet, then reports whether every still-tiering watched method has + // advanced beyond `previousTierCounter` tier-ups (or already reached its final tier). Quiescence is what makes + // proceeding safe: when the worker is idle, this tier's compiles (the watched method(s) AND their untracked callees, + // which tier up in a train of back-to-back batches) have all landed, so the next burst / the following stage won't + // race them. We wait for the worker to be idle and STAY idle for a settle window — a new batch within the window (a + // callee tiering up, or the watched-method batch starting after a slow enqueue) wakes us to drain it and re-settle; + // a full window with no new batch means the tree is warm. We read the (volatile) tier counts only AFTER observing + // the worker idle, so a batch that started-and-finished before we looked has already bumped them. (When the caller + // has stopped observing the watched methods it ignores the result and just uses this to drain untracked callees.) + internal bool WaitForQuiescentTierUp(int previousTierCounter, CancellationToken cancellationToken) + { + while (true) + { + // Wait out any tiering pause first: while paused, the worker won't compile even already-enqueued methods, so + // "idle" isn't quiescent — WaitForTieringActive returns once counting is active again (the delay elapsed / + // a Resume was observed). In the common case (not paused) it returns immediately. + WaitForTieringActive(cancellationToken); + // Wait the settle window for the worker to (re)start a batch. A burst's methods and their callees tier up in + // a train of back-to-back batches, so if one starts we drain it and re-check; if none starts within the + // window the tree has settled. + if (!WaitForBackgroundJitBusy(QuiescenceSettleWindow, cancellationToken)) + { + return AllAdvanced(previousTierCounter); + } + WaitForBackgroundJitIdle(BackgroundJitDrainTimeout, cancellationToken); + } + } + + // Waits up to the timeout for every still-tiering watched method to advance beyond `previousTierCounter`, WITHOUT + // waiting out a full settle window. Used to cheaply detect a tier-up while nudging one call at a time: it returns + // immediately if they've already advanced, otherwise waits out any pause and gives the worker a short window + // (jitBusyTimeout) to pick the nudge up, draining it if it does. The caller does a final WaitForQuiescentTierUp + // afterwards to settle callees. True if all advanced; false otherwise. + internal bool WaitForTierUp(int previousTierCounter, TimeSpan jitBusyTimeout, CancellationToken cancellationToken) + { + if (AllAdvanced(previousTierCounter)) + { + return true; + } + WaitForTieringActive(cancellationToken); + if (WaitForBackgroundJitBusy(jitBusyTimeout, cancellationToken)) + { + WaitForBackgroundJitIdle(BackgroundJitDrainTimeout, cancellationToken); + } + return AllAdvanced(previousTierCounter); + } + + // Waits up to the timeout for the background tiering worker to be running a batch. True if it is/becomes busy. + private bool WaitForBackgroundJitBusy(TimeSpan timeout, CancellationToken cancellationToken) + => backgroundJitBusySignal.Wait(timeout, cancellationToken); + + // Waits (up to the timeout) for the background tiering worker to go idle (its queue drained — a BackgroundJitStop + // with PendingMethodCount == 0). The caller only waits after observing the worker busy, and a running batch always + // finishes — but the BackgroundJitStop can be dropped by EventPipe under buffer pressure (most likely right here, + // since a busy drain floods the same buffers with MethodLoadVerbose). So on timeout we force the idle state and + // proceed: leaving "busy" stuck would poison every later quiescence check. The cap only bites on a dropped Stop. + private void WaitForBackgroundJitIdle(TimeSpan timeout, CancellationToken cancellationToken) + { + if (!backgroundJitIdleSignal.Wait(timeout, cancellationToken)) + { + lock (syncRoot) + { + // Reset busy before setting idle so a reader never sees both set at once (matches HandleBackgroundJitStop). + backgroundJitBusySignal.Reset(); + backgroundJitIdleSignal.Set(); + } + } + } + + // Whether every watched method either advanced its tier count or already reached its final tier. + private bool AllAdvanced(int previousTierCount) + { + for (int i = 0; i < watchedMethods.Length; i++) + { + var method = watchedMethods[i]; + if (!method.reachedFinalTier && method.tierUpCount <= previousTierCount) + { + return false; + } + } + return true; + } + + protected override void OnEventSourceCreated(EventSource source) + { + if (source.Name == RuntimeEventSourceName) + { + EnableEvents(source, EventLevel.Verbose, JitKeyword | CompilationKeyword); + // IsEnabled is true only when EventSource is supported AND the enable actually took effect + // at the level/keyword we need. + canObserve = source.IsEnabled(EventLevel.Verbose, JitKeyword | CompilationKeyword); + } + } + + protected override void OnEventWritten(EventWrittenEventArgs e) + { + if (!canObserve) + return; + string? name = e.EventName; + if (name is null) + return; + + // The runtime brackets the call-counting delay with these: Pause when a new tier0 method's first call + // (re)starts the delay, Resume when it elapses and the whole pending list of counting stubs is installed. + // tieringActiveSignal is the flip-flop the burst gate waits on (Set on Resume = delay inactive = stubs live, + // Reset on Pause); tieringActivePrimedSignal just records that some delay activity occurred (set by either). + if (name == TieredCompilationResumeEvent) + { + lock (syncRoot) + { + if (disposed) + return; + tieringActiveSignal.Set(); + tieringActivePrimedSignal.Set(); + } + return; + } + if (name == TieredCompilationPauseEvent) + { + lock (syncRoot) + { + if (disposed) + return; + tieringActiveSignal.Reset(); + tieringActivePrimedSignal.Set(); + } + return; + } + if (name == TieredCompilationBackgroundJitStartEvent) + { + lock (syncRoot) + { + if (disposed) + return; + // The worker began a batch (the watched method(s) and/or untracked callees tiering up). Reset idle + // before setting busy so a reader never sees both set at once. + backgroundJitIdleSignal.Reset(); + backgroundJitBusySignal.Set(); + } + return; + } + if (name == TieredCompilationBackgroundJitStopEvent) + { + HandleBackgroundJitStop(e); + return; + } + + if (name.StartsWith(MethodLoadVerbosePrefix, StringComparison.Ordinal)) + { + HandleMethodLoad(e); + } + } + + private void HandleBackgroundJitStop(EventWrittenEventArgs e) + { + var payloadNames = e.PayloadNames; + var payload = e.Payload; + if (payloadNames is null || payload is null) + return; + + if (backgroundJitStopPendingIndex < 0) + { + backgroundJitStopPendingIndex = payloadNames.IndexOf("PendingMethodCount"); + if (backgroundJitStopPendingIndex < 0) + return; + } + + // The worker stopped; once nothing is left queued the batch is fully drained and the JIT is idle. + if (Convert.ToInt64(payload[backgroundJitStopPendingIndex]) == 0) + { + lock (syncRoot) + { + if (disposed) + return; + // Reset busy before setting idle so a reader never sees both set at once. + backgroundJitBusySignal.Reset(); + backgroundJitIdleSignal.Set(); + } + } + } + + private void HandleMethodLoad(EventWrittenEventArgs e) + { + var payloadNames = e.PayloadNames; + var payload = e.Payload; + if (payloadNames is null || payload is null) + return; + + if (loadTokenIndex < 0) + { + loadTokenIndex = payloadNames.IndexOf("MethodToken"); + loadFlagsIndex = payloadNames.IndexOf("MethodFlags"); + loadNameIndex = payloadNames.IndexOf("MethodName"); + if (loadTokenIndex < 0 || loadFlagsIndex < 0 || loadNameIndex < 0) + return; + } + + long tier = (Convert.ToInt64(payload[loadFlagsIndex]) >> OptimizationTierShift) & OptimizationTierMask; + + // A QuickJitted (tier0) publication — for ANY method, not just the one we watch — means an eligible method was + // just tier0-compiled and is about to run, so its first call will start or join the call-counting delay and a + // TieredCompilationResume is coming. That is exactly (and all) the up-front gate (WaitForInitialTieringActive) + // needs: it only asks "is the tiering machinery active, so a Resume will arrive to gate on?", which is a + // process-wide question. (Pause/Resume prime it too; this also covers the brief window before the first call + // fires Pause.) The tier0 compile itself is the baseline, not a tier-up, so we never count it. + if (tier == QuickJittedTier0) + { + lock (syncRoot) + { + if (disposed) + return; + tieringActivePrimedSignal.Set(); + } + return; + } + + // An OSR publication is not a step on the call-count tier ladder (it fires off a hot loop's back-edge counter, + // and the method goes on to be call-count-promoted past it), so don't let it count as a tier-up — otherwise a + // method that OSRs in multiple bodies overruns its tier count and the stage stops short of the final tier. + if (tier == OptimizedTier1OSR) + return; + + // Everything below concerns one of OUR watched methods reaching its next tier, so find the matching one (if any). + int token = Convert.ToInt32(payload[loadTokenIndex]); + string? name = payload[loadNameIndex] as string; + WatchedMethod? matched = null; + foreach (var candidate in watchedMethods) + { + if (candidate.MetadataToken == token && candidate.Name == name) + { + matched = candidate; + break; + } + } + if (matched is null) + return; + + // Any of the runtime's final tiers means the method is fully warmed and will emit no further tier-ups — + // whether it tiered all the way up (OptimizedTier1), was compiled straight to optimized code (Optimized), or + // never tiers at all (MinOptJitted). + bool isFinalTier = tier == OptimizedTier1 || tier == Optimized || tier == MinOptJitted; + + lock (syncRoot) + { + if (disposed) + return; + // Count this tier-up so callers can detect the method advanced beyond a given tier. + matched.tierUpCount++; + // Track per-method completion so reachedFinalTier flips only once the LAST watched method is done. + if (isFinalTier && !matched.reachedFinalTier) + { + matched.reachedFinalTier = true; + if (++finalTierCount == watchedMethods.Length) + reachedFinalTier = true; + } + } + } + + private sealed class WatchedMethod(MethodInfo method) + { + internal volatile int tierUpCount; + internal volatile bool reachedFinalTier; + + internal int MetadataToken => method.MetadataToken; + internal string Name => method.Name; + } + + public override void Dispose() + { + lock (syncRoot) + { + disposed = true; + } + // base.Dispose disables the events we enabled (when no other listener wants them). + base.Dispose(); + tieringActivePrimedSignal.Dispose(); + tieringActiveSignal.Dispose(); + backgroundJitBusySignal.Dispose(); + backgroundJitIdleSignal.Dispose(); + } +} diff --git a/src/BenchmarkDotNet/Portability/JitInfo.cs b/src/BenchmarkDotNet/Portability/JitInfo.cs index 103dc6881a..3a85d28bcc 100644 --- a/src/BenchmarkDotNet/Portability/JitInfo.cs +++ b/src/BenchmarkDotNet/Portability/JitInfo.cs @@ -180,7 +180,7 @@ private static TimeSpan GetTieredDelay() /// public static readonly TimeSpan BackgroundCompilationDelay = IsTiered - // It's impossible for us to know exactly how long to wait without hooking into JIT notifications (which we can't do in-process). + // It's impossible for us to know exactly how long to wait without hooking into JIT notifications. // 100ms should be enough most of the time, but we bump it up to 250ms for higher confidence. // When https://github.com/dotnet/runtime/issues/101868 is resolved, if AggressiveTiering is enabled, we can skip the wait time and return TimeSpan.Zero. ? TimeSpan.FromMilliseconds(250) diff --git a/src/BenchmarkDotNet/Templates/BenchmarkType.txt b/src/BenchmarkDotNet/Templates/BenchmarkType.txt index 53d97e831b..ac9d89c84c 100644 --- a/src/BenchmarkDotNet/Templates/BenchmarkType.txt +++ b/src/BenchmarkDotNet/Templates/BenchmarkType.txt @@ -39,6 +39,7 @@ global::BenchmarkDotNet.Engines.EngineParameters engineParameters = new global::BenchmarkDotNet.Engines.EngineParameters() { Host = host, + WorkloadMethods = instance.__ResolveWorkloadMethods(host), WorkloadActionUnroll = instance.WorkloadActionUnroll, WorkloadActionNoUnroll = instance.WorkloadActionNoUnroll, OverheadActionNoUnroll = instance.OverheadActionNoUnroll, @@ -71,6 +72,51 @@ $DeclareFieldsContainer$ + private global::System.Reflection.MethodInfo[] __ResolveWorkloadMethods(global::BenchmarkDotNet.Engines.IHost host) + { + // Best-effort: the jit stage uses the resolved method(s) to watch their JIT tier-up events, and falls back + // to a fixed delay when none are resolved. So neither a missed match nor a reflection failure (e.g. a + // same-named overload's parameter type fails to load) may break the benchmark — report and return empty. + try + { + global::System.Type[] parameterTypes = $WorkloadMethodParameterTypes$; + foreach (global::System.Reflection.MethodInfo candidate in typeof($WorkloadTypeName$).GetMethods( + global::System.Reflection.BindingFlags.Instance | global::System.Reflection.BindingFlags.Static | + global::System.Reflection.BindingFlags.Public | global::System.Reflection.BindingFlags.NonPublic)) + { + if (candidate.Name != "$WorkloadMethodName$") + { + continue; + } + global::System.Reflection.ParameterInfo[] parameters = candidate.GetParameters(); + if (parameters.Length != parameterTypes.Length) + { + continue; + } + global::System.Boolean isMatch = true; + for (global::System.Int32 i = 0; i < parameters.Length; i++) + { + if (parameters[i].ParameterType != parameterTypes[i]) + { + isMatch = false; + break; + } + } + if (isMatch) + { + return new global::System.Reflection.MethodInfo[] { candidate }; + } + } + } + catch (global::System.Exception e) + { + host.SendError($"Exception during __ResolveWorkloadMethods!{(global::System.Environment.NewLine)}{e}"); + return global::System.Array.Empty(); + } + host.WriteLine("// Could not resolve the benchmark method '$WorkloadMethodName$' to watch JIT tier-up events; the jit stage will fall back to a fixed delay."); + return global::System.Array.Empty(); + } + private $GlobalSetupModifiers$ global::System.Threading.Tasks.ValueTask __GlobalSetup() { $GlobalSetupImpl$ diff --git a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs index 81519664d6..7646bddf84 100644 --- a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs +++ b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/AsyncStateMachineEmitter.cs @@ -14,16 +14,16 @@ partial class RunnableEmitter // This doesn't really matter for the runtime, but it helps with the NaiveRunnableEmitDiff tests. protected virtual IReadOnlyDictionary AsyncMethodToOrdinalMap { get; } = new Dictionary { - { GlobalSetupMethodName, 4 }, - { GlobalCleanupMethodName, 5 }, - { IterationSetupMethodName, 6 }, - { IterationCleanupMethodName, 7 }, - { OverheadActionUnrollMethodName, 11 }, - { OverheadActionNoUnrollMethodName, 12 }, - { WorkloadActionUnrollMethodName, 13 }, - { WorkloadActionNoUnrollMethodName, 14 }, - { StartWorkloadMethodName, 15 }, - { WorkloadCoreMethodName, 16 }, + { GlobalSetupMethodName, 5 }, + { GlobalCleanupMethodName, 6 }, + { IterationSetupMethodName, 7 }, + { IterationCleanupMethodName, 8 }, + { OverheadActionUnrollMethodName, 12 }, + { OverheadActionNoUnrollMethodName, 13 }, + { WorkloadActionUnrollMethodName, 14 }, + { WorkloadActionNoUnrollMethodName, 15 }, + { StartWorkloadMethodName, 16 }, + { WorkloadCoreMethodName, 17 }, }; private record struct AsyncStateMachineFields(FieldInfo StateField, FieldInfo BuilderField, FieldInfo? ThisField); diff --git a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/SyncTaskCoreEmitter.cs b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/SyncTaskCoreEmitter.cs index b9a338b07f..1e4350f3d6 100644 --- a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/SyncTaskCoreEmitter.cs +++ b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/Implementation/Emitters/SyncTaskCoreEmitter.cs @@ -25,10 +25,10 @@ protected override IReadOnlyDictionary AsyncMethodToOrdinalMap ? base.AsyncMethodToOrdinalMap : new Dictionary { - { GlobalSetupMethodName, 2 }, - { GlobalCleanupMethodName, 3 }, - { IterationSetupMethodName, 4 }, - { IterationCleanupMethodName, 5 }, + { GlobalSetupMethodName, 3 }, + { GlobalCleanupMethodName, 4 }, + { IterationSetupMethodName, 5 }, + { IterationCleanupMethodName, 6 }, }; protected override void EmitExtraGlobalCleanup(ILGenerator ilBuilder, LocalBuilder? thisLocal) { } diff --git a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/InProcessEmitRunner.cs b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/InProcessEmitRunner.cs index 4813b111e4..37a1e04973 100644 --- a/src/BenchmarkDotNet/Toolchains/InProcess/Emit/InProcessEmitRunner.cs +++ b/src/BenchmarkDotNet/Toolchains/InProcess/Emit/InProcessEmitRunner.cs @@ -92,6 +92,7 @@ private static async ValueTask RunCore(Type runnableType, IHost host, ExecutePar var engineParameters = new EngineParameters() { Host = host, + WorkloadMethods = [benchmarkCase.Descriptor.WorkloadMethod], WorkloadActionUnroll = LoopCallbackFromMethod(instance, WorkloadActionUnrollMethodName), WorkloadActionNoUnroll = LoopCallbackFromMethod(instance, WorkloadActionNoUnrollMethodName), OverheadActionNoUnroll = LoopCallbackFromMethod(instance, OverheadActionNoUnrollMethodName), diff --git a/src/BenchmarkDotNet/Toolchains/InProcess/NoEmit/InProcessNoEmitRunner.cs b/src/BenchmarkDotNet/Toolchains/InProcess/NoEmit/InProcessNoEmitRunner.cs index a0f3b1f78d..d124b765e0 100644 --- a/src/BenchmarkDotNet/Toolchains/InProcess/NoEmit/InProcessNoEmitRunner.cs +++ b/src/BenchmarkDotNet/Toolchains/InProcess/NoEmit/InProcessNoEmitRunner.cs @@ -179,6 +179,7 @@ public static async ValueTask RunCore(IHost host, ExecuteParameters parameters, var engineParameters = new EngineParameters { Host = host, + WorkloadMethods = [target.WorkloadMethod], WorkloadActionNoUnroll = workloadAction.InvokeNoUnroll, WorkloadActionUnroll = workloadAction.InvokeUnroll, OverheadActionNoUnroll = overheadAction.InvokeNoUnroll, diff --git a/tests/BenchmarkDotNet.IntegrationTests/BenchmarkDotNet.IntegrationTests.csproj b/tests/BenchmarkDotNet.IntegrationTests/BenchmarkDotNet.IntegrationTests.csproj index 89939ea2fb..aabd80ffeb 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/BenchmarkDotNet.IntegrationTests.csproj +++ b/tests/BenchmarkDotNet.IntegrationTests/BenchmarkDotNet.IntegrationTests.csproj @@ -23,8 +23,6 @@ Always - - diff --git a/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs b/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs index 2ee80f55bc..8cea409606 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs +++ b/tests/BenchmarkDotNet.IntegrationTests/InProcess.EmitTests/NaiveRunnableEmitDiff.cs @@ -27,7 +27,8 @@ public class NaiveRunnableEmitDiff private static readonly HashSet IgnoredRunnableMethodNames = [ "Run", - ".ctor" + ".ctor", + "__ResolveWorkloadMethods" ]; private static readonly IReadOnlyDictionary AltOpCodes = new Dictionary() diff --git a/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs new file mode 100644 index 0000000000..d7e3b6ac59 --- /dev/null +++ b/tests/BenchmarkDotNet.IntegrationTests/JitListenerTests.cs @@ -0,0 +1,320 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using BenchmarkDotNet.Engines; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Portability; +using BenchmarkDotNet.Reports; +using BenchmarkDotNet.Tests.XUnit; +using Perfolizer.Horology; + +namespace BenchmarkDotNet.IntegrationTests; + +public class JitListenerTests +{ + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_Cold() + { + Func workloadMethod = Cold; + + using var observer = JitListener.Create([workloadMethod.Method]); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); + } + + // Tests the case of InProcess benchmarking the same method multiple times. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_AlreadyTier1() + { + Func workloadMethod = AlreadyTier1; + + using var observer = JitListener.Create([workloadMethod.Method]); + + // The first jit stage brings the method to tier1 (in an optimized build) and our observer records it. Running + // the jit stage again for the same (now tier1) method should also succeed; it gets a fresh listener, because + // the stage drove the first to completion and reusing one across runs would leave its tiering signals ambiguous. + RunJitStageToCompletion(workloadMethod, observer); + using var observer2 = JitListener.Create([workloadMethod.Method]); + RunJitStageToCompletion(workloadMethod, observer2); + + AssertReachedFinalTier(observer); + } + + // Tests the case of InProcess benchmarking a method that the user already invoked before starting the benchmarks when call counting is active. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_AlreadyTier0() + { + Func workloadMethod = AlreadyTier0; + // Watch from before the pre-invoke, and hand this listener to the stage so it doesn't create a second one + // (see RunJitStageToCompletion): in a minopt build the pre-invoke is the method's only compile. + using var observer = JitListener.Create([workloadMethod.Method]); + + DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0(42)); + // Sleep long enough for the tiered call counting to begin. + Engine.SleepIfPositive(JitInfo.TieredDelay + JitInfo.TieredDelay); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); + } + + // Tests the case of InProcess benchmarking a method that the user already invoked before starting the benchmarks when call counting is delayed. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_AlreadyTier0DelayedCallCounting() + { + Func workloadMethod = AlreadyTier0DelayedCallCounting; + // Watch from before the pre-invoke, and hand this listener to the stage (see RunJitStageToCompletion). We do + // NOT sleep first: this test's whole point is that the call-counting delay is still pending when the stage + // starts. Because the stage reuses this one listener, the pre-invoke's event is never lost to a second + // listener's session churn, so no wait is needed to observe the final tier. + using var observer = JitListener.Create([workloadMethod.Method]); + + DeadCodeEliminationHelper.KeepAliveWithoutBoxing(AlreadyTier0DelayedCallCounting(42)); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); + } + + // Tests a benchmark method whose own hot loop is On-Stack-Replaced (OSR) mid-execution. Where OSR is enabled + // (by default in .NET 7+) this drives the method through an OSR publication on top of its normal tier-ups, and the + // stage must still reach OptimizedTier1 — JitInfo.MaxTierPromotions reserves an extra promotion for the OSR-induced + // double tier0-instrumentation. Where OSR is off it is simply a hot-loop method that tiers up normally; either way + // it ends at tier1. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_Osr() + { + Func workloadMethod = Osr; + + using var observer = JitListener.Create([workloadMethod.Method]); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); + } + + // Tests a benchmark method that calls (without inlining) a separate method whose hot loop is OSR'd. The listener + // only watches the benchmark method, never the callee, so it can't observe the callee's tiering at all — this + // exercises the runtime bug where an OSR'd callee gets tier0-instrumented twice (JitInfo.MaxTierPromotions reserves + // the extra promotion the stage spends on it). The benchmark method itself must still be driven to OptimizedTier1. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_CallsOsr() + { + Func workloadMethod = CallsOsr; + Func calleeMethod = OsrCallee; + + using var observer = JitListener.Create([workloadMethod.Method]); + // The stage only drives (and the engine's listener only watches) the benchmark method, but every call to it + // calls the OSR'd callee, so the callee should be driven all the way to tier1 too. Watch it independently. + using var calleeObserver = JitListener.Create([calleeMethod.Method]); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); + AssertReachedFinalTier(calleeObserver); + } + + // A method pinned to a single optimization level never tiers, but the listener still watches it and recognizes + // its one-and-only compile as a final tier (MinOptJitted or Optimized) — so the stage observes "done" rather than + // depending on an attribute heuristic to decline up front. + + // [MethodImpl(NoOptimization)] pins the method to minopts, so its final tier is MinOptJitted. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_NoOptimization() + { + Func workloadMethod = NoOptimization; + + using var observer = JitListener.Create([workloadMethod.Method]); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); + } + + // [MethodImpl(AggressiveOptimization)] pins the method straight to optimized code, so it never goes through + // tier0 -> tier1 and its final tier is Optimized (or OptimizedTier1, depending on runtime). + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_AggressiveOptimization() + { + Func workloadMethod = AggressiveOptimization; + + using var observer = JitListener.Create([workloadMethod.Method]); + + RunJitStageToCompletion(workloadMethod, observer); + + AssertReachedFinalTier(observer); + } + + // A single listener can watch several methods at once. Here one listener watches two distinct methods, and the + // stage drives both (the workload action calls each per invocation). ReachedFinalTier is the aggregate, so it only + // becomes true once BOTH have reached their final tier — exercising the multi-method path that scenarios like + // dotnet/BenchmarkDotNet#147 (driving several methods) will rely on. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_MultipleMethods() + { + Func first = MultiFirst; + Func second = MultiSecond; + + using var observer = JitListener.Create([first.Method, second.Method]); + + RunJitStageToCompletion(observer, [first.Method, second.Method], i => { first(i); second(i); }); + + AssertReachedFinalTier(observer); + } + + // Watched methods need not be invoked at the same rate (e.g. dotnet/BenchmarkDotNet#147 may call one more often + // than another). Here the "fast" method is invoked twice per iteration and the "slow" one once, so the fast method + // crosses its call-count thresholds — and thus tiers up — sooner. Each method banks its own publication permits, so + // the fast method's extra/earlier promotions aren't dropped while the slow one catches up; both must still reach + // their final tier. + [FactEnvSpecific("Only CoreCLR supports tiered JIT", EnvRequirement.DotNetCoreOnly)] + public void JitStage_MultipleMethodsUnevenInvocationRates() + { + Func fast = MultiUnevenFast; + Func slow = MultiUnevenSlow; + + using var observer = JitListener.Create([fast.Method, slow.Method]); + + int invokeCount = 0; + int tieredCount = 0; + RunJitStageToCompletion(observer, [fast.Method, slow.Method], i => + { + fast(i); + fast(i); + fast(i); + if (++tieredCount > 0 && tieredCount <= JitInfo.MaxTierPromotions && ++invokeCount * 3 > JitInfo.TieredCallCountThreshold) + { + Thread.Sleep(200); // Sleep to let the JIT compile the fast method before the slow method. + invokeCount = 0; + } + slow(i); + }); + + AssertReachedFinalTier(observer); + } + + private static void AssertReachedFinalTier(JitListener? observer) + { + // No wait needed: the tier-up event is delivered while the stage is still running (it spans hundreds of ms of + // tiering delays), so by the time the stage returns the observer has already recorded the final tier. + Assert.NotNull(observer); + Assert.True(observer.ReachedFinalTier, "the jit stage should have driven the benchmark method to its final tier"); + } + + // The test owns the listener and passes it in; the stage uses that exact instance (it never creates its own), so + // there is never a second EventListener whose setup/teardown could flush an event in flight to the test's listener. + private static void RunJitStageToCompletion(Func workloadMethod, JitListener? listener) + => RunJitStageToCompletion(listener, [workloadMethod.Method], i => workloadMethod(i)); + + // Core harness: the stage watches/drives the given workloadMethods, and each iteration runs invokeOnce (which the + // caller wires to actually call those methods) invokeCount times so they go through call counting and tier up. + private static void RunJitStageToCompletion(JitListener? listener, MethodInfo[] workloadMethods, Action invokeOnce) + { + // The per-tier publication wait is unbounded, cancellable only via the host's token. When the method tiers, the + // stage re-bursts until the runtime reports the next-tier compile began, so the wait isn't actually hit — but a + // large timeout guards against a hang if tiering somehow stalls, instead of wedging the whole test run. + using var timeout = new CancellationTokenSource(TimeSpan.FromSeconds(60)); + var host = new CancellableHost(timeout.Token); + Func> empty = (_, _) => new(default(ClockSpan)); + Func> workload = (invokeCount, _) => + { + // Really invoke the benchmark method(s) so they go through call counting and tier up for real. + for (long i = 0; i < invokeCount; i++) + invokeOnce(i); + return new(default(ClockSpan)); + }; + + var parameters = new EngineParameters + { + Host = host, + WorkloadMethods = workloadMethods, + WorkloadActionNoUnroll = workload, + WorkloadActionUnroll = workload, + OverheadActionNoUnroll = empty, + OverheadActionUnroll = empty, + GlobalSetupAction = () => new(), + GlobalCleanupAction = () => new(), + IterationSetupAction = () => new(), + IterationCleanupAction = () => new(), + TargetJob = Job.Default, + BenchmarkName = "", + InProcessDiagnoserHandler = new([], host, BenchmarkDotNet.Diagnosers.RunMode.None, null!), + }; + + var stage = new EngineJitStage(evaluateOverhead: false, parameters, listener); + var measurements = stage.GetMeasurementList(); + while (stage.GetShouldRunIteration(measurements, out var data)) + { + data.setupAction().GetAwaiter().GetResult(); + data.workloadAction(data.invokeCount / data.unrollFactor, null!).GetAwaiter().GetResult(); + data.cleanupAction().GetAwaiter().GetResult(); + // A zero-time measurement keeps the stage out of its "long-running benchmark" early-exit + // (iterationTime / 0 == Infinity, and Infinity < 1.5 is false). + measurements.Add(new Measurement(1, data.mode, data.stage, data.index, data.invokeCount, 0d)); + } + } + + // Benchmark-method stand-ins. A distinct method per tier-up scenario so that one test tiering a method up doesn't + // affect another's starting state. In a DisableOptimizations build these are JITted at minopts and never tier; in + // an optimized build they reach OptimizedTier1. + [MethodImpl(MethodImplOptions.NoInlining)] + private static long Cold(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long AlreadyTier1(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long AlreadyTier0(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long AlreadyTier0DelayedCallCounting(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)] + private static long NoOptimization(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining | CodeGenHelper.AggressiveOptimizationOption)] + private static long AggressiveOptimization(long x) => x * x + 1; + // Two distinct methods watched together by one listener (JitStage_MultipleMethods). + [MethodImpl(MethodImplOptions.NoInlining)] + private static long MultiFirst(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long MultiSecond(long x) => x * x + 1; + // Watched together but invoked at different rates (JitStage_MultipleMethodsUnevenInvocationRates). + [MethodImpl(MethodImplOptions.NoInlining)] + private static long MultiUnevenFast(long x) => x * x + 1; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long MultiUnevenSlow(long x) => x * x + 1; + + // A loop long enough to cross the OSR back-edge threshold so these methods are On-Stack-Replaced where OSR is + // enabled. Timing is irrelevant (RunJitStageToCompletion records 0ns measurements, so the stage never takes its + // long-running early-exit), so the only requirement is enough iterations to trigger OSR. + private const int OsrLoopCount = 1_000_000; + [MethodImpl(MethodImplOptions.NoInlining)] + private static long Osr(long x) + { + long sum = x; + for (int i = 0; i < OsrLoopCount; i++) + sum += i; + return sum; + } + // The benchmark method: it does nothing but call the OSR'd method, which NoInlining keeps as a separate jit unit. + [MethodImpl(MethodImplOptions.NoInlining)] + private static long CallsOsr(long x) => OsrCallee(x); + [MethodImpl(MethodImplOptions.NoInlining)] + private static long OsrCallee(long x) + { + long sum = x; + for (int i = 0; i < OsrLoopCount; i++) + sum += i; + return sum; + } + + // Minimal host that surfaces a cancellation token so the stage's unbounded per-tier wait stays interruptible. + private sealed class CancellableHost(CancellationToken cancellationToken) : IHost + { + public CancellationToken CancellationToken { get; } = cancellationToken; + public void Dispose() { } + public void WriteLine() { } + public void WriteLine(string message) { } + public void SendError(string message) { } + public void ReportResults(RunResults runResults) { } + public ValueTask SendSignalAsync(HostSignal hostSignal) => new(); + public ValueTask Yield() => new(); + } +} diff --git a/tests/BenchmarkDotNet.IntegrationTests/ValuesReturnedByBenchmarkTest.cs b/tests/BenchmarkDotNet.IntegrationTests/ValuesReturnedByBenchmarkTest.cs index 6c35a91748..77c0fc4567 100644 --- a/tests/BenchmarkDotNet.IntegrationTests/ValuesReturnedByBenchmarkTest.cs +++ b/tests/BenchmarkDotNet.IntegrationTests/ValuesReturnedByBenchmarkTest.cs @@ -107,6 +107,11 @@ public class Job { } [Benchmark] public unsafe int* PointerToUnmanagedType() => (int*)System.IntPtr.Zero.ToPointer(); + [Benchmark] + public unsafe delegate* FunctionPointer() => &ReturnArgument; + + private static int ReturnArgument(int value) => value; + [Benchmark] public System.IntPtr IntPtr() => System.IntPtr.Zero; diff --git a/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs b/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs index 569889ba10..470a5a7fa6 100644 --- a/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs +++ b/tests/BenchmarkDotNet.Tests/Engine/EnumerateStagesTests.cs @@ -31,7 +31,7 @@ public void JobsThatDontRequireJittingSkipJitStage(string jobName) var engineParameters = CreateEngineParameters(job); bool didRunStages = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { Assert.True(stage is not EngineJitStage); didRunStages = true; @@ -47,7 +47,7 @@ public void DefaultSettingsVeryTimeConsumingBenchmarksAreExecutedOncePerIteratio var engineParameters = CreateEngineParameters(Job.Default); bool didRunActualStage = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { Assert.NotEqual(IterationMode.Overhead, stage.Mode); @@ -81,7 +81,7 @@ public void BenchmarksThatRunLongerThanIterationTimeOnlyDuringFirstInvocationAre var engineParameters = CreateEngineParameters(Job.Default.WithIterationTime(TimeInterval.FromMilliseconds(iterationTime))); bool didRunActualStage = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -119,7 +119,7 @@ private void AssertUnroll(Job job) var engineParameters = CreateEngineParameters(job); bool didRunUnroll = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -150,7 +150,7 @@ public void JobWithExplicitInvocationCount(long invocationCount) // A short measurement encourages the JIT stage to batch many invocations into a single iteration, // which is the regression introduced by #2806. var fastMeasurement = TimeInterval.FromMicroseconds(1); - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -177,7 +177,7 @@ public void LongRunningBenchmarksExitJitStageEarly() var engineParameters = CreateEngineParameters(job); int jitWorkloadCount = 0; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -210,7 +210,7 @@ public void SlowFirstIterationButFastSteadyStateDoesNotExitJitStageEarly() var engineParameters = CreateEngineParameters(Job.Default.WithInvocationCount(1).WithUnrollFactor(1)); int jitWorkloadCount = 0; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -226,9 +226,9 @@ public void SlowFirstIterationButFastSteadyStateDoesNotExitJitStageEarly() if (stage is EngineJitStage) break; } - // Pre-loop iter + confirmation + full tiering loop (one yield per tier since the user - // pinned InvocationCount=1, matching JitInfo.MaxTierPromotions * TieredCallCountThreshold) - // + one stabilization iteration. Just assert it ran the full tiering loop rather than bailing. + // Pre-loop iter + confirmation + full tiering loop (one yield per tier since the user pinned + // InvocationCount=1, matching JitInfo.MaxTierPromotions * TieredCallCountThreshold) + the trailing + // stabilization iteration. Just assert it ran the full tiering loop rather than bailing. Assert.True(jitWorkloadCount > 2, $"Expected the tiering loop to run after confirmation disagreed, got {jitWorkloadCount} jitting iterations."); } @@ -247,7 +247,7 @@ public void ForceJitTieringModeRunsFullTieringLoopEvenForLongRunningBenchmarks() int jitWorkloadCount = 0; bool didStopEarly = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -282,7 +282,7 @@ public void SkipJitTieringModeSkipsTierPromotion() int jitWorkloadCount = 0; bool didStopEarly = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -316,7 +316,7 @@ public void MediumTimeConsumingBenchmarksStartPilotFrom2AndIncrementItWithEveryS var engineParameters = CreateEngineParameters(Job.Default); bool didRunPilotStage = false; - foreach (var stage in EngineStage.EnumerateStages(engineParameters)) + foreach (var stage in EngineStage.EnumerateStages(engineParameters, skipJitDelays: true)) { var stageMeasurements = stage.GetMeasurementList(); while (stage.GetShouldRunIteration(stageMeasurements, out var iterationData)) @@ -347,6 +347,7 @@ private EngineParameters CreateEngineParameters(Job job) Func> emptyAction = (_, _) => new(default(ClockSpan)); return new() { + WorkloadMethods = [], GlobalSetupAction = () => new(), GlobalCleanupAction = () => new(), Host = host, diff --git a/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs b/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs index 59dcb72633..a6e6ab7803 100644 --- a/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs +++ b/tests/BenchmarkDotNet.Tests/Shared/Mocks/MockEngine.cs @@ -22,6 +22,7 @@ internal MockEngine(ITestOutputHelper output, Job job, Func