vercel · pranaygp · Mar 23, 2026 · Mar 20, 2026 · Mar 23, 2026 · Mar 23, 2026
@@ -0,0 +1,9 @@
+---
+"@workflow/errors": patch
+"@workflow/core": patch
+"@workflow/world-local": patch
+"@workflow/builders": patch
+"@workflow/sveltekit": patch
+---
+
+Remove VQS maxDeliveries cap and enforce max delivery limit in workflow/step handlers with graceful failure
@@ -6,7 +6,6 @@ export const STEP_QUEUE_TRIGGER = {
   type: 'queue/v2beta' as const,
   topic: '__wkf_step_*',
   consumer: 'default',
-  maxDeliveries: 64, // Maximum number of delivery attempts (default: 3)
   retryAfterSeconds: 5, // Delay between retries (default: 60)
   initialDelaySeconds: 0, // Initial delay before first delivery (default: 0)
 };
@@ -19,7 +18,6 @@ export const WORKFLOW_QUEUE_TRIGGER = {
   type: 'queue/v2beta' as const,
   topic: '__wkf_workflow_*',
   consumer: 'default',
-  maxDeliveries: 64, // Maximum number of delivery attempts (default: 3)
   retryAfterSeconds: 5, // Delay between retries (default: 60)
   initialDelaySeconds: 0, // Initial delay before first delivery (default: 0)
 };
@@ -5,6 +5,7 @@ import {
   WorkflowRuntimeError,
 } from '@workflow/errors';
 import { classifyRunError } from './classify-error.js';
+import { MAX_QUEUE_DELIVERIES } from './runtime/constants.js';
 import { parseWorkflowName } from '@workflow/utils/parse-name';
 import {
   type Event,
@@ -107,6 +108,56 @@ export function workflowEntrypoint(
       const { requestId } = metadata;
       // Extract the workflow name from the topic name
       const workflowName = metadata.queueName.slice('__wkf_workflow_'.length);
+
+      // --- Max delivery check ---
+      // Enforce max delivery limit before any infrastructure calls.
+      // This prevents runaway workflows from consuming infinite queue deliveries.
-      // This prevents runaway workflows from consuming infinite queue deliveries.
+    // This prevents runaway workflows from consuming infinite queue deliveries.
+    // At this point, we want to do the minimal amount of work (no fetching
+    // of the workflow events, etc. We simply attempt to mark the run as failed
+    // and if that fails, the message is still consumed but with adequate logging
+    // that an error occurred preventing us from failing the run.
-      // This prevents runaway workflows from consuming infinite queue deliveries.
+    // This prevents runaway workflows from consuming infinite queue deliveries.
+    // At this point, we want to do the minimal amount of work (no fetching
+    // of the workflow events, etc. We simply attempt to mark the run as failed
+    // and if that fails, the message is still consumed but with adequate logging
+    // that an error occurred preventing us from failing the run.
+      // At this point, we want to do the minimal amount of work (no fetching
+      // of the workflow events, etc. We simply attempt to mark the run as failed
+      // and if that fails, the message is still consumed but with adequate logging
+      // that an error occurred preventing us from failing the run.
+      if (metadata.attempt > MAX_QUEUE_DELIVERIES) {
+        runtimeLogger.error(
+          `Workflow handler exceeded max deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`,
+          { workflowRunId: runId, workflowName, attempt: metadata.attempt }
+        );
+        try {
+          const world = getWorld();
+          await world.events.create(
+            runId,
+            {
+              eventType: 'run_failed',
+              specVersion: SPEC_VERSION_CURRENT,
+              eventData: {
+                error: {
+                  message: `Workflow exceeded maximum queue deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`,
+                },
+                errorCode: RUN_ERROR_CODES.MAX_DELIVERIES_EXCEEDED,
+              },
+            },
+            { requestId }
+          );
+        } catch (err) {
+          if (EntityConflictError.is(err) || RunExpiredError.is(err)) {
+            // Run already finished, consume the message silently
+            return;
+          }
+          runtimeLogger.error(
+            `Failed to mark run as failed after ${metadata.attempt} delivery attempts. ` +
+              `A persistent error is preventing the run from being terminated. ` +
+              `The run will remain in its current state until manually resolved. ` +
+              `This is most likely due to a persistent outage of the workflow backend ` +
+              `or a bug in the workflow runtime and should be reported to the Workflow team.`,
+            {
+              workflowRunId: runId,
+              error: err instanceof Error ? err.message : String(err),
+              attempt: metadata.attempt,
+            }
+          );
+        }
+        return;
+      }
+
       const spanLinks = await linkToCurrentContext();
 
       // Invoke user workflow within the propagated trace context and baggage

@@ -0,0 +1,13 @@
+// Maximum number of queue delivery attempts before the handler gives up and
+// gracefully fails the run/step. This must be bounded under the VQS message
+// max visibility window (24 hours) so that our handler-side failure path
+// reliably executes before VQS expires the message.
+//
+// VQS retry schedule (with retryAfterSeconds: 5):
+//   Attempts 1–32:  linear backoff at 5s each  → 32 × 5s = 160s (~2.7 min)
+//   Attempts 33+:   exponential backoff: 60s × 2^(attempt-32),
+//                   capped at 7,200s (2h), floored at retryAfterSeconds
+//
+// At 48 attempts the total elapsed time is approximately 20 hours, which is
+// safely under the 24-hour message visibility limit.
+export const MAX_QUEUE_DELIVERIES = 48;
@@ -142,6 +142,7 @@ vi.mock('@workflow/utils/get-port', () => ({
 // Import the module AFTER all mocks are set up - this triggers createQueueHandler
 // which populates capturedHandlerRef
 import './step-handler.js';
+import { MAX_QUEUE_DELIVERIES } from './constants.js';
 import { getStepFunction } from '../private.js';
 import {
   getErrorName,
@@ -497,3 +498,78 @@ describe('step-handler 409 handling', () => {
     });
   });
 });
+
+describe('step-handler max deliveries', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    vi.mocked(getStepFunction).mockReturnValue(mockStepFn);
+    mockStepFn.mockReset().mockResolvedValue('step-result');
+    mockStepFn.maxRetries = 3;
+    mockQueueMessage.mockResolvedValue(undefined);
+    vi.mocked(getWorld).mockReturnValue({
+      events: { create: mockEventsCreate },
+      queue: mockQueue,
+      getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined),
+    } as any);
+    mockEventsCreate.mockReset().mockResolvedValue({
+      step: {
+        stepId: 'step_abc',
+        status: 'running',
+        attempt: 1,
+        startedAt: new Date(),
+        input: [],
+      },
+      event: {},
+    });
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('should post step_failed and re-queue workflow when delivery count exceeds max', async () => {
+    const result = await capturedHandler(
+      createMessage(),
+      { ...createMetadata('myStep'), attempt: MAX_QUEUE_DELIVERIES + 1 }
+    );
+
+    expect(result).toBeUndefined();
+    expect(mockEventsCreate).toHaveBeenCalledWith(
+      'wrun_test123',
+      expect.objectContaining({
+        eventType: 'step_failed',
+        correlationId: 'step_abc',
+      }),
+      expect.anything()
+    );
+    expect(mockQueueMessage).toHaveBeenCalled();
+    expect(mockRuntimeLogger.error).toHaveBeenCalledWith(
+      expect.stringContaining('exceeded max deliveries'),
+      expect.objectContaining({ workflowRunId: 'wrun_test123' })
+    );
+  });
+
+  it('should consume message silently when step_failed fails with EntityConflictError', async () => {
+    mockEventsCreate.mockRejectedValue(
+      new EntityConflictError('Step already completed')
+    );
+
+    const result = await capturedHandler(
+      createMessage(),
+      { ...createMetadata('myStep'), attempt: MAX_QUEUE_DELIVERIES + 1 }
+    );
+
+    expect(result).toBeUndefined();
+    expect(mockStepFn).not.toHaveBeenCalled();
+  });
+
+  it('should not trigger max deliveries check when under limit', async () => {
+    const result = await capturedHandler(
+      createMessage(),
+      { ...createMetadata('myStep'), attempt: MAX_QUEUE_DELIVERIES }
+    );
+
+    // Should proceed normally (step function executes)
+    expect(mockStepFn).toHaveBeenCalled();
+  });
+});
@@ -41,6 +41,7 @@ import {
   queueMessage,
   withHealthCheck,
 } from './helpers.js';
+import { MAX_QUEUE_DELIVERIES } from './constants.js';
 import { getWorld, getWorldHandlers } from './world.js';
 
 const DEFAULT_STEP_MAX_RETRIES = 3;
@@ -67,6 +68,67 @@ const stepHandler = getWorldHandlers().createQueueHandler(
       requestedAt,
     } = StepInvokePayloadSchema.parse(message_);
     const { requestId } = metadata;
+
+    // --- Max delivery check ---
+    // Enforce max delivery limit before any infrastructure calls.
+    // This prevents runaway steps from consuming infinite queue deliveries.
-    // This prevents runaway steps from consuming infinite queue deliveries.
+    // This prevents runaway steps from consuming infinite queue deliveries.
+    // At this point, we want to do the minimal amount of work (no fetching
+    // of the step details, etc. We simply attempt to mark the step as failed
+    // and enqueue the workflow once, and if either of those fails, the message
+    // is still consumed but with adequate logging that an error occurred.
-    // This prevents runaway steps from consuming infinite queue deliveries.
+    // This prevents runaway steps from consuming infinite queue deliveries.
+    // At this point, we want to do the minimal amount of work (no fetching
+    // of the step details, etc. We simply attempt to mark the step as failed
+    // and enqueue the workflow once, and if either of those fails, the message
+    // is still consumed but with adequate logging that an error occurred.
+    // At this point, we want to do the minimal amount of work (no fetching
+    // of the step details, etc. We simply attempt to mark the step as failed
+    // and enqueue the workflow once, and if either of those fails, the message
+    // is still consumed but with adequate logging that an error occurred.
+    if (metadata.attempt > MAX_QUEUE_DELIVERIES) {
+      runtimeLogger.error(
+        `Step handler exceeded max deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`,
+        {
+          workflowRunId,
+          stepId,
+          stepName: metadata.queueName.slice('__wkf_step_'.length),
+          attempt: metadata.attempt,
+        }
+      );
+      try {
+        const world = getWorld();
+        await world.events.create(
+          workflowRunId,
+          {
+            eventType: 'step_failed',
+            specVersion: SPEC_VERSION_CURRENT,
+            correlationId: stepId,
+            eventData: {
+              error: `Step exceeded maximum queue deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`,
+            },
+          },
+          { requestId }
+        );
+        // Re-queue the workflow to handle the failed step
+        await queueMessage(world, getWorkflowQueueName(workflowName), {
+          runId: workflowRunId,
+          traceCarrier: await serializeTraceCarrier(),
+          requestedAt: new Date(),
+        });
+      } catch (err) {
+        if (EntityConflictError.is(err) || RunExpiredError.is(err)) {
+          return;
+        }
+        // Can't even mark the step as failed. Consume the message to stop
+        // further retries. The run will remain in its current state.
+        runtimeLogger.error(
+          `Failed to mark step as failed after ${metadata.attempt} delivery attempts. ` +
+            `A persistent error is preventing the step from being terminated. ` +
+            `The run will remain in its current state until manually resolved. ` +
+            `This is most likely due to a persistent outage of the workflow backend ` +
+            `or a bug in the workflow runtime and should be reported to the Workflow team.`,
+          {
+            workflowRunId,
+            stepId,
+            attempt: metadata.attempt,
+            error: err instanceof Error ? err.message : String(err),
+          }
+        );
+      }
+      return;
+    }
+
     const spanLinks = await linkToCurrentContext();
     // Execute step within the propagated trace context
     return await withTraceContext(traceContext, async () => {

@@ -8,6 +8,8 @@ export const RUN_ERROR_CODES = {
   USER_ERROR: 'USER_ERROR',
   /** Internal runtime error (corrupted event log, missing timestamps) */
   RUNTIME_ERROR: 'RUNTIME_ERROR',
+  /** Run exceeded the maximum number of queue deliveries */
+  MAX_DELIVERIES_EXCEEDED: 'MAX_DELIVERIES_EXCEEDED',
 } as const;
 
 export type RunErrorCode =

@@ -25,7 +25,6 @@ process.on('beforeExit', () => {
             type: 'queue/v2beta',
             topic: '__wkf_workflow_*',
             consumer: 'default',
-            maxDeliveries: 64,
             retryAfterSeconds: 5,
             initialDelaySeconds: 0,
           },
@@ -41,7 +40,6 @@ process.on('beforeExit', () => {
             type: 'queue/v2beta',
             topic: '__wkf_step_*',
             consumer: 'default',
-            maxDeliveries: 64,
             retryAfterSeconds: 5,
             initialDelaySeconds: 0,
           },

@@ -90,6 +90,14 @@ export function createQueue(config: Partial<Config>): LocalQueue {
     const { pathname, prefix } = getQueueRoute(queueName);
     const messageId = MessageId.parse(`msg_${generateId()}`);
 
+    // Extract identifiers from the message for structured logging.
+    // Workflow messages have `runId`, step messages have `workflowRunId` + `stepId`.
+    const msg = message as Record<string, unknown>;
+    const runId = (msg.runId ?? msg.workflowRunId ?? undefined) as
+      | string
+      | undefined;
+    const stepId = (msg.stepId ?? undefined) as string | undefined;
+
     if (opts?.idempotencyKey) {
       const key = opts.idempotencyKey;
       inflightMessages.set(key, messageId);
@@ -106,12 +114,12 @@ export function createQueue(config: Partial<Config>): LocalQueue {
         );
         await semaphore.acquire();
       }
+      // Safety limit to prevent infinite loops in the local queue.
+      // The actual max delivery enforcement happens in the workflow/step handlers
+      // (at MAX_QUEUE_DELIVERIES = 48), so this just needs to be comfortably higher.
+      const MAX_LOCAL_SAFETY_LIMIT = 256;
       try {
-        const maxAttempts = 3;
-        let defaultRetriesLeft = maxAttempts;
-        for (let attempt = 0; defaultRetriesLeft > 0; attempt++) {
-          defaultRetriesLeft--;
-
+        for (let attempt = 0; attempt < MAX_LOCAL_SAFETY_LIMIT; attempt++) {
           const headers: Record<string, string> = {
             ...opts?.headers,
             'content-type': 'application/json',
@@ -163,23 +171,39 @@ export function createQueue(config: Partial<Config>): LocalQueue {
                   );
                   await setTimeout(timeoutMs);
                 }
-                defaultRetriesLeft++;
                 continue;
               }
             } catch {}
             return;
           }
 
           console.error(
-            `[world-local] Queue message failed (attempt ${attempt + 1}/${maxAttempts}, status ${response.status}): ${text}`,
-            { queueName, messageId }
+            `[world-local] Queue message failed (attempt ${attempt + 1}, HTTP ${response.status})`,
+            {
+              queueName,
+              messageId,
+              ...(runId && { runId }),
+              ...(stepId && { stepId }),
+              handlerError: text,
+            }
           );
+
+          // 5s linear backoff to approximate VQS retry timing in local dev.
+          // VQS uses 5s linear for attempts 1–32, then exponential, but for
+          // local dev linear 5s is sufficient — the handler enforces the real
+          // cap at MAX_QUEUE_DELIVERIES (48) which keeps total time under ~4min.
+          await setTimeout(5000);
         }
 
-        console.error(`[world-local] Queue message exhausted all retries`, {
-          queueName,
-          messageId,
-        });
+        console.error(
+          `[world-local] Queue message exhausted safety limit (${MAX_LOCAL_SAFETY_LIMIT} attempts)`,
+          {
+            queueName,
+            messageId,
+            ...(runId && { runId }),
+            ...(stepId && { stepId }),
+          }
+        );
       } finally {
         semaphore.release();
       }

@@ -372,6 +372,7 @@ const hookEventsRequiringExistence = new Set([
   'hook_received',
 ]);
 
+
 export async function createWorkflowRunEvent(
   id: string | null,
   data: AnyEventRequest,