zstd: Replace 5-permute WAVE_PROPAGATE_STEP series with 1-permute zstdgpu_WavePropogateFseTableIndex. by Jonathan-Weinstein-AMD · Pull Request #105 · microsoft/DirectStorage

Jonathan-Weinstein-AMD · 2026-05-15T20:55:00Z

In zstdgpu_ShaderEntry_ParseCompressedBlocks(), replace both series of 5 WAVE_PROPAGATE_STEPs

    WAVE_PROPAGATE_STEP(x,  2)
    WAVE_PROPAGATE_STEP(x,  4)
    WAVE_PROPAGATE_STEP(x,  8)
    WAVE_PROPAGATE_STEP(x, 16)
    WAVE_PROPAGATE_STEP(x, 32)

with

    x = zstdgpu_WavePropogateFseTableIndex(x)

which is leaner since it does a single lane permute, instead of 5. It also maybe makes it a bit clearer what is being computed.

NOTE: zstdgpu_WavePropogateFseTableIndex doesn't handle active lanes at or beyond index 32, but that should be easy to add if desired, and neither did the original (it would need a WAVE_PROPAGATE_STEP(x, 64) for wave64). A pre-processor check/#error on kzstdgpu_TgSizeX_ParseCompressedBlocks was added, similar to the existing in ZstdGpuPrefixSequenceOffsets.hlsl on kzstdgpu_TgSizeX_PrefixSequenceOffsets.

Testing

I dispatched this HLSL shader with 256*256 = 2**16 total groups in a side-app and inspected the output. For the first 8 lanes in a wave, this tests all combinations of { Unused, Repeat, 2, 3 }. This isn't a perfect test, but I also ran zstdgpu_demo --chk-gpu with some real inputs.

// NOTE: makes lane masks easier; WaveTryReplicateFillerUpwardsToHoles doesn't bother with wave64+:
static const uint32_t kzstdgpu_TgSizeX_ParseCompressedBlocks = 32;

static const uint32_t kzstdgpu_FseProbTableIndex_Unused = 0x3fffffff;
static const uint32_t kzstdgpu_FseProbTableIndex_Repeat = kzstdgpu_FseProbTableIndex_Unused - 1;

uint WavePropogateFseTableIndex_Reference(uint x)
{
    const uint32_t blockSize = min(WaveGetLaneCount(), kzstdgpu_TgSizeX_ParseCompressedBlocks);

    #define WAVE_SHUFFLE(v, and_mask, or_mask, xor_mask) WaveReadLaneAt(v, ((WaveGetLaneIndex() & (and_mask)) | (or_mask)) ^ (xor_mask))

    #define WAVE_BROADCAST(v, group_size, group_lane) WAVE_SHUFFLE(v, ~(group_size - 1u), group_lane, 0)

    #define WAVE_PROPAGATE_STEP(p, group_size)  \
        if (blockSize >= group_size /** this condition is expected to be a compile-time condition, so no real branch */) \
        { \
            /* for every group of `group_size` consecutive lanes, broadcast the value from the last lane of the "odd" sub-group of 2x smaller size) */     \
            uint32_t b = WAVE_BROADCAST(p, group_size, group_size / 2u - 1u);                                   \
            /* for every group of `group_size` consecutive lanes */                                             \
            /* propagate element from the last lane of the "odd" sub-group of 2x smaller size  */               \
            /* into all elements of the "even" sub-group of 2x smaller size when propagated value makes sense */\
            [flatten] if ((WaveGetLaneIndex() & (group_size / 2u)))                                             \
            {                                                                                                   \
                /* We propagate only non-Repeat and not-Unused values to lanes containing Repeat/Unused values*/\
                if (p >= kzstdgpu_FseProbTableIndex_Repeat && b < kzstdgpu_FseProbTableIndex_Repeat)            \
                    p = b;                                                                                      \
            }                                                                                                   \
        }

    WAVE_PROPAGATE_STEP(x, 2)
    WAVE_PROPAGATE_STEP(x, 4)
    WAVE_PROPAGATE_STEP(x, 8)
    WAVE_PROPAGATE_STEP(x, 16)
    WAVE_PROPAGATE_STEP(x, 32)
    return x;

    #undef WAVE_PROPAGATE_STEP
    #undef WAVE_BROADCAST
    #undef WAVE_SHUFFLE
}

// Active lanes either contain a "filler" xor a "hole" value.
//
// If a lane with a hole value can't have a filler value propagated to it from a lower lane,
// its value is unchanged (remains a hole).
//
// NOTE: ensure kzstdgpu_TgSizeX_ParseCompressedBlocks <= 32
// so HLSL lane masks are easy to work with.
//
// Example with lower lane IDs on the left for "Wave8" where filler values are even integers (holes are odd integers):
//      input  = { 1, 4, 3, 3, 6, 8, 5, 5 }
//      output = { 1, 4, 4, 4, 6, 8, 8, 8 }
uint WaveTryReplicateFillerUpwardsToHoles(uint v_value, bool v_isFiller)
{
    const uint s_hasFillerMask = WaveActiveBallot(v_isFiller).x; // assume <= Wave32
    const uint v_selfMask = 1u << WaveGetLaneIndex();

    uint v_srcLanesMask = s_hasFillerMask & (v_selfMask - 1);
    // If this lane already has a filler value, or it has no lane with a filler value to read from, make it read from itself:
    if (v_isFiller || v_srcLanesMask == 0)
    {
        v_srcLanesMask = v_selfMask;
    }

    return WaveReadLaneAt(v_value, firstbithigh(v_srcLanesMask));
}

uint WavePropogateFseTableIndex_V2(uint tableIndex)
{
    const bool isFiller = tableIndex < kzstdgpu_FseProbTableIndex_Repeat;
    return WaveTryReplicateFillerUpwardsToHoles(tableIndex, isFiller);
}

RWStructuredBuffer<uint32_t> uav : register(u4, space2);

[numthreads(kzstdgpu_TgSizeX_ParseCompressedBlocks, 1, 1)]
void main(uint2 combinationKey2 : SV_GroupId,
          uint  threadIdInGroup : SV_GroupThreadID)
{
    const uint combinationKey = combinationKey2.y * 256 + combinationKey2.x;

    // Lets test all combinations of "wave8" (to deal with less data) with up to 4 values per lane.
    // The shader is actually wave32; we don't really care about values at lane index 8+.
    // For a combination key of 0b00'00'00'00'11'00'10'00, the output should be [Unused, 2, 2, 3, 3, 3, 3, 3]
    const uint slotId = (threadIdInGroup % 8u);
    uint v = (combinationKey >> (slotId * 2)) & 0x3;
    if (v == 0) {
        v = kzstdgpu_FseProbTableIndex_Unused;
    } else if (v == 1) {
        v = kzstdgpu_FseProbTableIndex_Repeat;
    }

    const uint output_ref = WavePropogateFseTableIndex_Reference(v);
    const uint output_v2 = WavePropogateFseTableIndex_V2(v);

    if (output_ref != output_v2) {
        uav[0] = 0xEEEEEEEE;
    }
    uav[(combinationKey + 1) * kzstdgpu_TgSizeX_ParseCompressedBlocks + threadIdInGroup] = output_v2;
}

…dgpu_WavePropogateFseTableIndex. In `zstdgpu_ShaderEntry_ParseCompressedBlocks()`, replace both series of 5 `WAVE_PROPAGATE_STEP`s ``` WAVE_PROPAGATE_STEP(x, 2) WAVE_PROPAGATE_STEP(x, 4) WAVE_PROPAGATE_STEP(x, 8) WAVE_PROPAGATE_STEP(x, 16) WAVE_PROPAGATE_STEP(x, 32) ``` with ``` x = zstdgpu_WavePropogateFseTableIndex(x) ``` which is leaner since it does a single lane permute, instead of 5. It also maybe makes it a bit clearer what is being computed. --- NOTE: `zstdgpu_WavePropogateFseTableIndex` doesn't handle active lanes at or beyond index 32, but that should be easy to add if desired, and neither did the original (it would need a `WAVE_PROPAGATE_STEP(x, 64)` for wave64). A pre-processor check/`#error` on `kzstdgpu_TgSizeX_ParseCompressedBlocks` was added, similar to the existing in `ZstdGpuPrefixSequenceOffsets.hlsl` on `kzstdgpu_TgSizeX_PrefixSequenceOffsets`. **Testing** I dispatched this HLSL shader with 256\*256 = 2\**16 total groups in a side-app and inspected the output. For the first 8 lanes in a wave, this tests all combinations of `{ Unused, Repeat, 2, 3 }`. This isn't a perfect test, but I also ran `zstdgpu_demo --chk-gpu` with some real inputs. ```hlsl // NOTE: makes lane masks easier; WaveTryReplicateFillerUpwardsToHoles doesn't bother with wave64+: static const uint32_t kzstdgpu_TgSizeX_ParseCompressedBlocks = 32; static const uint32_t kzstdgpu_FseProbTableIndex_Unused = 0x3fffffff; static const uint32_t kzstdgpu_FseProbTableIndex_Repeat = kzstdgpu_FseProbTableIndex_Unused - 1; uint WavePropogateFseTableIndex_Reference(uint x) { const uint32_t blockSize = min(WaveGetLaneCount(), kzstdgpu_TgSizeX_ParseCompressedBlocks); #define WAVE_SHUFFLE(v, and_mask, or_mask, xor_mask) WaveReadLaneAt(v, ((WaveGetLaneIndex() & (and_mask)) | (or_mask)) ^ (xor_mask)) #define WAVE_BROADCAST(v, group_size, group_lane) WAVE_SHUFFLE(v, ~(group_size - 1u), group_lane, 0) #define WAVE_PROPAGATE_STEP(p, group_size) \ if (blockSize >= group_size /** this condition is expected to be a compile-time condition, so no real branch */) \ { \ /* for every group of `group_size` consecutive lanes, broadcast the value from the last lane of the "odd" sub-group of 2x smaller size) */ \ uint32_t b = WAVE_BROADCAST(p, group_size, group_size / 2u - 1u); \ /* for every group of `group_size` consecutive lanes */ \ /* propagate element from the last lane of the "odd" sub-group of 2x smaller size */ \ /* into all elements of the "even" sub-group of 2x smaller size when propagated value makes sense */\ [flatten] if ((WaveGetLaneIndex() & (group_size / 2u))) \ { \ /* We propagate only non-Repeat and not-Unused values to lanes containing Repeat/Unused values*/\ if (p >= kzstdgpu_FseProbTableIndex_Repeat && b < kzstdgpu_FseProbTableIndex_Repeat) \ p = b; \ } \ } WAVE_PROPAGATE_STEP(x, 2) WAVE_PROPAGATE_STEP(x, 4) WAVE_PROPAGATE_STEP(x, 8) WAVE_PROPAGATE_STEP(x, 16) WAVE_PROPAGATE_STEP(x, 32) return x; #undef WAVE_PROPAGATE_STEP #undef WAVE_BROADCAST #undef WAVE_SHUFFLE } // Active lanes either contain a "filler" xor a "hole" value. // // If a lane with a hole value can't have a filler value propagated to it from a lower lane, // its value is unchanged (remains a hole). // // NOTE: ensure kzstdgpu_TgSizeX_ParseCompressedBlocks <= 32 // so HLSL lane masks are easy to work with. // // Example with lower lane IDs on the left for "Wave8" where filler values are even integers (holes are odd integers): // input = { 1, 4, 3, 3, 6, 8, 5, 5 } // output = { 1, 4, 4, 4, 6, 8, 8, 8 } uint WaveTryReplicateFillerUpwardsToHoles(uint v_value, bool v_isFiller) { const uint s_hasFillerMask = WaveActiveBallot(v_isFiller).x; // assume <= Wave32 const uint v_selfMask = 1u << WaveGetLaneIndex(); uint v_srcLanesMask = s_hasFillerMask & (v_selfMask - 1); // If this lane already has a filler value, or it has no lane with a filler value to read from, make it read from itself: if (v_isFiller || v_srcLanesMask == 0) { v_srcLanesMask = v_selfMask; } return WaveReadLaneAt(v_value, firstbithigh(v_srcLanesMask)); } uint WavePropogateFseTableIndex_V2(uint tableIndex) { const bool isFiller = tableIndex < kzstdgpu_FseProbTableIndex_Repeat; return WaveTryReplicateFillerUpwardsToHoles(tableIndex, isFiller); } RWStructuredBuffer<uint32_t> uav : register(u4, space2); [numthreads(kzstdgpu_TgSizeX_ParseCompressedBlocks, 1, 1)] void main(uint2 combinationKey2 : SV_GroupId, uint threadIdInGroup : SV_GroupThreadID) { const uint combinationKey = combinationKey2.y * 256 + combinationKey2.x; // Lets test all combinations of "wave8" (to deal with less data) with up to 4 values per lane. // The shader is actually wave32; we don't really care about values at lane index 8+. // For a combination key of 0b00'00'00'00'11'00'10'00, the output should be [Unused, 2, 2, 3, 3, 3, 3, 3] const uint slotId = (threadIdInGroup % 8u); uint v = (combinationKey >> (slotId * 2)) & 0x3; if (v == 0) { v = kzstdgpu_FseProbTableIndex_Unused; } else if (v == 1) { v = kzstdgpu_FseProbTableIndex_Repeat; } const uint output_ref = WavePropogateFseTableIndex_Reference(v); const uint output_v2 = WavePropogateFseTableIndex_V2(v); if (output_ref != output_v2) { uav[0] = 0xEEEEEEEE; } uav[(combinationKey + 1) * kzstdgpu_TgSizeX_ParseCompressedBlocks + threadIdInGroup] = output_v2; } ```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

zstd: Replace 5-permute WAVE_PROPAGATE_STEP series with 1-permute zstdgpu_WavePropogateFseTableIndex.#105

zstd: Replace 5-permute WAVE_PROPAGATE_STEP series with 1-permute zstdgpu_WavePropogateFseTableIndex.#105
Jonathan-Weinstein-AMD wants to merge 1 commit into
microsoft:developmentfrom
Jonathan-Weinstein-AMD:zstd-propogate-fse-index-within-wave-opt

Jonathan-Weinstein-AMD commented May 15, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Conversation

Jonathan-Weinstein-AMD commented May 15, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant