Skip to content

zstd: Replace 5-permute WAVE_PROPAGATE_STEP series with 1-permute zstdgpu_WavePropogateFseTableIndex.#105

Open
Jonathan-Weinstein-AMD wants to merge 1 commit into
microsoft:developmentfrom
Jonathan-Weinstein-AMD:zstd-propogate-fse-index-within-wave-opt
Open

zstd: Replace 5-permute WAVE_PROPAGATE_STEP series with 1-permute zstdgpu_WavePropogateFseTableIndex.#105
Jonathan-Weinstein-AMD wants to merge 1 commit into
microsoft:developmentfrom
Jonathan-Weinstein-AMD:zstd-propogate-fse-index-within-wave-opt

Conversation

@Jonathan-Weinstein-AMD
Copy link
Copy Markdown

In zstdgpu_ShaderEntry_ParseCompressedBlocks(), replace both series of 5 WAVE_PROPAGATE_STEPs

    WAVE_PROPAGATE_STEP(x,  2)
    WAVE_PROPAGATE_STEP(x,  4)
    WAVE_PROPAGATE_STEP(x,  8)
    WAVE_PROPAGATE_STEP(x, 16)
    WAVE_PROPAGATE_STEP(x, 32)

with

    x = zstdgpu_WavePropogateFseTableIndex(x)

which is leaner since it does a single lane permute, instead of 5. It also maybe makes it a bit clearer what is being computed.


NOTE: zstdgpu_WavePropogateFseTableIndex doesn't handle active lanes at or beyond index 32, but that should be easy to add if desired, and neither did the original (it would need a WAVE_PROPAGATE_STEP(x, 64) for wave64). A pre-processor check/#error on kzstdgpu_TgSizeX_ParseCompressedBlocks was added, similar to the existing in ZstdGpuPrefixSequenceOffsets.hlsl on kzstdgpu_TgSizeX_PrefixSequenceOffsets.

Testing

I dispatched this HLSL shader with 256*256 = 2**16 total groups in a side-app and inspected the output. For the first 8 lanes in a wave, this tests all combinations of { Unused, Repeat, 2, 3 }. This isn't a perfect test, but I also ran zstdgpu_demo --chk-gpu with some real inputs.

// NOTE: makes lane masks easier; WaveTryReplicateFillerUpwardsToHoles doesn't bother with wave64+:
static const uint32_t kzstdgpu_TgSizeX_ParseCompressedBlocks = 32;

static const uint32_t kzstdgpu_FseProbTableIndex_Unused = 0x3fffffff;
static const uint32_t kzstdgpu_FseProbTableIndex_Repeat = kzstdgpu_FseProbTableIndex_Unused - 1;

uint WavePropogateFseTableIndex_Reference(uint x)
{
    const uint32_t blockSize = min(WaveGetLaneCount(), kzstdgpu_TgSizeX_ParseCompressedBlocks);

    #define WAVE_SHUFFLE(v, and_mask, or_mask, xor_mask) WaveReadLaneAt(v, ((WaveGetLaneIndex() & (and_mask)) | (or_mask)) ^ (xor_mask))

    #define WAVE_BROADCAST(v, group_size, group_lane) WAVE_SHUFFLE(v, ~(group_size - 1u), group_lane, 0)

    #define WAVE_PROPAGATE_STEP(p, group_size)  \
        if (blockSize >= group_size /** this condition is expected to be a compile-time condition, so no real branch */) \
        { \
            /* for every group of `group_size` consecutive lanes, broadcast the value from the last lane of the "odd" sub-group of 2x smaller size) */     \
            uint32_t b = WAVE_BROADCAST(p, group_size, group_size / 2u - 1u);                                   \
            /* for every group of `group_size` consecutive lanes */                                             \
            /* propagate element from the last lane of the "odd" sub-group of 2x smaller size  */               \
            /* into all elements of the "even" sub-group of 2x smaller size when propagated value makes sense */\
            [flatten] if ((WaveGetLaneIndex() & (group_size / 2u)))                                             \
            {                                                                                                   \
                /* We propagate only non-Repeat and not-Unused values to lanes containing Repeat/Unused values*/\
                if (p >= kzstdgpu_FseProbTableIndex_Repeat && b < kzstdgpu_FseProbTableIndex_Repeat)            \
                    p = b;                                                                                      \
            }                                                                                                   \
        }

    WAVE_PROPAGATE_STEP(x, 2)
    WAVE_PROPAGATE_STEP(x, 4)
    WAVE_PROPAGATE_STEP(x, 8)
    WAVE_PROPAGATE_STEP(x, 16)
    WAVE_PROPAGATE_STEP(x, 32)
    return x;

    #undef WAVE_PROPAGATE_STEP
    #undef WAVE_BROADCAST
    #undef WAVE_SHUFFLE
}

// Active lanes either contain a "filler" xor a "hole" value.
//
// If a lane with a hole value can't have a filler value propagated to it from a lower lane,
// its value is unchanged (remains a hole).
//
// NOTE: ensure kzstdgpu_TgSizeX_ParseCompressedBlocks <= 32
// so HLSL lane masks are easy to work with.
//
// Example with lower lane IDs on the left for "Wave8" where filler values are even integers (holes are odd integers):
//      input  = { 1, 4, 3, 3, 6, 8, 5, 5 }
//      output = { 1, 4, 4, 4, 6, 8, 8, 8 }
uint WaveTryReplicateFillerUpwardsToHoles(uint v_value, bool v_isFiller)
{
    const uint s_hasFillerMask = WaveActiveBallot(v_isFiller).x; // assume <= Wave32
    const uint v_selfMask = 1u << WaveGetLaneIndex();

    uint v_srcLanesMask = s_hasFillerMask & (v_selfMask - 1);
    // If this lane already has a filler value, or it has no lane with a filler value to read from, make it read from itself:
    if (v_isFiller || v_srcLanesMask == 0)
    {
        v_srcLanesMask = v_selfMask;
    }

    return WaveReadLaneAt(v_value, firstbithigh(v_srcLanesMask));
}

uint WavePropogateFseTableIndex_V2(uint tableIndex)
{
    const bool isFiller = tableIndex < kzstdgpu_FseProbTableIndex_Repeat;
    return WaveTryReplicateFillerUpwardsToHoles(tableIndex, isFiller);
}

RWStructuredBuffer<uint32_t> uav : register(u4, space2);

[numthreads(kzstdgpu_TgSizeX_ParseCompressedBlocks, 1, 1)]
void main(uint2 combinationKey2 : SV_GroupId,
          uint  threadIdInGroup : SV_GroupThreadID)
{
    const uint combinationKey = combinationKey2.y * 256 + combinationKey2.x;

    // Lets test all combinations of "wave8" (to deal with less data) with up to 4 values per lane.
    // The shader is actually wave32; we don't really care about values at lane index 8+.
    // For a combination key of 0b00'00'00'00'11'00'10'00, the output should be [Unused, 2, 2, 3, 3, 3, 3, 3]
    const uint slotId = (threadIdInGroup % 8u);
    uint v = (combinationKey >> (slotId * 2)) & 0x3;
    if (v == 0) {
        v = kzstdgpu_FseProbTableIndex_Unused;
    } else if (v == 1) {
        v = kzstdgpu_FseProbTableIndex_Repeat;
    }

    const uint output_ref = WavePropogateFseTableIndex_Reference(v);
    const uint output_v2 = WavePropogateFseTableIndex_V2(v);

    if (output_ref != output_v2) {
        uav[0] = 0xEEEEEEEE;
    }
    uav[(combinationKey + 1) * kzstdgpu_TgSizeX_ParseCompressedBlocks + threadIdInGroup] = output_v2;
}

…dgpu_WavePropogateFseTableIndex.

In `zstdgpu_ShaderEntry_ParseCompressedBlocks()`, replace both series of 5 `WAVE_PROPAGATE_STEP`s
```
    WAVE_PROPAGATE_STEP(x,  2)
    WAVE_PROPAGATE_STEP(x,  4)
    WAVE_PROPAGATE_STEP(x,  8)
    WAVE_PROPAGATE_STEP(x, 16)
    WAVE_PROPAGATE_STEP(x, 32)
```
with
```
    x = zstdgpu_WavePropogateFseTableIndex(x)
```
which is leaner since it does a single lane permute, instead of 5. It also maybe makes it a bit clearer what is being computed.

---

NOTE: `zstdgpu_WavePropogateFseTableIndex` doesn't handle active lanes at or beyond index 32, but that should be easy to add if desired, and neither did the original (it would need a `WAVE_PROPAGATE_STEP(x, 64)` for wave64). A pre-processor check/`#error` on `kzstdgpu_TgSizeX_ParseCompressedBlocks` was added, similar to the existing in `ZstdGpuPrefixSequenceOffsets.hlsl` on `kzstdgpu_TgSizeX_PrefixSequenceOffsets`.

**Testing**

I dispatched this HLSL shader with 256\*256 = 2\**16 total groups in a side-app and inspected the output. For the first 8 lanes in a wave, this tests all combinations of `{ Unused, Repeat, 2, 3 }`. This isn't a perfect test, but I also ran `zstdgpu_demo --chk-gpu` with some real inputs.

```hlsl
// NOTE: makes lane masks easier; WaveTryReplicateFillerUpwardsToHoles doesn't bother with wave64+:
static const uint32_t kzstdgpu_TgSizeX_ParseCompressedBlocks = 32;

static const uint32_t kzstdgpu_FseProbTableIndex_Unused = 0x3fffffff;
static const uint32_t kzstdgpu_FseProbTableIndex_Repeat = kzstdgpu_FseProbTableIndex_Unused - 1;

uint WavePropogateFseTableIndex_Reference(uint x)
{
    const uint32_t blockSize = min(WaveGetLaneCount(), kzstdgpu_TgSizeX_ParseCompressedBlocks);

    #define WAVE_SHUFFLE(v, and_mask, or_mask, xor_mask) WaveReadLaneAt(v, ((WaveGetLaneIndex() & (and_mask)) | (or_mask)) ^ (xor_mask))

    #define WAVE_BROADCAST(v, group_size, group_lane) WAVE_SHUFFLE(v, ~(group_size - 1u), group_lane, 0)

    #define WAVE_PROPAGATE_STEP(p, group_size)  \
        if (blockSize >= group_size /** this condition is expected to be a compile-time condition, so no real branch */) \
        { \
            /* for every group of `group_size` consecutive lanes, broadcast the value from the last lane of the "odd" sub-group of 2x smaller size) */     \
            uint32_t b = WAVE_BROADCAST(p, group_size, group_size / 2u - 1u);                                   \
            /* for every group of `group_size` consecutive lanes */                                             \
            /* propagate element from the last lane of the "odd" sub-group of 2x smaller size  */               \
            /* into all elements of the "even" sub-group of 2x smaller size when propagated value makes sense */\
            [flatten] if ((WaveGetLaneIndex() & (group_size / 2u)))                                             \
            {                                                                                                   \
                /* We propagate only non-Repeat and not-Unused values to lanes containing Repeat/Unused values*/\
                if (p >= kzstdgpu_FseProbTableIndex_Repeat && b < kzstdgpu_FseProbTableIndex_Repeat)            \
                    p = b;                                                                                      \
            }                                                                                                   \
        }

    WAVE_PROPAGATE_STEP(x, 2)
    WAVE_PROPAGATE_STEP(x, 4)
    WAVE_PROPAGATE_STEP(x, 8)
    WAVE_PROPAGATE_STEP(x, 16)
    WAVE_PROPAGATE_STEP(x, 32)
    return x;

    #undef WAVE_PROPAGATE_STEP
    #undef WAVE_BROADCAST
    #undef WAVE_SHUFFLE
}

// Active lanes either contain a "filler" xor a "hole" value.
//
// If a lane with a hole value can't have a filler value propagated to it from a lower lane,
// its value is unchanged (remains a hole).
//
// NOTE: ensure kzstdgpu_TgSizeX_ParseCompressedBlocks <= 32
// so HLSL lane masks are easy to work with.
//
// Example with lower lane IDs on the left for "Wave8" where filler values are even integers (holes are odd integers):
//      input  = { 1, 4, 3, 3, 6, 8, 5, 5 }
//      output = { 1, 4, 4, 4, 6, 8, 8, 8 }
uint WaveTryReplicateFillerUpwardsToHoles(uint v_value, bool v_isFiller)
{
    const uint s_hasFillerMask = WaveActiveBallot(v_isFiller).x; // assume <= Wave32
    const uint v_selfMask = 1u << WaveGetLaneIndex();

    uint v_srcLanesMask = s_hasFillerMask & (v_selfMask - 1);
    // If this lane already has a filler value, or it has no lane with a filler value to read from, make it read from itself:
    if (v_isFiller || v_srcLanesMask == 0)
    {
        v_srcLanesMask = v_selfMask;
    }

    return WaveReadLaneAt(v_value, firstbithigh(v_srcLanesMask));
}

uint WavePropogateFseTableIndex_V2(uint tableIndex)
{
    const bool isFiller = tableIndex < kzstdgpu_FseProbTableIndex_Repeat;
    return WaveTryReplicateFillerUpwardsToHoles(tableIndex, isFiller);
}

RWStructuredBuffer<uint32_t> uav : register(u4, space2);

[numthreads(kzstdgpu_TgSizeX_ParseCompressedBlocks, 1, 1)]
void main(uint2 combinationKey2 : SV_GroupId,
          uint  threadIdInGroup : SV_GroupThreadID)
{
    const uint combinationKey = combinationKey2.y * 256 + combinationKey2.x;

    // Lets test all combinations of "wave8" (to deal with less data) with up to 4 values per lane.
    // The shader is actually wave32; we don't really care about values at lane index 8+.
    // For a combination key of 0b00'00'00'00'11'00'10'00, the output should be [Unused, 2, 2, 3, 3, 3, 3, 3]
    const uint slotId = (threadIdInGroup % 8u);
    uint v = (combinationKey >> (slotId * 2)) & 0x3;
    if (v == 0) {
        v = kzstdgpu_FseProbTableIndex_Unused;
    } else if (v == 1) {
        v = kzstdgpu_FseProbTableIndex_Repeat;
    }

    const uint output_ref = WavePropogateFseTableIndex_Reference(v);
    const uint output_v2 = WavePropogateFseTableIndex_V2(v);

    if (output_ref != output_v2) {
        uav[0] = 0xEEEEEEEE;
    }
    uav[(combinationKey + 1) * kzstdgpu_TgSizeX_ParseCompressedBlocks + threadIdInGroup] = output_v2;
}
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant