diff --git a/apps/typegpu-docs/src/examples/algorithms/mnist-inference/data.ts b/apps/typegpu-docs/src/examples/algorithms/mnist-inference/data.ts index 7f9ddd7694..32ffa367c9 100644 --- a/apps/typegpu-docs/src/examples/algorithms/mnist-inference/data.ts +++ b/apps/typegpu-docs/src/examples/algorithms/mnist-inference/data.ts @@ -1,40 +1,20 @@ -import tgpu, { d, type StorageFlag, type TgpuBuffer } from 'typegpu'; - -export const ReadonlyFloats = { - storage: d.arrayOf(d.f32), - access: 'readonly', -} as const; - -export const MutableFloats = { - storage: d.arrayOf(d.f32), - access: 'mutable', -} as const; - -export const ioLayout = tgpu.bindGroupLayout({ - input: ReadonlyFloats, - output: MutableFloats, -}); - -export const weightsBiasesLayout = tgpu.bindGroupLayout({ - weights: ReadonlyFloats, - biases: ReadonlyFloats, -}); +import { d, type StorageFlag, type TgpuBuffer } from 'typegpu'; export interface LayerData { shape: readonly [number] | readonly [number, number]; - buffer: TgpuBuffer> & StorageFlag; + buffer: TgpuBuffer> & StorageFlag; } export interface Layer { - weights: TgpuBuffer> & StorageFlag; - biases: TgpuBuffer> & StorageFlag; - state: TgpuBuffer> & StorageFlag; + weights: TgpuBuffer> & StorageFlag; + biases: TgpuBuffer> & StorageFlag; + state: TgpuBuffer> & StorageFlag; } export interface Network { layers: Layer[]; - input: TgpuBuffer> & StorageFlag; - output: TgpuBuffer> & StorageFlag; + input: TgpuBuffer> & StorageFlag; + output: TgpuBuffer> & StorageFlag; inference(data: number[]): Promise; } diff --git a/apps/typegpu-docs/src/examples/algorithms/mnist-inference/helpers.ts b/apps/typegpu-docs/src/examples/algorithms/mnist-inference/helpers.ts index ea015559a3..cd28150a09 100644 --- a/apps/typegpu-docs/src/examples/algorithms/mnist-inference/helpers.ts +++ b/apps/typegpu-docs/src/examples/algorithms/mnist-inference/helpers.ts @@ -37,7 +37,10 @@ function getLayerData(layer: ArrayBuffer): { }; } -export function downloadLayers(root: TgpuRoot): Promise<[LayerData, LayerData][]> { +export function downloadLayers( + root: TgpuRoot, + floatShcema: d.F32 | d.F16, +): Promise<[LayerData, LayerData][]> { const downloadLayer = async (fileName: string): Promise => { const buffer = await fetch(`/TypeGPU/assets/mnist-weights/${fileName}`).then((res) => res.arrayBuffer(), @@ -46,7 +49,7 @@ export function downloadLayers(root: TgpuRoot): Promise<[LayerData, LayerData][] const { shape, data } = getLayerData(buffer); const layerBuffer = root - .createBuffer(d.arrayOf(d.f32, data.length), [...data]) + .createBuffer(d.arrayOf(floatShcema, data.length), [...data]) .$usage('storage'); return { diff --git a/apps/typegpu-docs/src/examples/algorithms/mnist-inference/index.ts b/apps/typegpu-docs/src/examples/algorithms/mnist-inference/index.ts index 4fe7ed0d89..4aff083e17 100644 --- a/apps/typegpu-docs/src/examples/algorithms/mnist-inference/index.ts +++ b/apps/typegpu-docs/src/examples/algorithms/mnist-inference/index.ts @@ -1,38 +1,58 @@ import tgpu, { d, std } from 'typegpu'; -import { ioLayout, type LayerData, type Network, weightsBiasesLayout } from './data.ts'; +import type { LayerData, Network } from './data.ts'; import { downloadLayers } from './helpers.ts'; import { defineControls } from '../../common/defineControls.ts'; const SIZE = 28; +const WORKGROUP_SIZE = 64; const root = await tgpu.init({ - device: { - optionalFeatures: ['timestamp-query', 'subgroups'], - }, + device: { optionalFeatures: ['timestamp-query', 'subgroups', 'shader-f16'] }, }); const hasTimestampQuery = root.enabledFeatures.has('timestamp-query'); const hasSubgroups = root.enabledFeatures.has('subgroups'); +const hasShaderF16 = root.enabledFeatures.has('shader-f16'); let useSubgroups = hasSubgroups; +const float = hasShaderF16 ? d.f16 : d.f32; + +const ioLayout = tgpu.bindGroupLayout({ + input: { storage: d.arrayOf(float) }, + output: { + storage: d.arrayOf(float), + access: 'mutable', + }, +}); + +const weightsBiasesLayout = tgpu.bindGroupLayout({ + weights: { storage: d.arrayOf(float) }, + biases: { storage: d.arrayOf(float) }, +}); + const canvasData = Array.from({ length: SIZE ** 2 }, () => 0); // Shaders -const relu = tgpu.fn([d.f32], d.f32)((x) => std.max(0, x)); +function relu(x: number): number { + 'use gpu'; + return std.max(0, x); +} const defaultCompute = tgpu.computeFn({ - in: { - gid: d.builtin.globalInvocationId, - }, - workgroupSize: [1], + in: { gid: d.builtin.globalInvocationId }, + workgroupSize: [WORKGROUP_SIZE], })(({ gid }) => { - const inputSize = ioLayout.$.input.length; - const i = gid.x; + const outLen = ioLayout.$.output.length; + if (i >= outLen) { + return; + } + + const inputSize = ioLayout.$.input.length; const weightsOffset = i * inputSize; - let sum = d.f32(); + let sum = float(); - for (let j = d.u32(); j < inputSize; j++) { + for (let j = d.u32(0); j < inputSize; j++) { sum = std.fma(ioLayout.$.input[j], weightsBiasesLayout.$.weights[weightsOffset + j], sum); } @@ -40,30 +60,30 @@ const defaultCompute = tgpu.computeFn({ ioLayout.$.output[i] = relu(total); }); -const workgroupSize = tgpu.const(d.u32, 128); const subgroupCompute = tgpu.computeFn({ in: { - lid: d.builtin.localInvocationId, wid: d.builtin.workgroupId, sid: d.builtin.subgroupInvocationId, - ssize: d.builtin.subgroupSize, + sgid: d.builtin.subgroupId, + nsg: d.builtin.numSubgroups, }, - workgroupSize: [128], -})(({ lid, wid, sid, ssize }) => { - const subgroupId = d.u32(lid.x / ssize); - const outputsPerWG = d.u32(workgroupSize.$ / ssize); - const neuronIndex = wid.x * outputsPerWG + subgroupId; - + workgroupSize: [WORKGROUP_SIZE], +})(({ wid, sid, sgid, nsg }) => { const outLen = ioLayout.$.output.length; + const inputSize = ioLayout.$.input.length; + + const neuronIndex = wid.x * nsg + sgid; const valid = neuronIndex < outLen; - const inputSize = ioLayout.$.input.length; + // Actual number of active lanes in this subgroup. + const laneCount = std.subgroupAdd(1); - let partial = d.f32(); + let partial = float(0); if (valid) { const weightsOffset = neuronIndex * inputSize; - for (let j = sid; j < inputSize; j += ssize) { + + for (let j = sid; j < inputSize; j += laneCount) { partial = std.fma( ioLayout.$.input[j], weightsBiasesLayout.$.weights[weightsOffset + j], @@ -74,7 +94,7 @@ const subgroupCompute = tgpu.computeFn({ const sum = std.subgroupAdd(partial); - if (valid && sid === 0) { + if (valid && std.subgroupElect()) { ioLayout.$.output[neuronIndex] = relu(sum + weightsBiasesLayout.$.biases[neuronIndex]); } }); @@ -107,11 +127,11 @@ function createNetwork(layers: [LayerData, LayerData][]): Network { return { weights: weights.buffer, biases: biases.buffer, - state: root.createBuffer(d.arrayOf(d.f32, biases.shape[0])).$usage('storage'), + state: root.createBuffer(d.arrayOf(float, biases.shape[0])).$usage('storage'), }; }); - const input = root.createBuffer(d.arrayOf(d.f32, layers[0][0].shape[0])).$usage('storage'); + const input = root.createBuffer(d.arrayOf(float, layers[0][0].shape[0])).$usage('storage'); const output = buffers[buffers.length - 1].state; const ioBindGroups = buffers.map((_, i) => @@ -137,7 +157,8 @@ function createNetwork(layers: [LayerData, LayerData][]): Network { } input.write(data); - const pipeline = useSubgroups && pipelines.subgroup ? pipelines.subgroup : pipelines.default; + const subgroupPipeline = useSubgroups ? pipelines.subgroup : null; + const pipeline = subgroupPipeline ?? pipelines.default; // Run the network for (let i = 0; i < buffers.length; i++) { @@ -155,7 +176,10 @@ function createNetwork(layers: [LayerData, LayerData][]): Network { boundPipeline = boundPipeline.withTimestampWrites(descriptor); } - boundPipeline.dispatchWorkgroups(buffers[i].biases.dataType.elementCount); + const outputCount = buffers[i].biases.dataType.elementCount; + boundPipeline.dispatchWorkgroups( + subgroupPipeline ? outputCount : Math.ceil(outputCount / WORKGROUP_SIZE), + ); } if (querySet?.available) { @@ -180,7 +204,7 @@ function createNetwork(layers: [LayerData, LayerData][]): Network { }; } -const network = createNetwork(await downloadLayers(root)); +const network = createNetwork(await downloadLayers(root, float)); // #region Example controls and cleanup @@ -386,7 +410,7 @@ export const controls = defineControls({ 'Test Resolution': import.meta.env.DEV && { onButtonClick: () => [defaultCompute, subgroupCompute] - .map((fn) => tgpu.resolve([fn], { enableExtensions: ['subgroups'] })) + .map((fn) => tgpu.resolve([fn], { enableExtensions: ['subgroups', 'f16'] })) .map((r) => root.device.createShaderModule({ code: r })), }, }); diff --git a/apps/typegpu-docs/tests/individual-example-tests/mnist-inference.test.ts b/apps/typegpu-docs/tests/individual-example-tests/mnist-inference.test.ts index 2174bedee4..9a852ab94b 100644 --- a/apps/typegpu-docs/tests/individual-example-tests/mnist-inference.test.ts +++ b/apps/typegpu-docs/tests/individual-example-tests/mnist-inference.test.ts @@ -24,6 +24,9 @@ describe('mnist inference example', () => { expect(shaderCodes).toMatchInlineSnapshot(` "enable subgroups; + enable f16; + + @group(0) @binding(1) var output: array; @group(0) @binding(0) var input: array; @@ -31,15 +34,17 @@ describe('mnist inference example', () => { @group(1) @binding(1) var biases: array; - @group(0) @binding(1) var output: array; - fn relu(x: f32) -> f32 { return max(0f, x); } - @compute @workgroup_size(1) fn defaultCompute(@builtin(global_invocation_id) gid: vec3u) { - let inputSize = arrayLength(&input); + @compute @workgroup_size(64) fn defaultCompute(@builtin(global_invocation_id) gid: vec3u) { let i = gid.x; + let outLen = arrayLength(&output); + if ((i >= outLen)) { + return; + } + let inputSize = arrayLength(&input); let weightsOffset = (i * inputSize); var sum = 0f; for (var j = 0u; (j < inputSize); j++) { @@ -50,8 +55,7 @@ describe('mnist inference example', () => { } enable subgroups; - - const workgroupSize: u32 = 128u; + enable f16; @group(0) @binding(1) var output: array; @@ -65,22 +69,21 @@ describe('mnist inference example', () => { return max(0f, x); } - @compute @workgroup_size(128) fn subgroupCompute(@builtin(local_invocation_id) lid: vec3u, @builtin(workgroup_id) wid: vec3u, @builtin(subgroup_invocation_id) sid: u32, @builtin(subgroup_size) ssize: u32) { - let subgroupId = u32((f32(lid.x) / f32(ssize))); - let outputsPerWG = u32((f32(workgroupSize) / f32(ssize))); - let neuronIndex = ((wid.x * outputsPerWG) + subgroupId); + @compute @workgroup_size(64) fn subgroupCompute(@builtin(workgroup_id) wid: vec3u, @builtin(subgroup_invocation_id) sid: u32, @builtin(subgroup_id) sgid: u32, @builtin(num_subgroups) nsg: u32) { let outLen = arrayLength(&output); - let valid = (neuronIndex < outLen); let inputSize = arrayLength(&input); + let neuronIndex = ((wid.x * nsg) + sgid); + let valid = (neuronIndex < outLen); + let laneCount = subgroupAdd(1); var partial = 0f; if (valid) { let weightsOffset = (neuronIndex * inputSize); - for (var j = sid; (j < inputSize); j += ssize) { + for (var j = sid; (j < inputSize); j += u32(laneCount)) { partial = fma(input[j], weights[(weightsOffset + j)], partial); } } let sum = subgroupAdd(partial); - if ((valid && (sid == 0u))) { + if ((valid && subgroupElect())) { output[neuronIndex] = relu((sum + biases[neuronIndex])); } }"