Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4fc0a38
Implemented persistent nvfp4 kernel
Oleg-Goncharov Nov 21, 2025
03198ae
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 21, 2025
18211de
Merge branch 'main' into pr_nvfp4_persistent_kernel
Oleg-Goncharov Nov 21, 2025
4f1e8d8
Fix FP4 guard in ptx
Oleg-Goncharov Nov 22, 2025
236d7ee
Fix
Oleg-Goncharov Nov 22, 2025
558c126
Fix in ptx. reduxf32 guard
Oleg-Goncharov Nov 22, 2025
a7a0652
Fix in ptx. reduxf32 guard
Oleg-Goncharov Nov 22, 2025
c8062d3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 22, 2025
b14b3fd
Fix
Oleg-Goncharov Nov 22, 2025
f9cf5e0
Fixes per PR review
Oleg-Goncharov Dec 8, 2025
c2fd9f0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 8, 2025
bd06fb2
Merge branch 'main' into pr_nvfp4_persistent_kernel
Oleg-Goncharov Dec 8, 2025
452ea66
Fixes per PR review. Added parameter to turn off the persistency
Oleg-Goncharov Dec 8, 2025
3eb453b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 8, 2025
7b11f00
Modified reference CPU implementation in C++ unit tests to match GPU …
Oleg-Goncharov Dec 12, 2025
a38eeff
Disabled persistency by default, as non-persistent kernel is more per…
Oleg-Goncharov Dec 12, 2025
a7015f8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 12, 2025
b6290ae
Merge branch 'main' into pr_nvfp4_persistent_kernel
Oleg-Goncharov Dec 12, 2025
b8a2c60
Use the tuned kernel also for the rowwise only quantization
Oleg-Goncharov Dec 15, 2025
08a82d7
Fixed typo
Oleg-Goncharov Dec 15, 2025
c86ac00
Merge branch 'main' into pr_nvfp4_persistent_kernel
Oleg-Goncharov Dec 15, 2025
b6728bd
Initial version of the grouped MXFP8 kernel. Work in progress.
Oleg-Goncharov Jan 12, 2026
81ededa
Added support for all shapes. Fixed bugs. Work in progress.
Oleg-Goncharov Jan 14, 2026
fb83f08
Added acquire memory fence for tensor map
Oleg-Goncharov Jan 14, 2026
cd92f72
Fixed stride values in TMA descriptors (should be in bytes)
Oleg-Goncharov Jan 14, 2026
fc2a53f
Clean up. Small fixes.
Oleg-Goncharov Jan 15, 2026
74a7917
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions tests/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ cmake_minimum_required(VERSION 3.18)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
set(CMAKE_CUDA_ARCHITECTURES 100)
# set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
else ()
set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90)
set(CMAKE_CUDA_ARCHITECTURES 100)
# set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90)
endif()
endif()

Expand Down
55 changes: 28 additions & 27 deletions tests/cpp/operator/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,34 @@
# See LICENSE for license information.

add_executable(test_operator
test_cast.cu
test_cast_current_scaling.cu
test_cast_dbias.cu
test_cast_dbias_dgelu.cu
test_cast_gated_swiglu.cu
test_cast_mxfp8_gated_swiglu.cu
test_qdq.cu
test_cast_mxfp8.cu
test_cast_nvfp4_transpose.cu
test_cast_float8blockwise.cu
test_dequantize_mxfp8.cu
test_transpose.cu
test_cast_transpose.cu
test_cast_transpose_current_scaling.cu
test_cast_transpose_dbias.cu
test_cast_transpose_dbias_dgelu.cu
test_cast_transpose_dgeglu.cu
test_act.cu
test_normalization.cu
test_normalization_mxfp8.cu
test_memset.cu
test_multi_cast_transpose.cu
test_multi_padding.cu
test_multi_unpadding.cu
test_causal_softmax.cu
test_swizzle.cu
test_swap_first_dims.cu
# test_cast.cu
# test_cast_current_scaling.cu
# test_cast_dbias.cu
# test_cast_dbias_dgelu.cu
# test_cast_gated_swiglu.cu
# test_cast_mxfp8_gated_swiglu.cu
# test_qdq.cu
# test_cast_mxfp8.cu
test_cast_mxfp8_grouped.cu
# test_cast_nvfp4_transpose.cu
# test_cast_float8blockwise.cu
# test_dequantize_mxfp8.cu
# test_transpose.cu
# test_cast_transpose.cu
# test_cast_transpose_current_scaling.cu
# test_cast_transpose_dbias.cu
# test_cast_transpose_dbias_dgelu.cu
# test_cast_transpose_dgeglu.cu
# test_act.cu
# test_normalization.cu
# test_normalization_mxfp8.cu
# test_memset.cu
# test_multi_cast_transpose.cu
# test_multi_padding.cu
# test_multi_unpadding.cu
# test_causal_softmax.cu
# test_swizzle.cu
# test_swap_first_dims.cu
../test_common.cu)

# Find required packages
Expand Down
Loading