diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index edf9c4d6f19..d5c6155007e 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -797,6 +797,11 @@ def pre_quantize( preview_input_ids = next(iter(calib_dataloader))[ "input_features" if model_type == "whisper" else "input_ids" ][0:1] + # Strip leading padding tokens so the preview input shows real content + if model_type != "whisper" and tokenizer is not None and tokenizer.pad_token_id is not None: + first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0] + if first_non_pad.numel() > 0: + preview_input_ids = preview_input_ids[:, first_non_pad[0] :] # Generate preview before quantization if args.skip_generate: @@ -897,7 +902,7 @@ def input_decode(input_ids): if processor is not None and isinstance(processor, WhisperProcessor): return first_text_speech_dataset elif tokenizer is not None: - return tokenizer.batch_decode(input_ids) + return tokenizer.batch_decode(input_ids, skip_special_tokens=True) else: raise ValueError("The processor or tokenizer must be set") diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index 952ed1e39c1..6df3b01bc90 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -76,8 +76,12 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: ) i_quantizer = gate_up_input_q if is_gate_up else down_input_q - # gate/up share a weight quantizer — clone so each gets independent amax. - w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src + # gate/up share a quantizer — deepcopy so gate_proj and up_proj get + # independent quantizers that can hold different amax slices. + if is_gate_up: + w_quantizer = copy.deepcopy(w_quantizer_src) + else: + w_quantizer = w_quantizer_src # For per-channel amax (dim >= 1), proportionally slice dim-0 # to match the split weight. @@ -91,7 +95,7 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: if fused_total % amax_dim0 == 0: slice_start = fused_start * amax_dim0 // fused_total slice_end = (fused_start + weight_slice.shape[0]) * amax_dim0 // fused_total - w_quantizer.amax = amax[slice_start:slice_end].contiguous() + w_quantizer._amax = amax[slice_start:slice_end].contiguous() else: warnings.warn( f"Expert {idx} {proj_name}: fused amax dim0 ({amax_dim0}) does not " @@ -100,20 +104,73 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: stacklevel=2, ) - # If the weight quantizer was never calibrated, compute amax from weights. + # Patch invalid per-block amax entries (NaN/inf/negative/zero/too-small/too-large) + # with weight-derived fallback values. + min_valid_amax = 2e-3 # floor matches FP8 E4M3FN minimum subnormal (2^-9 ≈ 0.00195) + max_valid_amax = 1e6 + if ( + hasattr(w_quantizer, "_amax") + and w_quantizer._amax is not None + and w_quantizer._amax.numel() > 1 + and (getattr(w_quantizer, "block_sizes", None) or {}).get(-1) is not None + ): + amax_cpu = w_quantizer._amax + invalid_mask = ~( + torch.isfinite(amax_cpu) + & (amax_cpu >= min_valid_amax) + & (amax_cpu <= max_valid_amax) + ) + if invalid_mask.any(): + _block_size = (getattr(w_quantizer, "block_sizes", None) or {}).get(-1, 16) + per_block_fallback = ( + weight_slice.detach() + .reshape(-1, _block_size) + .abs() + .amax(dim=1, keepdim=True) + .cpu() + .float() + .clamp(min=2e-3) + .reshape(amax_cpu.shape) + ) + amax_cpu[invalid_mask] = per_block_fallback[invalid_mask] + w_quantizer._amax = amax_cpu + + # For uncalibrated experts (amax missing or invalid scalar), fall back to + # per-block amax from weights so the static export path can reshape it correctly. + # Only applies to per-block (NVFP4) quantizers — non-block quantizers have + # no block_sizes and should not be routed to the static NVFP4 export path. if ( hasattr(w_quantizer, "is_enabled") and w_quantizer.is_enabled + and (getattr(w_quantizer, "block_sizes", None) or {}).get(-1) is not None and ( not hasattr(w_quantizer, "_amax") or w_quantizer._amax is None - or torch.all(w_quantizer._amax == 0) + or ( + w_quantizer._amax.numel() == 1 + and not ( + torch.isfinite(w_quantizer._amax) + and w_quantizer._amax >= min_valid_amax + and w_quantizer._amax <= max_valid_amax + ) + ) ) ): - w_quantizer.amax = weight_slice.abs().amax().to(torch.float32) + _block_size = (getattr(w_quantizer, "block_sizes", None) or {}).get(-1, 16) + fallback_per_block = ( + weight_slice.detach() + .reshape(-1, _block_size) + .abs() + .amax(dim=1, keepdim=True) + .cpu() + .float() + .clamp(min=2e-3) + .reshape(*weight_slice.shape[:-1], weight_slice.shape[-1] // _block_size) + ) + w_quantizer._amax = fallback_per_block warnings.warn( f"Expert {idx} {proj_name} weight quantizer was not calibrated " - f"(amax missing or zero). Using weight-derived amax as fallback. " + f"(amax missing or zero). Using weight-derived per-block amax as fallback. " f"Consider using more calibration data to activate all experts.", stacklevel=2, ) @@ -123,6 +180,20 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: wrapper.weight_quantizer = w_quantizer wrapper.input_quantizer = i_quantizer + # Set global_amax to route to the static NVFP4 export path (reads per-block _amax). + # Always recompute from the current (possibly patched) _amax — a stale zero + # global_amax causes division-by-zero in the per-block scale formula. + # Guard: only per-block (NVFP4) quantizers have block_sizes; skip for others. + wq = wrapper.weight_quantizer + if ( + hasattr(wq, "_amax") + and wq._amax is not None + and wq._amax.numel() > 1 + and (getattr(wq, "block_sizes", None) or {}).get(-1) is not None + ): + wq._amax = wq._amax.to(weight_slice.device) + wq.global_amax = wq._amax.float().amax().clamp(min=2e-3) + _export_quantized_weight(wrapper, dtype) proj = nn.Module() diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 4ce0f62a75d..416a468b2a3 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -20,7 +20,7 @@ import warnings from collections.abc import Callable from functools import partial -from typing import TypeAlias +from typing import Any, TypeAlias import torch import torch.distributed as dist @@ -351,7 +351,7 @@ def mse_calibrate( # Step 2: Replace calibrators with MseCalibrator for enabled quantizers # and identify weight quantizers - weight_quantizers = [] + weight_quantizers: list[tuple[Any, Any, TensorQuantizer]] = [] seen_modules = set() for name, module in list(model.named_modules()): @@ -410,7 +410,12 @@ def mse_calibrate( quant_func=partial(_mse_quant_func, quantizer=module), ) - # Identify weight quantizers by checking if they have corresponding weight parameters + # Collect weight quantizers (standard + fused-experts per-expert lists). + try: + from modelopt.torch.quantization.plugins.huggingface import _QuantFusedExperts as _qfe_cls + except ImportError: + _qfe_cls = None # type: ignore[misc] + name_to_module = dict(model.named_modules()) for parent_module in name_to_module.values(): if parent_module in seen_modules: @@ -421,8 +426,56 @@ def mse_calibrate( if isinstance(weight_quantizer, TensorQuantizer) and weight_quantizer.is_enabled: if getattr(weight_quantizer, "_calibrator", None) is not None: weight_quantizers.append((parent_module, weight_name, weight_quantizer)) + # Enqueue per-expert quantizers from {param}_weight_quantizers ModuleLists. + if _qfe_cls is not None and isinstance(parent_module, _qfe_cls): + for param_name, param in parent_module.named_parameters(recurse=False): + qlist = getattr(parent_module, f"{param_name}_weight_quantizers", None) + if not isinstance(qlist, nn.ModuleList): + continue + if len(qlist) != param.shape[0]: + warnings.warn( + f"Skipping {param_name}_weight_quantizers: list length {len(qlist)} " + f"does not match parameter leading dimension {param.shape[0]}. " + "This may indicate a misconfigured fused-experts module.", + stacklevel=2, + ) + continue + for expert_idx, wq in enumerate(qlist): + if isinstance(wq, TensorQuantizer) and wq.is_enabled: + if getattr(wq, "_calibrator", None) is not None: + weight_quantizers.append((parent_module, (param_name, expert_idx), wq)) + seen_modules.add(parent_module) + # Warn about enabled weight quantizers that weren't scheduled for MSE calibration. + picked_ids = {id(wq) for _, _, wq in weight_quantizers} + + def _is_active_unpicked(q: Any) -> bool: + return ( + isinstance(q, TensorQuantizer) + and q.is_enabled + and getattr(q, "_calibrator", None) is not None + and id(q) not in picked_ids + ) + + missed: list[str] = [] + for mod_name, module in name_to_module.items(): + for attr_name, attr in module._modules.items(): + if isinstance(attr, TensorQuantizer) and attr_name.endswith("weight_quantizer"): + if _is_active_unpicked(attr): + missed.append(f"{mod_name}.{attr_name}") + elif isinstance(attr, nn.ModuleList) and attr_name.endswith("_weight_quantizers"): + for i, wq in enumerate(attr): + if _is_active_unpicked(wq): + missed.append(f"{mod_name}.{attr_name}[{i}]") + if missed: + warnings.warn( + f"MSE weight calibration: {len(missed)} weight quantizer(s) are enabled but were " + f"not scheduled for calibration and will retain max-calibration amax values. " + f"First {min(5, len(missed))}: {missed[:5]}", + stacklevel=2, + ) + # Step 3: Calibrate weight quantizers ONE AT A TIME with immediate amax computation # This prevents massive memory accumulation seen in large models for idx, (parent_module, weight_name, weight_quantizer) in enumerate( @@ -432,7 +485,11 @@ def mse_calibrate( weight_quantizer.disable_quant() weight_quantizer.enable_calib() with enable_weight_access_and_writeback(parent_module, model, name_to_module): - weight = getattr(parent_module, weight_name) + if isinstance(weight_name, tuple): + param_name, expert_idx = weight_name + weight = getattr(parent_module, param_name)[expert_idx] + else: + weight = getattr(parent_module, weight_name) weight_quantizer(weight) # IMMEDIATELY compute amax and reset calibrator to free memory @@ -778,7 +835,7 @@ def finish_stats_collection(model: nn.Module, method: str | None = None, **kwarg cal = getattr(module, "_calibrator", None) if cal and not getattr(module, "_dynamic", False): - if method in {"entropy"}: + if method == "entropy": if cal.compute_amax(method) is not None: module.load_calib_amax("entropy", **kwargs) elif cal.compute_amax(**kwargs) is not None: diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 5e65f9cc1d4..3582223c4d3 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -595,6 +595,7 @@ def print_quant_summary(model: nn.Module, output_dir: str | None = None): lines.append(f"{len(lines)} TensorQuantizers found in model") if output_dir: + os.makedirs(output_dir, exist_ok=True) path = os.path.join(output_dir, ".quant_summary.txt") with open(path, "w", encoding="utf-8") as f: f.write("\n".join(lines) + "\n") diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index 3ff7401ec3e..928e319dfff 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -1112,7 +1112,7 @@ def forward(self, inputs): return outputs - def _short_amax(self, fmt=".4f"): + def _short_amax(self, fmt=".2e"): """Short description of amax. Returns: @@ -1130,7 +1130,7 @@ def _short_amax(self, fmt=".4f"): return "meta" return self._short_tensor(self._amax, fmt) - def _short_tensor(self, tensor: torch.Tensor, fmt=".4f"): + def _short_tensor(self, tensor: torch.Tensor, fmt=".2e"): """Short description of tensor.""" if tensor.numel() == 1: return f"{tensor.item():{fmt}}" diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index fe30e283c2d..71083980169 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -124,8 +124,11 @@ def get_weights_scaling_factor_from_quantizer( # Quantize scales to FP8 if not keep_high_precision: - per_block_scale = (per_block_scale * 448.0 / per_block_scale_max).to( - torch.float8_e4m3fn + fp8_e4m3fn_min = 2**-9 # 0.001953125 — smallest positive subnormal + per_block_scale = ( + (per_block_scale * 448.0 / per_block_scale_max) + .clamp(min=fp8_e4m3fn_min) + .to(torch.float8_e4m3fn) ) return per_block_scale, weights_scaling_factor_2 else: @@ -167,6 +170,12 @@ def get_weights_scaling_factor( per_block_scale[per_block_scale == 0] = 1.0 # Convert to torch.float8_e4m3fn if not keep_high_precision: + # Clamp to the minimum positive FP8 E4M3FN subnormal (~0.00195 = 2^-9) before + # casting. Without this, blocks whose scale falls below the FP8 representable + # range silently underflow to 0, causing those blocks to produce zero output at + # inference even when the weights are non-trivial. + fp8_e4m3fn_min = 2**-9 # 0.001953125 — smallest positive subnormal + per_block_scale = per_block_scale.clamp(min=fp8_e4m3fn_min) per_block_scale = per_block_scale.to(torch.float8_e4m3fn) return per_block_scale, weights_scaling_factor_2 diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml new file mode 100644 index 00000000000..76d50b760f0 --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +metadata: + recipe_type: ptq + description: > + NVFP4 W4A4 for MoE routed experts only. Static weight scales via MSE + FP8 scale sweep; + dynamic activation scales. Supports sequential experts (nn.Linear-based) and fused experts + (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style). +quantize: + algorithm: + method: mse + fp8_scale_sweep: true + layerwise: false + quant_cfg: + # ── Disable everything first ───────────────────────────────────────────── + - quantizer_name: '*' + enable: false + + # ── Sequential experts (nn.Linear per expert) ──────────────────────────── + - quantizer_name: '*mlp.experts*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*mlp.experts*input_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Sequential experts: Mixtral / block_sparse_moe style ──────────────── + - quantizer_name: '*block_sparse_moe*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*block_sparse_moe*input_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Fused experts (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style) ── + - quantizer_name: '*gate_up_proj_weight_quantizers*' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*gate_up_proj_input_quantizer*' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*down_proj_weight_quantizers*' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*down_proj_input_quantizer*' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Exclusions: shared experts, attention, routers, lm_head ───────────── + - quantizer_name: '*block_sparse_moe.gate*' + enable: false + - quantizer_name: '*linear_attn.conv1d*' + enable: false + - quantizer_name: '*lm_head*' + enable: false + - quantizer_name: '*mlp.gate.*' + enable: false + - quantizer_name: '*mlp.shared_expert*' + enable: false + - quantizer_name: '*mlp.shared_expert_gate.*' + enable: false + - quantizer_name: '*router*' + enable: false + - quantizer_name: 'output.*' + enable: false + - parent_class: 'nn.BatchNorm1d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm2d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm3d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.LeakyReLU' + quantizer_name: '*' + enable: false diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py index 29435827748..cf785fb235b 100644 --- a/tests/unit/torch/quantization/plugins/test_fused_experts.py +++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py @@ -27,7 +27,6 @@ from modelopt.torch.quantization.plugins.huggingface import ( _is_fused_experts_module, _is_sparse_sequaential_moe_block, - _QuantFusedExperts, force_eager_experts_impl_on_the_fly, register_fused_experts_on_the_fly, register_sparse_moe_on_the_fly, @@ -256,27 +255,51 @@ def test_expert_index_recovery(self): # Tests for export # --------------------------------------------------------------------------- class TestExportFusedExperts: + @staticmethod + def _cleanup_registry(mod_type): + if QuantModuleRegistry.get(mod_type) is not None: + QuantModuleRegistry.unregister(mod_type) + def test_export_creates_per_expert_submodules(self): """_export_fused_experts should create per-expert submodules with standard naming.""" + import modelopt.torch.quantization as mtq from modelopt.torch.export.moe_utils import _export_fused_experts - experts = _SyntheticFusedExperts() - expert_type = type(experts) + model = _TinyMoEModel() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) - # Manually register and convert - if QuantModuleRegistry.get(expert_type) is None: - QuantModuleRegistry.register({expert_type: "test.SyntheticFusedExperts"})( - _QuantFusedExperts - ) - converted = QuantModuleRegistry.convert(experts) + quant_cfg = { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + { + "quantizer_name": "*gate_up_proj_input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*down_proj_input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*gate_up_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, + { + "quantizer_name": "*down_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, + ], + "algorithm": "max", + } - # Run a forward pass to calibrate (set amaxes) - seq_len = 16 - hidden_states = torch.randn(seq_len, HIDDEN_DIM) - top_k_index = torch.randint(0, NUM_EXPERTS, (seq_len, TOP_K)) - top_k_weights = torch.softmax(torch.randn(seq_len, TOP_K), dim=-1) - with torch.no_grad(): - converted(hidden_states, top_k_index, top_k_weights) + def forward_loop(m): + torch.manual_seed(0) + for _ in range(2): + x = torch.randn(1, 4, HIDDEN_DIM) + m(x) + + mtq.quantize(model, quant_cfg, forward_loop=forward_loop) + converted = model.moe.experts _export_fused_experts(converted, torch.float16) @@ -297,8 +320,7 @@ def test_export_creates_per_expert_submodules(self): assert not hasattr(converted, "down_proj") assert not hasattr(converted, "gate_up_proj_weight_quantizers") - if QuantModuleRegistry.get(expert_type) is not None: - QuantModuleRegistry.unregister(expert_type) + self._cleanup_registry(expert_type) # --------------------------------------------------------------------------- @@ -612,3 +634,126 @@ def test_unrelated_dotted_number_unchanged(self): _normalize_fused_experts_quantizer_name("moe.layers.3.gate.weight") == "moe.layers.3.gate.weight" ) + + +# Verifies that MSE calibration discovers and calibrates every per-expert weight quantizer +# inside a fused-expert ModuleList (both gate_up_proj and down_proj, for all experts). +class TestFusedExpertsMSECalibration: + @staticmethod + def _cleanup_registry(mod_type): + if QuantModuleRegistry.get(mod_type) is not None: + QuantModuleRegistry.unregister(mod_type) + + def test_mse_calibration_populates_all_expert_quantizers(self): + import modelopt.torch.quantization as mtq + + model = _TinyMoEModel() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) + + mtq.quantize( + model, + { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + { + "quantizer_name": "*gate_up_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*down_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + ], + "algorithm": "mse", + }, + forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)], + ) + + experts = model.moe.experts + for idx in range(NUM_EXPERTS): + assert experts.gate_up_proj_weight_quantizers[idx].amax is not None, ( + f"gate_up_proj_weight_quantizers[{idx}] not calibrated — Bug 1 regression" + ) + assert experts.down_proj_weight_quantizers[idx].amax is not None, ( + f"down_proj_weight_quantizers[{idx}] not calibrated" + ) + self._cleanup_registry(expert_type) + + +# Verifies that _export_fused_experts emits a fallback warning and computes weight-derived +# per-block amax when an expert's _amax is None or zero. Only applies to per-block (NVFP4) +# quantizers. _export_quantized_weight is mocked to isolate from FP8/GPU requirements. +class TestExportFusedExpertsUncalibratedFallback: + @staticmethod + def _cleanup_registry(mod_type): + if QuantModuleRegistry.get(mod_type) is not None: + QuantModuleRegistry.unregister(mod_type) + + def _quantize_with_block_sizes(self): + import modelopt.torch.quantization as mtq + + model = _TinyMoEModel() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) + block_cfg = {"num_bits": 8, "axis": None, "block_sizes": {-1: 16}} + mtq.quantize( + model, + { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + {"quantizer_name": "*gate_up_proj_weight_quantizer", "cfg": block_cfg}, + {"quantizer_name": "*down_proj_weight_quantizer", "cfg": block_cfg}, + ], + "algorithm": "max", + }, + forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)], + ) + return model.moe.experts, expert_type + + @pytest.mark.parametrize("zero_amax", [False, True]) + def test_fallback_warning_emitted(self, zero_amax): + """Fallback warning must fire and produce valid per-block _amax + global_amax.""" + import warnings + from unittest.mock import patch + + from modelopt.torch.export.moe_utils import _export_fused_experts + + converted, expert_type = self._quantize_with_block_sizes() + bad_amax = torch.tensor(0.0) if zero_amax else None + for idx in range(NUM_EXPERTS): + converted.gate_up_proj_weight_quantizers[idx]._amax = bad_amax + converted.down_proj_weight_quantizers[idx]._amax = bad_amax + + captured_wrappers = [] + + def _capture(wrapper, dtype): + captured_wrappers.append(wrapper) + + with ( + patch( + "modelopt.torch.export.unified_export_hf._export_quantized_weight", + side_effect=_capture, + ), + warnings.catch_warnings(record=True) as caught, + ): + warnings.simplefilter("always") + _export_fused_experts(converted, torch.float16) + + assert any("weight-derived per-block amax" in str(w.message) for w in caught), ( + f"No fallback warning emitted for {'zero' if zero_amax else 'None'} amax — Bug 3 regression" + ) + + # Every per-block weight quantizer must have a repaired per-block _amax and global_amax. + for wrapper in captured_wrappers: + wq = wrapper.weight_quantizer + if not (getattr(wq, "block_sizes", None) or {}).get(-1): + continue + assert wq._amax is not None and wq._amax.numel() > 1, ( + "Fallback did not produce per-block _amax" + ) + assert hasattr(wq, "global_amax") and wq.global_amax > 0, ( + "global_amax missing or zero after fallback" + ) + + self._cleanup_registry(expert_type) diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py new file mode 100644 index 00000000000..d66809a3cfc --- /dev/null +++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for NVFP4QTensor per-block FP8 scale underflow clamping.""" + +import torch + +from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor + +_FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive FP8 E4M3FN subnormal + + +class TestNVFP4ScaleClamping: + """Per-block weight scales below the FP8 E4M3FN minimum must be clamped, not rounded to zero.""" + + def test_no_zero_scales_for_tiny_weights(self): + """Tiny per-block amax (< 0).all(), ( + f"Zero per-block scales found after FP8 cast: {per_block_scale_f32.tolist()}. " + "FP8 scale underflow clamping likely regressed." + ) + assert (per_block_scale_f32 >= _FP8_E4M3FN_MIN).all(), ( + "Per-block scales below FP8 minimum subnormal found after cast." + ) + + def test_normal_weights_unaffected_by_clamp(self): + """Weights with typical magnitudes must not be affected by the underflow clamp.""" + block_size = 16 + torch.manual_seed(42) + normal_weight = torch.randn(8, block_size) + + per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(normal_weight, block_size) + assert (per_block_scale.float() > 0).all(), "Normal weights produced zero scales." + + def test_mixed_weight_no_zeros(self): + """Mixed-magnitude tensor (normal + tiny blocks) must have no zero scales.""" + block_size = 16 + weight = torch.cat( + [ + torch.randn(4, block_size), + torch.full((4, block_size), 1e-12), + ], + dim=0, + ) + + per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(weight, block_size) + assert (per_block_scale.float() > 0).all(), ( + "Zero scales in mixed-magnitude tensor after FP8 cast." + )