diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index edf9c4d6f19..d5c6155007e 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -797,6 +797,11 @@ def pre_quantize(
     preview_input_ids = next(iter(calib_dataloader))[
         "input_features" if model_type == "whisper" else "input_ids"
     ][0:1]
+    # Strip leading padding tokens so the preview input shows real content
+    if model_type != "whisper" and tokenizer is not None and tokenizer.pad_token_id is not None:
+        first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0]
+        if first_non_pad.numel() > 0:
+            preview_input_ids = preview_input_ids[:, first_non_pad[0] :]
 
     # Generate preview before quantization
     if args.skip_generate:
@@ -897,7 +902,7 @@ def input_decode(input_ids):
         if processor is not None and isinstance(processor, WhisperProcessor):
             return first_text_speech_dataset
         elif tokenizer is not None:
-            return tokenizer.batch_decode(input_ids)
+            return tokenizer.batch_decode(input_ids, skip_special_tokens=True)
         else:
             raise ValueError("The processor or tokenizer must be set")
 
diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
index 952ed1e39c1..6df3b01bc90 100644
--- a/modelopt/torch/export/moe_utils.py
+++ b/modelopt/torch/export/moe_utils.py
@@ -76,8 +76,12 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
             )
             i_quantizer = gate_up_input_q if is_gate_up else down_input_q
 
-            # gate/up share a weight quantizer — clone so each gets independent amax.
-            w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src
+            # gate/up share a quantizer — deepcopy so gate_proj and up_proj get
+            # independent quantizers that can hold different amax slices.
+            if is_gate_up:
+                w_quantizer = copy.deepcopy(w_quantizer_src)
+            else:
+                w_quantizer = w_quantizer_src
 
             # For per-channel amax (dim >= 1), proportionally slice dim-0
             # to match the split weight.
@@ -91,7 +95,7 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                 if fused_total % amax_dim0 == 0:
                     slice_start = fused_start * amax_dim0 // fused_total
                     slice_end = (fused_start + weight_slice.shape[0]) * amax_dim0 // fused_total
-                    w_quantizer.amax = amax[slice_start:slice_end].contiguous()
+                    w_quantizer._amax = amax[slice_start:slice_end].contiguous()
                 else:
                     warnings.warn(
                         f"Expert {idx} {proj_name}: fused amax dim0 ({amax_dim0}) does not "
@@ -100,20 +104,73 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                         stacklevel=2,
                     )
 
-            # If the weight quantizer was never calibrated, compute amax from weights.
+            # Patch invalid per-block amax entries (NaN/inf/negative/zero/too-small/too-large)
+            # with weight-derived fallback values.
+            min_valid_amax = 2e-3  # floor matches FP8 E4M3FN minimum subnormal (2^-9 ≈ 0.00195)
+            max_valid_amax = 1e6
+            if (
+                hasattr(w_quantizer, "_amax")
+                and w_quantizer._amax is not None
+                and w_quantizer._amax.numel() > 1
+                and (getattr(w_quantizer, "block_sizes", None) or {}).get(-1) is not None
+            ):
+                amax_cpu = w_quantizer._amax
+                invalid_mask = ~(
+                    torch.isfinite(amax_cpu)
+                    & (amax_cpu >= min_valid_amax)
+                    & (amax_cpu <= max_valid_amax)
+                )
+                if invalid_mask.any():
+                    _block_size = (getattr(w_quantizer, "block_sizes", None) or {}).get(-1, 16)
+                    per_block_fallback = (
+                        weight_slice.detach()
+                        .reshape(-1, _block_size)
+                        .abs()
+                        .amax(dim=1, keepdim=True)
+                        .cpu()
+                        .float()
+                        .clamp(min=2e-3)
+                        .reshape(amax_cpu.shape)
+                    )
+                    amax_cpu[invalid_mask] = per_block_fallback[invalid_mask]
+                    w_quantizer._amax = amax_cpu
+
+            # For uncalibrated experts (amax missing or invalid scalar), fall back to
+            # per-block amax from weights so the static export path can reshape it correctly.
+            # Only applies to per-block (NVFP4) quantizers — non-block quantizers have
+            # no block_sizes and should not be routed to the static NVFP4 export path.
             if (
                 hasattr(w_quantizer, "is_enabled")
                 and w_quantizer.is_enabled
+                and (getattr(w_quantizer, "block_sizes", None) or {}).get(-1) is not None
                 and (
                     not hasattr(w_quantizer, "_amax")
                     or w_quantizer._amax is None
-                    or torch.all(w_quantizer._amax == 0)
+                    or (
+                        w_quantizer._amax.numel() == 1
+                        and not (
+                            torch.isfinite(w_quantizer._amax)
+                            and w_quantizer._amax >= min_valid_amax
+                            and w_quantizer._amax <= max_valid_amax
+                        )
+                    )
                 )
             ):
-                w_quantizer.amax = weight_slice.abs().amax().to(torch.float32)
+                _block_size = (getattr(w_quantizer, "block_sizes", None) or {}).get(-1, 16)
+                fallback_per_block = (
+                    weight_slice.detach()
+                    .reshape(-1, _block_size)
+                    .abs()
+                    .amax(dim=1, keepdim=True)
+                    .cpu()
+                    .float()
+                    .clamp(min=2e-3)
+                    .reshape(*weight_slice.shape[:-1], weight_slice.shape[-1] // _block_size)
+                )
+                w_quantizer._amax = fallback_per_block
                 warnings.warn(
                     f"Expert {idx} {proj_name} weight quantizer was not calibrated "
-                    f"(amax missing or zero). Using weight-derived amax as fallback. "
+                    f"(amax missing or zero). Using weight-derived per-block amax as fallback. "
                     f"Consider using more calibration data to activate all experts.",
                     stacklevel=2,
                 )
@@ -123,6 +180,20 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
             wrapper.weight_quantizer = w_quantizer
             wrapper.input_quantizer = i_quantizer
 
+            # Set global_amax to route to the static NVFP4 export path (reads per-block _amax).
+            # Always recompute from the current (possibly patched) _amax — a stale zero
+            # global_amax causes division-by-zero in the per-block scale formula.
+            # Guard: only per-block (NVFP4) quantizers have block_sizes; skip for others.
+            wq = wrapper.weight_quantizer
+            if (
+                hasattr(wq, "_amax")
+                and wq._amax is not None
+                and wq._amax.numel() > 1
+                and (getattr(wq, "block_sizes", None) or {}).get(-1) is not None
+            ):
+                wq._amax = wq._amax.to(weight_slice.device)
+                wq.global_amax = wq._amax.float().amax().clamp(min=2e-3)
+
             _export_quantized_weight(wrapper, dtype)
 
             proj = nn.Module()
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
index 4ce0f62a75d..416a468b2a3 100644
--- a/modelopt/torch/quantization/model_calib.py
+++ b/modelopt/torch/quantization/model_calib.py
@@ -20,7 +20,7 @@
 import warnings
 from collections.abc import Callable
 from functools import partial
-from typing import TypeAlias
+from typing import Any, TypeAlias
 
 import torch
 import torch.distributed as dist
@@ -351,7 +351,7 @@ def mse_calibrate(
 
     # Step 2: Replace calibrators with MseCalibrator for enabled quantizers
     # and identify weight quantizers
-    weight_quantizers = []
+    weight_quantizers: list[tuple[Any, Any, TensorQuantizer]] = []
     seen_modules = set()
 
     for name, module in list(model.named_modules()):
@@ -410,7 +410,12 @@ def mse_calibrate(
                     quant_func=partial(_mse_quant_func, quantizer=module),
                 )
 
-    # Identify weight quantizers by checking if they have corresponding weight parameters
+    # Collect weight quantizers (standard + fused-experts per-expert lists).
+    try:
+        from modelopt.torch.quantization.plugins.huggingface import _QuantFusedExperts as _qfe_cls
+    except ImportError:
+        _qfe_cls = None  # type: ignore[misc]
+
     name_to_module = dict(model.named_modules())
     for parent_module in name_to_module.values():
         if parent_module in seen_modules:
@@ -421,8 +426,56 @@ def mse_calibrate(
             if isinstance(weight_quantizer, TensorQuantizer) and weight_quantizer.is_enabled:
                 if getattr(weight_quantizer, "_calibrator", None) is not None:
                     weight_quantizers.append((parent_module, weight_name, weight_quantizer))
+        # Enqueue per-expert quantizers from {param}_weight_quantizers ModuleLists.
+        if _qfe_cls is not None and isinstance(parent_module, _qfe_cls):
+            for param_name, param in parent_module.named_parameters(recurse=False):
+                qlist = getattr(parent_module, f"{param_name}_weight_quantizers", None)
+                if not isinstance(qlist, nn.ModuleList):
+                    continue
+                if len(qlist) != param.shape[0]:
+                    warnings.warn(
+                        f"Skipping {param_name}_weight_quantizers: list length {len(qlist)} "
+                        f"does not match parameter leading dimension {param.shape[0]}. "
+                        "This may indicate a misconfigured fused-experts module.",
+                        stacklevel=2,
+                    )
+                    continue
+                for expert_idx, wq in enumerate(qlist):
+                    if isinstance(wq, TensorQuantizer) and wq.is_enabled:
+                        if getattr(wq, "_calibrator", None) is not None:
+                            weight_quantizers.append((parent_module, (param_name, expert_idx), wq))
+
         seen_modules.add(parent_module)
 
+    # Warn about enabled weight quantizers that weren't scheduled for MSE calibration.
+    picked_ids = {id(wq) for _, _, wq in weight_quantizers}
+
+    def _is_active_unpicked(q: Any) -> bool:
+        return (
+            isinstance(q, TensorQuantizer)
+            and q.is_enabled
+            and getattr(q, "_calibrator", None) is not None
+            and id(q) not in picked_ids
+        )
+
+    missed: list[str] = []
+    for mod_name, module in name_to_module.items():
+        for attr_name, attr in module._modules.items():
+            if isinstance(attr, TensorQuantizer) and attr_name.endswith("weight_quantizer"):
+                if _is_active_unpicked(attr):
+                    missed.append(f"{mod_name}.{attr_name}")
+            elif isinstance(attr, nn.ModuleList) and attr_name.endswith("_weight_quantizers"):
+                for i, wq in enumerate(attr):
+                    if _is_active_unpicked(wq):
+                        missed.append(f"{mod_name}.{attr_name}[{i}]")
+    if missed:
+        warnings.warn(
+            f"MSE weight calibration: {len(missed)} weight quantizer(s) are enabled but were "
+            f"not scheduled for calibration and will retain max-calibration amax values. "
+            f"First {min(5, len(missed))}: {missed[:5]}",
+            stacklevel=2,
+        )
+
     # Step 3: Calibrate weight quantizers ONE AT A TIME with immediate amax computation
     # This prevents massive memory accumulation seen in large models
     for idx, (parent_module, weight_name, weight_quantizer) in enumerate(
@@ -432,7 +485,11 @@ def mse_calibrate(
         weight_quantizer.disable_quant()
         weight_quantizer.enable_calib()
         with enable_weight_access_and_writeback(parent_module, model, name_to_module):
-            weight = getattr(parent_module, weight_name)
+            if isinstance(weight_name, tuple):
+                param_name, expert_idx = weight_name
+                weight = getattr(parent_module, param_name)[expert_idx]
+            else:
+                weight = getattr(parent_module, weight_name)
             weight_quantizer(weight)
 
         # IMMEDIATELY compute amax and reset calibrator to free memory
@@ -778,7 +835,7 @@ def finish_stats_collection(model: nn.Module, method: str | None = None, **kwarg
 
         cal = getattr(module, "_calibrator", None)
         if cal and not getattr(module, "_dynamic", False):
-            if method in {"entropy"}:
+            if method == "entropy":
                 if cal.compute_amax(method) is not None:
                     module.load_calib_amax("entropy", **kwargs)
             elif cal.compute_amax(**kwargs) is not None:
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
index 5e65f9cc1d4..3582223c4d3 100644
--- a/modelopt/torch/quantization/model_quant.py
+++ b/modelopt/torch/quantization/model_quant.py
@@ -595,6 +595,7 @@ def print_quant_summary(model: nn.Module, output_dir: str | None = None):
     lines.append(f"{len(lines)} TensorQuantizers found in model")
 
     if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
         path = os.path.join(output_dir, ".quant_summary.txt")
         with open(path, "w", encoding="utf-8") as f:
             f.write("\n".join(lines) + "\n")
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
index 3ff7401ec3e..928e319dfff 100644
--- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
+++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -1112,7 +1112,7 @@ def forward(self, inputs):
 
         return outputs
 
-    def _short_amax(self, fmt=".4f"):
+    def _short_amax(self, fmt=".2e"):
         """Short description of amax.
 
         Returns:
@@ -1130,7 +1130,7 @@ def _short_amax(self, fmt=".4f"):
             return "meta"
         return self._short_tensor(self._amax, fmt)
 
-    def _short_tensor(self, tensor: torch.Tensor, fmt=".4f"):
+    def _short_tensor(self, tensor: torch.Tensor, fmt=".2e"):
         """Short description of tensor."""
         if tensor.numel() == 1:
             return f"{tensor.item():{fmt}}"
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index fe30e283c2d..71083980169 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -124,8 +124,11 @@ def get_weights_scaling_factor_from_quantizer(
 
             # Quantize scales to FP8
             if not keep_high_precision:
-                per_block_scale = (per_block_scale * 448.0 / per_block_scale_max).to(
-                    torch.float8_e4m3fn
+                fp8_e4m3fn_min = 2**-9  # 0.001953125 — smallest positive subnormal
+                per_block_scale = (
+                    (per_block_scale * 448.0 / per_block_scale_max)
+                    .clamp(min=fp8_e4m3fn_min)
+                    .to(torch.float8_e4m3fn)
                 )
             return per_block_scale, weights_scaling_factor_2
         else:
@@ -167,6 +170,12 @@ def get_weights_scaling_factor(
         per_block_scale[per_block_scale == 0] = 1.0
         # Convert to torch.float8_e4m3fn
         if not keep_high_precision:
+            # Clamp to the minimum positive FP8 E4M3FN subnormal (~0.00195 = 2^-9) before
+            # casting.  Without this, blocks whose scale falls below the FP8 representable
+            # range silently underflow to 0, causing those blocks to produce zero output at
+            # inference even when the weights are non-trivial.
+            fp8_e4m3fn_min = 2**-9  # 0.001953125 — smallest positive subnormal
+            per_block_scale = per_block_scale.clamp(min=fp8_e4m3fn_min)
             per_block_scale = per_block_scale.to(torch.float8_e4m3fn)
         return per_block_scale, weights_scaling_factor_2
 
diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
new file mode 100644
index 00000000000..76d50b760f0
--- /dev/null
+++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
@@ -0,0 +1,130 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+metadata:
+  recipe_type: ptq
+  description: >
+    NVFP4 W4A4 for MoE routed experts only. Static weight scales via MSE + FP8 scale sweep;
+    dynamic activation scales. Supports sequential experts (nn.Linear-based) and fused experts
+    (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style).
+quantize:
+  algorithm:
+    method: mse
+    fp8_scale_sweep: true
+    layerwise: false
+  quant_cfg:
+    # ── Disable everything first ─────────────────────────────────────────────
+    - quantizer_name: '*'
+      enable: false
+
+    # ── Sequential experts (nn.Linear per expert) ────────────────────────────
+    - quantizer_name: '*mlp.experts*weight_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: static
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*mlp.experts*input_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+
+    # ── Sequential experts: Mixtral / block_sparse_moe style ────────────────
+    - quantizer_name: '*block_sparse_moe*weight_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: static
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*block_sparse_moe*input_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+
+    # ── Fused experts (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style) ──
+    - quantizer_name: '*gate_up_proj_weight_quantizers*'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: static
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*gate_up_proj_input_quantizer*'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*down_proj_weight_quantizers*'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: static
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*down_proj_input_quantizer*'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+
+    # ── Exclusions: shared experts, attention, routers, lm_head ─────────────
+    - quantizer_name: '*block_sparse_moe.gate*'
+      enable: false
+    - quantizer_name: '*linear_attn.conv1d*'
+      enable: false
+    - quantizer_name: '*lm_head*'
+      enable: false
+    - quantizer_name: '*mlp.gate.*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert_gate.*'
+      enable: false
+    - quantizer_name: '*router*'
+      enable: false
+    - quantizer_name: 'output.*'
+      enable: false
+    - parent_class: 'nn.BatchNorm1d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm2d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm3d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.LeakyReLU'
+      quantizer_name: '*'
+      enable: false
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
index 29435827748..cf785fb235b 100644
--- a/tests/unit/torch/quantization/plugins/test_fused_experts.py
+++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -27,7 +27,6 @@
 from modelopt.torch.quantization.plugins.huggingface import (
     _is_fused_experts_module,
     _is_sparse_sequaential_moe_block,
-    _QuantFusedExperts,
     force_eager_experts_impl_on_the_fly,
     register_fused_experts_on_the_fly,
     register_sparse_moe_on_the_fly,
@@ -256,27 +255,51 @@ def test_expert_index_recovery(self):
 # Tests for export
 # ---------------------------------------------------------------------------
 class TestExportFusedExperts:
+    @staticmethod
+    def _cleanup_registry(mod_type):
+        if QuantModuleRegistry.get(mod_type) is not None:
+            QuantModuleRegistry.unregister(mod_type)
+
     def test_export_creates_per_expert_submodules(self):
         """_export_fused_experts should create per-expert submodules with standard naming."""
+        import modelopt.torch.quantization as mtq
         from modelopt.torch.export.moe_utils import _export_fused_experts
 
-        experts = _SyntheticFusedExperts()
-        expert_type = type(experts)
+        model = _TinyMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
 
-        # Manually register and convert
-        if QuantModuleRegistry.get(expert_type) is None:
-            QuantModuleRegistry.register({expert_type: "test.SyntheticFusedExperts"})(
-                _QuantFusedExperts
-            )
-        converted = QuantModuleRegistry.convert(experts)
+        quant_cfg = {
+            "quant_cfg": [
+                {"quantizer_name": "*", "enable": False},
+                {
+                    "quantizer_name": "*gate_up_proj_input_quantizer",
+                    "cfg": {"num_bits": 8, "axis": None},
+                },
+                {
+                    "quantizer_name": "*down_proj_input_quantizer",
+                    "cfg": {"num_bits": 8, "axis": None},
+                },
+                {
+                    "quantizer_name": "*gate_up_proj_weight_quantizer",
+                    "cfg": {"num_bits": 8, "axis": 0},
+                },
+                {
+                    "quantizer_name": "*down_proj_weight_quantizer",
+                    "cfg": {"num_bits": 8, "axis": 0},
+                },
+            ],
+            "algorithm": "max",
+        }
 
-        # Run a forward pass to calibrate (set amaxes)
-        seq_len = 16
-        hidden_states = torch.randn(seq_len, HIDDEN_DIM)
-        top_k_index = torch.randint(0, NUM_EXPERTS, (seq_len, TOP_K))
-        top_k_weights = torch.softmax(torch.randn(seq_len, TOP_K), dim=-1)
-        with torch.no_grad():
-            converted(hidden_states, top_k_index, top_k_weights)
+        def forward_loop(m):
+            torch.manual_seed(0)
+            for _ in range(2):
+                x = torch.randn(1, 4, HIDDEN_DIM)
+                m(x)
+
+        mtq.quantize(model, quant_cfg, forward_loop=forward_loop)
+        converted = model.moe.experts
 
         _export_fused_experts(converted, torch.float16)
 
@@ -297,8 +320,7 @@ def test_export_creates_per_expert_submodules(self):
         assert not hasattr(converted, "down_proj")
         assert not hasattr(converted, "gate_up_proj_weight_quantizers")
 
-        if QuantModuleRegistry.get(expert_type) is not None:
-            QuantModuleRegistry.unregister(expert_type)
+        self._cleanup_registry(expert_type)
 
 
 # ---------------------------------------------------------------------------
@@ -612,3 +634,126 @@ def test_unrelated_dotted_number_unchanged(self):
             _normalize_fused_experts_quantizer_name("moe.layers.3.gate.weight")
             == "moe.layers.3.gate.weight"
         )
+
+
+# Verifies that MSE calibration discovers and calibrates every per-expert weight quantizer
+# inside a fused-expert ModuleList (both gate_up_proj and down_proj, for all experts).
+class TestFusedExpertsMSECalibration:
+    @staticmethod
+    def _cleanup_registry(mod_type):
+        if QuantModuleRegistry.get(mod_type) is not None:
+            QuantModuleRegistry.unregister(mod_type)
+
+    def test_mse_calibration_populates_all_expert_quantizers(self):
+        import modelopt.torch.quantization as mtq
+
+        model = _TinyMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
+
+        mtq.quantize(
+            model,
+            {
+                "quant_cfg": [
+                    {"quantizer_name": "*", "enable": False},
+                    {
+                        "quantizer_name": "*gate_up_proj_weight_quantizer",
+                        "cfg": {"num_bits": 8, "axis": None},
+                    },
+                    {
+                        "quantizer_name": "*down_proj_weight_quantizer",
+                        "cfg": {"num_bits": 8, "axis": None},
+                    },
+                ],
+                "algorithm": "mse",
+            },
+            forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)],
+        )
+
+        experts = model.moe.experts
+        for idx in range(NUM_EXPERTS):
+            assert experts.gate_up_proj_weight_quantizers[idx].amax is not None, (
+                f"gate_up_proj_weight_quantizers[{idx}] not calibrated — Bug 1 regression"
+            )
+            assert experts.down_proj_weight_quantizers[idx].amax is not None, (
+                f"down_proj_weight_quantizers[{idx}] not calibrated"
+            )
+        self._cleanup_registry(expert_type)
+
+
+# Verifies that _export_fused_experts emits a fallback warning and computes weight-derived
+# per-block amax when an expert's _amax is None or zero. Only applies to per-block (NVFP4)
+# quantizers. _export_quantized_weight is mocked to isolate from FP8/GPU requirements.
+class TestExportFusedExpertsUncalibratedFallback:
+    @staticmethod
+    def _cleanup_registry(mod_type):
+        if QuantModuleRegistry.get(mod_type) is not None:
+            QuantModuleRegistry.unregister(mod_type)
+
+    def _quantize_with_block_sizes(self):
+        import modelopt.torch.quantization as mtq
+
+        model = _TinyMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
+        block_cfg = {"num_bits": 8, "axis": None, "block_sizes": {-1: 16}}
+        mtq.quantize(
+            model,
+            {
+                "quant_cfg": [
+                    {"quantizer_name": "*", "enable": False},
+                    {"quantizer_name": "*gate_up_proj_weight_quantizer", "cfg": block_cfg},
+                    {"quantizer_name": "*down_proj_weight_quantizer", "cfg": block_cfg},
+                ],
+                "algorithm": "max",
+            },
+            forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)],
+        )
+        return model.moe.experts, expert_type
+
+    @pytest.mark.parametrize("zero_amax", [False, True])
+    def test_fallback_warning_emitted(self, zero_amax):
+        """Fallback warning must fire and produce valid per-block _amax + global_amax."""
+        import warnings
+        from unittest.mock import patch
+
+        from modelopt.torch.export.moe_utils import _export_fused_experts
+
+        converted, expert_type = self._quantize_with_block_sizes()
+        bad_amax = torch.tensor(0.0) if zero_amax else None
+        for idx in range(NUM_EXPERTS):
+            converted.gate_up_proj_weight_quantizers[idx]._amax = bad_amax
+            converted.down_proj_weight_quantizers[idx]._amax = bad_amax
+
+        captured_wrappers = []
+
+        def _capture(wrapper, dtype):
+            captured_wrappers.append(wrapper)
+
+        with (
+            patch(
+                "modelopt.torch.export.unified_export_hf._export_quantized_weight",
+                side_effect=_capture,
+            ),
+            warnings.catch_warnings(record=True) as caught,
+        ):
+            warnings.simplefilter("always")
+            _export_fused_experts(converted, torch.float16)
+
+        assert any("weight-derived per-block amax" in str(w.message) for w in caught), (
+            f"No fallback warning emitted for {'zero' if zero_amax else 'None'} amax — Bug 3 regression"
+        )
+
+        # Every per-block weight quantizer must have a repaired per-block _amax and global_amax.
+        for wrapper in captured_wrappers:
+            wq = wrapper.weight_quantizer
+            if not (getattr(wq, "block_sizes", None) or {}).get(-1):
+                continue
+            assert wq._amax is not None and wq._amax.numel() > 1, (
+                "Fallback did not produce per-block _amax"
+            )
+            assert hasattr(wq, "global_amax") and wq.global_amax > 0, (
+                "global_amax missing or zero after fallback"
+            )
+
+        self._cleanup_registry(expert_type)
diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py
new file mode 100644
index 00000000000..d66809a3cfc
--- /dev/null
+++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for NVFP4QTensor per-block FP8 scale underflow clamping."""
+
+import torch
+
+from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor
+
+_FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive FP8 E4M3FN subnormal
+
+
+class TestNVFP4ScaleClamping:
+    """Per-block weight scales below the FP8 E4M3FN minimum must be clamped, not rounded to zero."""
+
+    def test_no_zero_scales_for_tiny_weights(self):
+        """Tiny per-block amax (<<FP8 min) must not underflow to zero after FP8 cast."""
+        block_size = 16
+        tiny_weight = torch.full((4, block_size), 1e-10)
+        # wsf2=1.0 → per_block_scale = amax/(6*wsf2) ≈ 1.7e-11 << 2^-9, exercises FP8-min clamp
+        wsf2 = torch.tensor(1.0)
+
+        per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(tiny_weight, block_size, wsf2)
+        per_block_scale_f32 = per_block_scale.float()
+
+        assert (per_block_scale_f32 > 0).all(), (
+            f"Zero per-block scales found after FP8 cast: {per_block_scale_f32.tolist()}. "
+            "FP8 scale underflow clamping likely regressed."
+        )
+        assert (per_block_scale_f32 >= _FP8_E4M3FN_MIN).all(), (
+            "Per-block scales below FP8 minimum subnormal found after cast."
+        )
+
+    def test_normal_weights_unaffected_by_clamp(self):
+        """Weights with typical magnitudes must not be affected by the underflow clamp."""
+        block_size = 16
+        torch.manual_seed(42)
+        normal_weight = torch.randn(8, block_size)
+
+        per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(normal_weight, block_size)
+        assert (per_block_scale.float() > 0).all(), "Normal weights produced zero scales."
+
+    def test_mixed_weight_no_zeros(self):
+        """Mixed-magnitude tensor (normal + tiny blocks) must have no zero scales."""
+        block_size = 16
+        weight = torch.cat(
+            [
+                torch.randn(4, block_size),
+                torch.full((4, block_size), 1e-12),
+            ],
+            dim=0,
+        )
+
+        per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(weight, block_size)
+        assert (per_block_scale.float() > 0).all(), (
+            "Zero scales in mixed-magnitude tensor after FP8 cast."
+        )