Add Quantizers for Qwen3VLMoeTextDecoderLayer (#666)

soodoshll · shengliangxu · Edwardf0t1 · web-flow · commit 21a4010348ac · 2026-01-20T15:05:14.000-08:00
## What does this PR do? **Type of change:** ? new feature **Overview:** ? huggingface transformers library implements Qwen3VL Moe layer as a monolithic module, instead of assembling it using Linear layers, which cannot be recognized by modelopt's quantizer now. This PR introduces a conversion from hf's qwen3vl_moe MoE layers to qewn3_moe MoE layers which consist of a set of Linear layers. ## Testing Tested with ```python python hf_ptq.py --pyt_ckpt_path=Qwen/Qwen3-VL-30B-A3B-Instruct --qformat=nvfp4 --dataset wikipedia ``` ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information   ## Summary by CodeRabbit ## Release Notes * **New Features** * Added quantization support for Qwen3VL models with sparse mixture-of-experts (MoE) architecture, enabling efficient model compression for this model type. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>  --------- Signed-off-by: Qidong Su <qidongs@nvidia.com> Signed-off-by: Qidong Su <soodoshll@gmail.com> Co-authored-by: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com> Co-authored-by: Zhiyu <bestczy317@gmail.com>
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -571,6 +571,86 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
         return self.w2_linear[expert_idx](x1)
 
 
+class _QuantQwen3VLMoeTextExperts(QuantModule):
+    def _setup(self):
+        """Modify the Qwen3VLMoeTextExperts by using nn.Linear layers."""
+        from accelerate import init_empty_weights
+
+        dtype, device = self.gate_up_proj.dtype, self.gate_up_proj.device
+
+        def _copy_weight(module, weight):
+            module.to_empty(device=device)
+            with torch.no_grad():
+                module.weight.data = weight.detach().data.to(dtype=dtype, device=device)
+
+        # The attribute name was changed from `intermediate_size` to `intermediate_dim` in
+        # https://github.com/huggingface/transformers/commit/0642963ba13f2dae0596fe489415569e1d91fbda
+        if hasattr(self, "intermediate_size"):
+            expert_dim = self.intermediate_size
+        elif hasattr(self, "intermediate_dim"):
+            expert_dim = self.intermediate_dim
+        else:
+            raise AttributeError("Could not find intermediate dimension size in model")
+
+        with init_empty_weights():
+            gate_proj = nn.ModuleList(
+                [
+                    nn.Linear(self.hidden_size, expert_dim, bias=False)
+                    for _ in range(self.num_experts)
+                ]
+            )
+            up_proj = nn.ModuleList(
+                [
+                    nn.Linear(self.hidden_size, expert_dim, bias=False)
+                    for _ in range(self.num_experts)
+                ]
+            )
+            down_proj = nn.ModuleList(
+                [
+                    nn.Linear(expert_dim, self.hidden_size, bias=False)
+                    for _ in range(self.num_experts)
+                ]
+            )
+
+        for idx in range(self.num_experts):
+            _copy_weight(gate_proj[idx], self.gate_up_proj[idx, :, :expert_dim].T)
+            _copy_weight(up_proj[idx], self.gate_up_proj[idx, :, expert_dim:].T)
+            _copy_weight(down_proj[idx], self.down_proj[idx, :].T)
+
+        delattr(self, "gate_up_proj")
+        delattr(self, "down_proj")
+        self.gate_proj = gate_proj
+        self.up_proj = up_proj
+        self.down_proj = down_proj
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        routing_weights: torch.Tensor,
+        router_indices: torch.Tensor,
+    ) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        next_states = torch.zeros_like(hidden_states)
+        with torch.no_grad():
+            expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=self.num_experts)
+            expert_mask = expert_mask.permute(2, 1, 0)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            with torch.no_grad():
+                _, token_idx = torch.where(expert_mask[expert_idx[0]])
+            current_state = hidden_states[token_idx]
+            gate = self.gate_proj[expert_idx](current_state)
+            up = self.up_proj[expert_idx](current_state)
+            gated_output = up * self.act_fn(gate)
+            out = self.down_proj[expert_idx](gated_output)
+            weighted_output = out * routing_weights[token_idx, expert_idx, None]
+            next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+        next_states = next_states.view(batch_size, -1, self.hidden_size)
+
+        return next_states
+
+
 class _QuantDbrxFFN(_QuantSparseMoe):
     @property
     def num_experts(self):
@@ -660,6 +740,24 @@ def top_k(self, value):
 except ImportError:
     pass
 
+try:
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+        Qwen3VLMoeTextExperts,
+        Qwen3VLMoeTextSparseMoeBlock,
+    )
+
+    if Qwen3VLMoeTextSparseMoeBlock not in QuantModuleRegistry:
+        QuantModuleRegistry.register(
+            {Qwen3VLMoeTextSparseMoeBlock: "hf.Qwen3VLMoeTextSparseMoeBlock"}
+        )(_QuantSparseMoe)
+
+    if Qwen3VLMoeTextExperts not in QuantModuleRegistry:
+        QuantModuleRegistry.register({Qwen3VLMoeTextExperts: "hf.Qwen3VLMoeTextExperts"})(
+            _QuantQwen3VLMoeTextExperts
+        )
+except ImportError:
+    pass
+
 
 class _QuantGptOssExperts(_QuantFunctionalMixin):
     """Quantized wrapper for `transformers.GptOssExperts`.