diff --git a/src/mcore_bridge/model/gpts/qwen3_next_gdn.py b/src/mcore_bridge/model/gpts/qwen3_next_gdn.py index 36ad746..4d7905f 100644 --- a/src/mcore_bridge/model/gpts/qwen3_next_gdn.py +++ b/src/mcore_bridge/model/gpts/qwen3_next_gdn.py @@ -138,8 +138,11 @@ def build_model( lm_model = model.language_model if hasattr(model, 'language_model') else model for layer in lm_model.decoder.layers: if hasattr(layer.self_attention, 'out_norm'): - assert hasattr(layer.self_attention.out_norm, 'zero_centered_gamma') - layer.self_attention.out_norm.zero_centered_gamma = False + out_norm = layer.self_attention.out_norm + if hasattr(out_norm, 'zero_centered_gamma'): + out_norm.zero_centered_gamma = False + elif hasattr(out_norm, 'config'): + out_norm.config.layernorm_zero_centered_gamma = False return model diff --git a/src/mcore_bridge/model/modules/gated_delta_net.py b/src/mcore_bridge/model/modules/gated_delta_net.py index 088419c..0c42a04 100644 --- a/src/mcore_bridge/model/modules/gated_delta_net.py +++ b/src/mcore_bridge/model/modules/gated_delta_net.py @@ -273,6 +273,7 @@ def forward( stride=self.conv1d.stride, padding=self.conv1d.padding, dilation=self.conv1d.dilation, + groups=qkv.shape[1], ) qkv = self.act_fn(conv_out[..., :seq_len]) qkv = qkv.transpose(1, 2) # b, d, s -> b, s, d diff --git a/src/mcore_bridge/version.py b/src/mcore_bridge/version.py index f3d9892..58614da 100644 --- a/src/mcore_bridge/version.py +++ b/src/mcore_bridge/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '1.4.0.dev0' +__version__ = '1.4.0' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future -__release_datetime__ = '2099-12-31 23:59:59' +__release_datetime__ = '2026-05-17 23:59:59'