perturbed attn + vocoder fix

prishajain1 · prishajain1 · commit a0795b1eaeaa · 2026-04-10T09:09:00.000+05:30
diff --git a/src/maxdiffusion/configs/ltx2_3_video.yml b/src/maxdiffusion/configs/ltx2_3_video.yml
@@ -28,6 +28,8 @@ sampler: "from_checkpoint"
 global_batch_size_to_train_on: 1
 num_inference_steps: 40
 guidance_scale: 3.0
+stg_scale: 0.0
+spatio_temporal_guidance_blocks: []
 fps: 24
 pipeline_type: multi-scale
 prompt: "A man in a brightly lit room talks on a vintage telephone. In a low, heavy voice, he says, 'I understand. I won't call again. Goodbye.' He hangs up the receiver and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is brightly lit by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a dramatic movie."
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -460,6 +460,7 @@ def __call__(
       attention_mask: Optional[Array] = None,
       rotary_emb: Optional[Tuple[Array, Array]] = None,
       k_rotary_emb: Optional[Tuple[Array, Array]] = None,
+      perturbation_mask: Optional[Array] = None,
   ) -> Array:
     # Determine context (Self or Cross)
     context = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
@@ -503,6 +504,11 @@ def __call__(
       # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
       attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
 
+      if perturbation_mask is not None:
+        # value is [B, S, InnerDim]
+        # attn_output is [B, S, InnerDim]
+        attn_output = value + perturbation_mask * (attn_output - value)
+
       if getattr(self, "to_gate_logits", None) is not None:
         gate_logits = self.to_gate_logits(hidden_states)
         b, s, _ = attn_output.shape
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -616,8 +616,10 @@ def __init__(
       gated_attn: bool = False,
       cross_attn_mod: bool = False,
       use_prompt_embeddings: bool = True,
+      spatio_temporal_guidance_blocks: Tuple[int, ...] = (),
       **kwargs,
   ):
+    self.spatio_temporal_guidance_blocks = spatio_temporal_guidance_blocks
     self.in_channels = in_channels
     self.out_channels = out_channels
     self.patch_size = patch_size
@@ -978,6 +980,7 @@ def __call__(
       audio_coords: Optional[jax.Array] = None,
       attention_kwargs: Optional[Dict[str, Any]] = None,
       return_dict: bool = True,
+      perturbation_mask: Optional[jax.Array] = None,
   ) -> Any:
     # Determine timestep for audio.
     audio_timestep = audio_timestep if audio_timestep is not None else timestep
@@ -1065,8 +1068,19 @@ def __call__(
         )
         audio_encoder_hidden_states = audio_encoder_hidden_states.reshape(batch_size, -1, audio_hidden_states.shape[-1])
 
+    # Construct perturbation_mask_per_layer for STG
+    if perturbation_mask is None:
+      perturbation_mask_per_layer = jnp.ones((self.num_layers, batch_size, 1, 1), dtype=self.dtype)
+    else:
+      masks = jnp.ones((self.num_layers, batch_size, 1, 1), dtype=self.dtype)
+      for i in self.spatio_temporal_guidance_blocks:
+        if i < self.num_layers:
+          masks = masks.at[i].set(perturbation_mask)
+      perturbation_mask_per_layer = masks
+
     # 5. Run transformer blocks
-    def scan_fn(carry, block):
+    def scan_fn(carry, block_and_mask):
+      block, mask = block_and_mask
       hidden_states, audio_hidden_states, rngs_carry = carry
       with jax.named_scope("Transformer Layer"):
         hidden_states_out, audio_hidden_states_out = block(
@@ -1086,6 +1100,7 @@ def scan_fn(carry, block):
             ca_audio_rotary_emb=audio_cross_attn_rotary_emb,
             encoder_attention_mask=encoder_attention_mask,
             audio_encoder_attention_mask=audio_encoder_attention_mask,
+            perturbation_mask=mask,
         )
       return (
           hidden_states_out.astype(hidden_states.dtype),
@@ -1105,9 +1120,10 @@ def scan_fn(carry, block):
             in_axes=(nnx.Carry, 0),
             out_axes=(nnx.Carry, 0),
             transform_metadata={nnx.PARTITION_NAME: "layers"},
-        )(carry, self.transformer_blocks)
+        )(carry, (self.transformer_blocks, perturbation_mask_per_layer))
       else:
-        for block in self.transformer_blocks:
+        for i, block in enumerate(self.transformer_blocks):
+          mask = perturbation_mask_per_layer[i] if perturbation_mask_per_layer is not None else None
           hidden_states, audio_hidden_states = block(
               hidden_states=hidden_states,
               audio_hidden_states=audio_hidden_states,
diff --git a/src/maxdiffusion/models/ltx2/vocoder_bwe_ltx2.py b/src/maxdiffusion/models/ltx2/vocoder_bwe_ltx2.py
@@ -585,7 +585,6 @@ def __call__(self, mel_spec: Array) -> Array:
     mel_for_bwe = jnp.transpose(mel, (0, 1, 3, 2)) # (B, C, T, F)
     
     residual = self.bwe_generator(mel_for_bwe)
-    skip = self.resampler(x)
     
     # Transpose x to (B, T, C) for resampler?
     # UpSample1d expects (B, T, C).
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -1317,17 +1317,52 @@ def __call__(
     prompt_embeds_jax = prompt_embeds
     prompt_attention_mask_jax = prompt_attention_mask
 
-    if guidance_scale > 1.0:
+    do_cfg = guidance_scale > 1.0
+    do_stg = getattr(self.config, "stg_scale", 0.0) > 0.0
+
+    if do_cfg and do_stg:
+      negative_prompt_embeds_jax = negative_prompt_embeds
+      negative_prompt_attention_mask_jax = negative_prompt_attention_mask
+      
+      if isinstance(prompt_embeds_jax, list):
+        prompt_embeds_jax = [jnp.concatenate([n, p, p], axis=0) for n, p in zip(negative_prompt_embeds_jax, prompt_embeds_jax)]
+      else:
+        prompt_embeds_jax = jnp.concatenate([negative_prompt_embeds_jax, prompt_embeds_jax, prompt_embeds_jax], axis=0)
+        
+      prompt_attention_mask_jax = jnp.concatenate([negative_prompt_attention_mask_jax, prompt_attention_mask_jax, prompt_attention_mask_jax], axis=0)
+      latents_jax = jnp.concatenate([latents_jax] * 3, axis=0)
+      audio_latents_jax = jnp.concatenate([audio_latents_jax] * 3, axis=0)
+      
+      N = latents.shape[0]
+      perturbation_mask = jnp.concatenate([jnp.ones((2 * N, 1, 1), dtype=dtype), jnp.zeros((N, 1, 1), dtype=dtype)], axis=0)
+      
+    elif do_cfg:
       negative_prompt_embeds_jax = negative_prompt_embeds
       negative_prompt_attention_mask_jax = negative_prompt_attention_mask
       if isinstance(prompt_embeds_jax, list):
         prompt_embeds_jax = [jnp.concatenate([n, p], axis=0) for n, p in zip(negative_prompt_embeds_jax, prompt_embeds_jax)]
       else:
         prompt_embeds_jax = jnp.concatenate([negative_prompt_embeds_jax, prompt_embeds_jax], axis=0)
-
+        
       prompt_attention_mask_jax = jnp.concatenate([negative_prompt_attention_mask_jax, prompt_attention_mask_jax], axis=0)
       latents_jax = jnp.concatenate([latents_jax] * 2, axis=0)
       audio_latents_jax = jnp.concatenate([audio_latents_jax] * 2, axis=0)
+      perturbation_mask = None
+      
+    elif do_stg:
+      if isinstance(prompt_embeds_jax, list):
+        prompt_embeds_jax = [jnp.concatenate([p, p], axis=0) for p in prompt_embeds_jax]
+      else:
+        prompt_embeds_jax = jnp.concatenate([prompt_embeds_jax, prompt_embeds_jax], axis=0)
+        
+      prompt_attention_mask_jax = jnp.concatenate([prompt_attention_mask_jax, prompt_attention_mask_jax], axis=0)
+      latents_jax = jnp.concatenate([latents_jax] * 2, axis=0)
+      audio_latents_jax = jnp.concatenate([audio_latents_jax] * 2, axis=0)
+      
+      N = latents.shape[0]
+      perturbation_mask = jnp.concatenate([jnp.ones((N, 1, 1), dtype=dtype), jnp.zeros((N, 1, 1), dtype=dtype)], axis=0)
+    else:
+      perturbation_mask = None
 
     if hasattr(self, "mesh") and self.mesh is not None:
       data_sharding_3d = NamedSharding(self.mesh, P())
@@ -1405,14 +1440,37 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
             latent_width,
             audio_num_frames,
             frame_rate,
+            perturbation_mask=perturbation_mask,
         )
 
-        if guidance_scale > 1.0:
+        do_stg = getattr(self.config, "stg_scale", 0.0) > 0.0
+
+        if guidance_scale > 1.0 and do_stg:
+          noise_pred_uncond, noise_pred_text, noise_pred_perturb = jnp.split(noise_pred, 3, axis=0)
+          noise_pred = (
+              noise_pred_uncond
+              + guidance_scale * (noise_pred_text - noise_pred_uncond)
+              + self.config.stg_scale * (noise_pred_text - noise_pred_perturb)
+          )
+          # Audio guidance
+          noise_pred_audio_uncond, noise_pred_audio_text, noise_pred_audio_perturb = jnp.split(noise_pred_audio, 3, axis=0)
+          noise_pred_audio = (
+              noise_pred_audio_uncond
+              + guidance_scale * (noise_pred_audio_text - noise_pred_audio_uncond)
+              + self.config.stg_scale * (noise_pred_audio_text - noise_pred_audio_perturb)
+          )
+        elif guidance_scale > 1.0:
           noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
           noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
           # Audio guidance
           noise_pred_audio_uncond, noise_pred_audio_text = jnp.split(noise_pred_audio, 2, axis=0)
           noise_pred_audio = noise_pred_audio_uncond + guidance_scale * (noise_pred_audio_text - noise_pred_audio_uncond)
+        elif do_stg:
+          noise_pred_text, noise_pred_perturb = jnp.split(noise_pred, 2, axis=0)
+          noise_pred = noise_pred_text + self.config.stg_scale * (noise_pred_text - noise_pred_perturb)
+          
+          noise_pred_audio_text, noise_pred_audio_perturb = jnp.split(noise_pred_audio, 2, axis=0)
+          noise_pred_audio = noise_pred_audio_text + self.config.stg_scale * (noise_pred_audio_text - noise_pred_audio_perturb)
 
           latents_step = latents_jax[batch_size:]
           audio_latents_step = audio_latents_jax[batch_size:]
@@ -1556,6 +1614,7 @@ def transformer_forward_pass(
     latent_width,
     audio_num_frames,
     fps,
+    perturbation_mask=None,
 ):
   transformer = nnx.merge(graphdef, state)
 
@@ -1576,6 +1635,7 @@ def transformer_forward_pass(
       fps=fps,
       audio_num_frames=audio_num_frames,
       return_dict=False,
+      perturbation_mask=perturbation_mask,
   )
 
   return noise_pred, noise_pred_audio