refactor(speechlm2): drop dead BSHD+CP code paths in SALMAutomodel

pzelasko · claude · pzelasko · commit ad8520cd42c3 · 2026-05-08T09:13:15.000-07:00
The fit-start validator already rejects BSHD + CP &gt; 1 with a hard error
pointing users to model.packed_sequences=true (see
validate_parallelism_compatibility in parts/parallel.py), so any code
that exists only to support BSHD under CP is unreachable.

In SALMAutomodel.prepare_inputs the BSHD branch's
``if cp_size &gt; 1: shard_bshd_for_cp(...)`` and the
``llm_attention_mask = None if cp_size &gt; 1 else attention_mask`` ternary
both presupposed BSHD + CP &gt; 1; remove them and inline the TP-truncation
into the BSHD path. Drop the unused shard_bshd_for_cp helper from
cp_helpers.py and update its module docstring + the cp_helpers test
docstring accordingly.

No behavior change for any reachable configuration.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/nemo/collections/speechlm2/models/salm_automodel.py b/nemo/collections/speechlm2/models/salm_automodel.py
@@ -197,10 +197,9 @@ def prepare_inputs(self, batch: dict):
         from nemo.collections.speechlm2.parts.cp_helpers import (
             encode_audio_with_cp_distribution,
             get_cp_mesh,
-            shard_bshd_for_cp,
         )
 
-        cp_mesh, cp_size, _ = get_cp_mesh(getattr(self, "_device_mesh", None))
+        cp_mesh, _, _ = get_cp_mesh(getattr(self, "_device_mesh", None))
 
         # Source audio encoding (distributed across CP ranks when CP is active).
         # Input audio: (B_aud, T_samples) → list of (L_i, H) embeddings.
@@ -243,37 +242,20 @@ def prepare_inputs(self, batch: dict):
         attention_mask = attention_mask[:, :-1]
         target_ids = target_ids[:, 1:]
 
-        # Sequence-length divisibility for sequence/context parallelism.
-        # CP path: pad to 2*cp_size*tp_size and partition along the seq dim
-        # (the existing TP truncation is folded into the CP padding). BSHD-only
-        # path keeps the original TP-truncation behavior.
-        tp_size = self.device_mesh["tp"].size() if self._use_tp else 1
-        if cp_size > 1:
-            sharded = shard_bshd_for_cp(input_embs, attention_mask, target_ids, cp_mesh, tp_size=tp_size)
-            input_embs = sharded["input_embs"]
-            attention_mask = sharded["attention_mask"]
-            target_ids = sharded["target_ids"]
-        elif self._use_tp:
+        # BSHD path runs only when CP is inactive (the fit-start validator
+        # rejects BSHD + CP > 1, see _validate_parallelism_compatibility).
+        # Truncate the seq dim to be divisible by tp_size so sequence
+        # parallelism doesn't reshape the input under us.
+        if self._use_tp:
+            tp_size = self.device_mesh["tp"].size()
             if (remainder := (input_embs.shape[1] - 1) % tp_size) != 0:
-                # Truncate some tokens from the end to make the sequence length shape divisible by tensor parallelism
-                # world size. Otherwise, sequence parallelism will change the input shape making leading to mismatches.
                 input_embs = input_embs[:, :-remainder]
                 attention_mask = attention_mask[:, :-remainder]
                 target_ids = target_ids[:, :-remainder]
 
-        # TE's fused-attention CP path rejects ``padding_causal``; only ``causal``
-        # is supported. BSHD batches are left-padded so dropping the padding mask
-        # lets pad K/V leak into real-token attention — empirically this drives
-        # the loss to NaN at step 2 (the gradient through the LoRA / projection
-        # parameters is corrupted by the leak after one optimizer step). BSHD +
-        # CP is therefore not a supported configuration; set
-        # ``model.packed_sequences: true`` to use the THD path under CP, which
-        # uses cu_seqlens-aware attention and has no equivalent issue.
-        llm_attention_mask = None if cp_size > 1 else attention_mask
-
         return {
             "input_embeds": input_embs,
-            "attention_mask": llm_attention_mask,
+            "attention_mask": attention_mask,
             "target_ids": target_ids,
             "llm_kwargs": {},
         }
diff --git a/nemo/collections/speechlm2/parts/cp_helpers.py b/nemo/collections/speechlm2/parts/cp_helpers.py
@@ -13,17 +13,14 @@
 # limitations under the License.
 """Context-Parallelism (CP) helpers for SALMAutomodel.
 
-These helpers consolidate the CP-shape work needed to feed both BSHD and THD
-batches into a Nemotron-V3 LLM whose attention/Mamba layers were CP-wired by
-the Automodel parallelizer (`set_context_parallel_group()` / `mixer.cp =
-MambaContextParallel(...)`). Three concerns:
+These helpers consolidate the CP-shape work needed to feed THD packed
+batches into a Nemotron-V3 LLM whose attention/Mamba layers were CP-wired
+by the Automodel parallelizer (``set_context_parallel_group()`` /
+``mixer.cp = MambaContextParallel(...)``). Two concerns:
 
 1. ``get_cp_mesh`` — read the CP submesh out of a device mesh, returning
    ``(None, 1, 0)`` when CP is inactive so callers can short-circuit.
-2. ``shard_bshd_for_cp`` — pad and partition a BSHD batch along the seq dim
-   using TE's DualChunkSwap pattern (matches Automodel's Config 1 reference
-   test in ``run_hybrid_nemotron_v3_cp.py``).
-3. ``encode_audio_with_cp_distribution`` — distribute the audio encoder
+2. ``encode_audio_with_cp_distribution`` — distribute the audio encoder
    forward across CP ranks so it isn't recomputed cp_size times. Pads to a
    multiple of cp_size with dummy zero-audios so every rank participates in
    FSDP all-gather; dummies are dropped after the post-encoder all-gather.
@@ -34,7 +31,6 @@
 
 import torch
 import torch.distributed as dist
-import torch.nn.functional as F
 from torch import Tensor
 
 from nemo.collections.speechlm2.parts.encoder_chunking import encode_audio_with_optional_chunking
@@ -52,55 +48,6 @@ def get_cp_mesh(device_mesh) -> tuple[Optional[object], int, int]:
     return cp_mesh, cp_mesh.size(), cp_rank
 
 
-def shard_bshd_for_cp(
-    input_embs: Tensor,
-    attention_mask: Tensor,
-    target_ids: Tensor,
-    cp_mesh,
-    tp_size: int = 1,
-) -> dict[str, Tensor]:
-    """Pre-shard a BSHD batch across CP ranks via TE's DualChunkSwap pattern.
-
-    Right-pads the seq dim to a multiple of ``2 * cp_size * tp_size`` (TE-CP
-    requires ``2 * cp_size``; SP requires per-rank len divisible by ``tp_size``)
-    and partitions along the seq dim using
-    ``transformer_engine_torch.thd_get_partitioned_indices``.
-
-    Args:
-        input_embs:     ``[B, T, H]`` float.
-        attention_mask: ``[B, T]`` bool/long; pad slots become 0.
-        target_ids:     ``[B, T]`` int64; pad slots become ``-100``.
-        cp_mesh:        the CP submesh of size ``cp_size > 1``.
-        tp_size:        tensor-parallel world size (1 if TP is inactive).
-
-    Returns dict with keys ``input_embs``, ``attention_mask``, ``target_ids``,
-    each shape ``[B, T_padded // cp_size, ...]``.
-    """
-    import transformer_engine_torch as tex
-
-    cp_size = cp_mesh.size()
-    cp_rank = dist.get_rank(group=cp_mesh.get_group())
-    device = input_embs.device
-
-    B, T, H = input_embs.shape
-    mult = 2 * cp_size * max(1, tp_size)
-    T_padded = ((T + mult - 1) // mult) * mult
-    pad_n = T_padded - T
-    if pad_n > 0:
-        input_embs = F.pad(input_embs, (0, 0, 0, pad_n), value=0.0)
-        attention_mask = F.pad(attention_mask.to(torch.long), (0, pad_n), value=0).to(torch.bool)
-        target_ids = F.pad(target_ids, (0, pad_n), value=-100)
-
-    cu_seqlens = torch.tensor([0, T_padded], dtype=torch.int32, device=device)
-    indices = tex.thd_get_partitioned_indices(cu_seqlens, T_padded, cp_size, cp_rank)
-
-    return {
-        "input_embs": input_embs.index_select(1, indices).contiguous(),
-        "attention_mask": attention_mask.index_select(1, indices).contiguous(),
-        "target_ids": target_ids.index_select(1, indices).contiguous(),
-    }
-
-
 def encode_audio_with_cp_distribution(
     perception,
     audios: Tensor,
diff --git a/tests/collections/speechlm2/test_salm_cp_helpers.py b/tests/collections/speechlm2/test_salm_cp_helpers.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 """CPU-only tests for the CP-helper module.
 
-The ``cp_size > 1`` paths in ``shard_bshd_for_cp`` and
-``encode_audio_with_cp_distribution`` require ``transformer_engine_torch``
-and a real ``torch.distributed`` process group respectively; they're
-exercised by the 2-GPU smoke. These tests cover the fallback contracts
-that run on every machine (``cp_mesh is None``, ``B_aud == 0``).
+The ``cp_size > 1`` path in ``encode_audio_with_cp_distribution`` requires
+a real ``torch.distributed`` process group; it's exercised by the 2-GPU
+smoke. These tests cover the fallback contracts that run on every machine
+(``cp_mesh is None``, ``B_aud == 0``).
 """
 import torch