[torchrec][LocalShardsWrapper] Implement tensor padding for local shards wrapper (pytorch#163183)

jeffkbkim · facebook-github-bot · commit af4f5c0ddf4f · 2025-09-17T12:31:06.000-07:00
Summary: X-link: pytorch/torchrec#3382 This diff implements the constant padding functionality (aten.constant_pad_nd.default) for `LocalShardsWrapper`. The method applies constant padding to the local shards based on the provided padding specification. Depending on the sharding type (RW, CW), the padding on [left, right, top, bottom] directions will be either applied to the first/last shard, or all local shards. New unit tests cover: - 1D (RW) top/bottom paddings - 2D (CW) left, right, top, bottom paddings - empty shards, number of dimensions > 2 Test Plan: ``` buck2 test fbcode//mode/opt fbcode//torchrec/distributed/tests:test_shards_wrapper <...> Buck UI: https://www.internalfb.com/buck2/9fff7732-346a-43eb-b1a0-f0e43e2e8815 Test UI: https://www.internalfb.com/intern/testinfra/testrun/18014398620870153 Network: Up: 110KiB Down: 95KiB (reSessionID-c0cdcb56-f82e-4f42-9fb8-54d8a3fb74eb) Analyzing targets. Remaining 0/191 Executing actions. Remaining 0/12849 7.6s exec time total Command: test. Finished 5 local Time elapsed: 1:40.1s Test execution completed but tests were skipped Tests finished: Pass 14. Fail 0. Fatal 0. Skip 3. Build failure 0 ``` Rollback Plan: Differential Revision: D82663766
diff --git a/torch/distributed/tensor/_shards_wrapper.py b/torch/distributed/tensor/_shards_wrapper.py
@@ -109,6 +109,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[
             aten.detach.default: cls.handle_detach,
             aten.clone.default: cls.handle_clone,
             aten.new_empty.default: cls.handle_new_empty,
+            aten.constant_pad_nd.default: cls.handle_constant_pad_nd,
         }
 
         if func in dispatcher:
@@ -223,6 +224,185 @@ def handle_new_empty(args, kwargs) -> "LocalShardsWrapper":
             self_ls.local_offsets(),
         )
 
+    @staticmethod
+    def handle_constant_pad_nd(args, kwargs) -> "LocalShardsWrapper":
+        """
+        Apply constant padding to LocalShardsWrapper.
+
+        The padding is based off of the following ideas:
+        - The resulting wrapper represents the padded version of the logical tensor.
+        - Each shard is padded based on the sharding type + dimension that is padded.
+            - For instance, CW shards padded on the left most col will have only padding on the first CW shard.
+            - Padding the top row will apply to all CW shards.
+        """
+        self_lsw = args[0]
+        pad_spec = args[1]
+        pad_value = args[2] if len(args) > 2 else 0.0
+
+        if len(self_lsw.local_shards()) == 0:
+            raise NotImplementedError("Padding empty LocalShardsWrapper is not supported.")
+
+        local_shards = self_lsw.local_shards()
+
+        if len(local_shards) == 1:
+            padded_shard = torch.nn.functional.pad(
+                local_shards[0], pad_spec, mode="constant", value=pad_value
+            )
+            return LocalShardsWrapper([padded_shard], self_lsw.local_offsets())
+
+        padded_shards = list(local_shards)
+
+        if local_shards[0].ndim == 2:
+            # 2D Column-wise sharding: [pad_left, pad_right, pad_top, pad_bottom]
+            pad_left, pad_right, pad_top, pad_bottom = pad_spec[0], pad_spec[1], pad_spec[2], pad_spec[3]
+
+            if pad_top > 0:
+                padded_shards = [
+                    torch.nn.functional.pad(
+                        shard, [0, 0, pad_top, 0], mode="constant", value=pad_value
+                    )
+                    for shard in padded_shards
+                ]
+            if pad_bottom > 0:
+                padded_shards = [
+                    torch.nn.functional.pad(
+                        shard, [0, 0, 0, pad_bottom], mode="constant", value=pad_value
+                    )
+                    for shard in padded_shards
+                ]
+            if pad_left > 0:
+                padded_shards[0] = torch.nn.functional.pad(
+                    padded_shards[0],
+                    [pad_left, 0, 0, 0],
+                    mode="constant",
+                    value=pad_value
+                )
+            if pad_right > 0:
+                padded_shards[-1] = torch.nn.functional.pad(
+                    padded_shards[-1],
+                    [0, pad_right, 0, 0],
+                    mode="constant",
+                    value=pad_value
+                )
+        elif local_shards[0].ndim == 1:
+            # 1D Row-wise sharding: [pad_top, pad_bottom]
+            pad_top, pad_bottom = pad_spec[0], pad_spec[1]
+
+            if pad_top > 0:
+                padded_shards[0] = torch.nn.functional.pad(
+                    padded_shards[0], [pad_top, 0], mode="constant", value=pad_value
+                )
+            if pad_bottom > 0:
+                padded_shards[-1] = torch.nn.functional.pad(
+                    padded_shards[-1], [0, pad_bottom], mode="constant", value=pad_value
+                )
+        else:
+            raise NotImplementedError(
+                f"Padding for {local_shards[0].ndim}D tensors is not supported. "
+                f"Only 1D and 2D tensors are currently supported."
+            )
+
+        # Update offsets and storage metadata
+        original_storage = self_lsw.storage_metadata()
+        updated_offsets, updated_storage = LocalShardsWrapper._compute_updated_metadata(
+            original_storage,
+            self_lsw.local_offsets(),
+            pad_spec, local_shards[0].ndim,
+            padded_shards
+        )
+
+        result = LocalShardsWrapper(padded_shards, updated_offsets)
+        result._storage_meta = updated_storage
+        return result
+
+    @staticmethod
+    def _compute_updated_metadata(
+        original_storage: TensorStorageMetadata,
+        original_offsets: list[torch.Size],
+        pad_spec: list[int],
+        ndim: int,
+        padded_shards: list[torch.Tensor],
+    ) -> tuple[list[torch.Size], TensorStorageMetadata]:
+        """
+        Compute updated offsets and storage metadata after padding is applied.
+
+        Args:
+            original_storage: Original storage metadata
+            original_offsets: Original shard offsets
+            pad_spec: Padding specification
+            ndim: Number of dimensions (1=RW or 2=CW)
+            padded_shards: Padded shard tensors
+
+        Returns:
+            Tuple of (updated_offsets, updated_storage_metadata)
+        """
+        if ndim == 1:  # 1D RW
+            pad_top, pad_bottom = pad_spec[0], pad_spec[1]
+
+            updated_offsets = []
+            for i, offset in enumerate(original_offsets):
+                if i == 0:
+                    # First shard: offset stays the same (absorbs top padding)
+                    updated_offsets.append(offset)
+                else:
+                    # Subsequent shards: shift by top padding amount
+                    new_offset = (offset[0] + pad_top,)
+                    updated_offsets.append(torch.Size(new_offset))
+
+            new_global_size = torch.Size(
+                [original_storage.size[0] + pad_top + pad_bottom]
+            )
+
+        elif ndim == 2:  # 2D CW
+            pad_left, pad_right, pad_top, pad_bottom = (
+                pad_spec[0],
+                pad_spec[1],
+                pad_spec[2],
+                pad_spec[3]
+            )
+
+            updated_offsets = []
+            for i, offset in enumerate(original_offsets):
+                row_offset = offset[0]
+                col_offset = offset[1]
+
+                # Top/bottom padding doesn't affect offsets
+                # Left padding affects column offsets
+                if i == 0:
+                    # First shard: column offset stays the same (absorbs left padding)
+                    new_offset = (row_offset, col_offset)
+                else:
+                    # Subsequent shards: shift column offset by left padding amount
+                    new_offset = (row_offset, col_offset + pad_left)
+
+                updated_offsets.append(torch.Size(new_offset))
+
+            new_global_size = torch.Size(
+                [
+                    original_storage.size[0] + pad_top + pad_bottom,
+                    original_storage.size[1] + pad_left + pad_right
+                ]
+            )
+
+        else:
+            raise NotImplementedError(f"Metadata computation for {ndim}D not supported")
+
+        updated_chunks = [
+            ChunkStorageMetadata(
+                offsets=offset,
+                sizes=shard.size(),
+            )
+            for offset, shard in zip(updated_offsets, padded_shards)
+        ]
+
+        updated_storage = TensorStorageMetadata(
+            properties=original_storage.properties,
+            size=new_global_size,
+            chunks=updated_chunks,
+        )
+
+        return updated_offsets, updated_storage
+
     @property
     def device(self) -> torch._C.device:  # type: ignore[override]
         return (