PaddlePaddle · SigureMo · Aug 7, 2024 · Aug 6, 2024
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
 import paddle
 from paddle import _C_ops, _legacy_C_ops
 from paddle.autograd import PyLayer
@@ -23,6 +27,10 @@
 
 from ....communication.reduce import ReduceOp, _get_reduce_op
 
+if TYPE_CHECKING:
+    from paddle import Tensor
+    from paddle._typing import ParamAttrLike, Size2
+
 
 class c_identity_eager(PyLayer):
     @staticmethod
@@ -696,16 +704,16 @@ def _parallel_embedding(
 
 
 def split(
-    x,
-    size,
-    operation,
-    axis=0,
-    num_partitions=1,
-    gather_out=True,
-    weight_attr=None,
-    bias_attr=None,
-    name=None,
-):
+    x: Tensor,
+    size: Size2,
+    operation: Literal['linear', 'embedding'],
+    axis: int = 0,
+    num_partitions: int = 1,
+    gather_out: bool = True,
+    weight_attr: ParamAttrLike | None = None,
+    bias_attr: ParamAttrLike | None = None,
+    name: str | None = None,
+) -> Tensor:
     """
 
     Split the weight of the specified operation into multiple devices

diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import contextlib
 import copy
 import inspect
 import weakref
+from typing import TYPE_CHECKING, Any, TypedDict
 
 import paddle
 from paddle import framework
@@ -28,6 +31,18 @@
 
 from ..utils.log_util import logger
 
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+
+    from typing_extensions import NotRequired
+
+    from paddle.nn import Sequential
+
+    class _Ctx(TypedDict):
+        segments: int = 1
+        preserve_rng_state: NotRequired[bool]
+
+
 __all__ = []
 
 
@@ -584,7 +599,12 @@ def recompute(function, *args, **kwargs):
         return _recompute_without_reentrant(function, preserve, *args, **kwargs)
 
 
-def recompute_sequential(ctx, functions, *args, **kwargs):
+def recompute_sequential(
+    ctx: _Ctx,
+    functions: Sequential | Sequence[Callable[..., Any]],
+    *args: Any,
+    **kwargs: Any,
+) -> Any:
     """
     recompute intermediate activations to save the memory for 'Sequential' models. use 'ctx' to transmit some context params, it is similar to 'recompute_hybrid' API.
 

diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -11,6 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, TypedDict
 
 import paddle
 from paddle import framework
@@ -25,6 +28,20 @@
     switch_rng_state_tracker,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from typing_extensions import NotRequired
+
+    from paddle.distributed.communication.group import Group
+    from paddle.nn import Layer
+
+    class _Ctx(TypedDict):
+        mp_group: Group
+        offload: NotRequired[bool]
+        partition: NotRequired[bool]
+
+
 __all__ = []
 
 
@@ -245,7 +262,9 @@ def backward(ctx, *args):
             return grads
 
 
-def recompute_hybrid(ctx, function, *args, **kwargs):
+def recompute_hybrid(
+    ctx: _Ctx, function: Layer | Callable[..., Any], *args: Any, **kwargs: Any
+) -> Any:
     """
     recompute intermediate activations to save the memory in hybrid parallel scene.
     # NOTE(shenliang03)The current hybrid parallel recompute has limitations.