fix/optimization (#8)

LouisRouss · web-flow · commit ce9add378d9b · 2025-08-17T21:55:38.000+02:00
* Fix in a more durable way the optimization problems due to hooking and unhooking

* add doctstring repa

* use loss name when logging on wandb
diff --git a/src/diffulab/diffuse/modelizations/flow.py b/src/diffulab/diffuse/modelizations/flow.py
@@ -247,7 +247,7 @@ def compute_loss(
         # Compute extra losses if any
         for extra_loss in extra_losses:
             e_loss = cast(Tensor, extra_loss(**extra_args))
-            loss_dict[extra_loss.__class__.__name__] = e_loss
+            loss_dict[extra_loss.name] = e_loss
         return loss_dict
 
     def add_noise(self, x: Tensor, timesteps: Tensor, noise: Tensor | None = None) -> tuple[Tensor, Tensor]:
diff --git a/src/diffulab/diffuse/modelizations/gaussian_diffusion.py b/src/diffulab/diffuse/modelizations/gaussian_diffusion.py
@@ -672,7 +672,7 @@ def compute_loss(
         loss_dict = {"loss": loss}
         for extra_loss in extra_losses:
             e_loss = cast(Tensor, extra_loss(**extra_args))
-            loss_dict[extra_loss.__class__.__name__] = e_loss
+            loss_dict[extra_loss.name] = e_loss
         return loss_dict
 
     def add_noise(self, x: Tensor, timesteps: Tensor, noise: Tensor | None = None) -> tuple[Tensor, Tensor]:
diff --git a/src/diffulab/training/losses/common.py b/src/diffulab/training/losses/common.py
@@ -1,45 +1,18 @@
 from abc import ABC
-from pathlib import Path
 from typing import TYPE_CHECKING
 
 import torch.nn as nn
 
 if TYPE_CHECKING:
-    from accelerate import Accelerator  # type: ignore
-    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
-    from torch.nn.parallel import DistributedDataParallel
-
     from diffulab.networks.denoisers import Denoiser
 
 
-class LossFunction(ABC, nn.Module):  # to be completed
+class LossFunction(ABC, nn.Module):
+    name: str = "extra_loss"
+
     def __init__(self) -> None:
         super().__init__()  # type: ignore
 
-    def save(self, path: str | Path, accelerator: "Accelerator") -> None:
-        """
-        Save eventual learnable parameters of the loss function.
-
-        Args:
-            path (str | Path): Path to save the loss function.
-            accelerator (Accelerator | None): Accelerator instance for distributed training. Uses
-                accelerator.save if provided
-        """
-        # By default, doesn't save anything.
-        pass
-
-    def accelerate_prepare(
-        self, accelerator: "Accelerator"
-    ) -> "list[nn.Module | DistributedDataParallel | FullyShardedDataParallel]":
-        """
-        Prepare the loss function for distributed training.
-
-        Args:
-            accelerator (Accelerator): Accelerator instance for distributed training.
-        """
-        # By default, doesn't prepare anything.
-        return []
-
     def set_model(self, model: "Denoiser") -> None:
         """
         Set the model for the loss function.
diff --git a/src/diffulab/training/losses/repa.py b/src/diffulab/training/losses/repa.py
@@ -1,20 +1,23 @@
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypedDict, cast
+from typing import Any, Callable, TypedDict
+from weakref import WeakKeyDictionary
 
 import torch
-from accelerate import Accelerator  # type: ignore
 from jaxtyping import Float
 from torch import Tensor, nn
+from torch.utils.hooks import RemovableHandle
 
 from diffulab.networks.denoisers.mmdit import MMDiT
 from diffulab.networks.repa.common import REPA
 from diffulab.networks.repa.dinov2 import DinoV2
 from diffulab.networks.repa.perceiver_resampler import PerceiverResampler
 from diffulab.training.losses.common import LossFunction
 
-if TYPE_CHECKING:
-    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
-    from torch.nn.parallel import DistributedDataParallel
+try:
+    from torch._dynamo import disable as _dynamo_disable  # type: ignore
+except Exception:
+
+    def _dynamo_disable(fn: Any) -> Any:
+        return fn
 
 
 class ResamplerParams(TypedDict):
@@ -27,7 +30,53 @@ class ResamplerParams(TypedDict):
 
 
 class RepaLoss(LossFunction):
+    """Representation Alignment (REPA) loss.
+
+    Aligns intermediate features from a denoiser (MMDiT) to features from an
+    external vision encoder (e.g., DINOv2) using a projection MLP and, optionally,
+    a Perceiver resampler. Denoiser features are captured via a forward hook on a
+    specified transformer block and compared to encoder features using cosine
+    similarity. The loss is averaged over the sequence dimension and scaled by
+    ``coeff``.
+
+    Typical usage:
+        loss_fn = RepaLoss(...)
+        loss_fn.set_model(denoiser)
+        # Run a forward pass through the denoiser to populate captured features
+        loss = loss_fn(x0=batch_images)  # or pass dst_features=...
+
+    Args:
+        repa_encoder: Key of the encoder to instantiate. Supported values are
+            keys of ``encoder_registry``, e.g. "dinov2".
+        encoder_args: Keyword arguments forwarded to the encoder constructor.
+        alignment_layer: 1-based index of the MMDiT layer from which to capture
+            features.
+        denoiser_dimension: Feature dimensionality of the denoiser at the
+            alignment layer.
+        hidden_dim: Hidden size of the projection MLP.
+        load_dino: Whether to instantiate and load the encoder. Set to ``False``
+            when precomputed ``dst_features`` will be supplied at call time.
+        embedding_dim: Target embedding dimensionality when the encoder is not
+            instantiated (i.e., when ``load_dino=False``).
+        use_resampler: Whether to apply a :class:`PerceiverResampler` after the
+            projection MLP.
+        resampler_params: Configuration for the :class:`PerceiverResampler`.
+            Required if ``use_resampler=True``.
+        coeff: Multiplicative weight applied to the returned loss value.
+
+    Attributes:
+        repa_encoder: The instantiated encoder or ``None`` if
+            ``load_dino=False``.
+        proj: Projection MLP mapping denoiser features to the encoder embedding
+            space.
+        resampler: Optional :class:`PerceiverResampler` applied after the
+            projection.
+        alignment_layer: 1-based index of the hooked MMDiT layer.
+        coeff: Multiplicative weight applied to the returned loss.
+    """
+
     encoder_registry: dict[str, type[REPA]] = {"dinov2": DinoV2}
+    name: str = "RepaLoss"
 
     def __init__(
         self,
@@ -69,79 +118,77 @@ def __init__(
                 **resampler_params,
             )
         self.alignment_layer = alignment_layer
-        self._hook_handle = None
-        self.src_features: Tensor | None = None
+        self._handles: "WeakKeyDictionary[nn.Module, RemovableHandle]" = WeakKeyDictionary()
+        self._features: "WeakKeyDictionary[nn.Module, torch.Tensor]" = WeakKeyDictionary()
+        self._active_model: nn.Module | None = None
+        self._hook_layer_idx = self.alignment_layer - 1  # as before
         self.coeff = coeff
 
-    def _register_hook(self, model: MMDiT) -> None:
-        """Register the forward hook on the specified layer of the model."""
-        self._unregister_hook()  # Ensure no previous hook is registered
-        self._hook_handle = model.layers[self.alignment_layer - 1].register_forward_hook(self._forward_hook)
-
-    def _unregister_hook(self) -> None:
-        """Remove the forward hook."""
-        if self._hook_handle is not None:
-            self._hook_handle.remove()
-            self._hook_handle = None
+    @_dynamo_disable
+    def _make_forward_hook(self, key_model: MMDiT) -> Callable[[nn.Module, tuple[Any, ...], torch.Tensor], None]:
+        def _hook(_mod: nn.Module, _inp: tuple[Any, ...], out: torch.Tensor):
+            self._features[key_model] = out
 
-    def set_model(self, model: MMDiT) -> None:  # type: ignore
-        """Switch the hook to a different model (e.g., EMA model)."""
-        self._register_hook(model)
+        return _hook
 
-    def _forward_hook(self, net: nn.Module, input: tuple[Any, ...], output: Tensor) -> None:
-        """
-        Hook to capture the output of the specified layer during the forward pass.
-        """
-        self.src_features = output
-
-    def save(self, path: str | Path, accelerator: Accelerator) -> None:
-        """
-        Save state dict containing projection (and resampler if present).
-
-        Args:
-            path (str | Path): Path to save the loss function.
-            accelerator (Accelerator | None): Accelerator instance for distributed training. Uses
-                accelerator.save if provided.
-        """
-        file_path = Path(path) / "RepaLoss.pt"
+    def _attach_once(self, model: MMDiT) -> None:
+        if model in self._handles:
+            return
+        layer = model.layers[self._hook_layer_idx]
+        handle = layer.register_forward_hook(self._make_forward_hook(model))  # type: ignore
+        self._handles[model] = handle
 
-        unwrapped_proj = cast(nn.Module, accelerator.unwrap_model(self.proj))  # type: ignore
-        merged_state = {}
-        for k, v in unwrapped_proj.state_dict().items():
-            merged_state[f"proj.{k}"] = v
-        if self.resampler is not None:
-            unwrapped_resampler = cast(nn.Module, accelerator.unwrap_model(self.resampler))  # type: ignore
-            for k, v in unwrapped_resampler.state_dict().items():
-                merged_state[f"resampler.{k}"] = v
+    def set_model(self, model: MMDiT) -> None:  # type: ignore
+        """Register the model to capture features from a specific layer.
 
-        accelerator.save(merged_state, file_path)  # type: ignore
-
-    def accelerate_prepare(
-        self, accelerator: Accelerator
-    ) -> "list[nn.Module | DistributedDataParallel | FullyShardedDataParallel]":
-        """
-        Prepare the loss function for distributed training.
+        This attaches a forward hook to the specified ``alignment_layer`` of the
+        provided model (only once). A forward pass on ``model`` must be executed
+        after calling this method so that features are captured before computing
+        the loss.
 
         Args:
-            accelerator (Accelerator): Accelerator instance for distributed training.
+            model (MMDiT): The model whose intermediate features will be
+                aligned to the encoder features.
         """
-        trainable_modules: "list[nn.Module | DistributedDataParallel | FullyShardedDataParallel]" = []
-        self.proj = accelerator.prepare_model(self.proj)  # type: ignore
-        trainable_modules.append(self.proj)  # type: ignore
-        if self.resampler is not None:
-            self.resampler = accelerator.prepare_model(self.resampler)  # type: ignore
-            trainable_modules.append(self.resampler)  # type: ignore
-        if self.repa_encoder is not None:
-            self.repa_encoder = accelerator.prepare_model(self.repa_encoder)  # type: ignore
+        self._attach_once(model)
+        self._active_model = model
 
-        return trainable_modules
+    def _unregister_all(self) -> None:
+        for h in list(self._handles.values()):
+            h.remove()
+        self._handles.clear()
+        self._features.clear()
+        self._active_model = None
 
     def forward(
         self,
         x0: Float[Tensor, "batch 3 H W"] | None = None,
         dst_features: Float[Tensor, "batch seq_len n_dim"] | None = None,
     ) -> Tensor:
-        assert self.src_features is not None, "Source features are not computed. Ensure the forward hook is registered."
+        """Compute the REPA cosine-similarity loss.
+
+        Either provide input images via ``x0`` to compute destination features
+        with the encoder, or pass precomputed ``dst_features`` directly.
+
+        Args:
+            x0 (Tensor): Input images of shape ``[B, 3, H, W]`` used to compute encoder
+                features when an encoder is available.
+            dst_features (Tensor): Precomputed encoder features of shape ``[B, S, D]``.
+                If provided, ``x0`` is ignored.
+
+        Returns:
+            Tensor: A scalar tensor containing the REPA loss.
+
+        Raises:
+            RuntimeError: If no captured features are available for the active
+                model. Ensure ``set_model(...)`` was called and a forward pass
+                on the model was executed first.
+            AssertionError: If neither ``x0`` nor ``dst_features`` is provided.
+        """
+        if self._active_model is None or self._active_model not in self._features:
+            raise RuntimeError(
+                "REPA: no captured features for the active model. Did you call set_model(...) and run a forward pass?"
+            )
         assert x0 is not None or dst_features is not None, "Either x0 or dst_features must be provided."
         if dst_features is None:
             assert self.repa_encoder is not None, "REPA encoder must be initialized to compute features."
@@ -151,7 +198,8 @@ def forward(
                 )  # batch size seqlen embedding_dim # SEE HOW TO HANDLE THE PRE COMPUTING OF FEATURES
         assert dst_features is not None, "Destination features must be provided or computed."
 
-        projected_src_features: Tensor = self.proj(self.src_features)  # type: ignore
+        src_features = self._features[self._active_model]
+        projected_src_features: Tensor = self.proj(src_features)
 
         if self.resampler is not None:
             projected_src_features = self.resampler(projected_src_features)
diff --git a/src/diffulab/training/trainer.py b/src/diffulab/training/trainer.py