Fix sharding when no device_map is passed (#8531)

SunMarc · sayakpaul · web-flow · commit 96399c3ec678 · 2024-06-18T05:47:23.000-10:00
* Fix sharding when no device_map is passed

* style

* add tests

* align

* add docstring

* format

---------

Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -462,7 +462,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
                 A map that specifies where each submodule should go. It doesn't need to be defined for each
                 parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
-                same device.
+                same device. Defaults to `None`, meaning that the model will be loaded on CPU.
 
                 Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
                 more information about each option see [designing a device
@@ -774,7 +774,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 else:  # else let accelerate handle loading and dispatching.
                     # Load weights and dispatch according to the device_map
                     # by default the device_map is None and the weights are loaded on the CPU
+                    force_hook = True
                     device_map = _determine_device_map(model, device_map, max_memory, torch_dtype)
+                    if device_map is None and is_sharded:
+                        # we load the parameters on the cpu
+                        device_map = {"": "cpu"}
+                        force_hook = False
                     try:
                         accelerate.load_checkpoint_and_dispatch(
                             model,
@@ -784,7 +789,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             offload_folder=offload_folder,
                             offload_state_dict=offload_state_dict,
                             dtype=torch_dtype,
-                            force_hooks=True,
+                            force_hooks=force_hook,
                             strict=True,
                         )
                     except AttributeError as e:
@@ -808,12 +813,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             model._temp_convert_self_to_deprecated_attention_blocks()
                             accelerate.load_checkpoint_and_dispatch(
                                 model,
-                                model_file,
+                                model_file if not is_sharded else sharded_ckpt_cached_folder,
                                 device_map,
                                 max_memory=max_memory,
                                 offload_folder=offload_folder,
                                 offload_state_dict=offload_state_dict,
                                 dtype=torch_dtype,
+                                force_hook=force_hook,
+                                strict=True,
                             )
                             model._undo_temp_convert_self_to_deprecated_attention_blocks()
                         else:
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -872,6 +872,39 @@ def test_model_parallelism(self):
 
     @require_torch_gpu
     def test_sharded_checkpoints(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            with open(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)) as f:
+                weight_map_dict = json.load(f)["weight_map"]
+                first_key = list(weight_map_dict.keys())[0]
+                weight_loc = weight_map_dict[first_key]  # e.g., diffusion_pytorch_model-00001-of-00002.safetensors
+                expected_num_shards = int(weight_loc.split("-")[-1].split(".")[0])
+
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            new_model = self.model_class.from_pretrained(tmp_dir)
+
+            torch.manual_seed(0)
+            new_output = new_model(**inputs_dict)
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_gpu
+    def test_sharded_checkpoints_device_map(self):
         config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**config).eval()
         if model._no_split_modules is None:
diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py
@@ -1038,14 +1038,33 @@ def test_ip_adapter_plus(self):
     @require_torch_gpu
     def test_load_sharded_checkpoint_from_hub(self):
         _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        loaded_model = self.model_class.from_pretrained("hf-internal-testing/unet2d-sharded-dummy", device_map="auto")
+        loaded_model = self.model_class.from_pretrained("hf-internal-testing/unet2d-sharded-dummy")
         new_output = loaded_model(**inputs_dict)
 
         assert loaded_model
         assert new_output.sample.shape == (4, 4, 16, 16)
 
     @require_torch_gpu
     def test_load_sharded_checkpoint_from_hub_local(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy")
+        loaded_model = self.model_class.from_pretrained(ckpt_path, local_files_only=True)
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_device_map_from_hub(self):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        loaded_model = self.model_class.from_pretrained("hf-internal-testing/unet2d-sharded-dummy", device_map="auto")
+        new_output = loaded_model(**inputs_dict)
+
+        assert loaded_model
+        assert new_output.sample.shape == (4, 4, 16, 16)
+
+    @require_torch_gpu
+    def test_load_sharded_checkpoint_device_map_from_hub_local(self):
         _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy")
         loaded_model = self.model_class.from_pretrained(ckpt_path, local_files_only=True, device_map="auto")