Initial attempt

cehongwang · cehongwang · commit cf064c523e40 · 2025-06-13T22:34:24.000Z
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -400,7 +400,7 @@ def _construct_trt_network_def(self) -> None:
     @staticmethod
     def find_weight(
         weight_name: str,
-        np_map: dict[str, Any],
+        weight_refit_map: dict[str, Any],
         state_dict: dict[str, Any],
         device: torch.device,
     ) -> str:
@@ -413,7 +413,7 @@ def find_weight(
         state_dict: state of the graph module
         """
         with unset_fake_temporarily():
-            network_weight = torch.from_numpy(np_map[weight_name]).to(device)
+            network_weight = weight_refit_map[weight_name].to(device)
             for sd_w_name, sd_weight in state_dict.items():
                 if TRTInterpreter.check_weight_equal(sd_weight, network_weight, device):
                     del state_dict[sd_w_name]
@@ -427,8 +427,8 @@ def check_weight_equal(
         device: torch.device,
     ) -> Any:
         with unset_fake_temporarily():
-            if not isinstance(network_weight, torch.Tensor):
-                network_weight = torch.from_numpy(network_weight).to(device)
+            if network_weight.device != device:
+                network_weight = network_weight.to(device)
             try:
                 return sd_weight.shape == network_weight.shape and torch.all(
                     torch.abs(sd_weight - network_weight) < 0.01
@@ -497,8 +497,8 @@ def _save_weight_mapping(self) -> None:
         self.module.to(torch_device)
         sd = self.module.state_dict()
         weight_name_map: dict[str, Any] = {}
-        np_map = self.ctx.weight_refit_map
-        constant_mapping = {k: v for k, v in np_map.items() if v.size == 1}
+        weight_refit_map = self.ctx.weight_refit_map
+        constant_mapping = {k: v for k, v in weight_refit_map.items() if v.size == 1}
         net = self.ctx.net
         for i in range(net.num_layers):
             layer = net[i]
@@ -540,7 +540,7 @@ def _save_weight_mapping(self) -> None:
                     else:
                         sd_weight_name = f"{sd_weight_name}.{torch_attr}"
 
-                    if engine_weight_name in np_map:
+                    if engine_weight_name in weight_refit_map:
                         weight_name_map[engine_weight_name] = sd_weight_name
 
         # Stage 2: Value mapping
@@ -549,10 +549,10 @@ def _save_weight_mapping(self) -> None:
                 # There is no direct connection in batch_norm layer. So skip it
                 pass
             elif sd_weight_name not in sd or not TRTInterpreter.check_weight_equal(
-                sd[sd_weight_name], np_map[engine_weight_name], torch_device
+                sd[sd_weight_name], weight_refit_map[engine_weight_name], torch_device
             ):
                 weight_name_map[engine_weight_name] = TRTInterpreter.find_weight(
-                    engine_weight_name, np_map, sd, torch_device
+                    engine_weight_name, weight_refit_map, sd, torch_device
                 )
                 if (
                     weight_name_map[engine_weight_name] != ""
@@ -563,12 +563,13 @@ def _save_weight_mapping(self) -> None:
 
             weight_name_map[engine_weight_name] = [
                 weight_name_map[engine_weight_name],
-                np_map[engine_weight_name].dtype,
+                weight_refit_map[engine_weight_name].dtype,
             ]
 
         weight_name_map["constant_mapping"] = constant_mapping
         self.weight_name_map = weight_name_map
-        del np_map, sd
+
+        del weight_refit_map, sd
         gc.collect()
         torch.cuda.empty_cache()
 
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -418,15 +418,8 @@ def create_constant(
                 ctx.cpu_weights_reference_holder[name + " FP4_CONSTANT"] = torch_value
                 return constant.get_output(0)
 
-            # TODO: Refit map uses numpy arrays. Remove this once refit is updated to use torch.Tensor
-            if torch_value.dtype == torch.bfloat16:
-                torch_value_fp32 = torch_value.to(torch.float32)
-                numpy_value = torch_value_fp32.numpy()
-            else:
-                numpy_value = torch_value.numpy()
-
             # Used for refit
-            ctx.weight_refit_map[name + " CONSTANT"] = numpy_value.reshape(-1)
+            ctx.weight_refit_map[name + " CONSTANT"] = torch_value
 
             # This is a buffer to hold the torch.Tensor so that they are alive during the course of TRT compilation.
             ctx.cpu_weights_reference_holder[name] = torch_value