From 8c69f8d16c0e7c390e40de1775f45ff3830180e7 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 2 Jul 2025 15:32:25 -0700 Subject: [PATCH 1/5] fix perf bug --- .../lowering/passes/constant_folding.py | 4 +- tools/perf/perf_run.py | 131 ++++++++++++------ tools/perf/utils.py | 2 + 3 files changed, 91 insertions(+), 46 deletions(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py index 928b7284fe..7be199d52a 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py @@ -36,6 +36,8 @@ def constant_fold( # The constants are created on CPU to save GPU memory for TensorRT compilation. # For TRT INetwork construction the constants are moved to CPU in get_attr call. for node, constant in cf.node_replacements.items(): + if node.target == torch.ops.aten.embedding.default: + continue replace_node_with_constant( gm, node, torch.nn.Parameter(constant, requires_grad=False) ) @@ -103,7 +105,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.quantization_ops: Set[torch._ops.OpOverload] = set() try: # modelopt import ensures torch.ops.tensorrt.quantize_op.default is registered - import modelopt.torch.quantization as mtq + import modelopt.torch.quantization as mtq # noqa: F401 assert torch.ops.tensorrt.quantize_op.default self.quantization_ops.add(torch.ops.tensorrt.quantize_op.default) diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index ca37316ea8..8028dd8c33 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -174,8 +174,7 @@ def run_ts_trt(model, input_tensors, params, precision, batch_size): compile_settings = { "inputs": input_tensors, "enabled_precisions": {precision_to_dtype(precision)}, - "truncate_long_and_double": params.get("truncate", False), - "use_python_runtime": params.get("use_python_runtime", False), + "truncate_double": params.get("truncate", False), } if precision == "int8": @@ -274,8 +273,7 @@ def run_dynamo(model, input_tensors, params, precision, batch_size): ir="dynamo", enabled_precisions={precision_to_dtype(precision)}, min_block_size=params.get("min_block_size", 1), - debug=False, - truncate_long_and_double=params.get("truncate", False), + truncate_double=params.get("truncate", False), immutable_weights=params.get("immutable_weights", True), strip_engine_weights=params.get("strip_engine_weights", False), refit_identical_engine_weights=params.get( @@ -284,6 +282,7 @@ def run_dynamo(model, input_tensors, params, precision, batch_size): cache_built_engines=params.get("cache_built_engines", False), reuse_cached_engines=params.get("reuse_cached_engines", False), use_python_runtime=params.get("use_python_runtime", False), + optimization_level=5, ) end_compile = timeit.default_timer() compile_time_s = end_compile - start_compile @@ -437,25 +436,30 @@ def run_tensorrt( precision, batch_size=1, ): - # Export an ONNX model and convert to TRT - torch.onnx.export(model.eval().cuda(), tuple(input_tensors), "./tmp.onnx") logger = trt.Logger(trt.Logger.WARNING) - builder = trt.Builder(logger) - network = builder.create_network( - 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) - ) - parser = trt.OnnxParser(network, logger) - success = parser.parse_from_file("./tmp.onnx") - if not success: - raise ValueError("ONNX conversion failed") - - config = builder.create_builder_config() - if precision == "fp16": - config.set_flag(trt.BuilderFlag.FP16) - start_compile = timeit.default_timer() - serialized_engine = builder.build_serialized_network(network, config) - end_compile = timeit.default_timer() - compile_time_s = end_compile - start_compile + compile_time_s = 0 + if params["is_trt_engine"]: + serialized_engine = model + else: + # Export an ONNX model and convert to TRT + torch.onnx.export(model.eval().cuda(), tuple(input_tensors), "./tmp.onnx") + builder = trt.Builder(logger) + network = builder.create_network( + 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + ) + parser = trt.OnnxParser(network, logger) + success = parser.parse_from_file("./tmp.onnx") + if not success: + raise ValueError("ONNX conversion failed") + + config = builder.create_builder_config() + if precision == "fp16": + config.set_flag(trt.BuilderFlag.FP16) + config.builder_optimization_level = 5 + start_compile = timeit.default_timer() + serialized_engine = builder.build_serialized_network(network, config) + end_compile = timeit.default_timer() + compile_time_s = end_compile - start_compile # Deserialize the TensorRT engine with trt.Runtime(logger) as runtime: engine = runtime.deserialize_cuda_engine(serialized_engine) @@ -463,31 +467,66 @@ def run_tensorrt( print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size) iters = params.get("iterations", 20) - # Compiling the bindings - bindings = engine.num_bindings * [None] - k = 0 - for idx, _ in enumerate(bindings): - dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx)) - shape = tuple(engine.get_binding_shape(idx)) - device = torch_device_from_trt(engine.get_location(idx)) - if not engine.binding_is_input(idx): - # Output bindings - output = torch.empty(size=shape, dtype=dtype, device=device) - bindings[idx] = output.data_ptr() - else: - # Input bindings - bindings[idx] = input_tensors[k].data_ptr() - k += 1 + # Get I/O tensor information using TensorRT 10 API + input_names = [] + output_names = [] + input_dtypes = [] + output_dtypes = [] + input_shapes = [] + output_shapes = [] + + for i in range(engine.num_io_tensors): + tensor_name = engine.get_tensor_name(i) + tensor_mode = engine.get_tensor_mode(tensor_name) + tensor_dtype = engine.get_tensor_dtype(tensor_name) + tensor_shape = engine.get_tensor_shape(tensor_name) + + if tensor_mode == trt.TensorIOMode.INPUT: + input_names.append(tensor_name) + input_dtypes.append(torch_dtype_from_trt(tensor_dtype)) + input_shapes.append(tuple(tensor_shape)) + else: # trt.TensorIOMode.OUTPUT + output_names.append(tensor_name) + output_dtypes.append(torch_dtype_from_trt(tensor_dtype)) + output_shapes.append(tuple(tensor_shape)) + + # Create output tensors + output_tensors = [] + for i, (shape, dtype) in enumerate(zip(output_shapes, output_dtypes)): + output = torch.empty(size=shape, dtype=dtype, device="cuda") + output_tensors.append(output) timings = [] with engine.create_execution_context() as context: + # Set input tensor addresses + for i, (input_name, input_tensor) in enumerate(zip(input_names, input_tensors)): + context.set_tensor_address(input_name, input_tensor.data_ptr()) + + # Set output tensor addresses + for output_name, output_tensor in zip(output_names, output_tensors): + context.set_tensor_address(output_name, output_tensor.data_ptr()) + + # Create a dedicated stream for TensorRT execution + dedicated_stream = torch.cuda.Stream() + current_stream = torch.cuda.current_stream() + + # Warm up for i in range(WARMUP_ITER): - context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream) + # Wait for current stream to finish + dedicated_stream.wait_stream(current_stream) + context.execute_async_v3(dedicated_stream.cuda_stream) + # Wait for TensorRT stream to finish + current_stream.wait_stream(dedicated_stream) torch.cuda.synchronize() + # Performance measurement for i in range(iters): start_time = timeit.default_timer() - context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream) + # Wait for current stream to finish + dedicated_stream.wait_stream(current_stream) + context.execute_async_v3(dedicated_stream.cuda_stream) + # Wait for TensorRT stream to finish + current_stream.wait_stream(dedicated_stream) torch.cuda.synchronize() end_time = timeit.default_timer() meas_time = end_time - start_time @@ -504,7 +543,6 @@ def run( params, precision, batch_size=1, - is_trt_engine=False, model_torch=None, ): for backend in backends: @@ -551,7 +589,6 @@ def run( input_tensors, params, precision, - is_trt_engine, batch_size, ) run_dynamo(model_torch, input_tensors, params, precision, batch_size) @@ -569,7 +606,7 @@ def run( ) elif backend == "tensorrt": run_tensorrt( - model_torch, + model, input_tensors, params, precision, @@ -702,8 +739,13 @@ def run( # Load TorchScript model, if provided if os.path.exists(model_name): - print("Loading user provided torchscript model: ", model_name) - model = torch.jit.load(model_name).cuda().eval() + if params["is_trt_engine"]: + with open(model_name, "rb") as f: + model = f.read() + print("Loading user provided trt engine: ", model_name) + else: + print("Loading user provided torchscript model: ", model_name) + model = torch.jit.load(model_name).cuda().eval() # Load PyTorch Model, if provided if len(model_name_torch) > 0 and os.path.exists(model_name_torch): @@ -746,7 +788,6 @@ def run( params, precision, batch_size, - is_trt_engine, model_torch=model_torch, ) diff --git a/tools/perf/utils.py b/tools/perf/utils.py index 5dae807892..0fd38e6447 100644 --- a/tools/perf/utils.py +++ b/tools/perf/utils.py @@ -176,6 +176,8 @@ def torch_dtype_from_trt(dtype): return torch.bool elif dtype == trt.int32: return torch.int32 + elif dtype == trt.int64: + return torch.int64 elif dtype == trt.float16: return torch.float16 elif dtype == trt.float32: From 962fb48338632a82ac62d4ee317e2b51f36b0245 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 2 Jul 2025 15:42:25 -0700 Subject: [PATCH 2/5] minor update --- tools/perf/perf_run.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 8028dd8c33..fe2ba3073c 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -282,7 +282,7 @@ def run_dynamo(model, input_tensors, params, precision, batch_size): cache_built_engines=params.get("cache_built_engines", False), reuse_cached_engines=params.get("reuse_cached_engines", False), use_python_runtime=params.get("use_python_runtime", False), - optimization_level=5, + optimization_level=params.get("optimization_level", 5), ) end_compile = timeit.default_timer() compile_time_s = end_compile - start_compile @@ -455,7 +455,7 @@ def run_tensorrt( config = builder.create_builder_config() if precision == "fp16": config.set_flag(trt.BuilderFlag.FP16) - config.builder_optimization_level = 5 + config.builder_optimization_level = params.get("optimization_level", 5) start_compile = timeit.default_timer() serialized_engine = builder.build_serialized_network(network, config) end_compile = timeit.default_timer() @@ -680,6 +680,12 @@ def run( action="store_true", help="Truncate long and double weights in the network in Torch-TensorRT", ) + arg_parser.add_argument( + "--optimization_level", + type=int, + default=5, + help="Builder optimization level for TensorRT", + ) arg_parser.add_argument( "--is_trt_engine", action="store_true", From 785c25a2f8edc39b9dfb3893a51f31d690c1c39b Mon Sep 17 00:00:00 2001 From: Evan Li Date: Thu, 17 Jul 2025 14:19:15 -0700 Subject: [PATCH 3/5] revert linear converter --- .../dynamo/conversion/aten_ops_converters.py | 20 +++++++ .../dynamo/conversion/impl/__init__.py | 1 + .../dynamo/conversion/impl/linear.py | 54 +++++++++++++++++++ .../dynamo/lowering/_decomposition_groups.py | 1 + .../lowering/passes/_aten_lowering_pass.py | 2 + .../dynamo/lowering/passes/lower_linear.py | 42 +++++++++++++++ tools/perf/perf_run.py | 33 ++++++++---- 7 files changed, 143 insertions(+), 10 deletions(-) create mode 100644 py/torch_tensorrt/dynamo/conversion/impl/linear.py create mode 100644 py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 8c0706539c..5e7a65d551 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -3579,3 +3579,23 @@ def aten_ops_nonzero( name, args[0], ) + + +@dynamo_tensorrt_converter(torch.ops.aten.linear.default, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.linear, supports_dynamic_shapes=True) +def aten_ops_linear( + ctx: ConversionContext, + target: Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + name: str, +) -> Union[TRTTensor, Sequence[TRTTensor]]: + return impl.linear.linear( + ctx, + target, + SourceIR.ATEN, + name, + input=args[0], + weight=args[1], + bias=args_bounds_check(args, 2, None), + ) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/__init__.py b/py/torch_tensorrt/dynamo/conversion/impl/__init__.py index 10af2ad892..61728392da 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/__init__.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/__init__.py @@ -12,6 +12,7 @@ embedding, full, grid, + linear, matmul, nccl_ops, normalization, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/linear.py b/py/torch_tensorrt/dynamo/conversion/impl/linear.py new file mode 100644 index 0000000000..69ef73964d --- /dev/null +++ b/py/torch_tensorrt/dynamo/conversion/impl/linear.py @@ -0,0 +1,54 @@ +from typing import Optional, Union + +import numpy as np +import tensorrt as trt +import torch +from torch.fx.node import Target +from torch_tensorrt.dynamo.conversion import impl +from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext +from torch_tensorrt.dynamo.conversion.converter_utils import SourceIR, get_trt_tensor +from torch_tensorrt.fx.types import TRTTensor + + +def linear( + ctx: ConversionContext, + target: Union[Target, str], + source_ir: Optional[SourceIR], + name: str, + input: TRTTensor, + weight: Union[TRTTensor, torch.Tensor, np.ndarray], + bias: Optional[Union[TRTTensor, torch.Tensor, np.ndarray]], +) -> TRTTensor: + # Process weight terms + if not isinstance(weight, (TRTTensor, torch.Tensor, np.ndarray)): + raise RuntimeError( + f"Linear layer {name} has weight of type {type(weight)}, Expect Union[TRTTensor, torch.Tensor, np.ndarray]," + ) + elif isinstance(weight, (torch.Tensor, np.ndarray)): + weight = get_trt_tensor(ctx, weight, f"{name}_weight") + + # Process bias terms + if bias is not None and not isinstance(bias, (TRTTensor, torch.Tensor, np.ndarray)): + raise RuntimeError( + f"Linear layer {name} has bias of type {type(bias)}, Expect Union[TRTTensor, torch.Tensor, np.ndarray]," + ) + elif isinstance(bias, (torch.Tensor, np.ndarray)): + bias = get_trt_tensor(ctx, bias, f"{name}_bias") + + # add IMatrixMultiplyLayer + out = impl.matmul.matrix_multiply( + ctx, + target, + source_ir, + name, + input, + weight, + input_matrix_op=trt.MatrixOperation.NONE, + other_matrix_op=trt.MatrixOperation.TRANSPOSE, + ) + + if bias is not None: + # add bias + out = impl.elementwise.add(ctx, target, source_ir, name, out, bias) + + return out diff --git a/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py b/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py index 825be75076..52b541d3a8 100644 --- a/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py +++ b/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py @@ -171,6 +171,7 @@ aten.upsample_bilinear2d.vec, aten.upsample_trilinear3d.vec, aten.upsample_bicubic2d.vec, + aten.linear.default, } diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index fff4473b47..96cda00f44 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -10,6 +10,7 @@ from .constant_folding import constant_fold from .fuse_distributed_ops import fuse_distributed_ops from .fuse_prims_broadcast import fuse_prims_broadcast +from .lower_linear import lower_linear from .pass_manager import DynamoPassManager from .remove_assert_nodes import remove_assert_nodes from .remove_detach import remove_detach @@ -28,6 +29,7 @@ accumulate_fp32_matmul, remove_num_users_is_0_nodes, complex_graph_detection, + lower_linear, ] pre_lowering_pass_list = [ diff --git a/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py b/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py new file mode 100644 index 0000000000..dca3d9ed47 --- /dev/null +++ b/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py @@ -0,0 +1,42 @@ +import logging + +import torch +from torch_tensorrt.dynamo._settings import CompilationSettings +from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( + clean_up_graph_after_modifications, +) +from torch_tensorrt.dynamo.utils import get_metadata, set_metadata + +logger = logging.getLogger(__name__) + + +def lower_linear( + gm: torch.fx.GraphModule, settings: CompilationSettings +) -> torch.fx.GraphModule: + """Replace aten.linear with an equivalent implementation which can be easily converted to TRT""" + orig_op = torch.ops.aten.addmm.default + replacement_op = torch.ops.aten.linear.default + + # Original graph + def orig( + input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor + ) -> torch.Tensor: + W_T = torch.ops.aten.permute.default(weight, [1, 0]) + out = orig_op(bias, input, W_T) + return out + + # Replacement graph + def replacement( + input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor + ) -> torch.Tensor: + return replacement_op(input, weight, bias) + + metadata = get_metadata(gm, orig_op) + replaced_nodes = torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement) + + if len(replaced_nodes) > 0: + gm = clean_up_graph_after_modifications(gm) + set_metadata(gm, replacement_op, metadata) + logger.debug(f"Graph after lowering linear:\n{gm.graph}") + + return gm diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index fe2ba3073c..2a06e0ce0c 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -282,7 +282,7 @@ def run_dynamo(model, input_tensors, params, precision, batch_size): cache_built_engines=params.get("cache_built_engines", False), reuse_cached_engines=params.get("reuse_cached_engines", False), use_python_runtime=params.get("use_python_runtime", False), - optimization_level=params.get("optimization_level", 5), + optimization_level=params.get("optimization_level", 3), ) end_compile = timeit.default_timer() compile_time_s = end_compile - start_compile @@ -441,21 +441,26 @@ def run_tensorrt( if params["is_trt_engine"]: serialized_engine = model else: - # Export an ONNX model and convert to TRT - torch.onnx.export(model.eval().cuda(), tuple(input_tensors), "./tmp.onnx") + if params["onnx"]: + onnx_path = params["onnx"] + else: + # Export an ONNX model and convert to TRT + onnx_path = "./onnx-trt.onnx" + exp_program = torch.export.export(model.eval().cuda(), tuple(input_tensors)) + torch.onnx.export(exp_program, tuple(input_tensors), onnx_path) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) ) parser = trt.OnnxParser(network, logger) - success = parser.parse_from_file("./tmp.onnx") + success = parser.parse_from_file(onnx_path) if not success: raise ValueError("ONNX conversion failed") config = builder.create_builder_config() if precision == "fp16": config.set_flag(trt.BuilderFlag.FP16) - config.builder_optimization_level = params.get("optimization_level", 5) + config.builder_optimization_level = params.get("optimization_level", 3) start_compile = timeit.default_timer() serialized_engine = builder.build_serialized_network(network, config) end_compile = timeit.default_timer() @@ -561,7 +566,7 @@ def run( print("int8 precision expects calibration cache file for inference") return False - if (model is None) and (backend in ("tensorrt", "ts_trt", "all")): + if (model is None) and (backend in ("ts_trt", "all")): warnings.warn( f"Requested backend {backend} without specifying a TorchScript Model, " + "skipping this backend" @@ -585,7 +590,7 @@ def run( batch_size, ) run_tensorrt( - model, + model_torch, input_tensors, params, precision, @@ -606,7 +611,7 @@ def run( ) elif backend == "tensorrt": run_tensorrt( - model, + model_torch, input_tensors, params, precision, @@ -641,6 +646,12 @@ def run( default="", help="Name of torch model file", ) + arg_parser.add_argument( + "--onnx", + type=str, + default="", + help="ONNX model file which helps bypass the step of exporting ONNX from torchscript model. If this argument is provided, the ONNX will be directly converted to TRT engine", + ) arg_parser.add_argument( "--inputs", type=str, @@ -683,7 +694,7 @@ def run( arg_parser.add_argument( "--optimization_level", type=int, - default=5, + default=3, help="Builder optimization level for TensorRT", ) arg_parser.add_argument( @@ -767,7 +778,9 @@ def run( ) backends = parse_backends(params["backends"]) - if ("dynamo" in backends or "torch_compile" in backends) and (model_torch is None): + if any( + backend in ["dynamo", "torch_compile", "tensorrt"] for backend in backends + ) and (model_torch is None): raise ValueError( "No Pytorch model (nn.Module) is provided for torchdynamo compilation. Please provide a pytorch model using --model_torch argument" ) From 22721c29dc909da8c34cccb1d37746f370bc4563 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Thu, 17 Jul 2025 18:49:23 -0700 Subject: [PATCH 4/5] fix comments --- .../lowering/passes/_aten_lowering_pass.py | 2 - .../dynamo/lowering/passes/lower_linear.py | 42 ------------------- tools/perf/README.md | 4 +- tools/perf/perf_run.py | 10 ++--- tools/perf/requirements.txt | 3 +- 5 files changed, 7 insertions(+), 54 deletions(-) delete mode 100644 py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index 96cda00f44..fff4473b47 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -10,7 +10,6 @@ from .constant_folding import constant_fold from .fuse_distributed_ops import fuse_distributed_ops from .fuse_prims_broadcast import fuse_prims_broadcast -from .lower_linear import lower_linear from .pass_manager import DynamoPassManager from .remove_assert_nodes import remove_assert_nodes from .remove_detach import remove_detach @@ -29,7 +28,6 @@ accumulate_fp32_matmul, remove_num_users_is_0_nodes, complex_graph_detection, - lower_linear, ] pre_lowering_pass_list = [ diff --git a/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py b/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py deleted file mode 100644 index dca3d9ed47..0000000000 --- a/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py +++ /dev/null @@ -1,42 +0,0 @@ -import logging - -import torch -from torch_tensorrt.dynamo._settings import CompilationSettings -from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( - clean_up_graph_after_modifications, -) -from torch_tensorrt.dynamo.utils import get_metadata, set_metadata - -logger = logging.getLogger(__name__) - - -def lower_linear( - gm: torch.fx.GraphModule, settings: CompilationSettings -) -> torch.fx.GraphModule: - """Replace aten.linear with an equivalent implementation which can be easily converted to TRT""" - orig_op = torch.ops.aten.addmm.default - replacement_op = torch.ops.aten.linear.default - - # Original graph - def orig( - input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor - ) -> torch.Tensor: - W_T = torch.ops.aten.permute.default(weight, [1, 0]) - out = orig_op(bias, input, W_T) - return out - - # Replacement graph - def replacement( - input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor - ) -> torch.Tensor: - return replacement_op(input, weight, bias) - - metadata = get_metadata(gm, orig_op) - replaced_nodes = torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement) - - if len(replaced_nodes) > 0: - gm = clean_up_graph_after_modifications(gm) - set_metadata(gm, replacement_op, metadata) - logger.debug(f"Graph after lowering linear:\n{gm.graph}") - - return gm diff --git a/tools/perf/README.md b/tools/perf/README.md index 4d4579efb4..36c85386f7 100644 --- a/tools/perf/README.md +++ b/tools/perf/README.md @@ -9,8 +9,6 @@ This is a comprehensive Python benchmark suite to run perf runs using different 5. TensorRT -Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package. - ## Prerequisite Benchmark scripts depends on following Python packages in addition to requirements.txt packages @@ -47,6 +45,7 @@ Here are the list of `CompileSpec` options that can be provided directly to comp * `--backends` : Comma separated string of backends. Eg: torch, torch_compile, dynamo, tensorrt * `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `dynamo` or `torch_compile`, the input should be a Pytorch module (instead of a torchscript module). * `--model_torch` : Name of the PyTorch model file (optional, only necessary if `dynamo` or `torch_compile` is a chosen backend) +* `--onnx` : ONNX model file which helps bypass the step of exporting ONNX from `model_torch`. If this argument is provided, the ONNX will be directly converted to TRT engine * `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT * `--batch_size` : Batch size * `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16 @@ -54,6 +53,7 @@ Here are the list of `CompileSpec` options that can be provided directly to comp * `--truncate` : Truncate long and double weights in the network in Torch-TensorRT * `--is_trt_engine` : Boolean flag to be enabled if the model file provided is a TensorRT engine. * `--report` : Path of the output file where performance summary is written. +* `--optimization_level` : Builder optimization level for TensorRT (from 1 to 5, 5 is the highest optimization). Eg: diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 2a06e0ce0c..65892683ae 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -444,10 +444,8 @@ def run_tensorrt( if params["onnx"]: onnx_path = params["onnx"] else: - # Export an ONNX model and convert to TRT onnx_path = "./onnx-trt.onnx" - exp_program = torch.export.export(model.eval().cuda(), tuple(input_tensors)) - torch.onnx.export(exp_program, tuple(input_tensors), onnx_path) + torch.onnx.export(model, tuple(input_tensors), onnx_path, dynamo=True) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) @@ -472,6 +470,7 @@ def run_tensorrt( print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size) iters = params.get("iterations", 20) + start_time = timeit.default_timer() # Get I/O tensor information using TensorRT 10 API input_names = [] output_names = [] @@ -526,7 +525,6 @@ def run_tensorrt( # Performance measurement for i in range(iters): - start_time = timeit.default_timer() # Wait for current stream to finish dedicated_stream.wait_stream(current_stream) context.execute_async_v3(dedicated_stream.cuda_stream) @@ -534,8 +532,8 @@ def run_tensorrt( current_stream.wait_stream(dedicated_stream) torch.cuda.synchronize() end_time = timeit.default_timer() - meas_time = end_time - start_time - timings.append(meas_time) + infer_time = end_time - start_time + timings.append(infer_time) recordStats("TensorRT", timings, precision, batch_size, compile_time_s) diff --git a/tools/perf/requirements.txt b/tools/perf/requirements.txt index fcfb0b3d53..efc11a05b5 100644 --- a/tools/perf/requirements.txt +++ b/tools/perf/requirements.txt @@ -4,6 +4,5 @@ pyyaml onnx pandas transformers -diffusers==0.21.4 +diffusers timm==0.9.8 - From a18580333967cd430f7fa8e38290b78d81b2a41c Mon Sep 17 00:00:00 2001 From: Evan Li Date: Mon, 21 Jul 2025 17:50:06 -0700 Subject: [PATCH 5/5] fix comments and bugs --- .../dynamo/conversion/aten_ops_converters.py | 1 - .../dynamo/conversion/impl/linear.py | 6 +++--- .../dynamo/runtime/_PythonTorchTensorRTModule.py | 3 +-- tools/perf/perf_run.py | 16 +++++++++------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 5e7a65d551..f1a7f9a8fc 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -3582,7 +3582,6 @@ def aten_ops_nonzero( @dynamo_tensorrt_converter(torch.ops.aten.linear.default, supports_dynamic_shapes=True) -@dynamo_tensorrt_converter(torch.ops.aten.linear, supports_dynamic_shapes=True) def aten_ops_linear( ctx: ConversionContext, target: Target, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/linear.py b/py/torch_tensorrt/dynamo/conversion/impl/linear.py index 69ef73964d..ac94876dca 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/linear.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/linear.py @@ -7,7 +7,7 @@ from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import SourceIR, get_trt_tensor -from torch_tensorrt.fx.types import TRTTensor +from torch_tensorrt.dynamo.types import TRTTensor def linear( @@ -40,7 +40,7 @@ def linear( ctx, target, source_ir, - name, + f"{name}_matrix_multiply", input, weight, input_matrix_op=trt.MatrixOperation.NONE, @@ -49,6 +49,6 @@ def linear( if bias is not None: # add bias - out = impl.elementwise.add(ctx, target, source_ir, name, out, bias) + out = impl.elementwise.add(ctx, target, source_ir, f"{name}_add", out, bias) return out diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 1d619b6ce3..777bb32a2d 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -2,7 +2,6 @@ import logging from contextlib import nullcontext -from tempfile import tempdir from typing import Any, Dict, List, Optional, Sequence, Tuple import tensorrt as trt @@ -539,7 +538,7 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: with tempfile.TemporaryDirectory() as tmpdir: self.cudagraph.debug_dump( - f"{tempdir}/{self.name}_cudagraph.dot" + f"{tmpdir}/{self.name}_cudagraph.dot" ) self.cudagraph.replay() # type: ignore diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 65892683ae..f7bc94d27d 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -474,9 +474,7 @@ def run_tensorrt( # Get I/O tensor information using TensorRT 10 API input_names = [] output_names = [] - input_dtypes = [] output_dtypes = [] - input_shapes = [] output_shapes = [] for i in range(engine.num_io_tensors): @@ -487,8 +485,6 @@ def run_tensorrt( if tensor_mode == trt.TensorIOMode.INPUT: input_names.append(tensor_name) - input_dtypes.append(torch_dtype_from_trt(tensor_dtype)) - input_shapes.append(tuple(tensor_shape)) else: # trt.TensorIOMode.OUTPUT output_names.append(tensor_name) output_dtypes.append(torch_dtype_from_trt(tensor_dtype)) @@ -514,6 +510,8 @@ def run_tensorrt( dedicated_stream = torch.cuda.Stream() current_stream = torch.cuda.current_stream() + setup_time = timeit.default_timer() + # Warm up for i in range(WARMUP_ITER): # Wait for current stream to finish @@ -523,6 +521,7 @@ def run_tensorrt( current_stream.wait_stream(dedicated_stream) torch.cuda.synchronize() + infer_start_time = timeit.default_timer() # Performance measurement for i in range(iters): # Wait for current stream to finish @@ -531,9 +530,12 @@ def run_tensorrt( # Wait for TensorRT stream to finish current_stream.wait_stream(dedicated_stream) torch.cuda.synchronize() - end_time = timeit.default_timer() - infer_time = end_time - start_time - timings.append(infer_time) + + end_time = timeit.default_timer() + + # to compare against torch-trt dynamo apples to apples + infer_time = (end_time - infer_start_time + setup_time - start_time) / iters + timings.append(infer_time) recordStats("TensorRT", timings, precision, batch_size, compile_time_s)