pytorch · zewenli98 · Jul 2, 2025 · Jul 2, 2025 · Jul 17, 2025 · Jul 18, 2025
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -3579,3 +3579,22 @@ def aten_ops_nonzero(
         name,
         args[0],
     )
+
+
+@dynamo_tensorrt_converter(torch.ops.aten.linear.default, supports_dynamic_shapes=True)
+def aten_ops_linear(
+    ctx: ConversionContext,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    return impl.linear.linear(
+        ctx,
+        target,
+        SourceIR.ATEN,
+        name,
+        input=args[0],
+        weight=args[1],
+        bias=args_bounds_check(args, 2, None),
+    )
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/__init__.py b/py/torch_tensorrt/dynamo/conversion/impl/__init__.py
@@ -12,6 +12,7 @@
     embedding,
     full,
     grid,
+    linear,
     matmul,
     nccl_ops,
     normalization,

diff --git a/py/torch_tensorrt/dynamo/conversion/impl/linear.py b/py/torch_tensorrt/dynamo/conversion/impl/linear.py
@@ -0,0 +1,54 @@
+from typing import Optional, Union
+
+import numpy as np
+import tensorrt as trt
+import torch
+from torch.fx.node import Target
+from torch_tensorrt.dynamo.conversion import impl
+from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
+from torch_tensorrt.dynamo.conversion.converter_utils import SourceIR, get_trt_tensor
+from torch_tensorrt.dynamo.types import TRTTensor
+
+
+def linear(
+    ctx: ConversionContext,
+    target: Union[Target, str],
+    source_ir: Optional[SourceIR],
+    name: str,
+    input: TRTTensor,
+    weight: Union[TRTTensor, torch.Tensor, np.ndarray],
+    bias: Optional[Union[TRTTensor, torch.Tensor, np.ndarray]],
+) -> TRTTensor:
+    # Process weight terms
+    if not isinstance(weight, (TRTTensor, torch.Tensor, np.ndarray)):
+        raise RuntimeError(
+            f"Linear layer {name} has weight of type {type(weight)}, Expect Union[TRTTensor, torch.Tensor, np.ndarray],"
+        )
+    elif isinstance(weight, (torch.Tensor, np.ndarray)):
+        weight = get_trt_tensor(ctx, weight, f"{name}_weight")
+
+    # Process bias terms
+    if bias is not None and not isinstance(bias, (TRTTensor, torch.Tensor, np.ndarray)):
+        raise RuntimeError(
+            f"Linear layer {name} has bias of type {type(bias)}, Expect Union[TRTTensor, torch.Tensor, np.ndarray],"
+        )
+    elif isinstance(bias, (torch.Tensor, np.ndarray)):
+        bias = get_trt_tensor(ctx, bias, f"{name}_bias")
+
+    # add IMatrixMultiplyLayer
+    out = impl.matmul.matrix_multiply(
+        ctx,
+        target,
+        source_ir,
+        f"{name}_matrix_multiply",
+        input,
+        weight,
+        input_matrix_op=trt.MatrixOperation.NONE,
+        other_matrix_op=trt.MatrixOperation.TRANSPOSE,
+    )
+
+    if bias is not None:
+        # add bias
+        out = impl.elementwise.add(ctx, target, source_ir, f"{name}_add", out, bias)
+
+    return out
diff --git a/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py b/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py
@@ -171,6 +171,7 @@
     aten.upsample_bilinear2d.vec,
     aten.upsample_trilinear3d.vec,
     aten.upsample_bicubic2d.vec,
+    aten.linear.default,
 }
 
 

diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py
@@ -36,6 +36,8 @@ def constant_fold(
     # The constants are created on CPU to save GPU memory for TensorRT compilation.
     # For TRT INetwork construction the constants are moved to CPU in get_attr call.
     for node, constant in cf.node_replacements.items():
+        if node.target == torch.ops.aten.embedding.default:
+            continue
         replace_node_with_constant(
             gm, node, torch.nn.Parameter(constant, requires_grad=False)
         )
@@ -103,7 +105,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         self.quantization_ops: Set[torch._ops.OpOverload] = set()
         try:
             # modelopt import ensures torch.ops.tensorrt.quantize_op.default is registered
-            import modelopt.torch.quantization as mtq
+            import modelopt.torch.quantization as mtq  # noqa: F401
 
             assert torch.ops.tensorrt.quantize_op.default
             self.quantization_ops.add(torch.ops.tensorrt.quantize_op.default)

diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -2,7 +2,6 @@
 
 import logging
 from contextlib import nullcontext
-from tempfile import tempdir
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import tensorrt as trt
@@ -539,7 +538,7 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
 
                                 with tempfile.TemporaryDirectory() as tmpdir:
                                     self.cudagraph.debug_dump(
-                                        f"{tempdir}/{self.name}_cudagraph.dot"
+                                        f"{tmpdir}/{self.name}_cudagraph.dot"
                                     )
 
                         self.cudagraph.replay()  # type: ignore

diff --git a/tools/perf/README.md b/tools/perf/README.md
@@ -9,8 +9,6 @@ This is a comprehensive Python benchmark suite to run perf runs using different
 5. TensorRT
 
 
-Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package.
-
 ## Prerequisite
 
 Benchmark scripts depends on following Python packages in addition to requirements.txt packages
@@ -47,13 +45,15 @@ Here are the list of `CompileSpec` options that can be provided directly to comp
 * `--backends` : Comma separated string of backends. Eg: torch, torch_compile, dynamo, tensorrt
 * `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `dynamo` or `torch_compile`, the input should be a Pytorch module (instead of a torchscript module).
 * `--model_torch` : Name of the PyTorch model file (optional, only necessary if `dynamo` or `torch_compile` is a chosen backend)
+* `--onnx` : ONNX model file which helps bypass the step of exporting ONNX from `model_torch`. If this argument is provided, the ONNX will be directly converted to TRT engine
 * `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT
 * `--batch_size` : Batch size
 * `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16
 * `--device` : Device ID
 * `--truncate` : Truncate long and double weights in the network in Torch-TensorRT
 * `--is_trt_engine` : Boolean flag to be enabled if the model file provided is a TensorRT engine.
 * `--report` : Path of the output file where performance summary is written.
+* `--optimization_level` : Builder optimization level for TensorRT (from 1 to 5, 5 is the highest optimization).
 
 Eg:
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,6 +12,7 @@ @@
         embedding,
         full,
         grid,
+        linear,
         matmul,
         nccl_ops,
         normalization,
@@ Expand Down @@