diff --git a/docs/module_usage/instructions/benchmark.md b/docs/module_usage/instructions/benchmark.md
index b48e17e73a..3365ecd54f 100644
--- a/docs/module_usage/instructions/benchmark.md
+++ b/docs/module_usage/instructions/benchmark.md
@@ -6,7 +6,7 @@ PaddleX 支持统计模型推理耗时，需通过环境变量进行设置，具
 * `PADDLE_PDX_INFER_BENCHMARK_WARMUP`：设置 warm up，在开始测试前，使用随机数据循环迭代 n 次，默认为 `0`；
 * `PADDLE_PDX_INFER_BENCHMARK_DATA_SIZE`： 设置随机数据的尺寸，默认为 `224`；
 * `PADDLE_PDX_INFER_BENCHMARK_ITER`：使用随机数据进行 Benchmark 测试的循环次数，仅当输入数据为 `None` 时，将使用随机数据进行测试；
-* `PADDLE_PDX_INFER_BENCHMARK_OUTPUT`：用于设置保存本次 benchmark 指标到 `txt` 文件，如 `./benchmark.txt`，默认为 `None`，表示不保存 Benchmark 指标；
+* `PADDLE_PDX_INFER_BENCHMARK_OUTPUT`：用于设置保存的目录，如 `./benchmark`，默认为 `None`，表示不保存 Benchmark 指标；
 
 使用示例如下：
 
@@ -15,51 +15,60 @@ PADDLE_PDX_INFER_BENCHMARK=True \
 PADDLE_PDX_INFER_BENCHMARK_WARMUP=5 \
 PADDLE_PDX_INFER_BENCHMARK_DATA_SIZE=320 \
 PADDLE_PDX_INFER_BENCHMARK_ITER=10 \
-PADDLE_PDX_INFER_BENCHMARK_OUTPUT=./benchmark.txt \
+PADDLE_PDX_INFER_BENCHMARK_OUTPUT=./benchmark \
 python main.py \
     -c ./paddlex/configs/object_detection/PicoDet-XS.yaml \
     -o Global.mode=predict \
     -o Predict.model_dir=None \
+    -o Predict.batch_size=2 \
     -o Predict.input=None
 ```
 
 在开启 Benchmark 后，将自动打印 benchmark 指标：
 
 ```
-+-------------------+-----------------+------+---------------+
-|       Stage       | Total Time (ms) | Nums | Avg Time (ms) |
-+-------------------+-----------------+------+---------------+
-|      ReadCmp      |   49.95107651   |  10  |   4.99510765  |
-|       Resize      |    8.48054886   |  10  |   0.84805489  |
-|     Normalize     |   23.08964729   |  10  |   2.30896473  |
-|     ToCHWImage    |    0.02717972   |  10  |   0.00271797  |
-| ImageDetPredictor |   75.94108582   |  10  |   7.59410858  |
-|   DetPostProcess  |    0.26535988   |  10  |   0.02653599  |
-+-------------------+-----------------+------+---------------+
-+-------------+-----------------+------+---------------+
-|    Stage    | Total Time (ms) | Nums | Avg Time (ms) |
-+-------------+-----------------+------+---------------+
-|  PreProcess |   81.54845238   |  10  |   8.15484524  |
-|  Inference  |   75.94108582   |  10  |   7.59410858  |
-| PostProcess |    0.26535988   |  10  |   0.02653599  |
-|   End2End   |   161.07797623  |  10  |  16.10779762  |
-|    WarmUp   |  5496.41847610  |  5   | 1099.28369522 |
-+-------------+-----------------+------+---------------+
++----------------+-----------------+-----------------+------------------------+
+|   Component    | Total Time (ms) | Number of Calls | Avg Time Per Call (ms) |
++----------------+-----------------+-----------------+------------------------+
+|    ReadCmp     |   102.39458084  |        10       |      10.23945808       |
+|     Resize     |   11.20400429   |        20       |       0.56020021       |
+|   Normalize    |   34.11078453   |        20       |       1.70553923       |
+|   ToCHWImage   |    0.05555153   |        20       |       0.00277758       |
+|    Copy2GPU    |    9.10568237   |        10       |       0.91056824       |
+|     Infer      |   98.22225571   |        10       |       9.82222557       |
+|    Copy2CPU    |   14.30845261   |        10       |       1.43084526       |
+| DetPostProcess |    0.45251846   |        20       |       0.02262592       |
++----------------+-----------------+-----------------+------------------------+
++-------------+-----------------+---------------------+----------------------------+
+|    Stage    | Total Time (ms) | Number of Instances | Avg Time Per Instance (ms) |
++-------------+-----------------+---------------------+----------------------------+
+|  PreProcess |   147.76492119  |          20         |         7.38824606         |
+|  Inference  |   121.63639069  |          20         |         6.08181953         |
+| PostProcess |    0.45251846   |          20         |         0.02262592         |
+|   End2End   |   294.03519630  |          20         |        14.70175982         |
+|    WarmUp   |  7937.82591820  |          5          |       1587.56518364        |
++-------------+-----------------+---------------------+----------------------------+
 ```
 
-在 Benchmark 结果中，会统计该模型全部组件（`Component`）的总耗时（`Total Time`，单位为“毫秒”）、调用次数（`Nums`）、调用平均执行耗时（`Avg Time`，单位为“毫秒”），以及按预热（`WarmUp`）、预处理（`PreProcess`）、模型推理（`Inference`）、后处理（`PostProcess`）和端到端（`End2End`）进行划分的耗时统计，包括每个阶段的总耗时（`Total Time`，单位为“毫秒”）、样本数（`Nums`）和单样本平均执行耗时（`Avg Time`，单位为“毫秒”），同时，保存相关指标会到本地 `./benchmark.csv` 文件中：
+在 Benchmark 结果中，会统计该模型全部组件（`Component`）的总耗时（`Total Time`，单位为“毫秒”）、**调用次数**（`Number of Calls`）、**调用**平均执行耗时（`Avg Time Per Call`，单位“毫秒”），以及按预热（`WarmUp`）、预处理（`PreProcess`）、模型推理（`Inference`）、后处理（`PostProcess`）和端到端（`End2End`）进行划分的耗时统计，包括每个阶段的总耗时（`Total Time`，单位为“毫秒”）、**样本数**（`Number of Instances`）和**单样本**平均执行耗时（`Avg Time Per Instance`，单位“毫秒”），同时，上述指标会保存到到本地： `./benchmark/detail.csv` 和 `./benchmark/summary.csv`：
 
 ```csv
-Stage,Total Time (ms),Nums,Avg Time (ms)
-ReadCmp,0.04995107650756836,10,0.004995107650756836
-Resize,0.008480548858642578,10,0.0008480548858642578
-Normalize,0.02308964729309082,10,0.002308964729309082
-ToCHWImage,2.7179718017578125e-05,10,2.7179718017578126e-06
-ImageDetPredictor,0.07594108581542969,10,0.007594108581542969
-DetPostProcess,0.00026535987854003906,10,2.6535987854003906e-05
-PreProcess,0.08154845237731934,10,0.008154845237731934
-Inference,0.07594108581542969,10,0.007594108581542969
-PostProcess,0.00026535987854003906,10,2.6535987854003906e-05
-End2End,0.16107797622680664,10,0.016107797622680664
-WarmUp,5.496418476104736,5,1.0992836952209473
+Component,Total Time (ms),Number of Calls,Avg Time Per Call (ms)
+ReadCmp,0.10199093818664551,10,0.01019909381866455
+Resize,0.011309385299682617,20,0.0005654692649841309
+Normalize,0.035140275955200195,20,0.0017570137977600097
+ToCHWImage,4.744529724121094e-05,20,2.3722648620605467e-06
+Copy2GPU,0.00861215591430664,10,0.000861215591430664
+Infer,0.820899248123169,10,0.08208992481231689
+Copy2CPU,0.006002187728881836,10,0.0006002187728881836
+DetPostProcess,0.0004436969757080078,20,2.218484878540039e-05
+```
+
+```csv
+Stage,Total Time (ms),Number of Instance,Avg Time Per Instance (ms)
+PreProcess,0.14848804473876953,20,0.007424402236938477
+Inference,0.8355135917663574,20,0.04177567958831787
+PostProcess,0.0004436969757080078,20,2.218484878540039e-05
+End2End,1.0054960250854492,20,0.05027480125427246
+WarmUp,8.869974851608276,5,1.7739949703216553
 ```
diff --git a/paddlex/inference/components/base.py b/paddlex/inference/components/base.py
index 8f9bdcfe9c..9554cf43e9 100644
--- a/paddlex/inference/components/base.py
+++ b/paddlex/inference/components/base.py
@@ -107,6 +107,9 @@ def _check_args_key(args):
                         f"The parameter ({param.name}) is needed by {self.__class__.__name__}, but {list(args.keys())} only found!"
                     )
 
+        if self.inputs is None:
+            return [({}, None)]
+
         if self.need_batch_input:
             args = {}
             for input_ in input_list:
@@ -266,6 +269,10 @@ def keep_input(self):
     def name(self):
         return getattr(self, "NAME", self.__class__.__name__)
 
+    @property
+    def sub_cmps(self):
+        return None
+
     @abstractmethod
     def apply(self, input):
         raise NotImplementedError
diff --git a/paddlex/inference/components/paddle_predictor/predictor.py b/paddlex/inference/components/paddle_predictor/predictor.py
index 04f6752c9b..d14b29dfc3 100644
--- a/paddlex/inference/components/paddle_predictor/predictor.py
+++ b/paddlex/inference/components/paddle_predictor/predictor.py
@@ -23,6 +23,42 @@
 from ..base import BaseComponent
 
 
+class Copy2GPU(BaseComponent):
+
+    def __init__(self, input_handlers):
+        super().__init__()
+        self.input_handlers = input_handlers
+
+    def apply(self, x):
+        for idx in range(len(x)):
+            self.input_handlers[idx].reshape(x[idx].shape)
+            self.input_handlers[idx].copy_from_cpu(x[idx])
+
+
+class Copy2CPU(BaseComponent):
+
+    def __init__(self, output_handlers):
+        super().__init__()
+        self.output_handlers = output_handlers
+
+    def apply(self):
+        output = []
+        for out_tensor in self.output_handlers:
+            batch = out_tensor.copy_to_cpu()
+            output.append(batch)
+        return output
+
+
+class Infer(BaseComponent):
+
+    def __init__(self, predictor):
+        super().__init__()
+        self.predictor = predictor
+
+    def apply(self):
+        self.predictor.run()
+
+
 class BasePaddlePredictor(BaseComponent):
     """Predictor based on Paddle Inference"""
 
@@ -56,12 +92,13 @@ def _reset(self):
             self.option = PaddlePredictorOption()
         logging.debug(f"Env: {self.option}")
         (
-            self.predictor,
-            self.inference_config,
-            self.input_names,
-            self.input_handlers,
-            self.output_handlers,
+            predictor,
+            input_handlers,
+            output_handlers,
         ) = self._create()
+        self.copy2gpu = Copy2GPU(input_handlers)
+        self.copy2cpu = Copy2CPU(output_handlers)
+        self.infer = Infer(predictor)
         self.option.changed = False
 
     def _create(self):
@@ -161,43 +198,46 @@ def _create(self):
         for output_name in output_names:
             output_handler = predictor.get_output_handle(output_name)
             output_handlers.append(output_handler)
-        return predictor, config, input_names, input_handlers, output_handlers
-
-    def get_input_names(self):
-        """get input names"""
-        return self.input_names
+        return predictor, input_handlers, output_handlers
 
     def apply(self, **kwargs):
         if self.option.changed:
             self._reset()
-        x = self.to_batch(**kwargs)
-        for idx in range(len(x)):
-            self.input_handlers[idx].reshape(x[idx].shape)
-            self.input_handlers[idx].copy_from_cpu(x[idx])
-
-        self.predictor.run()
-        output = []
-        for out_tensor in self.output_handlers:
-            batch = out_tensor.copy_to_cpu()
-            output.append(batch)
-        return self.format_output(output)
+        batches = self.to_batch(**kwargs)
+        self.copy2gpu.apply(batches)
+        self.infer.apply()
+        pred = self.copy2cpu.apply()
+        return self.format_output(pred)
 
-    def format_output(self, pred):
-        return [{"pred": res} for res in zip(*pred)]
+    @property
+    def sub_cmps(self):
+        return {
+            "Copy2GPU": self.copy2gpu,
+            "Infer": self.infer,
+            "Copy2CPU": self.copy2cpu,
+        }
 
     @abstractmethod
     def to_batch(self):
         raise NotImplementedError
 
+    @abstractmethod
+    def format_output(self, pred):
+        return [{"pred": res} for res in zip(*pred)]
+
 
 class ImagePredictor(BasePaddlePredictor):
-
     INPUT_KEYS = "img"
+    OUTPUT_KEYS = "pred"
     DEAULT_INPUTS = {"img": "img"}
+    DEAULT_OUTPUTS = {"pred": "pred"}
 
     def to_batch(self, img):
         return [np.stack(img, axis=0).astype(dtype=np.float32, copy=False)]
 
+    def format_output(self, pred):
+        return [{"pred": res} for res in zip(*pred)]
+
 
 class ImageDetPredictor(BasePaddlePredictor):
 
@@ -268,9 +308,14 @@ def format_output(self, pred):
 class TSPPPredictor(BasePaddlePredictor):
 
     INPUT_KEYS = "ts"
+    OUTPUT_KEYS = "pred"
     DEAULT_INPUTS = {"ts": "ts"}
+    DEAULT_OUTPUTS = {"pred": "pred"}
 
     def to_batch(self, ts):
         n = len(ts[0])
         x = [np.stack([lst[i] for lst in ts], axis=0) for i in range(n)]
         return x
+
+    def format_output(self, pred):
+        return [{"pred": res} for res in zip(*pred)]
diff --git a/paddlex/inference/utils/benchmark.py b/paddlex/inference/utils/benchmark.py
index 832901f310..a45b362871 100644
--- a/paddlex/inference/utils/benchmark.py
+++ b/paddlex/inference/utils/benchmark.py
@@ -16,6 +16,7 @@
 import functools
 from types import GeneratorType
 import time
+from pathlib import Path
 import numpy as np
 from prettytable import PrettyTable
 
@@ -42,11 +43,18 @@ def warmup_stop(self, warmup_num):
         self._reset()
 
     def _reset(self):
-        for name in self._components:
-            cmp = self._components[name]
+        for name, cmp in self.iterate_cmp(self._components):
             cmp.timer.reset()
         self._e2e_tic = time.time()
 
+    def iterate_cmp(self, cmps):
+        if cmps is None:
+            return
+        for name, cmp in cmps.items():
+            if cmp.sub_cmps is not None:
+                yield from self.iterate_cmp(cmp.sub_cmps)
+            yield name, cmp
+
     def gather(self, e2e_num):
         # lazy import for avoiding circular import
         from ..components.paddle_predictor import BasePaddlePredictor
@@ -54,17 +62,23 @@ def gather(self, e2e_num):
         detail = []
         summary = {"preprocess": 0, "inference": 0, "postprocess": 0}
         op_tag = "preprocess"
-        for name in self._components:
-            cmp = self._components[name]
-            times = cmp.timer.logs
-            counts = len(times)
-            avg = np.mean(times)
-            total = np.sum(times)
-            detail.append((name, total, counts, avg))
+        for name, cmp in self._components.items():
             if isinstance(cmp, BasePaddlePredictor):
-                summary["inference"] += total
+                # TODO(gaotingquan): show by hierarchy. Now dont show xxxPredictor benchmark info to ensure mutual exclusivity between components.
+                for name, sub_cmp in cmp.sub_cmps.items():
+                    times = sub_cmp.timer.logs
+                    counts = len(times)
+                    avg = np.mean(times)
+                    total = np.sum(times)
+                    detail.append((name, total, counts, avg))
+                    summary["inference"] += total
                 op_tag = "postprocess"
             else:
+                times = cmp.timer.logs
+                counts = len(times)
+                avg = np.mean(times)
+                total = np.sum(times)
+                detail.append((name, total, counts, avg))
                 summary[op_tag] += total
 
         summary = [
@@ -103,8 +117,13 @@ def collect(self, e2e_num):
         self._e2e_elapse = time.time() - self._e2e_tic
         detail, summary = self.gather(e2e_num)
 
-        table_head = ["Stage", "Total Time (ms)", "Nums", "Avg Time (ms)"]
-        table = PrettyTable(table_head)
+        detail_head = [
+            "Component",
+            "Total Time (ms)",
+            "Number of Calls",
+            "Avg Time Per Call (ms)",
+        ]
+        table = PrettyTable(detail_head)
         table.add_rows(
             [
                 (name, f"{total * 1000:.8f}", cnts, f"{avg * 1000:.8f}")
@@ -113,7 +132,13 @@ def collect(self, e2e_num):
         )
         logging.info(table)
 
-        table = PrettyTable(table_head)
+        summary_head = [
+            "Stage",
+            "Total Time (ms)",
+            "Number of Instances",
+            "Avg Time Per Instance (ms)",
+        ]
+        table = PrettyTable(summary_head)
         table.add_rows(
             [
                 (name, f"{total * 1000:.8f}", cnts, f"{avg * 1000:.8f}")
@@ -123,10 +148,17 @@ def collect(self, e2e_num):
         logging.info(table)
 
         if INFER_BENCHMARK_OUTPUT:
-            csv_data = [table_head]
-            csv_data.extend(detail)
-            csv_data.extend(summary)
-            with open("benchmark.csv", "w", newline="") as file:
+            save_dir = Path(INFER_BENCHMARK_OUTPUT)
+            save_dir.mkdir(parents=True, exist_ok=True)
+            csv_data = [detail_head, *detail]
+            # csv_data.extend(detail)
+            with open(Path(save_dir) / "detail.csv", "w", newline="") as file:
+                writer = csv.writer(file)
+                writer.writerows(csv_data)
+
+            csv_data = [summary_head, *summary]
+            # csv_data.extend(summary)
+            with open(Path(save_dir) / "summary.csv", "w", newline="") as file:
                 writer = csv.writer(file)
                 writer.writerows(csv_data)