diff --git a/docs/module_usage/instructions/benchmark.md b/docs/module_usage/instructions/benchmark.md index b48e17e73a..3365ecd54f 100644 --- a/docs/module_usage/instructions/benchmark.md +++ b/docs/module_usage/instructions/benchmark.md @@ -6,7 +6,7 @@ PaddleX 支持统计模型推理耗时,需通过环境变量进行设置,具 * `PADDLE_PDX_INFER_BENCHMARK_WARMUP`:设置 warm up,在开始测试前,使用随机数据循环迭代 n 次,默认为 `0`; * `PADDLE_PDX_INFER_BENCHMARK_DATA_SIZE`: 设置随机数据的尺寸,默认为 `224`; * `PADDLE_PDX_INFER_BENCHMARK_ITER`:使用随机数据进行 Benchmark 测试的循环次数,仅当输入数据为 `None` 时,将使用随机数据进行测试; -* `PADDLE_PDX_INFER_BENCHMARK_OUTPUT`:用于设置保存本次 benchmark 指标到 `txt` 文件,如 `./benchmark.txt`,默认为 `None`,表示不保存 Benchmark 指标; +* `PADDLE_PDX_INFER_BENCHMARK_OUTPUT`:用于设置保存的目录,如 `./benchmark`,默认为 `None`,表示不保存 Benchmark 指标; 使用示例如下: @@ -15,51 +15,60 @@ PADDLE_PDX_INFER_BENCHMARK=True \ PADDLE_PDX_INFER_BENCHMARK_WARMUP=5 \ PADDLE_PDX_INFER_BENCHMARK_DATA_SIZE=320 \ PADDLE_PDX_INFER_BENCHMARK_ITER=10 \ -PADDLE_PDX_INFER_BENCHMARK_OUTPUT=./benchmark.txt \ +PADDLE_PDX_INFER_BENCHMARK_OUTPUT=./benchmark \ python main.py \ -c ./paddlex/configs/object_detection/PicoDet-XS.yaml \ -o Global.mode=predict \ -o Predict.model_dir=None \ + -o Predict.batch_size=2 \ -o Predict.input=None ``` 在开启 Benchmark 后,将自动打印 benchmark 指标: ``` -+-------------------+-----------------+------+---------------+ -| Stage | Total Time (ms) | Nums | Avg Time (ms) | -+-------------------+-----------------+------+---------------+ -| ReadCmp | 49.95107651 | 10 | 4.99510765 | -| Resize | 8.48054886 | 10 | 0.84805489 | -| Normalize | 23.08964729 | 10 | 2.30896473 | -| ToCHWImage | 0.02717972 | 10 | 0.00271797 | -| ImageDetPredictor | 75.94108582 | 10 | 7.59410858 | -| DetPostProcess | 0.26535988 | 10 | 0.02653599 | -+-------------------+-----------------+------+---------------+ -+-------------+-----------------+------+---------------+ -| Stage | Total Time (ms) | Nums | Avg Time (ms) | -+-------------+-----------------+------+---------------+ -| PreProcess | 81.54845238 | 10 | 8.15484524 | -| Inference | 75.94108582 | 10 | 7.59410858 | -| PostProcess | 0.26535988 | 10 | 0.02653599 | -| End2End | 161.07797623 | 10 | 16.10779762 | -| WarmUp | 5496.41847610 | 5 | 1099.28369522 | -+-------------+-----------------+------+---------------+ ++----------------+-----------------+-----------------+------------------------+ +| Component | Total Time (ms) | Number of Calls | Avg Time Per Call (ms) | ++----------------+-----------------+-----------------+------------------------+ +| ReadCmp | 102.39458084 | 10 | 10.23945808 | +| Resize | 11.20400429 | 20 | 0.56020021 | +| Normalize | 34.11078453 | 20 | 1.70553923 | +| ToCHWImage | 0.05555153 | 20 | 0.00277758 | +| Copy2GPU | 9.10568237 | 10 | 0.91056824 | +| Infer | 98.22225571 | 10 | 9.82222557 | +| Copy2CPU | 14.30845261 | 10 | 1.43084526 | +| DetPostProcess | 0.45251846 | 20 | 0.02262592 | ++----------------+-----------------+-----------------+------------------------+ ++-------------+-----------------+---------------------+----------------------------+ +| Stage | Total Time (ms) | Number of Instances | Avg Time Per Instance (ms) | ++-------------+-----------------+---------------------+----------------------------+ +| PreProcess | 147.76492119 | 20 | 7.38824606 | +| Inference | 121.63639069 | 20 | 6.08181953 | +| PostProcess | 0.45251846 | 20 | 0.02262592 | +| End2End | 294.03519630 | 20 | 14.70175982 | +| WarmUp | 7937.82591820 | 5 | 1587.56518364 | ++-------------+-----------------+---------------------+----------------------------+ ``` -在 Benchmark 结果中,会统计该模型全部组件(`Component`)的总耗时(`Total Time`,单位为“毫秒”)、调用次数(`Nums`)、调用平均执行耗时(`Avg Time`,单位为“毫秒”),以及按预热(`WarmUp`)、预处理(`PreProcess`)、模型推理(`Inference`)、后处理(`PostProcess`)和端到端(`End2End`)进行划分的耗时统计,包括每个阶段的总耗时(`Total Time`,单位为“毫秒”)、样本数(`Nums`)和单样本平均执行耗时(`Avg Time`,单位为“毫秒”),同时,保存相关指标会到本地 `./benchmark.csv` 文件中: +在 Benchmark 结果中,会统计该模型全部组件(`Component`)的总耗时(`Total Time`,单位为“毫秒”)、**调用次数**(`Number of Calls`)、**调用**平均执行耗时(`Avg Time Per Call`,单位“毫秒”),以及按预热(`WarmUp`)、预处理(`PreProcess`)、模型推理(`Inference`)、后处理(`PostProcess`)和端到端(`End2End`)进行划分的耗时统计,包括每个阶段的总耗时(`Total Time`,单位为“毫秒”)、**样本数**(`Number of Instances`)和**单样本**平均执行耗时(`Avg Time Per Instance`,单位“毫秒”),同时,上述指标会保存到到本地: `./benchmark/detail.csv` 和 `./benchmark/summary.csv`: ```csv -Stage,Total Time (ms),Nums,Avg Time (ms) -ReadCmp,0.04995107650756836,10,0.004995107650756836 -Resize,0.008480548858642578,10,0.0008480548858642578 -Normalize,0.02308964729309082,10,0.002308964729309082 -ToCHWImage,2.7179718017578125e-05,10,2.7179718017578126e-06 -ImageDetPredictor,0.07594108581542969,10,0.007594108581542969 -DetPostProcess,0.00026535987854003906,10,2.6535987854003906e-05 -PreProcess,0.08154845237731934,10,0.008154845237731934 -Inference,0.07594108581542969,10,0.007594108581542969 -PostProcess,0.00026535987854003906,10,2.6535987854003906e-05 -End2End,0.16107797622680664,10,0.016107797622680664 -WarmUp,5.496418476104736,5,1.0992836952209473 +Component,Total Time (ms),Number of Calls,Avg Time Per Call (ms) +ReadCmp,0.10199093818664551,10,0.01019909381866455 +Resize,0.011309385299682617,20,0.0005654692649841309 +Normalize,0.035140275955200195,20,0.0017570137977600097 +ToCHWImage,4.744529724121094e-05,20,2.3722648620605467e-06 +Copy2GPU,0.00861215591430664,10,0.000861215591430664 +Infer,0.820899248123169,10,0.08208992481231689 +Copy2CPU,0.006002187728881836,10,0.0006002187728881836 +DetPostProcess,0.0004436969757080078,20,2.218484878540039e-05 +``` + +```csv +Stage,Total Time (ms),Number of Instance,Avg Time Per Instance (ms) +PreProcess,0.14848804473876953,20,0.007424402236938477 +Inference,0.8355135917663574,20,0.04177567958831787 +PostProcess,0.0004436969757080078,20,2.218484878540039e-05 +End2End,1.0054960250854492,20,0.05027480125427246 +WarmUp,8.869974851608276,5,1.7739949703216553 ``` diff --git a/paddlex/inference/components/base.py b/paddlex/inference/components/base.py index 8f9bdcfe9c..9554cf43e9 100644 --- a/paddlex/inference/components/base.py +++ b/paddlex/inference/components/base.py @@ -107,6 +107,9 @@ def _check_args_key(args): f"The parameter ({param.name}) is needed by {self.__class__.__name__}, but {list(args.keys())} only found!" ) + if self.inputs is None: + return [({}, None)] + if self.need_batch_input: args = {} for input_ in input_list: @@ -266,6 +269,10 @@ def keep_input(self): def name(self): return getattr(self, "NAME", self.__class__.__name__) + @property + def sub_cmps(self): + return None + @abstractmethod def apply(self, input): raise NotImplementedError diff --git a/paddlex/inference/components/paddle_predictor/predictor.py b/paddlex/inference/components/paddle_predictor/predictor.py index 04f6752c9b..d14b29dfc3 100644 --- a/paddlex/inference/components/paddle_predictor/predictor.py +++ b/paddlex/inference/components/paddle_predictor/predictor.py @@ -23,6 +23,42 @@ from ..base import BaseComponent +class Copy2GPU(BaseComponent): + + def __init__(self, input_handlers): + super().__init__() + self.input_handlers = input_handlers + + def apply(self, x): + for idx in range(len(x)): + self.input_handlers[idx].reshape(x[idx].shape) + self.input_handlers[idx].copy_from_cpu(x[idx]) + + +class Copy2CPU(BaseComponent): + + def __init__(self, output_handlers): + super().__init__() + self.output_handlers = output_handlers + + def apply(self): + output = [] + for out_tensor in self.output_handlers: + batch = out_tensor.copy_to_cpu() + output.append(batch) + return output + + +class Infer(BaseComponent): + + def __init__(self, predictor): + super().__init__() + self.predictor = predictor + + def apply(self): + self.predictor.run() + + class BasePaddlePredictor(BaseComponent): """Predictor based on Paddle Inference""" @@ -56,12 +92,13 @@ def _reset(self): self.option = PaddlePredictorOption() logging.debug(f"Env: {self.option}") ( - self.predictor, - self.inference_config, - self.input_names, - self.input_handlers, - self.output_handlers, + predictor, + input_handlers, + output_handlers, ) = self._create() + self.copy2gpu = Copy2GPU(input_handlers) + self.copy2cpu = Copy2CPU(output_handlers) + self.infer = Infer(predictor) self.option.changed = False def _create(self): @@ -161,43 +198,46 @@ def _create(self): for output_name in output_names: output_handler = predictor.get_output_handle(output_name) output_handlers.append(output_handler) - return predictor, config, input_names, input_handlers, output_handlers - - def get_input_names(self): - """get input names""" - return self.input_names + return predictor, input_handlers, output_handlers def apply(self, **kwargs): if self.option.changed: self._reset() - x = self.to_batch(**kwargs) - for idx in range(len(x)): - self.input_handlers[idx].reshape(x[idx].shape) - self.input_handlers[idx].copy_from_cpu(x[idx]) - - self.predictor.run() - output = [] - for out_tensor in self.output_handlers: - batch = out_tensor.copy_to_cpu() - output.append(batch) - return self.format_output(output) + batches = self.to_batch(**kwargs) + self.copy2gpu.apply(batches) + self.infer.apply() + pred = self.copy2cpu.apply() + return self.format_output(pred) - def format_output(self, pred): - return [{"pred": res} for res in zip(*pred)] + @property + def sub_cmps(self): + return { + "Copy2GPU": self.copy2gpu, + "Infer": self.infer, + "Copy2CPU": self.copy2cpu, + } @abstractmethod def to_batch(self): raise NotImplementedError + @abstractmethod + def format_output(self, pred): + return [{"pred": res} for res in zip(*pred)] + class ImagePredictor(BasePaddlePredictor): - INPUT_KEYS = "img" + OUTPUT_KEYS = "pred" DEAULT_INPUTS = {"img": "img"} + DEAULT_OUTPUTS = {"pred": "pred"} def to_batch(self, img): return [np.stack(img, axis=0).astype(dtype=np.float32, copy=False)] + def format_output(self, pred): + return [{"pred": res} for res in zip(*pred)] + class ImageDetPredictor(BasePaddlePredictor): @@ -268,9 +308,14 @@ def format_output(self, pred): class TSPPPredictor(BasePaddlePredictor): INPUT_KEYS = "ts" + OUTPUT_KEYS = "pred" DEAULT_INPUTS = {"ts": "ts"} + DEAULT_OUTPUTS = {"pred": "pred"} def to_batch(self, ts): n = len(ts[0]) x = [np.stack([lst[i] for lst in ts], axis=0) for i in range(n)] return x + + def format_output(self, pred): + return [{"pred": res} for res in zip(*pred)] diff --git a/paddlex/inference/utils/benchmark.py b/paddlex/inference/utils/benchmark.py index 832901f310..a45b362871 100644 --- a/paddlex/inference/utils/benchmark.py +++ b/paddlex/inference/utils/benchmark.py @@ -16,6 +16,7 @@ import functools from types import GeneratorType import time +from pathlib import Path import numpy as np from prettytable import PrettyTable @@ -42,11 +43,18 @@ def warmup_stop(self, warmup_num): self._reset() def _reset(self): - for name in self._components: - cmp = self._components[name] + for name, cmp in self.iterate_cmp(self._components): cmp.timer.reset() self._e2e_tic = time.time() + def iterate_cmp(self, cmps): + if cmps is None: + return + for name, cmp in cmps.items(): + if cmp.sub_cmps is not None: + yield from self.iterate_cmp(cmp.sub_cmps) + yield name, cmp + def gather(self, e2e_num): # lazy import for avoiding circular import from ..components.paddle_predictor import BasePaddlePredictor @@ -54,17 +62,23 @@ def gather(self, e2e_num): detail = [] summary = {"preprocess": 0, "inference": 0, "postprocess": 0} op_tag = "preprocess" - for name in self._components: - cmp = self._components[name] - times = cmp.timer.logs - counts = len(times) - avg = np.mean(times) - total = np.sum(times) - detail.append((name, total, counts, avg)) + for name, cmp in self._components.items(): if isinstance(cmp, BasePaddlePredictor): - summary["inference"] += total + # TODO(gaotingquan): show by hierarchy. Now dont show xxxPredictor benchmark info to ensure mutual exclusivity between components. + for name, sub_cmp in cmp.sub_cmps.items(): + times = sub_cmp.timer.logs + counts = len(times) + avg = np.mean(times) + total = np.sum(times) + detail.append((name, total, counts, avg)) + summary["inference"] += total op_tag = "postprocess" else: + times = cmp.timer.logs + counts = len(times) + avg = np.mean(times) + total = np.sum(times) + detail.append((name, total, counts, avg)) summary[op_tag] += total summary = [ @@ -103,8 +117,13 @@ def collect(self, e2e_num): self._e2e_elapse = time.time() - self._e2e_tic detail, summary = self.gather(e2e_num) - table_head = ["Stage", "Total Time (ms)", "Nums", "Avg Time (ms)"] - table = PrettyTable(table_head) + detail_head = [ + "Component", + "Total Time (ms)", + "Number of Calls", + "Avg Time Per Call (ms)", + ] + table = PrettyTable(detail_head) table.add_rows( [ (name, f"{total * 1000:.8f}", cnts, f"{avg * 1000:.8f}") @@ -113,7 +132,13 @@ def collect(self, e2e_num): ) logging.info(table) - table = PrettyTable(table_head) + summary_head = [ + "Stage", + "Total Time (ms)", + "Number of Instances", + "Avg Time Per Instance (ms)", + ] + table = PrettyTable(summary_head) table.add_rows( [ (name, f"{total * 1000:.8f}", cnts, f"{avg * 1000:.8f}") @@ -123,10 +148,17 @@ def collect(self, e2e_num): logging.info(table) if INFER_BENCHMARK_OUTPUT: - csv_data = [table_head] - csv_data.extend(detail) - csv_data.extend(summary) - with open("benchmark.csv", "w", newline="") as file: + save_dir = Path(INFER_BENCHMARK_OUTPUT) + save_dir.mkdir(parents=True, exist_ok=True) + csv_data = [detail_head, *detail] + # csv_data.extend(detail) + with open(Path(save_dir) / "detail.csv", "w", newline="") as file: + writer = csv.writer(file) + writer.writerows(csv_data) + + csv_data = [summary_head, *summary] + # csv_data.extend(summary) + with open(Path(save_dir) / "summary.csv", "w", newline="") as file: writer = csv.writer(file) writer.writerows(csv_data)