add infer (fp32/fp16/int8) benchmark

juncaipeng · juncaipeng · commit 92bc9e0a4382 · 2021-09-08T12:04:16.000Z
diff --git a/deploy/python/infer.py b/deploy/python/infer.py
@@ -71,13 +71,13 @@ def __init__(self, args):
         if not args.print_detail:
             pred_cfg.disable_glog_info()
         pred_cfg.enable_memory_optim()
+        pred_cfg.switch_ir_optim(True)
 
         if args.device == 'gpu':
             # set GPU configs accordingly
             # such as intialize the gpu memory, enable tensorrt
             logger.info("Use GPU")
             pred_cfg.enable_use_gpu(100, 0)
-            pred_cfg.switch_ir_optim(True)
             precision_map = {
                 "fp16": PrecisionType.Half,
                 "fp32": PrecisionType.Float32,
@@ -96,7 +96,7 @@ def __init__(self, args):
                     use_calib_mode=False)
                 min_input_shape = {"x": [1, 3, 100, 100]}
                 max_input_shape = {"x": [1, 3, 2000, 3000]}
-                opt_input_shape = {"x": [1, 3, 192, 192]}
+                opt_input_shape = {"x": [1, 3, 512, 1024]}
                 pred_cfg.set_trt_dynamic_shape_info(
                     min_input_shape, max_input_shape, opt_input_shape)
         else:
@@ -105,6 +105,7 @@ def __init__(self, args):
             logger.info("Use CPU")
             pred_cfg.disable_gpu()
             if args.enable_mkldnn:
+                logger.info("Use MKLDNN")
                 # cache 10 different shapes for mkldnn to avoid memory leak
                 pred_cfg.set_mkldnn_cache_capacity(10)
                 pred_cfg.enable_mkldnn()
diff --git a/docs/deployment/inference/infer_benchmark.md b/docs/deployment/inference/infer_benchmark.md
@@ -0,0 +1,52 @@
+# 推理Benchmark
+
+测试环境：
+* GPU: V100 32G
+* CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+* CUDA: 10.1
+* cuDNN: 7.6
+* TensorRT: 6.0.1.5
+* Paddle: 2.1.1
+
+
+GPU上分割模型的测试方法:
+1. 使用cityspcaes的全量验证数据集(1024x2048)进行测试
+2. 单GPU，Batchsize为1
+3. 运行耗时为纯模型预测时间
+4. 使用Paddle Inference的[Python API](./python_inference.md)测试，通过use_trt参数设置是否使用TRT，使用precision参数设置预测类型
+
+
+|       模型                |  使用TRT   |   预测类型  |  mIoU  |   耗时(s/img)   |
+|        -                 |   :-:      |   :-:     |   :-:   |   :-:           |
+| ANN_ResNet50_OS8         |   N        |    FP32    |  0.7909  |  0.274  |  
+| ANN_ResNet50_OS8         |   Y        |    FP32    |  0.7909  |  0.281  |
+| ANN_ResNet50_OS8         |   Y        |    FP16    |  0.7909  |  0.168  |
+| ANN_ResNet50_OS8         |   Y        |    INT8    |  0.7906  |  0.195  |
+| DANet_ResNet50_OS8         |   N        |    FP32    |  0.8027  |  0.371  |  
+| DANet_ResNet50_OS8         |   Y        |    FP32    |  0.8027  |  0.330  |
+| DANet_ResNet50_OS8         |   Y        |    FP16    |  0.8027  |  0.183  |
+| DANet_ResNet50_OS8         |   Y        |    INT8    |  0.8039  |  0.266  |
+| DeepLabV3P_ResNet50_OS8         |   N        |    FP32    |  0.8036  | 0.165  |  
+| DeepLabV3P_ResNet50_OS8         |   Y        |    FP32    |  0.8036  | 0.206  |
+| DeepLabV3P_ResNet50_OS8         |   Y        |    FP16    |  0.8036  | 0.196  |
+| DeepLabV3P_ResNet50_OS8         |   Y        |    INT8    |  0.8044  | 0.083  |
+| DNLNet_ResNet50_OS8         |   N        |    FP32    |  0.7995  |  0.381  |  
+| DNLNet_ResNet50_OS8         |   Y        |    FP32    |  0.7995  |  0.360  |
+| DNLNet_ResNet50_OS8         |   Y        |    FP16    |  0.7995  |  0.230  |
+| DNLNet_ResNet50_OS8         |   Y        |    INT8    |  0.7989  |  0.236  |
+| EMANet_ResNet50_OS8         |   N        |    FP32    |  0.7905  |  0.208  |  
+| EMANet_ResNet50_OS8         |   Y        |    FP32    |  0.7905  |  0.186  |
+| EMANet_ResNet50_OS8         |   Y        |    FP16    |  0.7904  |  0.062  |
+| EMANet_ResNet50_OS8         |   Y        |    INT8    |  0.7939  |  0.106  |
+| GCNet_ResNet50_OS8         |   N        |    FP32    |  0.7950  |  0.247  |  
+| GCNet_ResNet50_OS8         |   Y        |    FP32    |  0.7950  |  0.228  |
+| GCNet_ResNet50_OS8         |   Y        |    FP16    |  0.7950  |  0.100  |
+| GCNet_ResNet50_OS8         |   Y        |    INT8    |  0.7959  |  0.144  |
+| PSPNet_ResNet50_OS8         |   N        |    FP32    |  0.7883 | 0.327  |
+| PSPNet_ResNet50_OS8         |   Y        |    FP32    |  0.7883 | 0.324  |
+| PSPNet_ResNet50_OS8         |   Y        |    FP16    |  0.7883 | 0.218  |
+| PSPNet_ResNet50_OS8         |   Y        |    INT8    |  0.7915 | 0.223  |
+| UNet         |   N        |    FP32    |  0.6500  |  0.071  |  
+| UNet         |   Y        |    FP32    |  0.6500  |  0.099  |
+| UNet         |   Y        |    FP16    |  0.6500  |  0.099  |
+| UNet         |   Y        |    INT8    |  0.6503  |  0.099  |
diff --git a/docs/deployment/inference/python_inference.md b/docs/deployment/inference/python_inference.md
@@ -54,8 +54,18 @@ python deploy/python/infer.py \
 
 **注意**
 
-1. 使用TensorRT需要使用支持TRT功能的Paddle库，请参考[附录](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-release)下载带有trt的PaddlePaddle安装包，或者参考[源码编译](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/fromsource.html)自行编译。
+1. 如果使用TensorRT预测，需要安装支持TRT功能的Paddle库。Paddle支持`cuda10.1+cudnn7+trt6.0.1.5`和`cuda10.2+cudnn8.1+trt7.1.3.4`两种版本，大家可以根据实际情况选择，通过如下链接进行下载。
+```
+https://paddle-inference-dist.bj.bcebos.com/tensorrt_test/cuda10.1-cudnn7.6-trt6.0.tar
+https://paddle-inference-dist.bj.bcebos.com/tensorrt_test/cuda10.2-cudnn8.0-trt7.1.tgz
+```
+
+* 配置安装cuda和cudnn。
+* 下载TRT，设置`export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<tensorrt_path>`。
+* 参考[附录](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-release)下载带有trt的PaddlePaddle安装包或者参考[源码编译](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/fromsource.html)自行编译。
+* 安装PaddlePaddle。
+* 部署模型。
 
-2. 当使用量化模型在GPU上预测时，需要设置device=gpu、use_trt=True、precision=int8
+2. 当使用量化模型在GPU上预测时，需要设置device=gpu、use_trt=True、precision=int8。
 
 3. 要开启`--benchmark`的话需要安装auto_log，请参考[安装方式](https://github.com/LDOUBLEV/AutoLog)。
diff --git a/docs/slim/quant/quant.md b/docs/slim/quant/quant.md
@@ -1,16 +1,54 @@
 # 模型量化教程
 
-模型量化是使用整数替代浮点数进行存储和计算的方法。举例而言，模型量化可以将32bit浮点数转换成8bit整数，则模型存储空间可以减少4倍，同时整数运算替换浮点数运算，可以加快模型推理速度、降低计算内存。
+## 1 概述
+
+模型量化是一种常见的模型压缩方法，是使用整数替代浮点数进行存储和计算。
+
+比如，模型量化将32bit浮点数转换成8bit整数，则模型存储空间可以减少4倍，同时整数运算替换浮点数运算，可以加快模型推理速度、降低计算内存。
 
 PaddleSeg基于PaddleSlim，集成了量化训练（QAT）方法，特点如下：
-* 概述：使用大量训练数据，在训练过程中更新权重，减小量化损失。
-* 注意事项：训练数据需要有Ground Truth。
+* 概述：使用训练数据，在训练过程中更新权重，减小量化损失。
 * 优点：量化模型的精度高；使用该量化模型预测，可以减少计算量、降低计算内存、减小模型大小。
 * 缺点：易用性稍差，需要一定时间产出量化模型
 
-下面，本文以一个示例来介绍如何产出和部署量化模型。
+## 2 量化模型精度和性能
+
+测试环境：
+* GPU: V100 32G
+* CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+* CUDA: 10.1
+* cuDNN: 7.6
+* TensorRT: 6.0.1.5
+* Paddle: 2.1.1
 
-## 1 环境准备
+测试方法:
+1. 在GPU上使用TensorRT测试原始模型和量化模型
+2. 使用cityspcaes的全量验证数据集(1024x2048)进行测试
+3. 单GPU，Batchsize为1
+4. 运行耗时为纯模型预测时间
+5. 使用Paddle Inference的[Python API](../../depolyment/inference/python_inference.md)测试，通过use_trt参数设置是否使用TRT，使用precision参数设置预测类型。
+
+| 模型 | 类型 | mIoU |  耗时(s/img） | 量化加速比 |
+| - | :-: | :-: | :-: | :-: |
+| ANN_ResNet50_OS8 | FP32 | 0.7909  |  0.281  | - |
+| ANN_ResNet50_OS8 | INT8 | 0.7906  |  0.195  | 30.6% |
+| DANet_ResNet50_OS8 | FP32 | 0.8027  |  0.330  | - |
+| DANet_ResNet50_OS8 | INT8 | 0.8039  |  0.266  | 19.4% |
+| DeepLabV3P_ResNet50_OS8 | FP32 | 0.8036  | 0.206  |  - |  
+| DeepLabV3P_ResNet50_OS8 | INT8 | 0.8044  | 0.083  | 59.7% |
+| DNLNet_ResNet50_OS8 | FP32 | 0.7995  |  0.360  |  - |
+| DNLNet_ResNet50_OS8 | INT8 | 0.7989  |  0.236  | 52.5% |
+| EMANet_ResNet50_OS8 | FP32 |  0.7905  |  0.186  |  - |
+| EMANet_ResNet50_OS8 | INT8 | 0.7939  |  0.106  | 43.0% |
+| GCNet_ResNet50_OS8 | FP32 | 0.7950  |  0.228  |  - |
+| GCNet_ResNet50_OS8 | INT8 | 0.7959  |  0.144  | 36.8% |
+| PSPNet_ResNet50_OS8 | FP32 | 0.7883 | 0.324  |  - |
+| PSPNet_ResNet50_OS8 | INT8 | 0.7915 | 0.223  | 32.1% |
+
+## 3 示例
+
+我们以一个示例来介绍如何产出和部署量化模型。
+### 3.1 环境准备
 
 请参考[安装文档](../../install.md)准备好PaddleSeg的基础环境。由于量化功能要求最新的PaddlePaddle版本，所以请参考[文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)安装develop(Nightly build)版本。
 
@@ -26,15 +64,15 @@ git reset --hard 15ef0c7dcee5a622787b7445f21ad9d1dea0a933
 python setup.py install
 ```
 
-## 2 产出量化模型
+### 3.2 产出量化模型
 
-### 2.1 训练FP32模型
+#### 3.2.1 训练FP32模型
 
 在产出量化模型之前，我们需要提前准备训练或者fintune好的FP32模型。
 
-此处，我们选用视盘分割数据集和BiseNetV2模型，从头开始训练模型。
+此处，我们选用视盘分割数据集和BiseNetV2模型，使用train.py从头开始训练模型。train.py输入参数的介绍，请参考[文档](../../train/train.md)。
 
-在PaddleSeg目录下，执行如下脚本，会自动下载数据集进行训练。训练结束后，精度最高的权重会保存到`output_fp32/best_model`目录下。
+在PaddleSeg目录下，执行如下脚本，会自动下载数据集进行训练。
 
 ```shell
 # 设置1张可用的GPU卡
@@ -50,11 +88,33 @@ python train.py \
        --save_dir output_fp32
 ```
 
-### 2.2 使用量化训练方法产出量化模型
+训练结束后，精度最高的权重会保存到`output_fp32/best_model`目录下。
 
-**训练量化模型**
+#### 3.2.2 使用量化训练方法产出量化模型
 
-基于2.1步骤中训练好的FP32模型权重，执行如下命令，使用`slim/quant/qat_train.py`脚本进行量化训练。
+**1）产出量化模型**
+
+基于训练好的FP32模型权重，使用`slim/quant/qat_train.py`进行量化训练。
+
+qat_train.py和train.py的输入参数基本相似（如下）。注意，量化训练的学习率需要调小，使用`model_path`参数指定FP32模型的权重。
+
+| 参数名              | 用途                                                         | 是否必选项 | 默认值           |
+| ------------------- | ------------------------------------------------------------ | ---------- | ---------------- |
+| config              | FP32模型的配置文件                                            | 是         |     -  | -                |
+| model_path          | FP32模型的预训练权重                                        | 是  | - |
+| iters               | 训练迭代次数                                                 | 否         | 配置文件中指定值 |
+| batch_size          | 单卡batch size                                              | 否         | 配置文件中指定值 |
+| learning_rate       | 初始学习率                                                   | 否         | 配置文件中指定值 |  
+| save_dir            | 模型和visualdl日志文件的保存根路径                           | 否         | output           |
+| num_workers         | 用于异步读取数据的进程数量， 大于等于1时开启子进程读取数据   | 否         | 0                |
+| use_vdl             | 是否开启visualdl记录训练数据                                 | 否         | 否               |
+| save_interval_iters | 模型保存的间隔步数                                           | 否         | 1000             |
+| do_eval             | 是否在保存模型时启动评估, 启动时将会根据mIoU保存最佳模型至best_model | 否         | 否               |
+| log_iters           | 打印日志的间隔步数                                           | 否         | 10               |
+| resume_model        | 恢复训练模型路径，如：`output/iter_1000`                     | 否         | None  
+
+
+执行如下命令，进行量化训练。量化训练结束后，精度最高的量化模型权重保存在`output_quant/best_model`目录下。
 
 ```shell
 python slim/quant/qat_train.py \
@@ -67,23 +127,29 @@ python slim/quant/qat_train.py \
        --save_dir output_quant
 ```
 
-上述脚本的输入参数和常规训练相似，复用2.1步骤的config文件，使用`model_path`参数指定FP32模型的权重，初始学习率相应调小。
-
-训练结束后，精度最高的量化模型权重会保存到`output_quant/best_model`目录下。
-
-**测试量化模型**
+**2）测试量化模型**
 
-执行如下命令，使用`slim/quant/qat_val.py`脚本加载量化模型的权重，测试模型量化的精度。
+如果需要，可以执行如下命令，使用`slim/quant/qat_val.py`脚本加载量化模型的权重，测试模型量化的精度。
 
 ```
 python slim/quant/qat_val.py \
        --config configs/quick_start/bisenet_optic_disc_512x512_1k.yml \
        --model_path output_quant/best_model/model.pdparams
 ```
 
-**导出量化预测模型**
+**3）导出量化预测模型**
 
-基于此前训练好的量化模型权重，执行如下命令，使用`slim/quant/qat_export.py`导出预测量化模型，保存在`output_quant_infer`目录下。
+基于训练好的量化模型权重，使用`slim/quant/qat_export.py`导出预测量化模型，脚本输入参数如下。
+
+|参数名|用途|是否必选项|默认值|
+|-|-|-|-|
+|config|模型配置文件|是|-|
+|save_dir|预测量化模型保存的文件夹|否|output|
+|model_path|量化模型的权重|否|配置文件中指定值|
+|with_softmax|在网络末端添加softmax算子。由于PaddleSeg组网默认返回logits，如果想要部署模型获取概率值，可以置为True|否|False|
+|without_argmax|是否不在网络末端添加argmax算子。由于PaddleSeg组网默认返回logits，为部署模型可以直接获取预测结果，我们默认在网络末端添加argmax算子|否|False|
+
+执行如下命令，导出预测量化模型保存在`output_quant_infer`目录。
 
 ```
 python slim/quant/qat_export.py \
@@ -92,35 +158,15 @@ python slim/quant/qat_export.py \
        --save_dir output_quant_infer
 ```
 
-## 3 部署
+### 3.3 部署量化模型
+
+得到量化预测模型后，我们可以进行部署应用，请参考如下教程。
 
-得到量化预测模型后，我们可以进行部署应用。
 * [Paddle Inference Python部署](../../deployment/inference/python_inference.md)
 * [Paddle Inference C++部署](../../deployment/inference/cpp_inference.md)
-* [PaddleLite部署](../../deployment/lite/lite.md)
-
-## 4 量化加速比
-
-测试环境：
-* GPU: V100
-* CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-* CUDA: 10.2
-* cuDNN: 7.6
-* TensorRT: 6.0.1.5
-
-测试方法:
-1. 运行耗时为纯模型预测时间，测试图片cityspcaes(1024x2048)
-2. 预测10次作为热启动，连续预测50次取平均得到预测时间
-3. 使用GPU + TensorRT测试
-
-|模型|未量化运行耗时(ms)|量化运行耗时(ms)|加速比|
-|-|-|-|-|
-|deeplabv3_resnet50_os8|204.2|150.1|26.49%|
-|deeplabv3p_resnet50_os8|147.2|89.5|39.20%|
-|gcnet_resnet50_os8|201.8|126.1|37.51%|
-|pspnet_resnet50_os8|266.8|206.8|22.49%|  
+* [PaddleLite部署](../../deployment/lite/lite.md)  
 
-## 5 参考资料
+### 3.4 参考资料
 
 * [PaddleSlim Github](https://github.com/PaddlePaddle/PaddleSlim)
 * [PaddleSlim 文档](https://paddleslim.readthedocs.io/zh_CN/latest/)
diff --git a/slim/quant/qat_val.py b/slim/quant/qat_val.py
@@ -25,9 +25,11 @@
 from paddleseg.core import evaluate
 from paddleseg.utils import get_sys_env, logger, config_check, utils
 from qat_config import quant_config
+from qat_train import skip_quant
 
 from paddleslim import QAT
 
+
 def get_test_config(cfg, args):
 
     test_config = cfg.test_config
@@ -163,6 +165,7 @@ def main(args):
 
     model = cfg.model
 
+    skip_quant(model)
     quantizer = QAT(config=quant_config)
     quant_model = quantizer.quantize(model)
     logger.info('Quantize the model successfully')
@@ -174,7 +177,8 @@ def main(args):
     test_config = get_test_config(cfg, args)
     config_check(cfg, val_dataset=val_dataset)
 
-    evaluate(quant_model, val_dataset, num_workers=args.num_workers, **test_config)
+    evaluate(
+        quant_model, val_dataset, num_workers=args.num_workers, **test_config)
 
 
 if __name__ == '__main__':