From 37df2f3108d4f400c79aa06a5308f70f8bbe05cb Mon Sep 17 00:00:00 2001 From: Qixiang Xu Date: Fri, 7 Nov 2025 14:51:55 +0800 Subject: [PATCH] Add How to Benchmark a Single KleidiAI Micro-kernel in ExecuTorch --- .../01-env-setup.md | 54 +++++ .../02-cross-compile.md | 78 ++++++ .../03-executorch-node-kai-kernel.md | 59 +++++ .../04-create-fc-model.md | 229 ++++++++++++++++++ .../05-create-conv2d-model.md | 176 ++++++++++++++ .../06-create-matrix-mul-model.md | 91 +++++++ .../07-run-model.md | 47 ++++ .../08-analyze-etdump.md | 55 +++++ .../_index.md | 50 ++++ .../_next-steps.md | 8 + .../export-conv2d.py | 107 ++++++++ .../export-linear-model.py | 145 +++++++++++ .../export-matrix-mul.py | 44 ++++ 13 files changed, 1143 insertions(+) create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md new file mode 100644 index 0000000000..1e26ff1d6a --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md @@ -0,0 +1,54 @@ +--- +title: Environment setup +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +### Python Environment Setup + +Before building ExecuTorch, it is highly recommended to create an isolated Python environment. +This prevents dependency conflicts with your system Python installation and ensures a clean build environment. + +```bash +cd $WORKSPACE +python3 -m venv pyenv +source pyenv/bin/activate + +``` +All subsequent steps should be executed within this Python virtual environment. + +### Download the ExecuTorch Source Code + +Clone the ExecuTorch repository from GitHub. The following command checks out the stable v1.0.0 release and ensures all required submodules are fetched. + +```bash +cd $WORKSPACE +git clone -b v1.0.0 --recurse-submodules https://github.com/pytorch/executorch.git + +``` + + > **Note:** + > The instructions in this guide are based on **ExecuTorch v1.0.0**. + > Commands or configuration options may differ in later releases. + +### Build and Install the ExecuTorch Python Components + +Next, build the Python bindings and install them into your environment. The following command uses the provided installation script to configure, compile, and install ExecuTorch with developer tools enabled. + +```bash +cd $WORKSPACE/executorch +CMAKE_ARGS="-DEXECUTORCH_BUILD_DEVTOOLS=ON" ./install_executorch.sh + +``` + +This will build ExecuTorch and its dependencies using CMake, enabling optional developer utilities such as ETDump and Inspector. + +After installation completes successfully, you can verify the environment by running: + +```bash +python -c "import executorch; print('Executorch build and install successfully.')" +``` + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md new file mode 100644 index 0000000000..0b0386694c --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md @@ -0,0 +1,78 @@ +--- +title: Cross-Compile ExecuTorch for the Aarch64 platform +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +This section describes how to cross-compile ExecuTorch for an AArch64 target platform with XNNPACK and KleidiAI support enabled. +All commands below are intended to be executed on an x86-64 Linux host with an appropriate cross-compilation toolchain installed (e.g., aarch64-linux-gnu-gcc). + + +### Run CMake Configuration + +Use CMake to configure the ExecuTorch build for Aarch64. The example below enables key extensions, developer tools, and XNNPACK with KleidiAI acceleration: + +```bash + +cd $WORKSPACE +mkdir -p build-arm64 +cd build-arm64 + +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=aarch64 \ + -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \ + -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_LOG_LEVEL=debug \ + -DEXECUTORCH_XNNPACK_ENABLE_KLEIDI=ON \ + ../executorch + +``` + +#### Key Build Options + +| **CMake Option** | **Description** | +| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `EXECUTORCH_BUILD_XNNPACK` | Builds the **XNNPACK backend**, which provides highly optimized CPU operators (GEMM, convolution, etc.) for Arm64 platforms. | +| `EXECUTORCH_XNNPACK_ENABLE_KLEIDI` | Enables **Arm KleidiAI** acceleration for XNNPACK kernels, providing further performance improvements on Armv8.2+ CPUs. | +| `EXECUTORCH_BUILD_DEVTOOLS` | Builds **developer tools** such as the ExecuTorch Inspector and diagnostic utilities for profiling and debugging. | +| `EXECUTORCH_BUILD_EXTENSION_MODULE` | Builds the **Module API** extension, which provides a high-level abstraction for model loading and execution using `Module` objects. | +| `EXECUTORCH_BUILD_EXTENSION_TENSOR` | Builds the **Tensor API** extension, providing convenience functions for creating, manipulating, and managing tensors in C++ runtime. | +| `EXECUTORCH_BUILD_KERNELS_OPTIMIZED` | Enables building **optimized kernel implementations** for better performance on supported architectures. | +| `EXECUTORCH_ENABLE_EVENT_TRACER` | Enables the **event tracing** feature, which records performance and operator timing information for runtime analysis. | + + + +### Build ExecuTorch + +```bash +cmake --build . -j$(nproc) + +``` + +If the build completes successfully, you should find the executor_runner binary under the directory: + +```bash +build-arm64/executor_runner + +``` + +This binary can be used to run ExecuTorch models on the ARM64 target device using the XNNPACK backend with KleidiAI acceleration. + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md new file mode 100644 index 0000000000..7bb5ffffd4 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md @@ -0,0 +1,59 @@ +--- +title: KleidiAI micro-kernels support in ExecuTorch +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- +ExecuTorch uses XNNPACK as its primary CPU backend for operator execution and performance optimization. + +Within this architecture, only a subset of KleidiAI SME (Scalable Matrix Extension) micro-kernels has been integrated into XNNPACK to provide additional acceleration on supported Arm platforms. + +These specialized micro-kernels are designed to accelerate operators with specific data types and quantization configurations in ExecuTorch models. + +When an operator matches one of the supported configurations, ExecuTorch automatically dispatches it through the KleidiAI-optimized path. + +Operators that are not covered by KleidiAI fall back to the standard XNNPACK implementations during inference, ensuring functional correctness across all models. + +In ExecuTorch v1.0.0, the following operator types are implemented through the XNNPACK backend and can potentially benefit from KleidiAI acceleration: +- XNNFullyConnected – Fully connected (dense) layers +- XNNConv2d – Standard 2D convolution layers +- XNNBatchMatrixMultiply – Batched matrix multiplication operations + +However, not all instances of these operators are accelerated by KleidiAI. + +Acceleration eligibility depends on several operator attributes and backend support, including: +- Data types (e.g., float32, int8, int4) +- Quantization schemes (e.g., symmetric/asymmetric, per-tensor/per-channel) +- Tensor memory layout and alignment +- Kernel dimensions and stride settings + +The following section provides detailed information on which operator configurations can benefit from KleidiAI acceleration, along with their corresponding data type and quantization support. + + +### XNNFullyConnected + +| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| pf16_gemm | FP16 | FP16 | FP16 | +| pf32_gemm | FP32 | FP32 | FP32 | +| qp8_f32_qc8w_gemm | Asymmetric INT8 per-row quantization | Per-channel symmetric INT8 quantization | FP32 | +| pqs8_qc8w_gemm | Asymmetric INT8 quantization | Per-channel symmetric INT8 quantization | Asymmetric INT8 quantization | +| qp8_f32_qb4w_gemm | Asymmetric INT8 per-row quantization | INT4 (signed), shared blockwise quantization | FP32 | + + +### XNNConv2d +| XNNPACK GEMM Variant | Input DataType| Filter DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| pf32_gemm | FP32 | FP32, pointwise (1×1) | FP32 | +| pqs8_qc8w_gemm | Asymmetric INT8 quantization (NHWC) | Per-channel or per-tensor symmetric INT8 quantization | Asymmetric INT8 quantization(NHWC) | + + +### XNNBatchMatrixMultiply +| XNNPACK GEMM Variant | Input A DataType| Input B DataType |Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- |--------------------------------------- | +| pf32_gemm | FP32 | FP32 | FP32 | +| pf16_gemm | FP16 | FP16 | FP16 | + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md new file mode 100644 index 0000000000..7be11240db --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md @@ -0,0 +1,229 @@ +--- +title: Create and quantize linear layer benchmark model +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In the previous section, we discussed that the Fully Connected operator supports multiple GEMM (General Matrix Multiplication) variants. + +To evaluate the performance of these variants across different hardware platforms, we will construct a series of benchmark models that utilize the Fully Connected operator with different GEMM implementations for comparative analysis. + + +### Fully connected benchmark model + +In the following example model, we use simple model to generate nodes that can be accelerated by Kleidiai. + +By adjusting some of the model’s input parameters, we can also simulate the behavior of nodes that appear in real-world models. + + +```python +import torch +import torch.nn as nn +class DemoLinearModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(256,256) + + def forward(self, x): + y = self.linear(x) + return y + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 256, dtype=dtype),) + +``` + +### Export FP16/FP32 model for pf16_gemm/pf32_gemm Variants + +| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| pf16_gemm | FP16 | FP16 | FP16 | +| pf32_gemm | FP32 | FP32 | FP32 | + +The following code demonstrates how to lower and export a model that leverages the pf16_gemm variant to accelerate computation: + +``` python +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_executorch_model(dtype: torch.dtype, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(dtype) + example_inputs = model.get_example_inputs(dtype) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_executorch_model(torch.float16,"linear_model_pf16_gemm") + +``` + +To generate a model that uses the pf32_gemm variant, simply change the dtype in the previous code to torch.float32, as shown below: + +```python + +export_executorch_model(torch.float32,"linear_model_pf32_gemm") + +``` + +### Export int8 quantized model for pqs8_qc8w_gemm and qp8_f32_qc8w_gemm variant + +| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| qp8_f32_qc8w_gemm | Asymmetric INT8 per-row quantization | Per-channel symmetric INT8 quantization | FP32 | +| pqs8_qc8w_gemm | Asymmetric INT8 quantization | Per-channel symmetric INT8 quantization | Asymmetric INT8 quantization | + + +The following code demonstrates how to quantized a model that leverages the pqs8_qc8w_gemm/qp8_f32_qc8w_gemm variant to accelerate computation: + +```python + +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + +def export_int8_quantize_model(dynamic: bool, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + model = torch.export.export(model, example_inputs).module() + quantizer = XNNPACKQuantizer() + operator_config = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=dynamic + ) + + quantizer.set_global(operator_config) + quantize_model = prepare_pt2e(model, quantizer) + quantize_model(*example_inputs) + quantize_model = convert_pt2e(quantize_model) + + #lower and export model + exported_program = torch.export.export(quantize_model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int8_quantize_model(False,"linear_model_pqs8_qc8w_gemm"); +export_int8_quantize_model(True,"linear_model_qp8_f32_qc8w_gemm"); + +``` + +### Export int4 quantized model for qp8_f32_qb4w_gemm variant +| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| qp8_f32_qb4w_gemm | Asymmetric INT8 per-row quantization | INT4 (signed), shared blockwise quantization | FP32 | + + +The following code demonstrates how to quantized a model that leverages the qp8_f32_qb4w_gemm variant to accelerate computation: + +```python +from torchao.quantization.granularity import PerGroup, PerAxis +from torchao.quantization.quant_api import ( + IntxWeightOnlyConfig, + Int8DynamicActivationIntxWeightConfig, + quantize_, +) + +def export_int4_quantize_model(dynamic: bool, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + + linear_config = Int8DynamicActivationIntxWeightConfig( + weight_dtype=torch.int4, + weight_granularity=PerGroup(32), + ) + + quantize_(model, linear_config) + + #lower and export model + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int4_quantize_model(False,"linear_model_qp8_f32_qb4w_gemm"); + + +``` + +**NOTE:** + +When exporting models, the **generate_etrecord** option is enabled to produce the .etrecord file alongside the .pte model file. +These ETRecord files are essential for subsequent model inspection and performance analysis using the ExecuTorch Inspector API. + + +After running this script, both the PTE model file and the etrecord file are generated. + +``` bash +$ ls model/ -1 +linear_model_pf16_gemm.etrecord +linear_model_pf16_gemm.pte +linear_model_pf32_gemm.etrecord +linear_model_pf32_gemm.pte +linear_model_pqs8_qc8w_gemm.etrecord +linear_model_pqs8_qc8w_gemm.pte +linear_model_qp8_f32_qb4w_gemm.etrecord +linear_model_qp8_f32_qb4w_gemm.pte +linear_model_qp8_f32_qc8w_gemm.etrecord +linear_model_qp8_f32_qc8w_gemm.pte +``` + +The complete source code is available [here](../export-linear-model.py). diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md new file mode 100644 index 0000000000..685a7ce397 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md @@ -0,0 +1,176 @@ +--- +title: Create and quantize convolution layer benchmark model +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In the previous section, we discussed that both INT8-quantized Conv2d and pointwise (1×1) Conv2d operators can be accelerated using KleidiAI’s matrix-multiplication micro-kernels. + + +| XNNPACK GEMM Variant | Input DataType| Filter DataType | Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- | ---------------------------- | +| pqs8_qc8w_gemm | Asymmetric INT8 quantization(NHWC) | Per-channel or per-tensor symmetric INT8 quantization | Asymmetric INT8 quantization(NHWC) | +| pf32_gemm | FP32 | FP32, pointwise (1×1) | FP32 | + +To evaluate the performance of Conv2d operators across multiple hardware platforms, we create a set of benchmark models that utilize different GEMM implementation variants within the convolution operators for systematic comparative analysis. + + +### INT8-quantized Conv2d benchmark model + +The following example defines a simple model to generate INT8-quantized Conv2d nodes that can be accelerated by KleidiAI. + +By adjusting some of the model’s input parameters, we can also simulate the behavior of nodes that appear in real-world models. + + +```python +import torch +import torch.nn as nn + +class DemoQInt8Conv2dModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 6, 3) + + def forward(self,x): + x = self.conv(x) + return x + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 3, 16, 16, dtype=dtype),) + +``` + +The following code can be used to quantize and export the model: + +```python +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + +def export_int8_quantize_conv2d_model(model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoQInt8Conv2dModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + model = torch.export.export(model, example_inputs).module() + quantizer = XNNPACKQuantizer() + operator_config = get_symmetric_quantization_config( + is_per_channel=False, + is_dynamic=False + ) + + quantizer.set_global(operator_config) + quantize_model = prepare_pt2e(model, quantizer) + quantize_model(*example_inputs) + quantize_model = convert_pt2e(quantize_model) + + #export model + exported_program = torch.export.export(quantize_model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int8_quantize_conv2d_model("qint8_conv2d_pqs8_qc8w_gemm"); + + +``` + +### PointwiseConv2d benchmark model + +In the following example model, we use simple model to generate pointwise Conv2d nodes that can be accelerated by Kleidiai. + +As before, input parameters can be adjusted to simulate real-world model behavior. + + +``` python +import torch +import torch.nn as nn +class DemoConv2dModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.pointwiseconv = torch.nn.Conv2d(3, 2, 1,groups=1) + + def forward(self,x): + x = self.pointwiseconv(x) + return x + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 3, 16, 16, dtype=dtype),) + +``` + +The following code can be used to lower and export the model: + +```python +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_pointwise_model(model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoConv2dModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_pointwise_model("pointwise_conv2d_pf32_gemm") + +``` + +**NOTES:** + +When exporting models, the generate_etrecord option is enabled to produce the .etrecord file alongside the .pte model file. +These ETRecord files are essential for subsequent model analysis and performance evaluation. + +After running this script, both the PTE model file and the etrecord file are generated. + +``` bash +$ ls model/ -1 +qint8_conv2d_pqs8_qc8w_gemm.etrecord +qint8_conv2d_pqs8_qc8w_gemm.pte +pointwise_conv2d_pf32_gemm.etrecord +pointwise_conv2d_pf32_gemm.pte +``` + +The complete source code is available [here](../export-conv2d.py). diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md new file mode 100644 index 0000000000..901e2a8883 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md @@ -0,0 +1,91 @@ +--- +title: Create matrix multiply layer benchmark model +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In the previous section, we discussed that the Batch Matrix Multiply operator supports multiple GEMM (General Matrix Multiplication) variants. + +To evaluate the performance of these variants across different hardware platforms, we construct a set of benchmark models that utilize the batch matrix multiply operator with different GEMM implementations for comparative analysis. + + +### Matrix multiply benchmark model + +The following example defines a simple model to generate nodes that can be accelerated by KleidiAI. + +By adjusting the input parameters, this model can also simulate the behavior of nodes commonly found in real-world models. + + +```python +class DemoBatchMatMulModel(nn.Module): + def forward(self, x,y): + return torch.bmm(x, y) + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 256, 256, dtype=dtype),torch.randn(1, 256, 256, dtype=dtype)) + +``` + +### Export FP16/FP32 model for pf16_gemm/pf32_gemm variant + +| XNNPACK GEMM Variant | Input A DataType| Input B DataType |Output DataType | +| ------------------ | ---------------------------- | --------------------------------------- |--------------------------------------- | +| pf32_gemm | FP32 | FP32 | FP32 | +| pf16_gemm | FP16 | FP16 | FP16 | + +The following code snippet demonstrates how to lower the model that leverages the pf16_gemm and pf32_gemm variant to accelerate computation: + +``` python +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_mutrix_mul_model(dtype: torch.dtype, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoBatchMatMulModel().eval().to(dtype) + example_inputs = model.get_example_inputs(dtype) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_mutrix_mul_model(torch.float16,"matrix_mul_pf16_gemm") +export_mutrix_mul_model(torch.float32,"matrix_mul_pf32_gemm") + +``` + +**NOTE:** + +When exporting models, the **generate_etrecord** option is enabled to produce the .etrecord file alongside the .pte model file. +These ETRecord files are essential for subsequent model analysis and performance evaluation. + + +After running this script, both the PTE model file and the etrecord file are generated. + +``` bash +$ ls model/ -1 +model/matrix_mul_pf16_gemm.etrecord +model/matrix_mul_pf16_gemm.pte +model/matrix_mul_pf32_gemm.etrecord +model/matrix_mul_pf32_gemm.pte +``` + +The complete source code is available [here](../export-matrix-mul.py). diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md new file mode 100644 index 0000000000..2831a1cd97 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md @@ -0,0 +1,47 @@ +--- +title: Run model and generate the etdump +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +After generating the model, we can now run it on an ARM64 platform using the following command: + +```bash +cd $WORKSPACE +/build-arm64/executor_runner -etdump_path model/linear_model_f32.etdump -model_path model/linear_model_f32.pte -num_executions=1 -cpu_threads 1 + +``` + +You can adjust the number of execution threads and the number of times the model is invoked. + + +You should see output similar to the example below. + +```bash +D 00:00:00.015988 executorch:XNNPACKBackend.cpp:57] Creating XNN workspace +D 00:00:00.018719 executorch:XNNPACKBackend.cpp:69] Created XNN workspace: 0xaff21c2323e0 +D 00:00:00.027595 executorch:operator_registry.cpp:96] Successfully registered all kernels from shared library: NOT_SUPPORTED +I 00:00:00.035506 executorch:executor_runner.cpp:157] Resetting threadpool with num threads = 1 +I 00:00:00.048120 executorch:threadpool.cpp:48] Resetting threadpool to 1 threads. +I 00:00:00.051509 executorch:executor_runner.cpp:218] Model file model/linear_model_f32.pte is loaded. +I 00:00:00.051531 executorch:executor_runner.cpp:227] Using method forward +I 00:00:00.051541 executorch:executor_runner.cpp:278] Setting up planned buffer 0, size 2112. +D 00:00:00.051630 executorch:method.cpp:793] Loading method: forward. +.... + +D 00:00:00.091432 executorch:XNNExecutor.cpp:236] Resizing output tensor to a new shape +I 00:00:00.091459 executorch:executor_runner.cpp:340] Model executed successfully 1 time(s) in 2.904883 ms. +I 00:00:00.091477 executorch:executor_runner.cpp:349] 1 outputs: +OutputX 0: tensor(sizes=[1, 256], [ + 0.0106399, 0.0951964, 1.04854, -0.290168, -0.278126, -0.355151, 0.0583736, -0.431953, -0.0773305, -0.32844, + ..., + 0.553568, -0.0339369, 0.562088, -1.21021, -0.769254, 0.677771, -0.264338, 1.05453, 0.724467, 0.53182, +]) +I 00:00:00.093912 executorch:executor_runner.cpp:125] ETDump written to file 'model/linear_model_f32.etdump'. + +``` + +If the execution is successful, an etdump file will also be generated. + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md new file mode 100644 index 0000000000..d5e6845530 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md @@ -0,0 +1,55 @@ +--- +title: Analyzing ETRecord and ETDump +weight: 9 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In the final step, we create an Inspector instance by providing the paths to the generated ETDump and ETRecord. +The Inspector analyzes the runtime data from the ETDump file and maps it to the corresponding operators in the Edge Dialect Graph. + + +To visualize all runtime events in a tabular format, simply call: + +```python + +import os +import sys +from executorch.devtools.inspector import Inspector + +if len(sys.argv) < 2: + print(f"Usage: python {sys.argv[0]} ") + sys.exit(1) + +pte_file = sys.argv[1] + +base = os.path.splitext(pte_file)[0] + +etrecord = f"{base}.etrecord" +etdump = f"{base}.etdump" +csvfile = f"{base}.csv" + +ins = Inspector(etrecord=etrecord, etdump_path=etdump) +ins.print_data_tabular(include_delegate_debug_data=True, include_units=False) + +with open(csvfile, "w", encoding="utf-8") as f: + ins.save_data_to_tsv(f) + +``` + +Next, you can examine the generated CSV file to view the execution time information for each node in the model. + +Below is an example showing the runtime data corresponding to the Fully Connected node. + + +| event_block_name | event_name | p10 (ms) | p50 (ms) | p90 (ms) | avg (ms) | min (ms) | max (ms) | op_types | is_delegated_op | delegate_backend_name | +|-----------------|--------------------------------|----------------------|----------------------|----------------------|----------------------|----------------------|----------------------|--------------------------|----------------|---------------------| +| Default | Method::init | 33.277046 | 33.277046 | 33.277046 | 33.277046 | 33.277046 | 33.277046 | [] | FALSE | | +| Default | Program::load_method | 33.300006 | 33.300006 | 33.300006 | 33.300006 | 33.300006 | 33.300006 | [] | FALSE | | +| Execute | Fully Connected (NC, F32) GEMM #1 | 0.0160000000000196 | 0.0180000000000007 | 0.0190000000000055 | 0.0187449000000005 | 0.0149999999999864 | 4.244 | [] | TRUE | XnnpackBackend | +| Execute | DELEGATE_CALL | 0.04136 | 0.04464 | 0.04792 | 0.046082053 | 0.03372 | 4.390585 | ['aten.linear.default'] | FALSE | XnnpackBackend | +| Execute | Method::execute | 0.04848 | 0.0525595 | 0.05756 | 0.0540658046 | 0.03944 | 4.404385 | [] | FALSE | | + + +You can experiment with different models and matrix sizes to obtain various performance results. diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md new file mode 100644 index 0000000000..510945ac63 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md @@ -0,0 +1,50 @@ +--- +title: How to Benchmark a Single KleidiAI Micro-kernel in ExecuTorch + +minutes_to_complete: 30 + +who_is_this_for: This article is intended for advanced developers who want to leverage KleidiAI to accelerate ExecuTorch model inference on the AArch64 platform. + +learning_objectives: + - Cross-compile ExecuTorch for the ARM64 platform with XNNPACK and KleidiAI enabled, including SME/SME2 support. + - Build and export ExecuTorch models that can be accelerated by KleidiAI using SME/SME2 instructions. + - Use the `executor_runner` tool to collect ETDump profiling data. + - Inspect and analyze ETRecord and ETDump files using the ExecuTorch Inspector API. + +prerequisites: + - An x86_64 Linux host machine running Ubuntu, with at least 15 GB of free disk space. + - An Arm64 target system with support for SME or SME2. + +author: Qixiang Xu + +### Tags +skilllevels: Advanced +subjects: ML +armips: + - Cortex-A + - SME + - Kleidai + +tools_software_languages: + - Python + - cmake + - XNNPACK + +operatingsystems: + - Linux + + +further_reading: + - resource: + title: Executorch User Guide + link: https://docs.pytorch.org/executorch/stable/intro-section.html + type: documentation + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py new file mode 100644 index 0000000000..b976be70cc --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py @@ -0,0 +1,107 @@ + +import torch +import torch.nn as nn +class DemoConv2dModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.bitwiseconv = torch.nn.Conv2d(3, 2, 1,groups=1) + + def forward(self,x): + x = self.bitwiseconv(x) + return x + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 3, 16, 16, dtype=dtype),) + +class DemoQInt8Conv2dModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 6, 3) + + def forward(self,x): + x = self.conv(x) + return x + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 3, 16, 16, dtype=dtype),) + + +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + +def export_int8_quantize_conv2d_model(model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoQInt8Conv2dModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + model = torch.export.export(model, example_inputs).module() + quantizer = XNNPACKQuantizer() + operator_config = get_symmetric_quantization_config( + is_per_channel=False, + is_dynamic=False + ) + + quantizer.set_global(operator_config) + quantize_model = prepare_pt2e(model, quantizer) + quantize_model(*example_inputs) + quantize_model = convert_pt2e(quantize_model) + + #export model + exported_program = torch.export.export(quantize_model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int8_quantize_depthwise_model("qint8_conv2d_pqs8_qc8w_gemm"); + + + +def export_pointwise_model(model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoConv2dModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + + +export_pointwise_model("pointwise_conv2d_pf32_gemm") diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py new file mode 100644 index 0000000000..0f78dab2cc --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py @@ -0,0 +1,145 @@ + +import torch +import torch.nn as nn + +class DemoLinearModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(256,256) + + def forward(self, x): + y = self.linear(x) + return y + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 256, dtype=dtype),) + + +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_executorch_model(dtype: torch.dtype, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(dtype) + example_inputs = model.get_example_inputs(dtype) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_executorch_model(torch.float16,"linear_model_pf16_gemm") +export_executorch_model(torch.float32,"linear_model_pf32_gemm") + + +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + +def export_int8_quantize_model(dynamic: bool, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + model = torch.export.export(model, example_inputs).module() + quantizer = XNNPACKQuantizer() + operator_config = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=dynamic + ) + + quantizer.set_global(operator_config) + quantize_model = prepare_pt2e(model, quantizer) + quantize_model(*example_inputs) + quantize_model = convert_pt2e(quantize_model) + + #export model + exported_program = torch.export.export(quantize_model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int8_quantize_model(False,"linear_model_pqs8_qc8w_gemm"); +export_int8_quantize_model(True,"linear_model_qp8_f32_qc8w_gemm"); + + +from torchao.quantization.granularity import PerGroup, PerAxis +from torchao.quantization.quant_api import ( + IntxWeightOnlyConfig, + Int8DynamicActivationIntxWeightConfig, + quantize_, +) + +def export_int4_quantize_model(dynamic: bool, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoLinearModel().eval().to(torch.float32) + example_inputs = model.get_example_inputs(torch.float32) + + #Quantizer model + + linear_config = Int8DynamicActivationIntxWeightConfig( + weight_dtype=torch.int4, + weight_granularity=PerGroup(32), + ) + + quantize_(model, linear_config) + + #export model + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_int4_quantize_model(False,"linear_model_qp8_f32_qb4w_gemm"); + + + diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py new file mode 100644 index 0000000000..19eab1b356 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py @@ -0,0 +1,44 @@ + +import torch +import torch.nn as nn + +class DemoBatchMatMulModel(nn.Module): + def forward(self, x,y): + return torch.bmm(x, y) + + def get_example_inputs(self,dtype=torch.float32): + return (torch.randn(1, 256, 256, dtype=dtype),torch.randn(1, 256, 256, dtype=dtype)) + + +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType +from executorch.exir import to_edge_transform_and_lower + +def export_mutrix_mul_model(dtype: torch.dtype, model_name: str): + mode_file_name = "model/" + model_name + pte_file = mode_file_name + ".pte" + etr_file = mode_file_name + ".etrecord" + + model = DemoBatchMatMulModel().eval().to(dtype) + example_inputs = model.get_example_inputs(dtype) + + exported_program = torch.export.export(model, example_inputs) + + partitioner = XnnpackPartitioner() + edge_program = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + generate_etrecord=True + ) + + et_program = edge_program.to_executorch() + with open(pte_file, "wb") as f: + f.write(et_program.buffer) + + # Get and save ETRecord + etrecord = et_program.get_etrecord() + etrecord.save(etr_file) + +export_mutrix_mul_model(torch.float16,"matrix_mul_pf16_gemm") +export_mutrix_mul_model(torch.float32,"matrix_mul_pf32_gemm") +