From 37df2f3108d4f400c79aa06a5308f70f8bbe05cb Mon Sep 17 00:00:00 2001
From: Qixiang Xu <qixiang.xu@arm.com>
Date: Fri, 7 Nov 2025 14:51:55 +0800
Subject: [PATCH] Add How to Benchmark a Single KleidiAI Micro-kernel in
 ExecuTorch

---
 .../01-env-setup.md                           |  54 +++++
 .../02-cross-compile.md                       |  78 ++++++
 .../03-executorch-node-kai-kernel.md          |  59 +++++
 .../04-create-fc-model.md                     | 229 ++++++++++++++++++
 .../05-create-conv2d-model.md                 | 176 ++++++++++++++
 .../06-create-matrix-mul-model.md             |  91 +++++++
 .../07-run-model.md                           |  47 ++++
 .../08-analyze-etdump.md                      |  55 +++++
 .../_index.md                                 |  50 ++++
 .../_next-steps.md                            |   8 +
 .../export-conv2d.py                          | 107 ++++++++
 .../export-linear-model.py                    | 145 +++++++++++
 .../export-matrix-mul.py                      |  44 ++++
 13 files changed, 1143 insertions(+)
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py
 create mode 100644 content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py

diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md
new file mode 100644
index 0000000000..1e26ff1d6a
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/01-env-setup.md
@@ -0,0 +1,54 @@
+---
+title: Environment setup
+weight: 2
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+
+### Python Environment Setup 
+
+Before building ExecuTorch, it is highly recommended to create an isolated Python environment.
+This prevents dependency conflicts with your system Python installation and ensures a clean build environment.
+
+```bash 
+cd $WORKSPACE
+python3 -m venv pyenv
+source pyenv/bin/activate
+
+```
+All subsequent steps should be executed within this Python virtual environment.
+
+### Download the ExecuTorch Source Code
+
+Clone the ExecuTorch repository from GitHub. The following command checks out the stable v1.0.0 release and ensures all required submodules are fetched.
+
+```bash 
+cd $WORKSPACE
+git clone -b v1.0.0 --recurse-submodules https://github.com/pytorch/executorch.git
+
+```
+
+   > **Note:**  
+   > The instructions in this guide are based on **ExecuTorch v1.0.0**.  
+   > Commands or configuration options may differ in later releases.
+
+### Build and Install the ExecuTorch Python Components
+
+Next, build the Python bindings and install them into your environment. The following command uses the provided installation script to configure, compile, and install ExecuTorch with developer tools enabled.
+
+```bash 
+cd $WORKSPACE/executorch
+CMAKE_ARGS="-DEXECUTORCH_BUILD_DEVTOOLS=ON" ./install_executorch.sh
+
+```
+
+This will build ExecuTorch and its dependencies using CMake, enabling optional developer utilities such as ETDump and Inspector.
+
+After installation completes successfully, you can verify the environment by running:
+
+```bash 
+python -c "import executorch; print('Executorch build and install successfully.')"
+```
+
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md
new file mode 100644
index 0000000000..0b0386694c
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/02-cross-compile.md
@@ -0,0 +1,78 @@
+---
+title: Cross-Compile ExecuTorch for the Aarch64 platform
+weight: 3
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+
+This section describes how to cross-compile ExecuTorch for an AArch64 target platform with XNNPACK and KleidiAI support enabled.
+All commands below are intended to be executed on an x86-64 Linux host with an appropriate cross-compilation toolchain installed (e.g., aarch64-linux-gnu-gcc).
+
+
+### Run CMake Configuration 
+
+Use CMake to configure the ExecuTorch build for Aarch64. The example below enables key extensions, developer tools, and XNNPACK with KleidiAI acceleration: 
+
+```bash 
+
+cd $WORKSPACE
+mkdir -p build-arm64
+cd build-arm64
+
+cmake -GNinja \
+    -DCMAKE_BUILD_TYPE=Debug \
+    -DCMAKE_SYSTEM_NAME=Linux \
+    -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
+    -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
+    -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
+    -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+    -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
+    -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
+    -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+    -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DEXECUTORCH_LOG_LEVEL=debug \
+    -DEXECUTORCH_XNNPACK_ENABLE_KLEIDI=ON \
+    ../executorch
+
+```
+
+#### Key Build Options
+
+| **CMake Option**                            | **Description**                                                                                                                                                    |
+| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `EXECUTORCH_BUILD_XNNPACK`                  | Builds the **XNNPACK backend**, which provides highly optimized CPU operators (GEMM, convolution, etc.) for Arm64 platforms.                                 |
+| `EXECUTORCH_XNNPACK_ENABLE_KLEIDI`          | Enables **Arm KleidiAI** acceleration for XNNPACK kernels, providing further performance improvements on Armv8.2+ CPUs.                                            |
+| `EXECUTORCH_BUILD_DEVTOOLS`                 | Builds **developer tools** such as the ExecuTorch Inspector and diagnostic utilities for profiling and debugging.                                                  |
+| `EXECUTORCH_BUILD_EXTENSION_MODULE`         | Builds the **Module API** extension, which provides a high-level abstraction for model loading and execution using `Module` objects.                               |
+| `EXECUTORCH_BUILD_EXTENSION_TENSOR`         | Builds the **Tensor API** extension, providing convenience functions for creating, manipulating, and managing tensors in C++ runtime.                              |
+| `EXECUTORCH_BUILD_KERNELS_OPTIMIZED`        | Enables building **optimized kernel implementations** for better performance on supported architectures.                                                           |
+| `EXECUTORCH_ENABLE_EVENT_TRACER`            | Enables the **event tracing** feature, which records performance and operator timing information for runtime analysis.                                             |
+
+
+
+### Build ExecuTorch 
+
+```bash 
+cmake --build . -j$(nproc)
+
+```
+
+If the build completes successfully, you should find the executor_runner binary under the directory:
+
+```bash
+build-arm64/executor_runner
+
+```
+
+This binary can be used to run ExecuTorch models on the ARM64 target device using the XNNPACK backend with KleidiAI acceleration.
+
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md
new file mode 100644
index 0000000000..7bb5ffffd4
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/03-executorch-node-kai-kernel.md
@@ -0,0 +1,59 @@
+---
+title: KleidiAI micro-kernels support in ExecuTorch
+weight: 4
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+ExecuTorch uses XNNPACK as its primary CPU backend for operator execution and performance optimization.
+
+Within this architecture, only a subset of KleidiAI SME (Scalable Matrix Extension) micro-kernels has been integrated into XNNPACK to provide additional acceleration on supported Arm platforms.
+
+These specialized micro-kernels are designed to accelerate operators with specific data types and quantization configurations in ExecuTorch models.
+
+When an operator matches one of the supported configurations, ExecuTorch automatically dispatches it through the KleidiAI-optimized path.
+
+Operators that are not covered by KleidiAI fall back to the standard XNNPACK implementations during inference, ensuring functional correctness across all models.
+
+In ExecuTorch v1.0.0, the following operator types are implemented through the XNNPACK backend and can potentially benefit from KleidiAI acceleration:
+- XNNFullyConnected – Fully connected (dense) layers
+- XNNConv2d – Standard 2D convolution layers
+- XNNBatchMatrixMultiply – Batched matrix multiplication operations
+
+However, not all instances of these operators are accelerated by KleidiAI.
+
+Acceleration eligibility depends on several operator attributes and backend support, including:
+- Data types (e.g., float32, int8, int4)
+- Quantization schemes (e.g., symmetric/asymmetric, per-tensor/per-channel)
+- Tensor memory layout and alignment
+- Kernel dimensions and stride settings
+    
+The following section provides detailed information on which operator configurations can benefit from KleidiAI acceleration, along with their corresponding data type and quantization support.
+
+
+### XNNFullyConnected 
+
+| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType                      |
+| ------------------  | ---------------------------- | --------------------------------------- | ---------------------------- |
+| pf16_gemm    | FP16                         | FP16                                    | FP16                         |
+| pf32_gemm    | FP32                         | FP32                                    | FP32                         |
+| qp8_f32_qc8w_gemm | Asymmetric INT8 per-row quantization | Per-channel symmetric INT8 quantization | FP32                         |
+| pqs8_qc8w_gemm    | Asymmetric INT8 quantization | Per-channel symmetric INT8 quantization | Asymmetric INT8 quantization |
+| qp8_f32_qb4w_gemm | Asymmetric INT8 per-row quantization | INT4 (signed), shared blockwise quantization | FP32                         |
+
+
+### XNNConv2d
+| XNNPACK GEMM Variant | Input DataType| Filter DataType | Output DataType                      |
+| ------------------  | ---------------------------- | --------------------------------------- | ---------------------------- |
+| pf32_gemm    | FP32                         | FP32, pointwise (1×1)                   | FP32                         |
+| pqs8_qc8w_gemm | Asymmetric INT8 quantization (NHWC) | Per-channel or per-tensor symmetric INT8 quantization | Asymmetric INT8 quantization(NHWC) |
+
+
+### XNNBatchMatrixMultiply
+| XNNPACK GEMM Variant | Input A DataType| Input B DataType |Output DataType |
+| ------------------  | ---------------------------- | --------------------------------------- |--------------------------------------- |
+| pf32_gemm    | FP32                         | FP32                         | FP32 | 
+| pf16_gemm    | FP16                         | FP16                         | FP16 |
+
+
+
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md
new file mode 100644
index 0000000000..7be11240db
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/04-create-fc-model.md
@@ -0,0 +1,229 @@
+---
+title: Create and quantize linear layer benchmark model
+weight: 5
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+In the previous section, we discussed that the Fully Connected operator supports multiple GEMM (General Matrix Multiplication) variants.
+
+To evaluate the performance of these variants across different hardware platforms, we will construct a series of benchmark models that utilize the Fully Connected operator with different GEMM implementations for comparative analysis.
+
+
+### Fully connected benchmark model
+
+In the following example model, we use simple model to generate nodes that can be accelerated by Kleidiai. 
+
+By adjusting some of the model’s input parameters, we can also simulate the behavior of nodes that appear in real-world models.
+
+
+```python
+import torch
+import torch.nn as nn
+class DemoLinearModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(256,256)
+
+    def forward(self, x):
+        y = self.linear(x)
+        return y
+
+    def get_example_inputs(self,dtype=torch.float32):
+        return (torch.randn(1, 256, dtype=dtype),)
+
+```
+
+### Export FP16/FP32 model for pf16_gemm/pf32_gemm Variants
+
+| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType                      |
+| ------------------  | ---------------------------- | --------------------------------------- | ---------------------------- |
+| pf16_gemm    | FP16                         | FP16                                    | FP16                         |
+| pf32_gemm    | FP32                         | FP32                                    | FP32                         |
+
+The following code demonstrates how to lower and export a model that leverages the pf16_gemm variant to accelerate computation:
+
+``` python 
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType
+from executorch.exir import to_edge_transform_and_lower
+
+def export_executorch_model(dtype: torch.dtype, model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+
+    model = DemoLinearModel().eval().to(dtype)
+    example_inputs = model.get_example_inputs(dtype)
+
+    exported_program = torch.export.export(model, example_inputs)
+
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_executorch_model(torch.float16,"linear_model_pf16_gemm")
+
+```
+
+To generate a model that uses the pf32_gemm variant, simply change the dtype in the previous code to torch.float32, as shown below:
+
+```python 
+
+export_executorch_model(torch.float32,"linear_model_pf32_gemm")
+
+```
+
+### Export int8 quantized model for pqs8_qc8w_gemm and qp8_f32_qc8w_gemm variant
+
+| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType                      |
+| ------------------  | ---------------------------- | --------------------------------------- | ---------------------------- |
+| qp8_f32_qc8w_gemm | Asymmetric INT8 per-row quantization | Per-channel symmetric INT8 quantization | FP32                         |
+| pqs8_qc8w_gemm    | Asymmetric INT8 quantization | Per-channel symmetric INT8 quantization | Asymmetric INT8 quantization |
+
+
+The following code demonstrates how to quantized a model that leverages the pqs8_qc8w_gemm/qp8_f32_qc8w_gemm  variant to accelerate computation:
+
+```python 
+
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
+def export_int8_quantize_model(dynamic: bool, model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+
+    model = DemoLinearModel().eval().to(torch.float32)
+    example_inputs = model.get_example_inputs(torch.float32)
+
+    #Quantizer model
+    model = torch.export.export(model, example_inputs).module()
+    quantizer = XNNPACKQuantizer()
+    operator_config = get_symmetric_quantization_config(
+        is_per_channel=True,
+        is_dynamic=dynamic
+    )
+
+    quantizer.set_global(operator_config)
+    quantize_model = prepare_pt2e(model, quantizer)
+    quantize_model(*example_inputs)
+    quantize_model = convert_pt2e(quantize_model)
+
+    #lower and export model
+    exported_program = torch.export.export(quantize_model, example_inputs)
+
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_int8_quantize_model(False,"linear_model_pqs8_qc8w_gemm");
+export_int8_quantize_model(True,"linear_model_qp8_f32_qc8w_gemm");
+
+```
+
+### Export int4 quantized model for qp8_f32_qb4w_gemm variant
+| XNNPACK GEMM Variant | Activations DataType| Weights DataType | Output DataType                      |
+| ------------------  | ---------------------------- | --------------------------------------- | ---------------------------- |
+| qp8_f32_qb4w_gemm | Asymmetric INT8 per-row quantization | INT4 (signed), shared blockwise quantization | FP32                         |
+
+
+The following code demonstrates how to quantized a model that leverages the qp8_f32_qb4w_gemm variant to accelerate computation:
+
+```python 
+from torchao.quantization.granularity import PerGroup, PerAxis
+from torchao.quantization.quant_api import (
+    IntxWeightOnlyConfig,
+    Int8DynamicActivationIntxWeightConfig,
+    quantize_,
+)
+
+def export_int4_quantize_model(dynamic: bool, model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+
+    model = DemoLinearModel().eval().to(torch.float32)
+    example_inputs = model.get_example_inputs(torch.float32)
+
+    #Quantizer model
+
+    linear_config = Int8DynamicActivationIntxWeightConfig(
+        weight_dtype=torch.int4,
+        weight_granularity=PerGroup(32),
+    )
+
+    quantize_(model, linear_config)
+
+    #lower and export model
+    exported_program = torch.export.export(model, example_inputs)
+
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_int4_quantize_model(False,"linear_model_qp8_f32_qb4w_gemm");
+
+
+```
+
+**NOTE:**
+
+When exporting models, the **generate_etrecord** option is enabled to produce the .etrecord file alongside the .pte model file.
+These ETRecord files are essential for subsequent model inspection and performance analysis using the ExecuTorch Inspector API.
+
+
+After running this script, both the PTE model file and the etrecord file are generated.
+
+``` bash 
+$ ls model/ -1
+linear_model_pf16_gemm.etrecord
+linear_model_pf16_gemm.pte
+linear_model_pf32_gemm.etrecord
+linear_model_pf32_gemm.pte
+linear_model_pqs8_qc8w_gemm.etrecord
+linear_model_pqs8_qc8w_gemm.pte
+linear_model_qp8_f32_qb4w_gemm.etrecord
+linear_model_qp8_f32_qb4w_gemm.pte
+linear_model_qp8_f32_qc8w_gemm.etrecord
+linear_model_qp8_f32_qc8w_gemm.pte
+```
+
+The complete source code is available [here](../export-linear-model.py).
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md
new file mode 100644
index 0000000000..685a7ce397
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/05-create-conv2d-model.md
@@ -0,0 +1,176 @@
+---
+title: Create and quantize convolution layer benchmark model
+weight: 6
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+In the previous section, we discussed that both INT8-quantized Conv2d and pointwise (1×1) Conv2d operators can be accelerated using KleidiAI’s matrix-multiplication micro-kernels.
+
+
+| XNNPACK GEMM Variant | Input DataType| Filter DataType | Output DataType                      |
+| ------------------  | ---------------------------- | --------------------------------------- | ---------------------------- |
+| pqs8_qc8w_gemm | Asymmetric INT8 quantization(NHWC) | Per-channel or per-tensor symmetric INT8 quantization | Asymmetric INT8 quantization(NHWC) |
+| pf32_gemm    | FP32                         | FP32, pointwise (1×1)                   | FP32                         |
+
+To evaluate the performance of Conv2d operators across multiple hardware platforms, we create a set of benchmark models that utilize different GEMM implementation variants within the convolution operators for systematic comparative analysis.
+
+
+### INT8-quantized Conv2d benchmark model
+
+The following example defines a simple model to generate INT8-quantized Conv2d nodes that can be accelerated by KleidiAI.
+
+By adjusting some of the model’s input parameters, we can also simulate the behavior of nodes that appear in real-world models.
+
+
+```python
+import torch
+import torch.nn as nn
+
+class DemoQInt8Conv2dModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 6, 3)
+
+    def forward(self,x):
+         x = self.conv(x)
+         return x
+
+    def get_example_inputs(self,dtype=torch.float32):
+        return (torch.randn(1, 3, 16, 16, dtype=dtype),)
+
+```
+
+The following code can be used to quantize and export the model:
+
+```python
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType
+from executorch.exir import to_edge_transform_and_lower
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
+def export_int8_quantize_conv2d_model(model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+
+    model = DemoQInt8Conv2dModel().eval().to(torch.float32)
+    example_inputs = model.get_example_inputs(torch.float32)
+
+    #Quantizer model
+    model = torch.export.export(model, example_inputs).module()
+    quantizer = XNNPACKQuantizer()
+    operator_config = get_symmetric_quantization_config(
+        is_per_channel=False,
+        is_dynamic=False
+    )
+
+    quantizer.set_global(operator_config)
+    quantize_model = prepare_pt2e(model, quantizer)
+    quantize_model(*example_inputs)
+    quantize_model = convert_pt2e(quantize_model)
+
+    #export model
+    exported_program = torch.export.export(quantize_model, example_inputs)
+
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_int8_quantize_conv2d_model("qint8_conv2d_pqs8_qc8w_gemm");
+
+
+```
+
+### PointwiseConv2d benchmark model
+
+In the following example model, we use simple model to generate pointwise Conv2d nodes that can be accelerated by Kleidiai. 
+
+As before, input parameters can be adjusted to simulate real-world model behavior.
+
+
+``` python 
+import torch
+import torch.nn as nn
+class DemoConv2dModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.pointwiseconv = torch.nn.Conv2d(3, 2, 1,groups=1)
+
+    def forward(self,x):
+         x = self.pointwiseconv(x)
+         return x
+
+    def get_example_inputs(self,dtype=torch.float32):
+        return (torch.randn(1, 3, 16, 16, dtype=dtype),)
+
+```
+
+The following code can be used to lower and export the model:
+
+```python 
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType
+from executorch.exir import to_edge_transform_and_lower
+
+def export_pointwise_model(model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+
+    model = DemoConv2dModel().eval().to(torch.float32)
+    example_inputs = model.get_example_inputs(torch.float32)
+
+    exported_program = torch.export.export(model, example_inputs)
+
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_pointwise_model("pointwise_conv2d_pf32_gemm")
+
+```
+
+**NOTES:** 
+
+When exporting models, the generate_etrecord option is enabled to produce the .etrecord file alongside the .pte model file.
+These ETRecord files are essential for subsequent model analysis and performance evaluation.
+
+After running this script, both the PTE model file and the etrecord file are generated.
+
+``` bash 
+$ ls model/ -1
+qint8_conv2d_pqs8_qc8w_gemm.etrecord
+qint8_conv2d_pqs8_qc8w_gemm.pte
+pointwise_conv2d_pf32_gemm.etrecord
+pointwise_conv2d_pf32_gemm.pte
+```
+
+The complete source code is available [here](../export-conv2d.py).
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md
new file mode 100644
index 0000000000..901e2a8883
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/06-create-matrix-mul-model.md
@@ -0,0 +1,91 @@
+---
+title: Create matrix multiply layer benchmark model
+weight: 7
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+In the previous section, we discussed that the Batch Matrix Multiply operator supports multiple GEMM (General Matrix Multiplication) variants.
+
+To evaluate the performance of these variants across different hardware platforms, we construct a set of benchmark models that utilize the batch matrix multiply operator with different GEMM implementations for comparative analysis.
+
+
+### Matrix multiply benchmark model
+
+The following example defines a simple model to generate nodes that can be accelerated by KleidiAI.
+
+By adjusting the input parameters, this model can also simulate the behavior of nodes commonly found in real-world models.
+
+
+```python
+class DemoBatchMatMulModel(nn.Module):
+    def forward(self, x,y):
+        return torch.bmm(x, y)
+
+    def get_example_inputs(self,dtype=torch.float32):
+        return (torch.randn(1, 256, 256, dtype=dtype),torch.randn(1, 256, 256, dtype=dtype))
+
+```
+
+### Export FP16/FP32 model for pf16_gemm/pf32_gemm variant
+
+| XNNPACK GEMM Variant | Input A DataType| Input B DataType |Output DataType |
+| ------------------  | ---------------------------- | --------------------------------------- |--------------------------------------- |
+| pf32_gemm    | FP32                         | FP32                         | FP32 | 
+| pf16_gemm    | FP16                         | FP16                         | FP16 |
+
+The following code snippet demonstrates how to lower the model that leverages the pf16_gemm and pf32_gemm variant to accelerate computation:
+
+``` python 
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType
+from executorch.exir import to_edge_transform_and_lower
+
+def export_mutrix_mul_model(dtype: torch.dtype, model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+
+    model = DemoBatchMatMulModel().eval().to(dtype)
+    example_inputs = model.get_example_inputs(dtype)
+
+    exported_program = torch.export.export(model, example_inputs)
+
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_mutrix_mul_model(torch.float16,"matrix_mul_pf16_gemm")
+export_mutrix_mul_model(torch.float32,"matrix_mul_pf32_gemm")
+
+```
+
+**NOTE:** 
+
+When exporting models, the **generate_etrecord** option is enabled to produce the .etrecord file alongside the .pte model file.
+These ETRecord files are essential for subsequent model analysis and performance evaluation.
+
+
+After running this script, both the PTE model file and the etrecord file are generated.
+
+``` bash 
+$ ls model/ -1
+model/matrix_mul_pf16_gemm.etrecord
+model/matrix_mul_pf16_gemm.pte
+model/matrix_mul_pf32_gemm.etrecord
+model/matrix_mul_pf32_gemm.pte
+```
+
+The complete source code is available [here](../export-matrix-mul.py).
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md
new file mode 100644
index 0000000000..2831a1cd97
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/07-run-model.md
@@ -0,0 +1,47 @@
+---
+title: Run model and generate the etdump
+weight: 8
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+After generating the model, we can now run it on an ARM64 platform using the following command:
+
+```bash 
+cd $WORKSPACE 
+/build-arm64/executor_runner -etdump_path model/linear_model_f32.etdump -model_path model/linear_model_f32.pte -num_executions=1 -cpu_threads 1
+
+```
+
+You can adjust the number of execution threads and the number of times the model is invoked.
+
+
+You should see output similar to the example below.
+
+```bash
+D 00:00:00.015988 executorch:XNNPACKBackend.cpp:57] Creating XNN workspace
+D 00:00:00.018719 executorch:XNNPACKBackend.cpp:69] Created XNN workspace: 0xaff21c2323e0
+D 00:00:00.027595 executorch:operator_registry.cpp:96] Successfully registered all kernels from shared library: NOT_SUPPORTED
+I 00:00:00.035506 executorch:executor_runner.cpp:157] Resetting threadpool with num threads = 1
+I 00:00:00.048120 executorch:threadpool.cpp:48] Resetting threadpool to 1 threads.
+I 00:00:00.051509 executorch:executor_runner.cpp:218] Model file model/linear_model_f32.pte is loaded.
+I 00:00:00.051531 executorch:executor_runner.cpp:227] Using method forward
+I 00:00:00.051541 executorch:executor_runner.cpp:278] Setting up planned buffer 0, size 2112.
+D 00:00:00.051630 executorch:method.cpp:793] Loading method: forward.
+....
+
+D 00:00:00.091432 executorch:XNNExecutor.cpp:236] Resizing output tensor to a new shape
+I 00:00:00.091459 executorch:executor_runner.cpp:340] Model executed successfully 1 time(s) in 2.904883 ms.
+I 00:00:00.091477 executorch:executor_runner.cpp:349] 1 outputs:
+OutputX 0: tensor(sizes=[1, 256], [
+  0.0106399, 0.0951964, 1.04854, -0.290168, -0.278126, -0.355151, 0.0583736, -0.431953, -0.0773305, -0.32844,
+  ...,
+  0.553568, -0.0339369, 0.562088, -1.21021, -0.769254, 0.677771, -0.264338, 1.05453, 0.724467, 0.53182,
+])
+I 00:00:00.093912 executorch:executor_runner.cpp:125] ETDump written to file 'model/linear_model_f32.etdump'.
+
+```
+
+If the execution is successful, an etdump file will also be generated.
+
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md
new file mode 100644
index 0000000000..d5e6845530
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/08-analyze-etdump.md
@@ -0,0 +1,55 @@
+---
+title: Analyzing ETRecord and ETDump
+weight: 9
+
+### FIXED, DO NOT MODIFY
+layout: learningpathall
+---
+
+In the final step, we create an Inspector instance by providing the paths to the generated ETDump and ETRecord. 
+The Inspector analyzes the runtime data from the ETDump file and maps it to the corresponding operators in the Edge Dialect Graph.
+
+
+To visualize all runtime events in a tabular format, simply call:
+
+```python 
+
+import os
+import sys
+from executorch.devtools.inspector import Inspector
+
+if len(sys.argv) < 2:
+    print(f"Usage: python {sys.argv[0]} <model_pte>")
+    sys.exit(1)
+
+pte_file = sys.argv[1]
+
+base = os.path.splitext(pte_file)[0]
+
+etrecord = f"{base}.etrecord"
+etdump = f"{base}.etdump"
+csvfile = f"{base}.csv"
+
+ins = Inspector(etrecord=etrecord, etdump_path=etdump)
+ins.print_data_tabular(include_delegate_debug_data=True, include_units=False)
+
+with open(csvfile, "w", encoding="utf-8") as f:
+    ins.save_data_to_tsv(f)
+
+```
+
+Next, you can examine the generated CSV file to view the execution time information for each node in the model.
+
+Below is an example showing the runtime data corresponding to the Fully Connected node.
+
+
+| event_block_name | event_name                     | p10 (ms)              | p50 (ms)              | p90 (ms)              | avg (ms)              | min (ms)              | max (ms)              | op_types                  | is_delegated_op | delegate_backend_name |
+|-----------------|--------------------------------|----------------------|----------------------|----------------------|----------------------|----------------------|----------------------|--------------------------|----------------|---------------------|
+| Default         | Method::init                   | 33.277046            | 33.277046            | 33.277046            | 33.277046            | 33.277046            | 33.277046            | []                       | FALSE          |                     |
+| Default         | Program::load_method           | 33.300006            | 33.300006            | 33.300006            | 33.300006            | 33.300006            | 33.300006            | []                       | FALSE          |                     |
+| Execute         | Fully Connected (NC, F32) GEMM #1 | 0.0160000000000196   | 0.0180000000000007   | 0.0190000000000055   | 0.0187449000000005   | 0.0149999999999864   | 4.244                | []                       | TRUE           | XnnpackBackend      |
+| Execute         | DELEGATE_CALL                  | 0.04136              | 0.04464              | 0.04792              | 0.046082053          | 0.03372              | 4.390585             | ['aten.linear.default']  | FALSE          | XnnpackBackend      |
+| Execute         | Method::execute                | 0.04848              | 0.0525595            | 0.05756              | 0.0540658046         | 0.03944              | 4.404385             | []                       | FALSE          |                     |
+
+
+You can experiment with different models and matrix sizes to obtain various performance results.
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md
new file mode 100644
index 0000000000..510945ac63
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_index.md
@@ -0,0 +1,50 @@
+---
+title: How to Benchmark a Single KleidiAI Micro-kernel in ExecuTorch
+
+minutes_to_complete: 30
+
+who_is_this_for: This article is intended for advanced developers who want to leverage KleidiAI to accelerate ExecuTorch model inference on the AArch64 platform.
+
+learning_objectives:
+  - Cross-compile ExecuTorch for the ARM64 platform with XNNPACK and KleidiAI enabled, including SME/SME2 support.
+  - Build and export ExecuTorch models that can be accelerated by KleidiAI using SME/SME2 instructions.
+  - Use the `executor_runner` tool to collect ETDump profiling data.
+  - Inspect and analyze ETRecord and ETDump files using the ExecuTorch Inspector API.
+
+prerequisites:
+  - An x86_64 Linux host machine running Ubuntu, with at least 15 GB of free disk space.
+  - An Arm64 target system with support for SME or SME2.
+
+author: Qixiang Xu
+
+### Tags
+skilllevels: Advanced
+subjects: ML
+armips:
+    - Cortex-A
+    - SME
+    - Kleidai
+
+tools_software_languages:
+    - Python
+    - cmake
+    - XNNPACK
+
+operatingsystems:
+    - Linux
+
+
+further_reading:
+    - resource:
+        title: Executorch User Guide 
+        link: https://docs.pytorch.org/executorch/stable/intro-section.html
+        type: documentation
+
+
+
+### FIXED, DO NOT MODIFY
+# ================================================================================
+weight: 1                       # _index.md always has weight of 1 to order correctly
+layout: "learningpathall"       # All files under learning paths have this same wrapper
+learning_path_main_page: "yes"  # This should be surfaced when looking for related content. Only set for _index.md of learning path content.
+---
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md
new file mode 100644
index 0000000000..c3db0de5a2
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/_next-steps.md
@@ -0,0 +1,8 @@
+---
+# ================================================================================
+#       FIXED, DO NOT MODIFY THIS FILE
+# ================================================================================
+weight: 21                  # Set to always be larger than the content in this path to be at the end of the navigation.
+title: "Next Steps"         # Always the same, html page title.
+layout: "learningpathall"   # All files under learning paths have this same wrapper for Hugo processing.
+---
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py
new file mode 100644
index 0000000000..b976be70cc
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-conv2d.py
@@ -0,0 +1,107 @@
+
+import torch
+import torch.nn as nn
+class DemoConv2dModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bitwiseconv = torch.nn.Conv2d(3, 2, 1,groups=1)
+
+    def forward(self,x):
+         x = self.bitwiseconv(x)
+         return x
+
+    def get_example_inputs(self,dtype=torch.float32): 
+        return (torch.randn(1, 3, 16, 16, dtype=dtype),)
+
+class DemoQInt8Conv2dModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 6, 3)
+
+    def forward(self,x):
+         x = self.conv(x)
+         return x
+
+    def get_example_inputs(self,dtype=torch.float32):
+        return (torch.randn(1, 3, 16, 16, dtype=dtype),)
+
+
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType
+from executorch.exir import to_edge_transform_and_lower
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
+def export_int8_quantize_conv2d_model(model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+    
+    model = DemoQInt8Conv2dModel().eval().to(torch.float32)
+    example_inputs = model.get_example_inputs(torch.float32)
+    
+    #Quantizer model 
+    model = torch.export.export(model, example_inputs).module()
+    quantizer = XNNPACKQuantizer()
+    operator_config = get_symmetric_quantization_config(
+        is_per_channel=False,
+        is_dynamic=False
+    )
+
+    quantizer.set_global(operator_config)
+    quantize_model = prepare_pt2e(model, quantizer)
+    quantize_model(*example_inputs)
+    quantize_model = convert_pt2e(quantize_model)
+
+    #export model
+    exported_program = torch.export.export(quantize_model, example_inputs)
+    
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+    
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+    
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_int8_quantize_depthwise_model("qint8_conv2d_pqs8_qc8w_gemm");
+
+
+
+def export_pointwise_model(model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+    
+    model = DemoConv2dModel().eval().to(torch.float32)
+    example_inputs = model.get_example_inputs(torch.float32)
+    
+    exported_program = torch.export.export(model, example_inputs)
+    
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+    
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+    
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+
+export_pointwise_model("pointwise_conv2d_pf32_gemm")
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py
new file mode 100644
index 0000000000..0f78dab2cc
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-linear-model.py
@@ -0,0 +1,145 @@
+
+import torch
+import torch.nn as nn
+
+class DemoLinearModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(256,256)
+
+    def forward(self, x):
+        y = self.linear(x)
+        return y
+
+    def get_example_inputs(self,dtype=torch.float32):
+        return (torch.randn(1, 256, dtype=dtype),)
+
+
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType
+from executorch.exir import to_edge_transform_and_lower
+
+def export_executorch_model(dtype: torch.dtype, model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+    
+    model = DemoLinearModel().eval().to(dtype)
+    example_inputs = model.get_example_inputs(dtype)
+    
+    exported_program = torch.export.export(model, example_inputs)
+    
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+    
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+    
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_executorch_model(torch.float16,"linear_model_pf16_gemm")
+export_executorch_model(torch.float32,"linear_model_pf32_gemm")
+
+
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
+def export_int8_quantize_model(dynamic: bool, model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+    
+    model = DemoLinearModel().eval().to(torch.float32)
+    example_inputs = model.get_example_inputs(torch.float32)
+    
+    #Quantizer model 
+    model = torch.export.export(model, example_inputs).module()
+    quantizer = XNNPACKQuantizer()
+    operator_config = get_symmetric_quantization_config(
+        is_per_channel=True,
+        is_dynamic=dynamic
+    )
+
+    quantizer.set_global(operator_config)
+    quantize_model = prepare_pt2e(model, quantizer)
+    quantize_model(*example_inputs)
+    quantize_model = convert_pt2e(quantize_model)
+
+    #export model
+    exported_program = torch.export.export(quantize_model, example_inputs)
+    
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+    
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+    
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_int8_quantize_model(False,"linear_model_pqs8_qc8w_gemm");
+export_int8_quantize_model(True,"linear_model_qp8_f32_qc8w_gemm");
+
+
+from torchao.quantization.granularity import PerGroup, PerAxis
+from torchao.quantization.quant_api import (
+    IntxWeightOnlyConfig,
+    Int8DynamicActivationIntxWeightConfig,
+    quantize_,
+)
+
+def export_int4_quantize_model(dynamic: bool, model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+    
+    model = DemoLinearModel().eval().to(torch.float32)
+    example_inputs = model.get_example_inputs(torch.float32)
+    
+    #Quantizer model 
+
+    linear_config = Int8DynamicActivationIntxWeightConfig(
+        weight_dtype=torch.int4,
+        weight_granularity=PerGroup(32),
+    )
+
+    quantize_(model, linear_config)
+
+    #export model
+    exported_program = torch.export.export(model, example_inputs)
+    
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+    
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+    
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_int4_quantize_model(False,"linear_model_qp8_f32_qb4w_gemm");
+
+
+
diff --git a/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py
new file mode 100644
index 0000000000..19eab1b356
--- /dev/null
+++ b/content/learning-paths/mobile-graphics-and-gaming/measure-kleidiai-kernel-performance-on-executorch/export-matrix-mul.py
@@ -0,0 +1,44 @@
+
+import torch
+import torch.nn as nn
+
+class DemoBatchMatMulModel(nn.Module):
+    def forward(self, x,y):
+        return torch.bmm(x, y)  
+
+    def get_example_inputs(self,dtype=torch.float32): 
+        return (torch.randn(1, 256, 256, dtype=dtype),torch.randn(1, 256, 256, dtype=dtype))
+
+
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.backends.xnnpack.partition.config.xnnpack_config import ConfigPrecisionType
+from executorch.exir import to_edge_transform_and_lower
+
+def export_mutrix_mul_model(dtype: torch.dtype, model_name: str):
+    mode_file_name = "model/" + model_name
+    pte_file = mode_file_name + ".pte"
+    etr_file = mode_file_name + ".etrecord"
+    
+    model = DemoBatchMatMulModel().eval().to(dtype)
+    example_inputs = model.get_example_inputs(dtype)
+    
+    exported_program = torch.export.export(model, example_inputs)
+    
+    partitioner = XnnpackPartitioner()
+    edge_program = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        generate_etrecord=True
+    )
+    
+    et_program = edge_program.to_executorch()
+    with open(pte_file, "wb") as f:
+        f.write(et_program.buffer)
+    
+    # Get and save ETRecord
+    etrecord = et_program.get_etrecord()
+    etrecord.save(etr_file)
+
+export_mutrix_mul_model(torch.float16,"matrix_mul_pf16_gemm")
+export_mutrix_mul_model(torch.float32,"matrix_mul_pf32_gemm")
+