From 0e0a3b990cd8720cf8add5f6441f2884d8ba53c8 Mon Sep 17 00:00:00 2001 From: TFLM-bot Date: Tue, 14 Oct 2025 14:03:04 +0000 Subject: [PATCH] Sync from upstream TF. --- .../kernels/internal/portable_tensor_utils.cc | 98 ++++++++++++++++--- .../kernels/internal/portable_tensor_utils.h | 33 +++++-- tensorflow/lite/tools/visualize.py | 2 +- 3 files changed, 111 insertions(+), 22 deletions(-) diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/portable_tensor_utils.cc index 0928d4b0d0d..efc6ba5a9c0 100644 --- a/tensorflow/lite/kernels/internal/portable_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h" #include +#include #include #include @@ -92,23 +93,90 @@ void UnpackDenseInt4IntoInt8(const int8_t* src_buffer, int num_elements, } } -void PackInt8IntoDenseInt4(const int8_t* src_buffer, int num_elements, - int8_t* dst_buffer) { - // num_elements means the number of elements regardless of packed or unpacked. - // For example, 3 elements means both - // 1) Packed: 3 int4's = 12 bit -> 16 bits (padded) = 2 bytes. - // stored in src_buffer[0] and src_buffer[1] (i = 0..1) - // 2) Unpacked: 3 int8's = 3 bytes. - // stored in dst_buffer[0], dst_buffer[1] and dst_buffer[2] (j = 0..2) - for (int i = 0; i < num_elements - 1; i += 2) { - dst_buffer[i / 2] = src_buffer[i] & 0x0F; - dst_buffer[i / 2] |= src_buffer[i + 1] << 4; +void UnpackPackedIntToInt8(const int8_t* src_buffer, int num_elements, + int bit_width, int8_t* dst_buffer) { + assert(bit_width == 2 || bit_width == 4); + if (bit_width == 4) { + // num_elements means the number of elements regardless of packed or + // unpacked. For example, 3 elements means both + // 1) Packed: 3 int4's = 12 bit -> 16 bits (padded) = 2 bytes. + // stored in src_buffer[0] and src_buffer[1] (i = 0..1) + // 2) Unpacked: 3 int8's = 3 bytes. + //. stored in dst_buffer[0], dst_buffer[1] and dst_buffer[2] (j = 0..2) + for (int i = 0; i < num_elements / 2; i++) { + int8_t byte = src_buffer[i]; + // Shift left first so that sign is properly extended when shifted right + int8_t lower = static_cast(byte << 4) >> 4; + int8_t higher = byte >> 4; + dst_buffer[2 * i] = lower; + dst_buffer[2 * i + 1] = higher; + } + + // If the buffer size is odd, extract the final lower nibble. + if (num_elements % 2 != 0) { + dst_buffer[num_elements - 1] = + static_cast(src_buffer[num_elements / 2] << 4) >> 4; + } + } else if (bit_width == 2) { + for (int i = 0; i < num_elements / 4; i++) { + int8_t byte = src_buffer[i]; + // Shift left first so that sign is properly extended when shifted right + int8_t val1 = static_cast(byte << 6) >> 6; + int8_t val2 = static_cast((byte << 4) & 0xFF) >> 6; + int8_t val3 = static_cast((byte << 2) & 0xFF) >> 6; + int8_t val4 = byte >> 6; + dst_buffer[4 * i] = val1; + dst_buffer[4 * i + 1] = val2; + dst_buffer[4 * i + 2] = val3; + dst_buffer[4 * i + 3] = val4; + } + + // Handle the remaining elements. + int remaining_elements = num_elements % 4; + if (remaining_elements > 0) { + int8_t byte = src_buffer[num_elements / 4]; + for (int i = 0; i < remaining_elements; i++) { + dst_buffer[num_elements - remaining_elements + i] = + static_cast((byte << (6 - 2 * i)) & 0xFF) >> 6; + } + } } - auto packed_size = (num_elements + 1) / 2; +} - // Copy the final nibble if the buffer is odd-lengthed - if (num_elements % 2 != 0) { - dst_buffer[packed_size - 1] = src_buffer[num_elements - 1] & 0x0F; +void PackInt8IntoDenseInt(const int8_t* src_buffer, int num_elements, + int bit_width, int8_t* dst_buffer) { + assert(bit_width == 2 || bit_width == 4); + if (bit_width == 4) { + // num_elements means the number of elements regardless of packed or + // unpacked. For example, 3 elements means both + // 1) Unpacked: 3 int8's = 3 bytes. + // stored in src_buffer[0], src_buffer[1] and src_buffer[2] (j = 0..2) + // 2) Packed: 3 int4's = 12 bit -> 16 bits (padded) = 2 bytes. + // stored in dst_buffer[0] and dst_buffer[1] (i = 0..1) + for (int i = 0; i < num_elements / 2; ++i) { + dst_buffer[i] = (src_buffer[2 * i] & 0x0F) | (src_buffer[2 * i + 1] << 4); + } + // If the buffer size is odd, pack the final nibble. + if (num_elements % 2 != 0) { + dst_buffer[num_elements / 2] = src_buffer[num_elements - 1] & 0x0F; + } + } else if (bit_width == 2) { + for (int i = 0; i < num_elements / 4; ++i) { + dst_buffer[i] = (src_buffer[4 * i] & 0x03) | + ((src_buffer[4 * i + 1] & 0x03) << 2) | + ((src_buffer[4 * i + 2] & 0x03) << 4) | + ((src_buffer[4 * i + 3] & 0x03) << 6); + } + // Handle the remaining elements. + int remaining_elements = num_elements % 4; + if (remaining_elements > 0) { + int8_t packed_val = 0; + for (int i = 0; i < remaining_elements; ++i) { + packed_val |= (src_buffer[num_elements - remaining_elements + i] & 0x03) + << (i * 2); + } + dst_buffer[num_elements / 4] = packed_val; + } } } diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/portable_tensor_utils.h index a361a2d0e5d..c70ac94db5f 100644 --- a/tensorflow/lite/kernels/internal/portable_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.h @@ -618,20 +618,41 @@ void ApplySignbitToVector(const float* __restrict__ vector, int v_size, void UnpackDenseInt4IntoInt8(const int8_t* src_buffer, int num_elements, int8_t* dst_buffer); -// Pack `src_buffer` into a densely packed buffer of int4 values. +// Unpack or inflate `src_buffer` by taking each byte and splitting it into +// multiple elements into `dst_buffer`. Supports 2-bit and 4-bit packed integers // Parameters: -// src_buffer : Buffer containing int4 values stored in int8 memory. +// src_buffer : Densely packed buffer containing int2 or int4 values. +// num_elements : Number of unpacked elements to be read from the buffer. +// This should be equal to the size of `dst_buffer`. +// bit_width : The bit width of the packed elements (either 2 or 4). +// dst_buffer : Buffer to unpack into. Should be allocated by the caller. +// Size should be at least `num_elements`. +// Notes: +// For 4-bit unpacking: e.g., `src_buffer = {0x12, 0x34};` (num_elements = 4) +// will return `dst_buffer = {0x02, 0x01, 0x04, 0x03}`. +// For 2-bit unpacking: e.g., `src_buffer = {0x12};` (num_elements = 4) +// will return `dst_buffer = {0x02, 0x00, 0x01, 0x00}` (sign extended). +void UnpackPackedIntToInt8(const int8_t* src_buffer, int num_elements, + int bit_width, int8_t* dst_buffer); + +// Pack `src_buffer` into a densely packed buffer of int2 or int4 values. +// Parameters: +// src_buffer : Buffer containing int2 or int4 values stored in int8 +// memory. // num_elements : Number of elements stored in the buffer. Note that this can // be smaller than the size of `src_buffer` by 1 if it's odd, // in which case the last nibble in `src_buffer` is ignored. // This should be equal to the size of `dst_buffer`. +// bit_width : The bit width of the packed elements (either 2 or 4). // dst_buffer : Buffer to pack into. Should be allocated by the caller. // Size should be at least `num_elements`. // Notes: -// For example, given `src_buffer = {0x02, 0x01, 0x04, 0x03}`, calling this -// function will return `dst_buffer = {0x12, 0x34}`. -void PackInt8IntoDenseInt4(const int8_t* src_buffer, int num_elements, - int8_t* dst_buffer); +// For 4-bit packing: e.g., given `src_buffer = {0x02, 0x01, 0x04, 0x03}`, +// calling this function will return `dst_buffer = {0x12, 0x34}`. +// For 2-bit packing: e.g., given `src_buffer = {0x00, 0x01, 0x00, 0x02}`, +// calling this function will return `dst_buffer = {0x84}`. +void PackInt8IntoDenseInt(const int8_t* src_buffer, int num_elements, + int bit_width, int8_t* dst_buffer); } // namespace tensor_utils } // namespace tflite diff --git a/tensorflow/lite/tools/visualize.py b/tensorflow/lite/tools/visualize.py index de7ef820079..cd4bcfa7aaf 100644 --- a/tensorflow/lite/tools/visualize.py +++ b/tensorflow/lite/tools/visualize.py @@ -33,7 +33,7 @@ from tflite_micro.tensorflow.lite.python import schema_py_generated as schema_fb else: # This file is part of tflite_runtime package. - from tflite_runtime import schema_py_generated as schema_fb + from tflite_micro.tensorflow.lite_runtime import schema_py_generated as schema_fb # A CSS description for making the visualizer _CSS = """