Added Unity Barracuda support option for GatherND (gather_nd)

PINTO0309 · PINTO0309 · commit 7b55f6156af0 · 2022-03-30T00:16:58.000+09:00
diff --git a/README.md b/README.md
@@ -350,6 +350,7 @@ usage: tflite2tensorflow
   [--optimizing_for_edgetpu]
   [--replace_prelu_and_minmax]
   [--disable_experimental_new_quantizer]
+  [--optimizing_barracuda]
   [--locationids_of_the_terminating_output]
 
 optional arguments:
@@ -455,6 +456,9 @@ optional arguments:
   --disable_experimental_new_quantizer
                         Disable MLIRs new quantization feature during INT8 quantization
                         in TensorFlowLite.
+  --optimizing_barracuda
+                      Generates ONNX by replacing Barracuda unsupported layers
+                      with standard layers. For example, GatherND.
   --locationids_of_the_terminating_output
                         A comma-separated list of LocationIDs to be used as output layers.
                         e.g. --locationids_of_the_terminating_output 100,201,560
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
     setup(
         name="tflite2tensorflow",
         scripts=scripts,
-        version="1.20.6",
+        version="1.20.7",
         description="Generate saved_model, tfjs, tf-trt, EdgeTPU, CoreML, quantized tflite, ONNX, OpenVINO, Myriad Inference Engine blob and .pb from .tflite.",
         long_description=long_description,
         long_description_content_type="text/markdown",
diff --git a/tflite2tensorflow/mediapipeCustomOp.py b/tflite2tensorflow/mediapipeCustomOp.py
@@ -24,9 +24,35 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import sys
 import tensorflow.compat.v1 as tf
 import numpy as np
 
+class Color:
+    BLACK          = '\033[30m'
+    RED            = '\033[31m'
+    GREEN          = '\033[32m'
+    YELLOW         = '\033[33m'
+    BLUE           = '\033[34m'
+    MAGENTA        = '\033[35m'
+    CYAN           = '\033[36m'
+    WHITE          = '\033[37m'
+    COLOR_DEFAULT  = '\033[39m'
+    BOLD           = '\033[1m'
+    UNDERLINE      = '\033[4m'
+    INVISIBLE      = '\033[08m'
+    REVERCE        = '\033[07m'
+    BG_BLACK       = '\033[40m'
+    BG_RED         = '\033[41m'
+    BG_GREEN       = '\033[42m'
+    BG_YELLOW      = '\033[43m'
+    BG_BLUE        = '\033[44m'
+    BG_MAGENTA     = '\033[45m'
+    BG_CYAN        = '\033[46m'
+    BG_WHITE       = '\033[47m'
+    BG_DEFAULT     = '\033[49m'
+    RESET          = '\033[0m'
+
 #Affine transform points
 def TransformLandmarks(operator, custom_options, tensors, interpreter, landmarks2d=None, mat=None):
     if landmarks2d is None:
@@ -46,7 +72,7 @@ def TransformLandmarks(operator, custom_options, tensors, interpreter, landmarks
     return landmarks2d_transformed
 
 #Affine transform images using bilinear interpolation
-def TransformTensorBilinear(operator, custom_options, tensors, interpreter, features=None, mat=None):
+def TransformTensorBilinear(operator, custom_options, tensors, interpreter, optimizing_barracuda, features=None, mat=None):
     if features is None:
         features = tensors[operator['inputs'][0]] #float32 [b,48,48,32] feature maps
     if mat is None:
@@ -102,11 +128,46 @@ def TransformTensorBilinear(operator, custom_options, tensors, interpreter, feat
     in_coord_floor = tf.concat([in_coord_floor[:,:,:,1:2], in_coord_floor[:,:,:,0:1]], axis=3) #[b,h,w,YX]
     in_coord_ceil_ = tf.concat([in_coord_ceil_[:,:,:,1:2], in_coord_ceil_[:,:,:,0:1]], axis=3) #[b,h,w,YX]
 
+    def barracuda_gather_nd(params, indices):
+        if len(indices.shape) == 4 and indices.shape[0] == 1:
+            indices = indices[0]
+        elif len(indices.shape) == 3:
+            pass
+        else:
+            print(f'{Color.RED}ERROR:{Color.RESET} gather_nd when optimizing_barracuda is enabled must have 4 dimensions and batch size = 1 or 3 dimensions.')
+            print(f'{Color.RED}ERROR:{Color.RESET} params.shape: {params.shape}, indices.shape: {indices.shape}')
+            sys.exit(-1)
+        if len(params.shape) == 4 and params.shape[0] == 1:
+            params = params[0]
+        elif len(params.shape) == 3:
+            pass
+        else:
+            print(f'{Color.RED}ERROR:{Color.RESET} gather_nd when optimizing_barracuda is enabled must have 4 dimensions and batch size = 1 or 3 dimensions.')
+            print(f'{Color.RED}ERROR:{Color.RESET} params.shape: {params.shape}, indices.shape: {indices.shape}')
+            sys.exit(-1)
+        idx_shape = indices.shape
+        params_shape = params.shape
+        idx_dims = idx_shape[-1]
+        gather_shape = params_shape[idx_dims:]
+        params_flat = tf.reshape(params, tf.concat([[-1], gather_shape], axis=0))
+        axis_step = tf.math.cumprod(params_shape[:idx_dims], exclusive=True, reverse=True)
+        mul = tf.math.multiply(indices, axis_step)
+        indices_flat = tf.reduce_sum(mul, axis=-1)
+        result_flat = tf.gather(params_flat, indices_flat)
+        return tf.expand_dims(tf.reshape(result_flat, tf.concat([idx_shape[:-1], gather_shape], axis=0)), axis=0)
+
     # calc final pixel value
-    value_floor = tf.gather_nd(params=features, indices=in_coord_floor, batch_dims=1) #[b,h,w,32]
-    value_ceilX = tf.gather_nd(params=features, indices=in_coord_ceilX, batch_dims=1) #[b,h,w,32]
-    value_ceilY = tf.gather_nd(params=features, indices=in_coord_ceilY, batch_dims=1) #[b,h,w,32]
-    value_ceil_ = tf.gather_nd(params=features, indices=in_coord_ceil_, batch_dims=1) #[b,h,w,32]
+    if not optimizing_barracuda:
+        value_floor = tf.gather_nd(params=features, indices=in_coord_floor, batch_dims=1) #[b,h,w,32]
+        value_ceilX = tf.gather_nd(params=features, indices=in_coord_ceilX, batch_dims=1) #[b,h,w,32]
+        value_ceilY = tf.gather_nd(params=features, indices=in_coord_ceilY, batch_dims=1) #[b,h,w,32]
+        value_ceil_ = tf.gather_nd(params=features, indices=in_coord_ceil_, batch_dims=1) #[b,h,w,32]
+    else:
+        value_floor = barracuda_gather_nd(params=features, indices=in_coord_floor) #[b,h,w,32]
+        value_ceilX = barracuda_gather_nd(params=features, indices=in_coord_ceilX) #[b,h,w,32]
+        value_ceilY = barracuda_gather_nd(params=features, indices=in_coord_ceilY) #[b,h,w,32]
+        value_ceil_ = barracuda_gather_nd(params=features, indices=in_coord_ceil_) #[b,h,w,32]
+
     value_floor_fraction = tf.multiply(value_floor, weight_floor)
     value_ceil__fraction = tf.multiply(value_ceil_, weight_ceil_)
     value_ceilX_fraction = tf.multiply(value_ceilX, weight_ceilX)
@@ -132,8 +193,12 @@ def Landmarks2TransformMatrix(operator, custom_options, tensors, interpreter, la
     ######################################
     # calc rotation
     ######################################
-    rot90_t = tf.constant([[  0.0,  1.0],
-                           [ -1.0,  0.0]]) #[2,2], already transposed
+    rot90_t = tf.constant(
+        [
+            [  0.0,  1.0],
+            [ -1.0,  0.0]
+        ]
+    ) #[2,2], already transposed
 
     idx_rot_l = custom_options['left_rotation_idx']
     idx_rot_r = custom_options['right_rotation_idx']
diff --git a/tflite2tensorflow/tflite2tensorflow.py b/tflite2tensorflow/tflite2tensorflow.py
@@ -418,6 +418,7 @@ def make_graph(
     optimizing_for_openvino_and_myriad,
     rigorous_optimization_for_myriad,
     optimizing_for_coreml,
+    optimizing_barracuda,
 ):
 
     import tensorflow.compat.v1 as tf
@@ -3455,11 +3456,44 @@ def pad_v2(x, paddings, constant_values):
             except:
                 input_tensor2 = interpreter.get_tensor(positions_detail['index'])
             output_detail = interpreter._get_tensor_details(op['outputs'][0])
-            output_tensor = tf.gather_nd(
-                input_tensor1,
-                input_tensor2,
-                name=get_op_name(output_detail['name'])
-            )
+
+            def barracuda_gather_nd(params, indices):
+                if len(indices.shape) == 4 and indices.shape[0] == 1:
+                    indices = indices[0]
+                elif len(indices.shape) == 3:
+                    pass
+                else:
+                    print(f'{Color.RED}ERROR:{Color.RESET} gather_nd when optimizing_barracuda is enabled must have 4 dimensions and batch size = 1 or 3 dimensions.')
+                    print(f'{Color.RED}ERROR:{Color.RESET} params.shape: {params.shape}, indices.shape: {indices.shape}')
+                    sys.exit(-1)
+                if len(params.shape) == 4 and params.shape[0] == 1:
+                    params = params[0]
+                elif len(params.shape) == 3:
+                    pass
+                else:
+                    print(f'{Color.RED}ERROR:{Color.RESET} gather_nd when optimizing_barracuda is enabled must have 4 dimensions and batch size = 1 or 3 dimensions.')
+                    print(f'{Color.RED}ERROR:{Color.RESET} params.shape: {params.shape}, indices.shape: {indices.shape}')
+                    sys.exit(-1)
+                idx_shape = indices.shape
+                params_shape = params.shape
+                idx_dims = idx_shape[-1]
+                gather_shape = params_shape[idx_dims:]
+                params_flat = tf.reshape(params, tf.concat([[-1], gather_shape], axis=0))
+                axis_step = tf.math.cumprod(params_shape[:idx_dims], exclusive=True, reverse=True)
+                mul = tf.math.multiply(indices, axis_step)
+                indices_flat = tf.reduce_sum(mul, axis=-1)
+                result_flat = tf.gather(params_flat, indices_flat)
+                return tf.expand_dims(tf.reshape(result_flat, tf.concat([idx_shape[:-1], gather_shape], axis=0)), axis=0)
+
+            if not optimizing_barracuda:
+                output_tensor = tf.gather_nd(
+                    input_tensor1,
+                    input_tensor2,
+                    name=get_op_name(output_detail['name'])
+                )
+            else:
+                output_tensor = barracuda_gather_nd(input_tensor1, input_tensor2)
+
             tensors[output_detail['index']] = output_tensor
 
         elif op_type == 'COS':
@@ -5081,7 +5115,6 @@ def complexabs_(x, tout):
                     )
                     tensors[output_detail['index']] = output_tensor
 
-
                 # MediaPipe v0.8.9
                 elif custom_op_type == 'Landmarks2TransformMatrix':
                     options = op['custom_options']
@@ -5093,7 +5126,7 @@ def complexabs_(x, tout):
                     options = op['custom_options']
                     custom_options = read_flexbuffer(np.array(options, dtype=np.uint8).tobytes())
                     output_detail = interpreter._get_tensor_details(op['outputs'][0])
-                    tensors[output_detail['index']] = TransformTensorBilinear(op, custom_options, tensors, interpreter)
+                    tensors[output_detail['index']] = TransformTensorBilinear(op, custom_options, tensors, interpreter, optimizing_barracuda)
 
                 elif custom_op_type == 'TransformLandmarks':
                     custom_options = None
@@ -5642,6 +5675,7 @@ def main():
     parser.add_argument('--optimizing_for_edgetpu', action='store_true', help='Optimizing for edgetpu')
     parser.add_argument('--replace_prelu_and_minmax', action='store_true', help='Replace prelu and minimum/maximum with each other')
     parser.add_argument('--disable_experimental_new_quantizer', action='store_true', help='Disable MLIR\'s new quantization feature during INT8 quantization in TensorFlowLite.')
+    parser.add_argument('--optimizing_barracuda', action='store_true', help='Generates ONNX by replacing Barracuda\'s unsupported layers with standard layers.')
     parser.add_argument('--locationids_of_the_terminating_output', type=str, default='', help='A comma-separated list of location IDs to be used as output layers. Default: \'\'')
     args = parser.parse_args()
 
@@ -5691,6 +5725,7 @@ def main():
     optimizing_for_edgetpu = args.optimizing_for_edgetpu
     replace_prelu_and_minmax = args.replace_prelu_and_minmax
     use_experimental_new_quantizer = not args.disable_experimental_new_quantizer
+    optimizing_barracuda = args.optimizing_barracuda
     locationids_of_the_terminating_output_tmp = args.locationids_of_the_terminating_output
     locationids_of_the_terminating_output = None
     if locationids_of_the_terminating_output_tmp:
@@ -5844,7 +5879,8 @@ def main():
             optimizing_for_edgetpu_flg,
             optimizing_for_openvino_and_myriad,
             rigorous_optimization_for_myriad,
-            optimizing_for_coreml
+            optimizing_for_coreml,
+            optimizing_barracuda
         )
         print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
         print('outputs:')