From 73a2d35c4f4e11d44896850fb49df0ed107df9e3 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Sat, 15 Jun 2024 09:37:00 -0700 Subject: [PATCH 1/5] Rem runner loops; use OpenCV --- objectdetection_coral_multitpu.py | 46 ++- options.py | 216 +++++------ requirements.txt | 5 +- segment_and_test.py | 187 ++++++---- tpu_runner.py | 595 ++++++++++++++++-------------- 5 files changed, 583 insertions(+), 466 deletions(-) diff --git a/objectdetection_coral_multitpu.py b/objectdetection_coral_multitpu.py index e7bcb4b..96f708d 100644 --- a/objectdetection_coral_multitpu.py +++ b/objectdetection_coral_multitpu.py @@ -42,6 +42,7 @@ import os import threading import time +import cv2 #import tracemalloc from PIL import Image @@ -218,7 +219,8 @@ def main(): options.downsample_by = 100 options.label_file = args.labels - image = Image.open(args.input) + #image = Image.open(args.input) + image = cv2.imread(args.input, cv2.IMREAD_COLOR) init_detect(options, args.num_tpus) print('----INFERENCE TIME----') @@ -229,11 +231,15 @@ def main(): thread_cnt = 16 tot_infr_time = 0 + q_infr_count = 0 + q_wall_start = None + half_wall_start = None half_infr_count = 0 + start = time.perf_counter() + if args.count > 1: with concurrent.futures.ThreadPoolExecutor(max_workers=thread_cnt) as executor: - start = time.perf_counter() for chunk_i in range(0, args.count-1, thread_cnt*8): fs = [executor.submit(_tpu_runner.process_image, options, copy.copy(image), args.threshold) for i in range(min(thread_cnt*8, args.count-1 - chunk_i))] @@ -241,18 +247,16 @@ def main(): _, infr_time, _ = f.result() tot_infr_time += infr_time - # Start a timer for the last ~half of the run for more accurate benchmark - if chunk_i > (args.count-1) / 2.0: - half_infr_count += 1 - if half_wall_start is None: - half_wall_start = time.perf_counter() + # Start a timer for the last ~quarter of the run for more accurate benchmark + if chunk_i > (args.count-1) * 3.0 / 4.0: + q_infr_count += 1 + if q_wall_start is None: + q_wall_start = time.perf_counter() # Uncomment for testing # import random # logging.info("Pause") # time.sleep(random.randint(0,INTERPRETER_LIFESPAN_SECONDS*3)) - else: - start = time.perf_counter() # snapshot = tracemalloc.take_snapshot() # top_stats = snapshot.statistics('lineno') @@ -262,20 +266,27 @@ def main(): start_one = time.perf_counter() objs, infr_time, _ = _tpu_runner.process_image(options, copy.copy(image), args.threshold) tot_infr_time += infr_time - half_infr_count += 1 + q_infr_count += 1 wall_time = time.perf_counter() - start - half_wall_time = 0.0 - if half_wall_start is not None: - half_wall_time = time.perf_counter() - half_wall_start + q_wall_time = 0.0 + mpps = 0.0 + if q_wall_start is not None: + q_wall_time = time.perf_counter() - q_wall_start + + mpps = (_tpu_runner.input_details['shape'][1] - options.tile_overlap) \ + * (_tpu_runner.input_details['shape'][2] - options.tile_overlap) \ + * q_infr_count \ + / (q_wall_time * 1000000) logging.info('completed one run every %.2fms for %d runs; %.2fms wall time for a single run' % (wall_time * 1000 / args.count, args.count, (time.perf_counter() - start_one) * 1000)) - logging.info('%.2fms avg time blocked across %d threads; %.3fms ea for final %d inferences' % + logging.info('%.2fms avg time blocked across %d threads; %.2fms ea for final %d inferences; %.2f tensor MPx / sec' % (tot_infr_time / args.count, thread_cnt, - half_wall_time * 1000 / half_infr_count, half_infr_count)) + q_wall_time * 1000 / q_infr_count, q_infr_count, + mpps)) logging.info('-------RESULTS--------') if not objs: @@ -290,10 +301,11 @@ def main(): logging.info(f' bbox: {obj.bbox}') if args.output: - image = image.convert('RGB') + # image = image.convert('RGB') + image = Image.fromarray(image) draw_objects(ImageDraw.Draw(image), objs, _tpu_runner.labels) image.save(args.output, subsampling=2, quality=95) - #image.show() + image.show() if __name__ == '__main__': diff --git a/options.py b/options.py index aefb7c5..5db3823 100644 --- a/options.py +++ b/options.py @@ -14,137 +14,145 @@ def __init__(self, model_name: str, model_name_pattern: str, std_model_name: str self.labels_name = labels_name self.MODEL_SEGMENTS = { - 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq': { - # 104.2 ms per inference + 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq': { + # 176.2 ms per inference (5.7 FPS) for 1 TPUs using 1 segment + # 97.8 ms per inference (10.2 FPS) for 2 TPUs using 2 segments + # 66.2 ms per inference (15.1 FPS) for 3 TPUs using 2 segments + # 48.8 ms per inference (20.5 FPS) for 4 TPUs using 1 segment + # 37.4 ms per inference (26.8 FPS) for 5 TPUs using 2 segments 2: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], - # 67.5 ms per inference - 3: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], - # 49.1 ms per inference - 4: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], - # 43.5 ms per inference - 5: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], - # 37.0 ms per inference - 6: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], - # 31.1 ms per inference - 7: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], - # 27.1 ms per inference - 8: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + 3: ['dumb_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'dumb_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + 5: ['2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], }, 'efficientdet_lite2_448_ptq': { - # 32.1 ms per inference - 2: ['all_segments_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], - # 19.5 ms per inference - 3: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], - # 16.5 ms per inference - 4: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_2_of_3_edgetpu.tflite'], - # 13.6 ms per inference - 5: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], - # 11.5 ms per inference - 7: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], - # 11.3 ms per inference - 8: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 60.0 ms per inference (16.7 FPS) for 1 TPUs using 1 segment + # 30.6 ms per inference (32.7 FPS) for 2 TPUs using 1 segment + # 20.4 ms per inference (49.1 FPS) for 3 TPUs using 2 segments + # 17.4 ms per inference (57.4 FPS) for 4 TPUs using 2 segments + # 14.5 ms per inference (68.8 FPS) for 5 TPUs using 2 segments + 3: ['2x_last_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + 4: ['all_segments_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + 5: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], }, 'efficientdet_lite3_512_ptq': { - # 20.9 ms per inference - 4: ['15x_last_seg_efficientdet_lite3_512_ptq_segment_0_of_2_edgetpu.tflite', '15x_last_seg_efficientdet_lite3_512_ptq_segment_1_of_2_edgetpu.tflite'], + # 75.7 ms per inference (13.2 FPS) for 1 TPUs using 1 segment + # 38.1 ms per inference (26.2 FPS) for 2 TPUs using 1 segment + # 26.8 ms per inference (37.3 FPS) for 3 TPUs using 1 segment + # 20.7 ms per inference (48.4 FPS) for 4 TPUs using 1 segment + # 18.0 ms per inference (55.5 FPS) for 5 TPUs using 1 segment }, 'efficientdet_lite3x_640_ptq': { - # 95.0 ms per inference - 2: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], - # 70.6 ms per inference - 3: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite'], - # 47.9 ms per inference - 4: ['2x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', '2x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite', '2x_first_seg_efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite'], - # 38.7 ms per inference + # 181.6 ms per inference (5.5 FPS) for 1 TPUs using 1 segment + # 91.5 ms per inference (10.9 FPS) for 2 TPUs using 1 segment + # 62.9 ms per inference (15.9 FPS) for 3 TPUs using 2 segments + # 49.6 ms per inference (20.2 FPS) for 4 TPUs using 1 segment + # 40.4 ms per inference (24.7 FPS) for 5 TPUs using 2 segments + 3: ['2x_last_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], 5: ['15x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], - # 35.1 ms per inference - 6: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], - # 30.6 ms per inference - 7: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], - # 27.3 ms per inference - 8: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'yolov5s-int8': { + # 36.4 ms per inference (27.5 FPS) for 1 TPUs using 1 segment + # 18.9 ms per inference (53.0 FPS) for 2 TPUs using 1 segment + # 14.4 ms per inference (69.7 FPS) for 3 TPUs using 1 segment + # 11.7 ms per inference (85.4 FPS) for 4 TPUs using 1 segment + # 10.8 ms per inference (92.6 FPS) for 5 TPUs using 1 segment }, 'yolov5m-int8': { - # 56.3 ms per inference - 2: ['all_segments_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], - # 32.2 ms per inference + # 100.3 ms per inference (10.0 FPS) for 1 TPUs using 1 segment + # 50.5 ms per inference (19.8 FPS) for 2 TPUs using 1 segment + # 31.7 ms per inference (31.5 FPS) for 3 TPUs using 2 segments + # 26.0 ms per inference (38.5 FPS) for 4 TPUs using 2 segments + # 20.1 ms per inference (49.9 FPS) for 5 TPUs using 2 segments 3: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], - # 25.9 ms per inference - 4: ['2x_last_seg_yolov5m-int8_segment_0_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_1_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_2_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_3_of_4_edgetpu.tflite'], - # 21.2 ms per inference - 5: ['all_segments_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], - # 18.8 ms per inference - 6: ['15x_last_seg_yolov5m-int8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_yolov5m-int8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_yolov5m-int8_segment_2_of_3_edgetpu.tflite'], - # 14.7 ms per inference - 7: ['all_segments_yolov5m-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_3_of_4_edgetpu.tflite'], - # 14.6 ms per inference - 8: ['all_segments_yolov5m-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_2_of_3_edgetpu.tflite'], + 4: ['4x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], + 5: ['4x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], }, 'yolov5l-int8': { - # 61.1 ms per inference - 3: ['all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], - # 48.0 ms per inference - 4: ['all_segments_yolov5l-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_4_edgetpu.tflite'], - # 39.0 ms per inference - 5: ['all_segments_yolov5l-int8_segment_0_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_4_of_5_edgetpu.tflite'], - # 31.5 ms per inference - 6: ['all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], - # 26.7 ms per inference - 7: ['dumb_yolov5l-int8_segment_0_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_2_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_3_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_4_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_5_of_6_edgetpu.tflite'], - # 24.4 ms per inference - 8: ['all_segments_yolov5l-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_4_edgetpu.tflite'], + # 182.8 ms per inference (5.5 FPS) for 1 TPUs using 1 segment + # 85.6 ms per inference (11.7 FPS) for 2 TPUs using 2 segments + # 56.5 ms per inference (17.7 FPS) for 3 TPUs using 2 segments + # 43.8 ms per inference (22.8 FPS) for 4 TPUs using 2 segments + # 34.0 ms per inference (29.4 FPS) for 5 TPUs using 3 segments + 2: ['dumb_yolov5l-int8_segment_0_of_2_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], + 3: ['2x_last_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '2x_last_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], + 4: ['dumb_yolov5l-int8_segment_0_of_2_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], + 5: ['3x_first_seg_yolov5l-int8_segment_0_of_3_edgetpu.tflite', '3x_first_seg_yolov5l-int8_segment_1_of_3_edgetpu.tflite', '3x_first_seg_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], }, 'yolov8s_416_640px': { - # 25.6 ms per inference - 3: ['166x_first_seg_yolov8s_416_640px_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov8s_416_640px_segment_1_of_2_edgetpu.tflite'], + # 67.5 ms per inference (14.8 FPS) for 1 TPUs using 1 segment + # 34.5 ms per inference (29.0 FPS) for 2 TPUs using 1 segment + # 22.8 ms per inference (43.8 FPS) for 3 TPUs using 1 segment + # 17.0 ms per inference (58.9 FPS) for 4 TPUs using 2 segments + # 13.1 ms per inference (76.1 FPS) for 5 TPUs using 2 segments + 4: ['3x_first_seg_yolov8s_416_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8s_416_640px_segment_1_of_2_edgetpu.tflite'], + 5: ['3x_first_seg_yolov8s_416_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8s_416_640px_segment_1_of_2_edgetpu.tflite'], }, 'yolov8m_416_640px': { - # 114.4 ms per inference + # 272.3 ms per inference (3.7 FPS) for 1 TPUs using 1 segment + # 95.6 ms per inference (10.5 FPS) for 2 TPUs using 2 segments + # 59.5 ms per inference (16.8 FPS) for 3 TPUs using 3 segments + # 43.8 ms per inference (22.8 FPS) for 4 TPUs using 2 segments + # 35.5 ms per inference (28.2 FPS) for 5 TPUs using 2 segments 2: ['all_segments_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], - # 71.9 ms per inference 3: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], - # 53.0 ms per inference - 4: ['2x_first_seg_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], - # 43.5 ms per inference - 5: ['166x_first_seg_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite'], - # 31.8 ms per inference - 6: ['2x_first_seg_yolov8m_416_640px_segment_0_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_2_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_3_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_4_of_5_edgetpu.tflite'], - # 29.5 ms per inference - 7: ['all_segments_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite'], - # 26.0 ms per inference - 8: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], + 4: ['2x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], + 5: ['3x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], }, 'yolov8l_416_640px': { - # 169.6 ms per inference + # 1053.4 ms per inference (0.9 FPS) for 1 TPUs using 1 segment + # 155.1 ms per inference (6.4 FPS) for 2 TPUs using 2 segments + # 98.1 ms per inference (10.2 FPS) for 3 TPUs using 2 segments + # 78.3 ms per inference (12.8 FPS) for 4 TPUs using 2 segments + # 61.4 ms per inference (16.3 FPS) for 5 TPUs using 2 segments 2: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - # 115.8 ms per inference 3: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - # 89.7 ms per inference 4: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - # 77.7 ms per inference 5: ['4x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - # 64.2 ms per inference - 6: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - # 57.3 ms per inference - 7: ['3x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '3x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '3x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'], - # 52.2 ms per inference - 8: ['166x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '166x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '166x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'], + }, + 'yolov8s_640px': { + # 541.0 ms per inference (1.8 FPS) for 1 TPUs using 1 segment + # 83.7 ms per inference (11.9 FPS) for 2 TPUs using 2 segments + # 54.1 ms per inference (18.5 FPS) for 3 TPUs using 3 segments + # 40.8 ms per inference (24.5 FPS) for 4 TPUs using 3 segments + # 32.9 ms per inference (30.4 FPS) for 5 TPUs using 3 segments + 2: ['15x_last_seg_yolov8s_640px_segment_0_of_2_edgetpu.tflite', '15x_last_seg_yolov8s_640px_segment_1_of_2_edgetpu.tflite'], + 3: ['all_segments_yolov8s_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_2_of_3_edgetpu.tflite'], + 4: ['all_segments_yolov8s_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_2_of_3_edgetpu.tflite'], + 5: ['all_segments_yolov8s_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_2_of_3_edgetpu.tflite'], + }, + 'yolov8m_640px': { + # 353.8 ms per inference (2.8 FPS) for 1 TPUs using 1 segment + # 165.9 ms per inference (6.0 FPS) for 2 TPUs using 2 segments + # 95.4 ms per inference (10.5 FPS) for 3 TPUs using 2 segments + # 71.9 ms per inference (13.9 FPS) for 4 TPUs using 2 segments + # 56.6 ms per inference (17.7 FPS) for 5 TPUs using 2 segments + 2: ['all_segments_yolov8m_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_640px_segment_1_of_2_edgetpu.tflite'], + 3: ['2x_first_seg_yolov8m_640px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8m_640px_segment_1_of_2_edgetpu.tflite'], + 4: ['3x_first_seg_yolov8m_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8m_640px_segment_1_of_2_edgetpu.tflite'], + 5: ['4x_first_seg_yolov8m_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8m_640px_segment_1_of_2_edgetpu.tflite'], + }, + 'yolov8l_640px': { + # 1517.3 ms per inference (0.7 FPS) for 1 TPUs using 1 segment + # 389.8 ms per inference (2.6 FPS) for 2 TPUs using 2 segments + # 206.5 ms per inference (4.8 FPS) for 3 TPUs using 2 segments + # 149.0 ms per inference (6.7 FPS) for 4 TPUs using 2 segments + # 132.4 ms per inference (7.6 FPS) for 5 TPUs using 2 segments + 2: ['15x_first_seg_yolov8l_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_640px_segment_1_of_2_edgetpu.tflite'], + 3: ['15x_first_seg_yolov8l_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_640px_segment_1_of_2_edgetpu.tflite'], + 4: ['2x_last_seg_yolov8l_640px_segment_0_of_2_edgetpu.tflite', '2x_last_seg_yolov8l_640px_segment_1_of_2_edgetpu.tflite'], + 5: ['2x_first_seg_yolov8l_640px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8l_640px_segment_1_of_2_edgetpu.tflite'], }, 'ipcam-general-v8': { - # 53.4 ms per inference - 2: ['2x_last_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], - # 24.3 ms per inference - 3: ['all_segments_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'all_segments_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], - # 19.9 ms per inference - 4: ['15x_first_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], - # 15.6 ms per inference - 5: ['15x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], - # 15.2 ms per inference - 6: ['15x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], - # 12.3 ms per inference - 7: ['15x_first_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], - # 10.9 ms per inference - 8: ['2x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 241.2 ms per inference (4.1 FPS) for 1 TPUs using 1 segment + # 44.7 ms per inference (22.4 FPS) for 2 TPUs using 2 segments + # 22.5 ms per inference (44.4 FPS) for 3 TPUs using 2 segments + # 16.1 ms per inference (62.0 FPS) for 4 TPUs using 2 segments + # 12.2 ms per inference (82.2 FPS) for 5 TPUs using 2 segments + 2: ['15x_last_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + 3: ['166x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '166x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + 4: ['2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + 5: ['2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + }, } diff --git a/requirements.txt b/requirements.txt index bb2471e..a631804 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,8 @@ pycoral~=2.0 # Installing PyCoral numpy>=1.16.0 # Installing NumPy, the fundamental package for array computing with Python Pillow>=4.0.0,<10.0 # Installing Pillow, a Python Image Library - +opencv-python-headless # Install OpenCV for highly optimized image resizing. Otherwise we'd need pillow-simd, which is much more tricky to get installed. CodeProject-AI-SDK # Installing the CodeProject.AI SDK -# last line empty \ No newline at end of file + +# last line empty diff --git a/segment_and_test.py b/segment_and_test.py index 72db254..07eb1ac 100644 --- a/segment_and_test.py +++ b/segment_and_test.py @@ -6,28 +6,27 @@ #''' fn_list = [ - #'tf2_ssd_mobilenet_v2_coco17_ptq', - #'ssd_mobilenet_v2_coco_quant_postprocess', - #'ssdlite_mobiledet_coco_qat_postprocess', - #'ssd_mobilenet_v1_coco_quant_postprocess', - #'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', - #'efficientdet_lite0_320_ptq', - #'efficientdet_lite1_384_ptq', - #'efficientdet_lite2_448_ptq', - #'efficientdet_lite3_512_ptq', - #'efficientdet_lite3x_640_ptq', - #'yolov5n-int8', - #'yolov5s-int8', - #'yolov5m-int8', - #'yolov5l-int8', - #'yolov8n_416_640px', # lg 1st seg - #'yolov8s_416_640px', # lg 1st seg - #'yolov8m_416_640px', # lg 1st seg - #'yolov8l_416_640px', # lg 1st seg - #'yolov8n_640px', - #'yolov8s_640px', - #'yolov8m_640px', # lg 1st seg - #'yolov8l_640px', # lg 1st seg + 'tf2_ssd_mobilenet_v2_coco17_ptq', + 'ssd_mobilenet_v2_coco_quant_postprocess', + 'ssdlite_mobiledet_coco_qat_postprocess', + 'ssd_mobilenet_v1_coco_quant_postprocess', + 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', + 'efficientdet_lite0_320_ptq', + 'efficientdet_lite1_384_ptq', + 'efficientdet_lite2_448_ptq', + 'efficientdet_lite3_512_ptq', + 'efficientdet_lite3x_640_ptq', + 'yolov5n-int8', + 'yolov5s-int8', + 'yolov5m-int8', + 'yolov5l-int8', + + ['yolov8n_416_640px', 'yolov8n_384_640px', 'yolov8n_384_608px', 'yolov8n_352_608px'], + ['yolov8s_416_640px', 'yolov8s_384_640px', 'yolov8s_384_608px', 'yolov8s_352_608px'], + ['yolov8m_416_640px', 'yolov8m_384_640px', 'yolov8m_384_608px', 'yolov8m_352_608px'], + ['yolov8l_416_640px', 'yolov8l_384_640px', 'yolov8l_384_608px', 'yolov8l_352_608px'], + + ['yolov9c_416_640px', 'yolov9c_384_640px', 'yolov9c_384_608px', 'yolov9c_352_608px', 'yolov9c_352_576px'], 'ipcam-general-v8'] custom_args = { @@ -77,7 +76,21 @@ 5: ["--partition_search_step","2"], 6: ["--partition_search_step","3"], 7: ["--partition_search_step","4"], - 8: ["--partition_search_step","5"]}}#''' + 8: ["--partition_search_step","5"]}, + 'yolov9c_416_640px': { + 2: ["--delegate_search_step","10"]}, + 'yolov9c_384_640px': { + 1: ["--delegate_search_step","10"], + 2: ["--delegate_search_step","10"]}, + 'yolov9c_384_608px': { + 1: ["--delegate_search_step","10"], + 2: ["--delegate_search_step","10"]}, + 'yolov9c_352_608px': { + 1: ["--delegate_search_step","10"], + 2: ["--delegate_search_step","10"]}, + 'yolov9c_352_576px': { + 1: ["--delegate_search_step","10"], + 2: ["--delegate_search_step","10"]}}#''' ''' fn_list = [ @@ -181,8 +194,8 @@ 7: ["--partition_search_step","4"], 8: ["--partition_search_step","5"]}}#''' -seg_dir = "/media/seth/FAT_THUMB/all_segments/" -seg_types = ['', '2x_first_seg/', '15x_first_seg/', '166x_first_seg/', '3x_first_seg/', '4x_first_seg/', 'inc_seg/', 'dumb/'] +seg_dir = "/home/seth/Documents/all_segments/" +seg_types = ['', '2x_first_seg/', '15x_first_seg/', '3x_first_seg/', '4x_first_seg/', '15x_last_seg/', '2x_last_seg/', 'dumb/'] def seg_exists(filename, segment_type, segment_count): @@ -195,12 +208,20 @@ def seg_exists(filename, segment_type, segment_count): seg_list = [seg_dir+segment_type+filename+'_segment_{}_of_{}_edgetpu.tflite'.format(i, segment_count) for i in range(segment_count)] return (seg_list, any([True for s in seg_list if not os.path.exists(s)])) -MAX_TPU_COUNT = 4 +MAX_TPU_COUNT = 6 ''' # Generate segment files for sn in range(1,MAX_TPU_COUNT+1): + flat_fn_list = [] for fn in fn_list: + if isinstance(fn, list): + flat_fn_list += fn + else: + flat_fn_list.append(fn) + + + for fn in flat_fn_list: for seg_type in seg_types: seg_list, file_missing = seg_exists(fn, seg_type, sn) @@ -281,62 +302,92 @@ def seg_exists(filename, segment_type, segment_count): partition_with_profiling_dir = "libcoral/tools.last15" elif '2x_last_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.last2" - elif 'inc_seg' == seg_type: + elif '125x_last_inc_seg/' == seg_type: + partition_with_profiling_dir = "libcoral/tools.last125_inc_seg" + elif '2x_first_125x_last_inc_seg/' == seg_type: + partition_with_profiling_dir = "libcoral/tools.2last125_inc_seg" + elif 'inc_seg/' == seg_type: partition_with_profiling_dir = "libcoral/tools.inc_seg" else: partition_with_profiling_dir = "libcoral/tools.orig" cmd = [partition_with_profiling_dir+"/partitioner/partition_with_profiling","--output_dir",seg_dir+seg_type,"--edgetpu_compiler_binary", "/usr/bin/edgetpu_compiler","--model_path",seg_dir+fn+".tflite","--num_segments",str(sn)] - - try: - cmd += custom_args[fn][sn] - except: - pass + + try: + cmd += custom_args[fn][sn] + except: + pass print(cmd) subprocess.run(cmd)#''' -seg_types += ['133x_first_seg/', '15x_last_seg/', '2x_last_seg/'] +seg_types += ['133x_first_seg/', '166x_first_seg/', 'inc_seg/', '125x_last_inc_seg/', '2x_first_125x_last_inc_seg/'] # Test timings fin_timings = {} fin_fnames = {} for fn in fn_list: + if isinstance(fn, list): + fn_size_list = fn + fn = fn[0] + else: + fn_size_list = [fn] + timings = [] fin_timings[fn] = {} fin_fnames[fn] = {} - for num_tpus in range(2,MAX_TPU_COUNT+1): + for num_tpus in range(1,MAX_TPU_COUNT+1): - for seg_type in seg_types: - max_seg = 0 - for sn in range(1,num_tpus+1): + for this_fn in fn_size_list: + for seg_type in seg_types: + max_seg = 0 + for sn in range(1,num_tpus+1): + # No need to run many slow single TPU tests, just one + if sn == 1 and seg_type != '': + continue - # Test against orig code - exe_file = "/home/seth/CodeProject.AI-Server/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py" + # Test against orig code + exe_file = "/home/seth/CodeProject.AI-Server/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py" - # Get file types - seg_list, file_missing = seg_exists(fn, seg_type, sn) + # Get file types + seg_list, file_missing = seg_exists(this_fn, seg_type, sn) - if file_missing: - continue - max_seg = sn + if file_missing: + continue + max_seg = sn + + cmd = ["python3.9",exe_file,"--model"] + \ + seg_list + ["--labels","coral/pycoral/test_data/coco_labels.txt","--input","/home/seth/coral/pycoral/test_data/grace_hopper.bmp", + "--count","4000","--num-tpus",str(num_tpus)] + print(cmd) + + # Clock runtime + #start_time = time.perf_counter() + #subprocess.run(cmd) + #ms_time = 1000 * (time.perf_counter() - start_time) / 4000 # ms * total time / iterations - cmd = ["python3",exe_file,"--model"] + \ - seg_list + ["--labels","coral/pycoral/test_data/coco_labels.txt","--input","/home/seth/coral/pycoral/test_data/grace_hopper.bmp", - "--count","2000","--num-tpus",str(num_tpus)] - print(cmd) - c = subprocess.run(cmd, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - print(c.stdout) - print(c.stderr) - ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stderr)[0]) - timings.append((ms_time, num_tpus, fn, seg_type, sn)) + # Last quarter runtime + try: + c = subprocess.run(cmd, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=3600*2) + except subprocess.TimeoutExpired: + print("Timed out!") + continue + print(c.stdout) + print(c.stderr) + ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stderr)[0]) + mpps_time = float(re.compile(r'; ([\d\.]+) tensor MPx').findall(c.stderr)[0]) - timings = sorted(timings, key=lambda t: t[0]) + timings.append((ms_time, num_tpus, this_fn, seg_type, sn, mpps_time)) + subprocess.run(['uptime']) - # Print the top three + timings = sorted(timings, key=lambda t: t[5], reverse=True) + if not any(timings): + continue + + # Print the top ten print(f"TIMINGS FOR {num_tpus} TPUs AND {fn} MODEL:") for t in range(min(10,len(timings))): print(timings[t]) @@ -344,12 +395,9 @@ def seg_exists(filename, segment_type, segment_count): # Get best segments, but # Skip if it's not 'orig_code' and > 1 segment t = [t for t in timings if t[3] != 'orig_code'][0] - if t[4] == 1: - continue - - # Add segment to the final list fin_timings[fn][num_tpus] = timings[0] + # Add segment to the final list # Copy best to local dir seg_list, _ = seg_exists(t[2], t[3], t[4]) fin_fnames[fn][num_tpus] = [] @@ -360,9 +408,12 @@ def seg_exists(filename, segment_type, segment_count): fin_fnames[fn][num_tpus].append(out_fname) # Create archive for this model / TPU count - if any(fin_fnames[fn][num_tpus]): - cmd = ['zip', '-9', f'objectdetection-{fn}-{num_tpus}-edgetpu.zip'] + fin_fnames[fn][num_tpus] + if len(fin_fnames[fn][num_tpus]) > 1 or num_tpus == 1: + zip_name = f'objectdetection-{fn}-{num_tpus}-edgetpu.zip' + cmd = ['zip', '-9', zip_name] + fin_fnames[fn][num_tpus] print(cmd) + if os.path.exists(zip_name): + os.unlink(zip_name) subprocess.run(cmd) print(fin_timings) @@ -371,7 +422,17 @@ def seg_exists(filename, segment_type, segment_count): # Pretty print all of the segments we've timed and selected for fn, v in fin_fnames.items(): print(" '%s': {" % fn) + for tpu_count, timing in fin_timings[fn].items(): + if tpu_count in v: + seg_str = f"{len(v[tpu_count])} segments" + else: + seg_str = "1 segment " + + fps = 1000.0 / timing[0] + + print(f"#{timing[0]:6.1f} ms/inference ({fps:5.1f} FPS;{timing[5]:5.1f} tensor MPx/sec) for {tpu_count} TPUs using {seg_str}: {timing[2]}") + for tpu_count, out_fnames in v.items(): - print(f" # {fin_timings[fn][tpu_count][0]:6.1f} ms per inference") - print(f" {tpu_count}: "+str(out_fnames)+",") + if len(out_fnames) > 1: + print(f"{tpu_count}: "+str(out_fnames)+",") print(" },") diff --git a/tpu_runner.py b/tpu_runner.py index c784a31..a927b53 100644 --- a/tpu_runner.py +++ b/tpu_runner.py @@ -20,9 +20,10 @@ import time import logging import queue -import gc import math +import cv2 +import concurrent.futures from datetime import datetime import numpy as np @@ -68,15 +69,17 @@ class TPUException(Exception): class DynamicInterpreter(object): - def __init__(self, fname_list: list, tpu_name: str, queues: list, rebalancing_lock: threading.Lock): + def __init__(self, fname_list: list, tpu_name: str, queues: list): self.fname_list = fname_list self.tpu_name = tpu_name self.queues = queues - self.rebalancing_lock = rebalancing_lock - self.timings = [0.0] * len(fname_list) - self.q_len = [0] * len(fname_list) - self.exec_count = [0] * len(fname_list) + # Keep track of how productive this TPU is + self.stats_lock = threading.Lock() + self.output_lock = threading.Lock() + self.timings = [0.0] * len(fname_list) + self.q_len = [0] * len(fname_list) + self.exec_count = [0] * len(fname_list) try: self.delegate = edgetpu.load_edgetpu_delegate({'device': tpu_name}) @@ -93,95 +96,80 @@ def __init__(self, fname_list: list, tpu_name: str, queues: list, rebalancing_lo def start(self, seg_idx: int, fbytes: bytes): logging.info(f"Loading {self.tpu_name}: {self.fname_list[seg_idx]}") - try: - self.interpreter = edgetpu.make_interpreter(fbytes, delegate=self.delegate) - except Exception as in_ex: - # If we fail to create even one of the interpreters then fail all. - # Throw exception and caller can try to recreate without the TPU. - # An option here is to remove the failed TPU from the list - # of TPUs and try the others. Maybe there's paired PCI cards - # and a USB, and the USB is failing? - logging.warning(f"Unable to create interpreter for TPU {self.tpu_name}: {in_ex}") - raise TPUException(self.tpu_name) - - self.interpreter.allocate_tensors() - self.input_details = self.interpreter.get_input_details() - self.output_details = self.interpreter.get_output_details() - - # Start processing loop per TPU - self.thread = threading.Thread(target=self._interpreter_runner, args=[seg_idx]) - self.thread.start() - - - def _interpreter_runner(self, seg_idx: int): - in_names = [d['name'] for d in self.input_details ] - out_names = [d['name'] for d in self.output_details] - indices = [d['index'] for d in self.output_details] - first_in_name = in_names.pop(0) + with self.output_lock: + try: + self.interpreter = edgetpu.make_interpreter(fbytes, delegate=self.delegate) + except Exception as in_ex: + # If we fail to create even one of the interpreters then fail all. + # Throw exception and caller can try to recreate without the TPU. + # An option here is to remove the failed TPU from the list + # of TPUs and try the others. Maybe there's paired PCI cards + # and a USB, and the USB is failing? + logging.warning(f"Unable to create interpreter for TPU {self.tpu_name}: {in_ex}") + raise TPUException(self.tpu_name) + + self.interpreter.allocate_tensors() + self.input_details = self.interpreter.get_input_details() + self.output_details = self.interpreter.get_output_details() + + # Setup local interpreter vars + self.seg_idx = seg_idx + self.this_q = self.queues[seg_idx] + self.next_q = None + if len(self.queues) > seg_idx+1: + self.next_q = self.queues[seg_idx+1] + + self.in_info = [(d['name'], d['index'], self.interpreter.tensor(d['index'])) for d in self.input_details ] + self.out_info = [(d['name'], d['index'], self.interpreter.tensor(d['index'])) for d in self.output_details] + self.first_in_name, _, _ = self.in_info.pop(0) - # Setup input/output queues - in_q = self.queues[seg_idx] - out_q = None - if len(self.queues) > seg_idx+1: - out_q = self.queues[seg_idx+1] + self.expected_input_size = np.prod(self.input_details[0]['shape']) + self.interpreter_handle = self.interpreter._native_handle() - # Input tensors for this interpreter - input_tensors = {} - for details in self.input_details: - input_tensors[details['name']] = self.interpreter.tensor(details['index']) - output_tensors = [] - if not out_q: - for details in self.output_details: - output_tensors.append(self.interpreter.tensor(details['index'])) + # Add self to priority queue + self.this_q.put(self) - expected_input_size = np.prod(self.input_details[0]['shape']) - interpreter_handle = self.interpreter._native_handle() + def invoke(self, working_tensors): + start_inference_time = time.perf_counter_ns() - # Run interpreter loop; consume & produce results - while True: - # Pull next input from the queue - working_tensors = in_q.get() - - # Exit if the pipeline is done - if working_tensors is False: - logging.debug("Get EOF in tid {}".format(threading.get_ident())) - self.interpreter = None - self.input_details = None - self.output_details = None - if self.rebalancing_lock.locked(): - self.rebalancing_lock.release() - return - - start_inference_time = time.perf_counter_ns() - - # Set inputs beyond the first - for name in in_names: - input_tensors[name]()[0] = working_tensors[0][name] + # Set inputs beyond the first + for name, _, t in self.in_info: + t()[0] = working_tensors[name] + # The next thread needs to wait for us to finish copying the output + with self.output_lock: # Invoke_with_membuffer() directly on numpy memory, # but only works with a single input - edgetpu.invoke_with_membuffer(interpreter_handle, - working_tensors[0][first_in_name].ctypes.data, - expected_input_size) + edgetpu.invoke_with_membuffer(self.interpreter_handle, + working_tensors[self.first_in_name].ctypes.data, + self.expected_input_size) - if out_q: - # Fetch results - for name, index in zip(out_names, indices): - working_tensors[0][name] = self.interpreter.get_tensor(index) + # Save locally in case it is moved to a different queue + this_q = self.this_q + next_q = self.next_q + seg_idx = self.seg_idx + + # Make TPU available to begin next round + this_q.put(self) - # Deliver to next queue in pipeline - out_q.put(working_tensors) + if next_q: + # Fetch results + for name, index, _ in self.out_info: + working_tensors[name] = self.interpreter.get_tensor(index) else: # Fetch pointer to results # Copy and convert to float - # Deliver to final results queue - working_tensors[1].put([t().astype(np.float32) for t in output_tensors]) + output = [t().astype(np.float32) for _,_,t in self.out_info] + with self.stats_lock: # Convert elapsed time to double precision ms self.timings[seg_idx] += (time.perf_counter_ns() - start_inference_time) / (1000.0 * 1000.0) - self.q_len[seg_idx] += in_q.qsize() + self.q_len[seg_idx] += this_q.qsize() self.exec_count[seg_idx] += 1 + # Return results + return next_q.get(timeout=MAX_WAIT_TIME).invoke(working_tensors) if next_q else output + def __del__(self): # Print performance info t_str = "" @@ -203,6 +191,18 @@ def __del__(self): self.delegate = None self.queues = None + def __lt__(self, other): + """Allow interpreters to be sorted in a PriorityQueue by speed.""" + selfPriority = 0.0 + if self.exec_count[self.seg_idx] > 0: + selfPriority = self.timings[self.seg_idx] / self.exec_count[self.seg_idx] + + otherPriority = 0.0 + if other.exec_count[other.seg_idx] > 0: + otherPriority = other.timings[other.seg_idx] / other.exec_count[other.seg_idx] + + return selfPriority < otherPriority + class DynamicPipeline(object): @@ -214,17 +214,14 @@ def __init__(self, tpu_list: list, fname_list: list): self.fname_list = fname_list self.tpu_list = tpu_list - self.interpreters = [[] for i in range(seg_count)] + self.interpreters = [[] for _ in fname_list] # Input queues for each segment; if we go over maxsize, something went wrong - self.queues = [queue.Queue(maxsize=self.max_pipeline_queue_length) for i in range(seg_count)] + self.queues = [queue.PriorityQueue(maxsize=self.max_pipeline_queue_length) for _ in fname_list] # Lock for internal reorganization self.balance_lock = threading.Lock() - # Lock for interpreter use - self.rebalancing_lock = threading.Lock() - # Read file data self.fbytes_list = [] for fname in fname_list: @@ -243,15 +240,17 @@ def __init__(self, tpu_list: list, fname_list: list): self._init_interpreters() def _init_interpreters(self): + assert self.balance_lock.locked() + # Set a Time To Live for balancing so we don't thrash - self.balance_ttl = len(self.tpu_list) * 2 + self.balance_ttl = len(self.tpu_list) * 3 start_boot_time = time.perf_counter_ns() # Fill TPUs with interpreters for i, tpu_name in enumerate(self.tpu_list): seg_idx = i % len(self.fname_list) - i = DynamicInterpreter(self.fname_list, tpu_name, self.queues, self.rebalancing_lock) + i = DynamicInterpreter(self.fname_list, tpu_name, self.queues) i.start(seg_idx, self.fbytes_list[seg_idx]) self.interpreters[seg_idx].append(i) @@ -261,12 +260,16 @@ def _init_interpreters(self): logging.info(f"Initialized pipeline interpreters in {boot_time:.1f}ms") - def enqueue(self, in_tensor, out_q: queue.Queue): + def invoke(self, in_tensor): with self.balance_lock: if not self.first_name: self._init_interpreters() + fn = self.first_name - self.queues[0].put(({self.first_name: in_tensor}, out_q)) + # It's possible the interpreters will get deleted before we get() one, + # but that's a risk we'll take to drop the lock and not block. If it happens, + # we'll end up blocking until timeout or another process re-inits. + return self.queues[0].get(timeout=MAX_WAIT_TIME).invoke({fn: in_tensor}) def _eval_timings(self, interpreter_counts): @@ -339,11 +342,13 @@ def _eval_timings(self, interpreter_counts): new_swap_t = 0.0 if i.exec_count[interp_i] > VALID_CNT_THRESH: new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] + + #print(f"i {interp_i} t {max_i} cnt {i.exec_count[max_i]} mt {max_t} nmt {new_max_t} nst {new_swap_t}") # If TPU has already found to be faster on this segment # and we aren't making the other segment the new worst # and we are choosing the best available candidate. - if max_t-0.5 > new_max_t and max_t > new_swap_t and swap_t > new_max_t: + if min(max_t-0.5, max_t*0.99) > new_max_t and max_t > new_swap_t and swap_t > new_max_t: swap_i = interp_i swap_t = new_max_t @@ -375,8 +380,8 @@ def balance_queues(self): elif swap_i is not None: # 2nd Priority: Swap slow segments with faster ones to see if we can # run them faster. Hopefully still a good way to optimize for - # heterogenous hardware. - logging.info(f"Auto-tuning between queues {swap_i} and {max_i}") + # heterogeneous hardware. + logging.info(f"Auto-tuning queues {swap_i} and {max_i}") # Stop them new_max = self._rem_interpreter_from(swap_i) @@ -399,29 +404,10 @@ def balance_queues(self): def _rem_interpreter_from(self, interp_i): - # Sending False kills the processing loop - self.rebalancing_lock.acquire() - self.queues[interp_i].put(False) - - # This is ugly, but I can't think of something better - # Threads are blocked by queues. Queues may not have a stream - # of work cycling them. Therefore must kill with an - # enqueued command. But we don't know which thread picks - # up the command from the queue. - - # Block & wait - realloc_interp = None - with self.rebalancing_lock: - for idx, interpreter in enumerate(self.interpreters[interp_i]): - if not interpreter.interpreter: - realloc_interp = self.interpreters[interp_i].pop(idx) - break - - if not realloc_interp: - logging.warning("Unable to find killed interpreter") - self.balance_lock.release() - return realloc_interp - + assert self.balance_lock.locked() + interp = self.queues[interp_i].get() + self.interpreters[interp_i].remove(interp) + return interp def print_queue_len(self): len_str = "" @@ -431,43 +417,23 @@ def print_queue_len(self): seg_str += " {:2}".format(len(i)) logging.info(f"Queue len: ({len_str}); Segment alloc: ({seg_str})") - def __del__(self): self.delete() - - def _halt_interpreters(self, seg_idx: int): - - if not self.interpreters or seg_idx < 0 or seg_idx >= len(self.interpreters): - return - - # Insert EOF to each queue - for i in self.interpreters[seg_idx]: - self.queues[seg_idx].put(False) - - # Wait for threads to finish - for interpreter in self.interpreters[seg_idx]: - t = interpreter.thread - logging.debug("Joining thread {} for DynamicPipeline.delete()".format(t.native_id)) - t.join(timeout=MAX_WAIT_TIME) - if t.is_alive(): - logging.warning("Pipe thread didn't join!") - - def delete(self): # Kill interpreters. Maybe refresh later; maybe delete object. + # Hold lock so no more work can be enqueued with self.balance_lock: - # Insert EOF to each queue - # Wait for threads to finish - # Init structures - for q_idx, q in enumerate(self.queues): - self._halt_interpreters(q_idx) - self.queues[q_idx] = queue.Queue(maxsize=self.max_pipeline_queue_length) + # Empty interpreter lists + # Empty interpreter queues + for i_list, q in zip(self.interpreters, self.queues): + # Make sure we dequeue the expected number of items + for _ in i_list: + q.get() - if self.interpreters and len(self.interpreters) > q_idx: - self.interpreters[q_idx] = [] + i_list.clear() - self.first_name = None + self.first_name = None class TPURunner(object): @@ -494,19 +460,22 @@ def __init__(self, tpu_limit: int = -1): self.model_name = None # Name of current model in use self.model_size = None # Size of current model in use self.labels = None # set of labels for this model - - self.runner_lock = threading.Lock() self.last_check_time = None self.printed_shape_map = {} + self.runner_lock = threading.Lock() + self.watchdog_time = None self.watchdog_shutdown = False self.watchdog_thread = threading.Thread(target=self._watchdog) self.watchdog_thread.start() + self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=32) + logging.info(f"edgetpu version: {edgetpu.get_runtime_version()}") logging.info(f"{Image.__name__} version: {Image.__version__}") + logging.info(f"OpenCV version: {cv2.__version__}") # Find the temperature file # https://coral.ai/docs/pcie-parameters/ @@ -532,14 +501,11 @@ def __init__(self, tpu_limit: int = -1): def _watchdog(self): self.watchdog_time = time.time() while not self.watchdog_shutdown: - if self.pipe and self.pipe.first_name is not None and \ - time.time() - self.watchdog_time > self.max_idle_secs_before_recycle: - logging.info("No work in {} seconds, watchdog shutting down TPUs.".format(self.max_idle_secs_before_recycle)) - self.runner_lock.acquire(timeout=MAX_WAIT_TIME) - if self.pipe is not None: - # Avoid possible race condition. + with self.runner_lock: + if self.pipe and self.pipe.first_name is not None and \ + time.time() - self.watchdog_time > self.max_idle_secs_before_recycle: + logging.info("No work in {} seconds, watchdog shutting down TPUs.".format(self.max_idle_secs_before_recycle)) self.pipe.delete() - self.runner_lock.release() # Pipeline will reinitialize itself as needed time.sleep(self.watchdog_idle_secs) @@ -589,10 +555,13 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list: device_count = len(tpu_list) # TPUs. We've at least found one self.device_type = 'Multi-TPU' if device_count == 1: - self.device_type = 'TPU' + self.device_type = 'Single TPU' # If TPU found then default is single TPU model file (no segments) if not any(options.tpu_segments_lists) or device_count == 1: + if not os.path.exists(options.model_tpu_file): + logging.warning(f"Missing TPU file: {options.model_tpu_file}; falling back to CPU") + return self._get_model_filenames(options, []) return [options.model_tpu_file] # We have a list of segment files @@ -602,14 +571,19 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list: # so best performance above that can probably be had by extrapolation. device_count = min(device_count, 8) if device_count in options.tpu_segments_lists: - return options.tpu_segments_lists[device_count] + seg_fnames = options.tpu_segments_lists[device_count] + for fn in seg_fnames: + if not os.path.exists(fn): + logging.warning(f"Missing TPU segment file: {fn}; falling back to single segment") + return self._get_model_filenames(options, (tpu_list[0],)) + return seg_fnames else: # Only one list of segments; use it regardless of even match to TPU count if len(options.tpu_segments_lists) <= device_count: return options.tpu_segments_lists # Couldn't find a good fit, use single segment - return [options.model_tpu_file] + return self._get_model_filenames(options, (tpu_list[0],)) # Should be called while holding runner_lock (if called at run time) @@ -634,10 +608,7 @@ def init_pipe(self, options: Options) -> tuple: tpu_model_files = self._get_model_filenames(options, tpu_list) # Read labels - try: - self.labels = read_label_file(options.label_file) if options.label_file else {} - except: - labels = {} + self.labels = read_label_file(options.label_file) if options.label_file else {} # Initialize EdgeTPU pipe. self.device_type = "Multi-TPU" @@ -646,15 +617,15 @@ def init_pipe(self, options: Options) -> tuple: self.pipe = DynamicPipeline(tpu_list, tpu_model_files) except TPUException as tpu_ex: self.pipe = None - logging.warning(f"TPU Exception creating interpreter: {tpu_ex}") + logging.exception(f"TPU Exception creating interpreter: {tpu_ex}") error = "Failed to create interpreter (Coral issue)" except FileNotFoundError as ex: self.pipe = None - logging.warning(f"Model file not found: {ex}") + logging.exception(f"Model file not found: {ex}") error = "Model file not found. Please download the model if possible" except Exception as ex: self.pipe = None - logging.warning(f"Exception creating interpreter: {ex}") + logging.exception(f"Exception creating interpreter: {ex}") error = "Unable to create the interpreter" if not self.pipe: @@ -677,6 +648,10 @@ def init_pipe(self, options: Options) -> tuple: self.input_details = self.pipe.interpreters[0][0].input_details[0] self.output_details = self.pipe.interpreters[-1][0].output_details[0] + # Rescale the input from uint8 to the TPU input tensor + self.input_zero = float(self.input_details['quantization'][1]) + self.input_scale = 1.0 / (255.0 * self.input_details['quantization'][0]) + # Print debug logging.info("{} device & segment counts: {} & {}" .format(self.device_type, @@ -685,6 +660,10 @@ def init_pipe(self, options: Options) -> tuple: logging.debug(f"Input details: {self.input_details}") logging.debug(f"Output details: {self.output_details}") + # Reduce OpenCV usage of threads + if os.cpu_count() is not None: + cv2.setNumThreads(min(8, os.cpu_count())) + return (self.device_type, error) @@ -713,6 +692,7 @@ def _periodic_check(self, options: Options, force: bool = False, """ error = None now_ts = datetime.now() + assert self.runner_lock.locked() if not self.pipe: logging.debug("No pipe found. Recreating.") @@ -732,25 +712,24 @@ def _periodic_check(self, options: Options, force: bool = False, self.last_check_time = now_ts # Check temperatures - if check_temp: - if self.temp_fname_format != None and self.pipe: - msg = "TPU {} is {}C and will likely be throttled" - temp_arr = [] - for i in range(len(self.pipe.tpu_list)): - if os.path.exists(self.temp_fname_format.format(i)): - with open(self.temp_fname_format.format(i), "r") as fp: - # Convert from millidegree C to degree C - temp = int(fp.read()) // 1000 - temp_arr.append(temp) - if self.warn_temperature_thresh_C <= temp: - logging.warning(msg.format(i, temp)) - if any(temp_arr): - logging.debug("Temperatures: {} avg; {} max; {} total".format( - sum(temp_arr) // len(temp_arr), - max(temp_arr), - len(temp_arr))) - else: - logging.warning("Unable to find temperatures!") + if check_temp and self.temp_fname_format != None and self.pipe: + msg = "TPU {} is {}C and will likely be throttled" + temp_arr = [] + for i in range(len(self.pipe.tpu_list)): + if os.path.exists(self.temp_fname_format.format(i)): + with open(self.temp_fname_format.format(i), "r") as fp: + # Convert from millidegree C to degree C + temp = int(fp.read()) // 1000 + temp_arr.append(temp) + if self.warn_temperature_thresh_C <= temp: + logging.warning(msg.format(i, temp)) + if any(temp_arr): + logging.debug("Temperatures: {} avg; {} max; {} total".format( + sum(temp_arr) // len(temp_arr), + max(temp_arr), + len(temp_arr))) + else: + logging.warning("Unable to find temperatures!") # Once an hour, refresh the pipe if (force or check_refresh) and self.pipe: @@ -789,7 +768,6 @@ def __del__(self): def _delete(self): # Close pipeline if self.pipe: - self.pipe.delete() self.pipe = None def pipeline_ok(self) -> bool: @@ -829,68 +807,64 @@ def _process_image(self, - Return inference timing. Note that the image object is modified in place to resize it - for in input tensor. + to fit the model's input tensor. """ + with self.runner_lock: + # Recreate the pipe if it is stale, but also check if we can + # and have created the pipe. It's not always successful... + (pipe_ok, error) = self._periodic_check(options) + if not pipe_ok: + return None, 0, error + + # Grab a reference so we know it isn't deleted + pipe = self.pipe + + tiles = self._get_tiles(options, image) + + start_inference_time = time.perf_counter() all_objects = [] - all_queues = [] - _, m_height, m_width, _ = self.input_details['shape'] - - # Potentially resize & pipeline a number of tiles - for rs_image, rs_loc in self._get_tiles(options, image): - rs_queue = queue.Queue(maxsize=1) - all_queues.append((rs_queue, rs_loc)) - logging.debug("Enqueuing tile in pipeline") + if len(tiles) > 1: + # Submit tile processing to thread pool + future_to_inference = {self.executor.submit(pipe.invoke, image): loc for image, loc in tiles} - with self.runner_lock: - # Recreate the pipe if it is stale, but also check if we can - # and have created the pipe. It's not always successful... - (pipe_ok, error) = self._periodic_check(options) - if not pipe_ok: - return None, 0, error - - self.pipe.enqueue(rs_image, rs_queue) - - # Wait for the results here - tot_inference_time = 0 - for rs_queue, rs_loc in all_queues: - # Wait for results - # We may have to wait a few seconds at most, but I'd expect the - # pipeline to clear fairly quickly. - start_inference_time = time.perf_counter() - result = rs_queue.get(timeout=MAX_WAIT_TIME) - tot_inference_time += time.perf_counter() - start_inference_time - assert result - - boxes, class_ids, scores, count = self._decode_result(result, score_threshold) - - logging.debug("BBox scaling params: {}x{}, ({},{}), {:.2f}x{:.2f}". - format(m_width, m_height, *rs_loc)) - - # Create Objects for each valid result - for i in range(int(count[0])): - if scores[0][i] < score_threshold: - continue - - ymin, xmin, ymax, xmax = boxes[0][i] - - bbox = detect.BBox(xmin=(max(xmin, 0.0)*m_width + rs_loc[0])*rs_loc[2], - ymin=(max(ymin, 0.0)*m_height + rs_loc[1])*rs_loc[3], - xmax=(min(xmax, 1.0)*m_width + rs_loc[0])*rs_loc[2], - ymax=(min(ymax, 1.0)*m_height + rs_loc[1])*rs_loc[3]) - - all_objects.append(detect.Object(id=int(class_ids[0][i]), - score=float(scores[0][i]), - bbox=bbox.map(int))) + # Wait for the results here + for future in concurrent.futures.as_completed(future_to_inference): + self._rs_to_obj(future.result(), score_threshold, all_objects, future_to_inference[future]) + else: + self._rs_to_obj(pipe.invoke(tiles[0][0]), score_threshold, all_objects, tiles[0][1]) + tot_inference_time = time.perf_counter() - start_inference_time # Convert to ms tot_inference_time = int(tot_inference_time * 1000) - # Remove duplicate objects - unique_indexes = self._non_max_suppression(all_objects, options.iou_threshold) + # We got here, so the pipe must be relatively healthy. self.watchdog_time = time.time() + return (all_objects, tot_inference_time, None) + + + def _rs_to_obj(self, rs, score_threshold, all_objects, rs_loc): + _, m_height, m_width, _ = self.input_details['shape'] + boxes, class_ids, scores, count = self._decode_result(rs, score_threshold) - return ([all_objects[i] for i in unique_indexes], tot_inference_time, None) - + logging.debug("BBox scaling params: {}x{}, ({},{}), {:.2f}x{:.2f}". + format(m_width, m_height, *rs_loc)) + + # Create Objects for each valid result + for i in range(int(count[0])): + if scores[0][i] < score_threshold: + continue + + ymin, xmin, ymax, xmax = boxes[0][i] + + bbox = detect.BBox(xmin=(max(xmin, 0.0)*m_width + rs_loc[0])*rs_loc[2], + ymin=(max(ymin, 0.0)*m_height + rs_loc[1])*rs_loc[3], + xmax=(min(xmax, 1.0)*m_width + rs_loc[0])*rs_loc[2], + ymax=(min(ymax, 1.0)*m_height + rs_loc[1])*rs_loc[3]) + + all_objects.append(detect.Object(id=int(class_ids[0][i]), + score=float(scores[0][i]), + bbox=bbox.map(int))) + def _decode_result(self, result_list, score_threshold: float): if len(result_list) == 4: @@ -978,7 +952,7 @@ def _nms(self, dets, scores, thresh): inds = np.where(ovr <= thresh)[0] order = order[inds + 1] - return np.array(keep) + return np.asarray(keep) def _yolov8_non_max_suppression(self, prediction, conf_thres=0.25, iou_thres=0.45, @@ -1101,9 +1075,11 @@ def _yolov5_non_max_suppression(self, return output - def _non_max_suppression(self, objects: list, threshold: float) -> list: + def non_max_suppression(self, objects: list, threshold: float) -> list: """Returns a list of indexes of objects passing the NMS. + Can be optionally used on tiled results by advanced users. + Args: objects: result candidates. threshold: the threshold of overlapping IoU to merge the boxes. @@ -1116,7 +1092,7 @@ def _non_max_suppression(self, objects: list, threshold: float) -> list: if len(objects) == 1: return [0] - boxes = np.array([o.bbox for o in objects]) + boxes = np.asarray([o.bbox for o in objects]) x_mins = boxes[:, 0] y_mins = boxes[:, 1] x_maxs = boxes[:, 2] @@ -1161,54 +1137,50 @@ def _non_max_suppression(self, objects: list, threshold: float) -> list: def _resize_and_chop_tiles(self, options: Options, - image: Image, + image_full: Image, m_width: int, m_height: int): """ Image resizing is one of the more expensive things we're doing here. - It's expensive enough that it may take as much CPU time as inference - under some circumstances. The Lanczos resampling kernel in particular - is expensive, but results in quality output. - - For example, see the resizing performance charts here: - https://python-pillow.org/pillow-perf - - Pillow is the highly optimized version of PIL and it only runs at - ~100 MP/sec when making a thumbnail with the Lanczos kernel. That's - only 12.6 4k frames per second, maximum, in a Python process. We are - hoping to process more than that with TPU hardware. - - We can also improve performance by installing - the 'pillow-simd' Python library. And improve it even more by - re-compiling it to use AVX2 instructions. See: - https://github.com/uploadcare/pillow-simd#pillow-simd + It's expensive enough that it may take more wall time than inference + under some circumstances. """ - i_width, i_height = image.size + i_height, i_width, _ = image_full.shape # What tile dim do we want? tiles_x = int(max(1, round(i_width / (options.downsample_by * m_width)))) tiles_y = int(max(1, round(i_height / (options.downsample_by * m_height)))) logging.debug("Chunking to {} x {} tiles".format(tiles_x, tiles_y)) - # Fit image within target size - resamp_x = int(m_width + (tiles_x - 1) * (m_width - options.tile_overlap)) - resamp_y = int(m_height + (tiles_y - 1) * (m_height - options.tile_overlap)) + # Find target size for new image + resamp_x = m_width + (tiles_x - 1) * (m_width - options.tile_overlap) + resamp_y = m_height + (tiles_y - 1) * (m_height - options.tile_overlap) - # Chop & resize image piece - if image.mode != 'RGB': - image = image.convert('RGB') - image.thumbnail((resamp_x, resamp_y), Image.LANCZOS) - logging.debug("Resizing to {} x {} for tiling".format(image.width, image.height)) + # Find a scaled image size limiting to 10% distortion + if resamp_y / i_height > resamp_x / i_width: + fin_x = int(resamp_x) + fin_y = int(min(1.1 * i_height * resamp_x / i_width, resamp_y)) + else: + fin_x = int(min(1.1 * i_width * resamp_y / i_height, resamp_x)) + fin_y = int(resamp_y) + + # Chop & resize image + if isinstance(image, np.array): + if fin_x < i_width or fin_y < i_height: + image_full = cv2.resize(image_full, (fin_x, fin_y), interpolation=cv2.INTER_AREA) + image = cv2.cvtColor(image_full, cv2.COLOR_BGR2RGB) + img_h, img_w, _ = image.shape + else: + image.thumbnail((resamp_x, resamp_y), Image.LANCZOS) + img_h, img_w = image.height, image.width - # Rescale the input from uint8 - input_zero = float(self.input_details['quantization'][1]) - input_scale = 1.0 / (255.0 * self.input_details['quantization'][0]) + logging.debug(f"Resizing to {img_w}x{img_h} for tiling ({m_width}x{m_height} tensor)") # It'd be useful to print this once at the beginning of the run - key = "{} {}".format(*image.size) + key = "{} {}".format(img_h, img_w) if key not in self.printed_shape_map: logging.info( - "Mapping {} image to {}x{} tiles".format(image.size, tiles_x, tiles_y)) + f"Mapping {img_w}x{img_h} ({m_width}x{m_height} tensor) image to {tiles_x}x{tiles_y} tiles") self.printed_shape_map[key] = True # Do chunking @@ -1216,30 +1188,93 @@ def _resize_and_chop_tiles(self, tiles = [] step_x = 1 if tiles_x > 1: - step_x = int(math.ceil((image.width - m_width)/(tiles_x-1))) + step_x = int(math.ceil((img_w - m_width)/(tiles_x-1))) step_y = 1 if tiles_y > 1: - step_y = int(math.ceil((image.height - m_height)/(tiles_y-1))) + step_y = int(math.ceil((img_h - m_height)/(tiles_y-1))) - for x_off in range(0, max(image.width - m_width, 0) + tiles_x, step_x): - for y_off in range(0, max(image.height - m_height, 0) + tiles_y, step_y): + for x_off in range(0, max(img_w - m_width, 0) + tiles_x, step_x): + for y_off in range(0, max(img_h - m_height, 0) + tiles_y, step_y): # Adjust contrast on a per-chunk basis; we will likely be quantizing the image during scaling - image_chunk = ImageOps.autocontrast(image.crop((x_off, - y_off, - x_off + m_width, - y_off + m_height)), 1) - # Normalize to whatever the input is - cropped_arr = np.asarray(image_chunk, np.float32) * input_scale + input_zero + if isinstance(image, np.array): + cropped_arr = self._autocontrast_scale_np(image, (x_off, y_off, + x_off + m_width, + y_off + m_height)) + else: + cropped_arr = self._pil_autocontrast_scale_np(image, (x_off, y_off, + x_off + m_width, + y_off + m_height)) logging.debug("Resampled image tile {} at offset {}, {}".format(cropped_arr.shape, x_off, y_off)) - resamp_info = (x_off, y_off, i_width/image.width, i_height/image.height) + resamp_info = (x_off, y_off, i_width/img_w, i_height/img_h) + + # Cast and clip + tile_arr = np.zeros(cropped_arr.shape, dtype=self.input_details['dtype']) + dinfo = np.iinfo(self.input_details['dtype']) + np.clip(cropped_arr, dinfo.min, dinfo.max, out=tile_arr, casting='unsafe') - tiles.append((cropped_arr.astype(self.input_details['dtype']), resamp_info)) + # Ensure this is the tensor-ready size + tile_height, tile_width, tile_c = tile_arr.shape + if tile_width != m_width or tile_height != m_height: + tile_arr = np.pad(tile_arr, ((0, m_height - tile_height), (0, m_width - tile_width), (0, 0))) + tiles.append((tile_arr, resamp_info)) + # Debug: + #Image.fromarray((tiles[-1][0]).astype(np.uint8)).save(f"test_{x_off}_{y_off}.png") return tiles + def _pil_autocontrast_scale_np(self, image, crop_dim): + image_chunk = ImageOps.autocontrast(image.crop(crop_dim), 1) + return np.asarray(image_chunk, np.float32) * self.input_scale + self.input_zero + + def _autocontrast_scale_np(self, image, crop_dim): + cropped_img = image[crop_dim[1]:crop_dim[3],crop_dim[0]:crop_dim[2]] + + # Convert to gret for histogram + gray = cv2.cvtColor(cropped_img, cv2.COLOR_RGB2GRAY) + + # Calculate grayscale histogram + hist = cv2.calcHist([gray],[0],None,[256],[0,256]) + hist_size = len(hist) + + # Calculate cumulative distribution from the histogram + accumulator = [] + accumulator.append(float(hist[0])) + for index in range(1, hist_size): + accumulator.append(accumulator[index -1] + float(hist[index])) + + # Locate points to clip + maximum = accumulator[-1] + clip_hist_percent = 2.0 # == 2% pixels outside nominal input tensor range + clip_hist_percent *= (maximum/100.0) + clip_hist_percent /= 2.0 + + # Locate left cut + minimum_gray = 0 + while accumulator[minimum_gray] < clip_hist_percent: + minimum_gray += 1 + + # Locate right cut + maximum_gray = hist_size -1 + while accumulator[maximum_gray] >= (maximum - clip_hist_percent): + maximum_gray -= 1 + + # Calculate alpha and beta values + alpha = 255.0 / (maximum_gray - minimum_gray) + beta = -minimum_gray * alpha + + # Combine the image tile contrast adjustment with the input tensor scaling. + # Advantages of doing this here in one step is: + # - Reducing quantization error. + # - Not clamping dynamic range to uint8 before scaling to the input tensor. + # - Fewer operations per pixel. + return np.asarray(cropped_img, np.float32) \ + * (alpha * self.input_scale) \ + + (beta * self.input_scale + self.input_zero) + + def _get_tiles(self, options: Options, image: Image): """ Returns an iterator that yields image tiles and associated location. From 429888ea898bf21cb8becebbace05cda39e1c6e8 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Sun, 16 Jun 2024 19:52:21 -0700 Subject: [PATCH 2/5] Small bugfixes --- options.py | 10 +++++++++- tpu_runner.py | 11 ++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/options.py b/options.py index 5db3823..e29b058 100644 --- a/options.py +++ b/options.py @@ -2,7 +2,15 @@ import platform # Import CodeProject.AI SDK -from codeproject_ai_sdk import ModuleOptions +try: + from codeproject_ai_sdk import ModuleOptions +except ImportError: + print("Unable to import CPAI SDK! Faking Options.") + class ModuleOptions: + module_path = '' + + def getEnvVariable(env_name, env_def): + return env_def class Settings: def __init__(self, model_name: str, model_name_pattern: str, std_model_name: str, diff --git a/tpu_runner.py b/tpu_runner.py index a927b53..4abae60 100644 --- a/tpu_runner.py +++ b/tpu_runner.py @@ -21,7 +21,11 @@ import logging import queue import math -import cv2 + +try: + import cv2 +except ImportError: + logging.info("Unable to import OpenCV in TPURunner.") import concurrent.futures from datetime import datetime @@ -1165,12 +1169,13 @@ def _resize_and_chop_tiles(self, fin_y = int(resamp_y) # Chop & resize image - if isinstance(image, np.array): + if isinstance(image_full, np.ndarray): if fin_x < i_width or fin_y < i_height: image_full = cv2.resize(image_full, (fin_x, fin_y), interpolation=cv2.INTER_AREA) image = cv2.cvtColor(image_full, cv2.COLOR_BGR2RGB) img_h, img_w, _ = image.shape else: + image = image_full image.thumbnail((resamp_x, resamp_y), Image.LANCZOS) img_h, img_w = image.height, image.width @@ -1196,7 +1201,7 @@ def _resize_and_chop_tiles(self, for x_off in range(0, max(img_w - m_width, 0) + tiles_x, step_x): for y_off in range(0, max(img_h - m_height, 0) + tiles_y, step_y): # Adjust contrast on a per-chunk basis; we will likely be quantizing the image during scaling - if isinstance(image, np.array): + if isinstance(image, np.ndarray): cropped_arr = self._autocontrast_scale_np(image, (x_off, y_off, x_off + m_width, y_off + m_height)) From beda4ce7caaf71baee9c51c230834b78712b14e9 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Mon, 24 Jun 2024 13:17:37 -0700 Subject: [PATCH 3/5] New TPU segment files --- options.py | 372 +++++++++++++++++++++++++++++++++----------------- tpu_runner.py | 4 +- 2 files changed, 249 insertions(+), 127 deletions(-) diff --git a/options.py b/options.py index e29b058..c3847bd 100644 --- a/options.py +++ b/options.py @@ -22,146 +22,250 @@ def __init__(self, model_name: str, model_name_pattern: str, std_model_name: str self.labels_name = labels_name self.MODEL_SEGMENTS = { - 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq': { - # 176.2 ms per inference (5.7 FPS) for 1 TPUs using 1 segment - # 97.8 ms per inference (10.2 FPS) for 2 TPUs using 2 segments - # 66.2 ms per inference (15.1 FPS) for 3 TPUs using 2 segments - # 48.8 ms per inference (20.5 FPS) for 4 TPUs using 1 segment - # 37.4 ms per inference (26.8 FPS) for 5 TPUs using 2 segments - 2: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], - 3: ['dumb_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'dumb_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], - 5: ['2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + 'tf2_ssd_mobilenet_v2_coco17_ptq': { +# 6.9 ms/inference (144.3 FPS; 11.7 tensor MPx/sec) for 1 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 3.9 ms/inference (255.1 FPS; 20.8 tensor MPx/sec) for 2 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 2.8 ms/inference (354.6 FPS; 28.8 tensor MPx/sec) for 3 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 2.3 ms/inference (434.8 FPS; 35.4 tensor MPx/sec) for 4 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 2.2 ms/inference (452.5 FPS; 36.7 tensor MPx/sec) for 5 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 2.2 ms/inference (452.5 FPS; 36.7 tensor MPx/sec) for 6 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq + }, + 'ssd_mobilenet_v2_coco_quant_postprocess': { +# 7.1 ms/inference (140.1 FPS; 11.4 tensor MPx/sec) for 1 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 3.9 ms/inference (259.1 FPS; 21.0 tensor MPx/sec) for 2 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 2.7 ms/inference (366.3 FPS; 29.8 tensor MPx/sec) for 3 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 2.2 ms/inference (444.4 FPS; 36.1 tensor MPx/sec) for 4 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 2.1 ms/inference (478.5 FPS; 38.9 tensor MPx/sec) for 5 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 2.1 ms/inference (478.5 FPS; 38.9 tensor MPx/sec) for 6 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess + }, + 'ssdlite_mobiledet_coco_qat_postprocess': { +# 8.8 ms/inference (113.8 FPS; 10.6 tensor MPx/sec) for 1 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess +# 4.6 ms/inference (217.9 FPS; 20.2 tensor MPx/sec) for 2 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess +# 3.3 ms/inference (305.8 FPS; 28.4 tensor MPx/sec) for 3 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess +# 2.8 ms/inference (363.6 FPS; 33.9 tensor MPx/sec) for 4 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess +# 2.8 ms/inference (363.6 FPS; 33.9 tensor MPx/sec) for 5 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess +# 2.8 ms/inference (363.6 FPS; 33.9 tensor MPx/sec) for 6 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess +3: ['15x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', '15x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite'], +4: ['4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', '4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite'], +5: ['4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', '4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite'], +6: ['4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', '4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite'], + }, + 'ssd_mobilenet_v1_coco_quant_postprocess': { +# 6.7 ms/inference (149.7 FPS; 12.2 tensor MPx/sec) for 1 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 3.5 ms/inference (289.0 FPS; 23.5 tensor MPx/sec) for 2 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 2.4 ms/inference (411.5 FPS; 33.5 tensor MPx/sec) for 3 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 2.0 ms/inference (490.2 FPS; 39.8 tensor MPx/sec) for 4 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 2.0 ms/inference (502.5 FPS; 40.8 tensor MPx/sec) for 5 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 2.0 ms/inference (505.1 FPS; 41.1 tensor MPx/sec) for 6 TPUs using 2 segments: ssd_mobilenet_v1_coco_quant_postprocess +6: ['dumb_ssd_mobilenet_v1_coco_quant_postprocess_segment_0_of_2_edgetpu.tflite', 'dumb_ssd_mobilenet_v1_coco_quant_postprocess_segment_1_of_2_edgetpu.tflite'], + }, + 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq': { +# 175.7 ms/inference ( 5.7 FPS; 2.2 tensor MPx/sec) for 1 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 89.4 ms/inference ( 11.2 FPS; 4.4 tensor MPx/sec) for 2 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 60.9 ms/inference ( 16.4 FPS; 6.4 tensor MPx/sec) for 3 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 46.3 ms/inference ( 21.6 FPS; 8.4 tensor MPx/sec) for 4 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 38.6 ms/inference ( 25.9 FPS; 10.1 tensor MPx/sec) for 5 TPUs using 2 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 34.5 ms/inference ( 29.0 FPS; 11.3 tensor MPx/sec) for 6 TPUs using 2 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +5: ['2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], +6: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'efficientdet_lite0_320_ptq': { +# 23.2 ms/inference ( 43.2 FPS; 4.0 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 13.9 ms/inference ( 71.7 FPS; 6.7 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 9.8 ms/inference (102.4 FPS; 9.5 tensor MPx/sec) for 3 TPUs using 2 segments: efficientdet_lite0_320_ptq +# 9.2 ms/inference (109.3 FPS; 10.2 tensor MPx/sec) for 4 TPUs using 2 segments: efficientdet_lite0_320_ptq +# 7.7 ms/inference (129.5 FPS; 12.1 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite0_320_ptq +# 7.5 ms/inference (134.0 FPS; 12.5 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite0_320_ptq +3: ['3x_first_seg_efficientdet_lite0_320_ptq_segment_0_of_2_edgetpu.tflite', '3x_first_seg_efficientdet_lite0_320_ptq_segment_1_of_2_edgetpu.tflite'], +4: ['133x_first_seg_efficientdet_lite0_320_ptq_segment_0_of_2_edgetpu.tflite', '133x_first_seg_efficientdet_lite0_320_ptq_segment_1_of_2_edgetpu.tflite'], +5: ['133x_first_seg_efficientdet_lite0_320_ptq_segment_0_of_2_edgetpu.tflite', '133x_first_seg_efficientdet_lite0_320_ptq_segment_1_of_2_edgetpu.tflite'], +6: ['166x_first_seg_efficientdet_lite0_320_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite0_320_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'efficientdet_lite1_384_ptq': { +# 34.7 ms/inference ( 28.8 FPS; 3.9 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 21.5 ms/inference ( 46.5 FPS; 6.3 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 14.0 ms/inference ( 71.5 FPS; 9.7 tensor MPx/sec) for 3 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 12.3 ms/inference ( 81.5 FPS; 11.1 tensor MPx/sec) for 4 TPUs using 2 segments: efficientdet_lite1_384_ptq +# 11.2 ms/inference ( 89.0 FPS; 12.1 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite1_384_ptq +# 10.6 ms/inference ( 94.6 FPS; 12.9 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite1_384_ptq +4: ['dumb_efficientdet_lite1_384_ptq_segment_0_of_2_edgetpu.tflite', 'dumb_efficientdet_lite1_384_ptq_segment_1_of_2_edgetpu.tflite'], +5: ['133x_first_seg_efficientdet_lite1_384_ptq_segment_0_of_2_edgetpu.tflite', '133x_first_seg_efficientdet_lite1_384_ptq_segment_1_of_2_edgetpu.tflite'], +6: ['15x_first_seg_efficientdet_lite1_384_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite1_384_ptq_segment_1_of_2_edgetpu.tflite'], }, 'efficientdet_lite2_448_ptq': { - # 60.0 ms per inference (16.7 FPS) for 1 TPUs using 1 segment - # 30.6 ms per inference (32.7 FPS) for 2 TPUs using 1 segment - # 20.4 ms per inference (49.1 FPS) for 3 TPUs using 2 segments - # 17.4 ms per inference (57.4 FPS) for 4 TPUs using 2 segments - # 14.5 ms per inference (68.8 FPS) for 5 TPUs using 2 segments - 3: ['2x_last_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], - 4: ['all_segments_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], - 5: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], +# 60.6 ms/inference ( 16.5 FPS; 3.1 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite2_448_ptq +# 31.4 ms/inference ( 31.9 FPS; 6.0 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite2_448_ptq +# 20.6 ms/inference ( 48.7 FPS; 9.1 tensor MPx/sec) for 3 TPUs using 2 segments: efficientdet_lite2_448_ptq +# 18.1 ms/inference ( 55.2 FPS; 10.4 tensor MPx/sec) for 4 TPUs using 2 segments: efficientdet_lite2_448_ptq +# 15.3 ms/inference ( 65.4 FPS; 12.3 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite2_448_ptq +# 14.4 ms/inference ( 69.3 FPS; 13.0 tensor MPx/sec) for 6 TPUs using 3 segments: efficientdet_lite2_448_ptq +3: ['2x_last_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], +4: ['4x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '4x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], +5: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], +6: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_2_of_3_edgetpu.tflite'], }, 'efficientdet_lite3_512_ptq': { - # 75.7 ms per inference (13.2 FPS) for 1 TPUs using 1 segment - # 38.1 ms per inference (26.2 FPS) for 2 TPUs using 1 segment - # 26.8 ms per inference (37.3 FPS) for 3 TPUs using 1 segment - # 20.7 ms per inference (48.4 FPS) for 4 TPUs using 1 segment - # 18.0 ms per inference (55.5 FPS) for 5 TPUs using 1 segment +# 76.7 ms/inference ( 13.0 FPS; 3.2 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 38.5 ms/inference ( 25.9 FPS; 6.4 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 26.9 ms/inference ( 37.2 FPS; 9.2 tensor MPx/sec) for 3 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 21.1 ms/inference ( 47.3 FPS; 11.7 tensor MPx/sec) for 4 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 17.5 ms/inference ( 57.3 FPS; 14.2 tensor MPx/sec) for 5 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 17.1 ms/inference ( 58.4 FPS; 14.4 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite3_512_ptq +6: ['2x_last_seg_efficientdet_lite3_512_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite3_512_ptq_segment_1_of_2_edgetpu.tflite'], }, 'efficientdet_lite3x_640_ptq': { - # 181.6 ms per inference (5.5 FPS) for 1 TPUs using 1 segment - # 91.5 ms per inference (10.9 FPS) for 2 TPUs using 1 segment - # 62.9 ms per inference (15.9 FPS) for 3 TPUs using 2 segments - # 49.6 ms per inference (20.2 FPS) for 4 TPUs using 1 segment - # 40.4 ms per inference (24.7 FPS) for 5 TPUs using 2 segments - 3: ['2x_last_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], - 5: ['15x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], +# 180.8 ms/inference ( 5.5 FPS; 2.2 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite3x_640_ptq +# 93.6 ms/inference ( 10.7 FPS; 4.2 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite3x_640_ptq +# 63.9 ms/inference ( 15.7 FPS; 6.1 tensor MPx/sec) for 3 TPUs using 2 segments: efficientdet_lite3x_640_ptq +# 48.1 ms/inference ( 20.8 FPS; 8.1 tensor MPx/sec) for 4 TPUs using 2 segments: efficientdet_lite3x_640_ptq +# 40.9 ms/inference ( 24.5 FPS; 9.6 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite3x_640_ptq +# 35.5 ms/inference ( 28.2 FPS; 11.0 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite3x_640_ptq +3: ['2x_last_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], +4: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], +5: ['15x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], +6: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'yolov5n-int8': { +# 27.2 ms/inference ( 36.7 FPS; 6.9 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5n-int8 +# 17.2 ms/inference ( 58.2 FPS; 10.9 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5n-int8 +# 12.4 ms/inference ( 80.3 FPS; 15.1 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5n-int8 +# 12.0 ms/inference ( 83.1 FPS; 15.6 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5n-int8 +# 10.9 ms/inference ( 91.4 FPS; 17.1 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5n-int8 +# 10.9 ms/inference ( 91.4 FPS; 17.1 tensor MPx/sec) for 6 TPUs using 2 segments: yolov5n-int8 +3: ['4x_first_seg_yolov5n-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5n-int8_segment_1_of_2_edgetpu.tflite'], +4: ['133x_first_seg_yolov5n-int8_segment_0_of_2_edgetpu.tflite', '133x_first_seg_yolov5n-int8_segment_1_of_2_edgetpu.tflite'], +5: ['4x_first_seg_yolov5n-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5n-int8_segment_1_of_2_edgetpu.tflite'], +6: ['4x_first_seg_yolov5n-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5n-int8_segment_1_of_2_edgetpu.tflite'], }, 'yolov5s-int8': { - # 36.4 ms per inference (27.5 FPS) for 1 TPUs using 1 segment - # 18.9 ms per inference (53.0 FPS) for 2 TPUs using 1 segment - # 14.4 ms per inference (69.7 FPS) for 3 TPUs using 1 segment - # 11.7 ms per inference (85.4 FPS) for 4 TPUs using 1 segment - # 10.8 ms per inference (92.6 FPS) for 5 TPUs using 1 segment +# 39.9 ms/inference ( 25.1 FPS; 4.7 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5s-int8 +# 22.3 ms/inference ( 44.9 FPS; 8.4 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5s-int8 +# 15.0 ms/inference ( 66.6 FPS; 12.5 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5s-int8 +# 11.7 ms/inference ( 85.5 FPS; 16.0 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5s-int8 +# 11.3 ms/inference ( 88.7 FPS; 16.6 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5s-int8 +# 10.0 ms/inference (100.4 FPS; 18.8 tensor MPx/sec) for 6 TPUs using 2 segments: yolov5s-int8 +3: ['166x_first_seg_yolov5s-int8_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov5s-int8_segment_1_of_2_edgetpu.tflite'], +4: ['4x_first_seg_yolov5s-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5s-int8_segment_1_of_2_edgetpu.tflite'], +5: ['166x_first_seg_yolov5s-int8_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov5s-int8_segment_1_of_2_edgetpu.tflite'], +6: ['4x_first_seg_yolov5s-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5s-int8_segment_1_of_2_edgetpu.tflite'], }, 'yolov5m-int8': { - # 100.3 ms per inference (10.0 FPS) for 1 TPUs using 1 segment - # 50.5 ms per inference (19.8 FPS) for 2 TPUs using 1 segment - # 31.7 ms per inference (31.5 FPS) for 3 TPUs using 2 segments - # 26.0 ms per inference (38.5 FPS) for 4 TPUs using 2 segments - # 20.1 ms per inference (49.9 FPS) for 5 TPUs using 2 segments - 3: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], - 4: ['4x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], - 5: ['4x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], +# 100.9 ms/inference ( 9.9 FPS; 1.9 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5m-int8 +# 50.6 ms/inference ( 19.8 FPS; 3.7 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5m-int8 +# 31.7 ms/inference ( 31.6 FPS; 5.9 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5m-int8 +# 25.8 ms/inference ( 38.8 FPS; 7.3 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5m-int8 +# 19.9 ms/inference ( 50.1 FPS; 9.4 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5m-int8 +# 16.9 ms/inference ( 59.1 FPS; 11.1 tensor MPx/sec) for 6 TPUs using 2 segments: yolov5m-int8 +3: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], +4: ['166x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], +5: ['3x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], +6: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], }, 'yolov5l-int8': { - # 182.8 ms per inference (5.5 FPS) for 1 TPUs using 1 segment - # 85.6 ms per inference (11.7 FPS) for 2 TPUs using 2 segments - # 56.5 ms per inference (17.7 FPS) for 3 TPUs using 2 segments - # 43.8 ms per inference (22.8 FPS) for 4 TPUs using 2 segments - # 34.0 ms per inference (29.4 FPS) for 5 TPUs using 3 segments - 2: ['dumb_yolov5l-int8_segment_0_of_2_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], - 3: ['2x_last_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '2x_last_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], - 4: ['dumb_yolov5l-int8_segment_0_of_2_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], - 5: ['3x_first_seg_yolov5l-int8_segment_0_of_3_edgetpu.tflite', '3x_first_seg_yolov5l-int8_segment_1_of_3_edgetpu.tflite', '3x_first_seg_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], +# 183.5 ms/inference ( 5.4 FPS; 1.0 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5l-int8 +# 85.5 ms/inference ( 11.7 FPS; 2.2 tensor MPx/sec) for 2 TPUs using 2 segments: yolov5l-int8 +# 55.2 ms/inference ( 18.1 FPS; 3.4 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5l-int8 +# 43.6 ms/inference ( 22.9 FPS; 4.3 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5l-int8 +# 34.2 ms/inference ( 29.2 FPS; 5.5 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5l-int8 +# 30.0 ms/inference ( 33.3 FPS; 6.2 tensor MPx/sec) for 6 TPUs using 3 segments: yolov5l-int8 +2: ['dumb_yolov5l-int8_segment_0_of_2_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], +3: ['2x_last_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '2x_last_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], +4: ['3x_first_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], +5: ['3x_first_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], +6: ['4x_first_seg_yolov5l-int8_segment_0_of_3_edgetpu.tflite', '4x_first_seg_yolov5l-int8_segment_1_of_3_edgetpu.tflite', '4x_first_seg_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], + }, + 'yolov8n_416_640px': { +# 23.7 ms/inference ( 42.1 FPS; 9.7 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8n_384_640px +# 12.1 ms/inference ( 82.9 FPS; 19.1 tensor MPx/sec) for 2 TPUs using 1 segments: yolov8n_384_640px +# 9.1 ms/inference (109.8 FPS; 25.3 tensor MPx/sec) for 3 TPUs using 1 segments: yolov8n_384_640px +# 7.6 ms/inference (131.6 FPS; 30.3 tensor MPx/sec) for 4 TPUs using 1 segments: yolov8n_384_640px +# 7.0 ms/inference (142.2 FPS; 32.8 tensor MPx/sec) for 5 TPUs using 1 segments: yolov8n_384_640px +# 6.5 ms/inference (154.3 FPS; 35.6 tensor MPx/sec) for 6 TPUs using 1 segments: yolov8n_384_640px }, 'yolov8s_416_640px': { - # 67.5 ms per inference (14.8 FPS) for 1 TPUs using 1 segment - # 34.5 ms per inference (29.0 FPS) for 2 TPUs using 1 segment - # 22.8 ms per inference (43.8 FPS) for 3 TPUs using 1 segment - # 17.0 ms per inference (58.9 FPS) for 4 TPUs using 2 segments - # 13.1 ms per inference (76.1 FPS) for 5 TPUs using 2 segments - 4: ['3x_first_seg_yolov8s_416_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8s_416_640px_segment_1_of_2_edgetpu.tflite'], - 5: ['3x_first_seg_yolov8s_416_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8s_416_640px_segment_1_of_2_edgetpu.tflite'], +# 46.5 ms/inference ( 21.5 FPS; 4.7 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8s_384_608px +# 23.5 ms/inference ( 42.5 FPS; 9.3 tensor MPx/sec) for 2 TPUs using 1 segments: yolov8s_384_608px +# 16.2 ms/inference ( 61.7 FPS; 13.5 tensor MPx/sec) for 3 TPUs using 1 segments: yolov8s_384_608px +# 10.9 ms/inference ( 91.8 FPS; 18.4 tensor MPx/sec) for 4 TPUs using 2 segments: yolov8s_352_608px +# 9.8 ms/inference (102.0 FPS; 22.3 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8s_384_608px +# 8.9 ms/inference (112.0 FPS; 24.5 tensor MPx/sec) for 6 TPUs using 1 segments: yolov8s_384_608px +4: ['4x_first_seg_yolov8s_352_608px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8s_352_608px_segment_1_of_2_edgetpu.tflite'], +5: ['3x_first_seg_yolov8s_384_608px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8s_384_608px_segment_1_of_2_edgetpu.tflite'], }, 'yolov8m_416_640px': { - # 272.3 ms per inference (3.7 FPS) for 1 TPUs using 1 segment - # 95.6 ms per inference (10.5 FPS) for 2 TPUs using 2 segments - # 59.5 ms per inference (16.8 FPS) for 3 TPUs using 3 segments - # 43.8 ms per inference (22.8 FPS) for 4 TPUs using 2 segments - # 35.5 ms per inference (28.2 FPS) for 5 TPUs using 2 segments - 2: ['all_segments_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], - 3: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], - 4: ['2x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], - 5: ['3x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], +# 188.7 ms/inference ( 5.3 FPS; 1.1 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8m_352_608px +# 95.1 ms/inference ( 10.5 FPS; 2.6 tensor MPx/sec) for 2 TPUs using 2 segments: yolov8m_416_640px +# 58.7 ms/inference ( 17.0 FPS; 4.3 tensor MPx/sec) for 3 TPUs using 2 segments: yolov8m_416_640px +# 44.0 ms/inference ( 22.7 FPS; 5.7 tensor MPx/sec) for 4 TPUs using 2 segments: yolov8m_416_640px +# 35.5 ms/inference ( 28.1 FPS; 7.0 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8m_416_640px +# 30.9 ms/inference ( 32.4 FPS; 8.1 tensor MPx/sec) for 6 TPUs using 3 segments: yolov8m_416_640px +2: ['all_segments_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], +3: ['2x_last_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '2x_last_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], +4: ['2x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], +5: ['4x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], +6: ['133x_first_seg_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', '133x_first_seg_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', '133x_first_seg_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], }, 'yolov8l_416_640px': { - # 1053.4 ms per inference (0.9 FPS) for 1 TPUs using 1 segment - # 155.1 ms per inference (6.4 FPS) for 2 TPUs using 2 segments - # 98.1 ms per inference (10.2 FPS) for 3 TPUs using 2 segments - # 78.3 ms per inference (12.8 FPS) for 4 TPUs using 2 segments - # 61.4 ms per inference (16.3 FPS) for 5 TPUs using 2 segments - 2: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - 3: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - 4: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - 5: ['4x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], +# 236.2 ms/inference ( 4.2 FPS; 0.8 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8l_352_608px +# 118.1 ms/inference ( 8.5 FPS; 1.7 tensor MPx/sec) for 2 TPUs using 1 segments: yolov8l_352_608px +# 85.2 ms/inference ( 11.7 FPS; 2.6 tensor MPx/sec) for 3 TPUs using 2 segments: yolov8l_384_608px +# 59.8 ms/inference ( 16.7 FPS; 3.3 tensor MPx/sec) for 4 TPUs using 1 segments: yolov8l_352_608px +# 46.1 ms/inference ( 21.7 FPS; 4.3 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8l_352_608px +# 51.1 ms/inference ( 19.6 FPS; 4.9 tensor MPx/sec) for 6 TPUs using 3 segments: yolov8l_416_640px +3: ['2x_first_seg_yolov8l_384_608px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8l_384_608px_segment_1_of_2_edgetpu.tflite'], +5: ['4x_first_seg_yolov8l_352_608px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8l_352_608px_segment_1_of_2_edgetpu.tflite'], +6: ['4x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'], }, - 'yolov8s_640px': { - # 541.0 ms per inference (1.8 FPS) for 1 TPUs using 1 segment - # 83.7 ms per inference (11.9 FPS) for 2 TPUs using 2 segments - # 54.1 ms per inference (18.5 FPS) for 3 TPUs using 3 segments - # 40.8 ms per inference (24.5 FPS) for 4 TPUs using 3 segments - # 32.9 ms per inference (30.4 FPS) for 5 TPUs using 3 segments - 2: ['15x_last_seg_yolov8s_640px_segment_0_of_2_edgetpu.tflite', '15x_last_seg_yolov8s_640px_segment_1_of_2_edgetpu.tflite'], - 3: ['all_segments_yolov8s_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_2_of_3_edgetpu.tflite'], - 4: ['all_segments_yolov8s_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_2_of_3_edgetpu.tflite'], - 5: ['all_segments_yolov8s_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8s_640px_segment_2_of_3_edgetpu.tflite'], + 'yolov9t_416_640px': { +# 29.3 ms/inference ( 34.1 FPS; 7.9 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9t_384_640px +# 14.6 ms/inference ( 68.6 FPS; 15.8 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9t_384_640px +# 10.3 ms/inference ( 96.7 FPS; 22.3 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9t_384_640px +# 8.3 ms/inference (120.2 FPS; 27.7 tensor MPx/sec) for 4 TPUs using 1 segments: yolov9t_384_640px +# 7.3 ms/inference (137.7 FPS; 31.8 tensor MPx/sec) for 5 TPUs using 1 segments: yolov9t_384_640px +# 6.6 ms/inference (151.1 FPS; 34.8 tensor MPx/sec) for 6 TPUs using 1 segments: yolov9t_384_640px }, - 'yolov8m_640px': { - # 353.8 ms per inference (2.8 FPS) for 1 TPUs using 1 segment - # 165.9 ms per inference (6.0 FPS) for 2 TPUs using 2 segments - # 95.4 ms per inference (10.5 FPS) for 3 TPUs using 2 segments - # 71.9 ms per inference (13.9 FPS) for 4 TPUs using 2 segments - # 56.6 ms per inference (17.7 FPS) for 5 TPUs using 2 segments - 2: ['all_segments_yolov8m_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_640px_segment_1_of_2_edgetpu.tflite'], - 3: ['2x_first_seg_yolov8m_640px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8m_640px_segment_1_of_2_edgetpu.tflite'], - 4: ['3x_first_seg_yolov8m_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8m_640px_segment_1_of_2_edgetpu.tflite'], - 5: ['4x_first_seg_yolov8m_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8m_640px_segment_1_of_2_edgetpu.tflite'], + 'yolov9s_416_640px': { +# 45.9 ms/inference ( 21.8 FPS; 4.1 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9s_352_576px +# 22.8 ms/inference ( 43.9 FPS; 8.3 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9s_352_576px +# 15.3 ms/inference ( 65.1 FPS; 12.3 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9s_352_576px +# 11.7 ms/inference ( 85.4 FPS; 16.1 tensor MPx/sec) for 4 TPUs using 1 segments: yolov9s_352_576px +# 10.3 ms/inference ( 97.3 FPS; 19.4 tensor MPx/sec) for 5 TPUs using 1 segments: yolov9s_352_608px +# 8.3 ms/inference (120.5 FPS; 22.8 tensor MPx/sec) for 6 TPUs using 2 segments: yolov9s_352_576px +6: ['3x_first_seg_yolov9s_352_576px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov9s_352_576px_segment_1_of_2_edgetpu.tflite'], }, - 'yolov8l_640px': { - # 1517.3 ms per inference (0.7 FPS) for 1 TPUs using 1 segment - # 389.8 ms per inference (2.6 FPS) for 2 TPUs using 2 segments - # 206.5 ms per inference (4.8 FPS) for 3 TPUs using 2 segments - # 149.0 ms per inference (6.7 FPS) for 4 TPUs using 2 segments - # 132.4 ms per inference (7.6 FPS) for 5 TPUs using 2 segments - 2: ['15x_first_seg_yolov8l_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_640px_segment_1_of_2_edgetpu.tflite'], - 3: ['15x_first_seg_yolov8l_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_640px_segment_1_of_2_edgetpu.tflite'], - 4: ['2x_last_seg_yolov8l_640px_segment_0_of_2_edgetpu.tflite', '2x_last_seg_yolov8l_640px_segment_1_of_2_edgetpu.tflite'], - 5: ['2x_first_seg_yolov8l_640px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8l_640px_segment_1_of_2_edgetpu.tflite'], + 'yolov9m_416_640px': { +# 148.0 ms/inference ( 6.8 FPS; 1.3 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9m_352_576px +# 73.8 ms/inference ( 13.5 FPS; 2.6 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9m_352_576px +# 49.6 ms/inference ( 20.2 FPS; 3.8 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9m_352_576px +# 37.1 ms/inference ( 26.9 FPS; 5.1 tensor MPx/sec) for 4 TPUs using 1 segments: yolov9m_352_576px +# 35.4 ms/inference ( 28.3 FPS; 6.2 tensor MPx/sec) for 5 TPUs using 1 segments: yolov9m_384_608px +# 33.5 ms/inference ( 29.9 FPS; 7.5 tensor MPx/sec) for 6 TPUs using 2 segments: yolov9m_416_640px +6: ['3x_first_seg_yolov9m_416_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov9m_416_640px_segment_1_of_2_edgetpu.tflite'], }, - 'ipcam-general-v8': { - # 241.2 ms per inference (4.1 FPS) for 1 TPUs using 1 segment - # 44.7 ms per inference (22.4 FPS) for 2 TPUs using 2 segments - # 22.5 ms per inference (44.4 FPS) for 3 TPUs using 2 segments - # 16.1 ms per inference (62.0 FPS) for 4 TPUs using 2 segments - # 12.2 ms per inference (82.2 FPS) for 5 TPUs using 2 segments - 2: ['15x_last_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], - 3: ['166x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '166x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], - 4: ['2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], - 5: ['2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], - + 'yolov9c_416_640px': { +# 306.7 ms/inference ( 3.3 FPS; 0.8 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9c_416_640px +# 153.2 ms/inference ( 6.5 FPS; 1.6 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9c_416_640px +# 103.0 ms/inference ( 9.7 FPS; 2.4 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9c_416_640px +# 74.6 ms/inference ( 13.4 FPS; 3.4 tensor MPx/sec) for 4 TPUs using 2 segments: yolov9c_416_640px +# 59.5 ms/inference ( 16.8 FPS; 4.2 tensor MPx/sec) for 5 TPUs using 2 segments: yolov9c_416_640px +# 47.1 ms/inference ( 21.2 FPS; 4.6 tensor MPx/sec) for 6 TPUs using 2 segments: yolov9c_384_608px +4: ['dumb_yolov9c_416_640px_segment_0_of_2_edgetpu.tflite', 'dumb_yolov9c_416_640px_segment_1_of_2_edgetpu.tflite'], +5: ['15x_last_seg_yolov9c_416_640px_segment_0_of_2_edgetpu.tflite', '15x_last_seg_yolov9c_416_640px_segment_1_of_2_edgetpu.tflite'], +6: ['all_segments_yolov9c_384_608px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov9c_384_608px_segment_1_of_2_edgetpu.tflite'], }, + 'ipcam-general-v8': { +# 233.2 ms/inference ( 4.3 FPS; 1.1 tensor MPx/sec) for 1 TPUs using 1 segments: ipcam-general-v8 +# 44.6 ms/inference ( 22.4 FPS; 5.6 tensor MPx/sec) for 2 TPUs using 2 segments: ipcam-general-v8 +# 22.7 ms/inference ( 44.1 FPS; 11.1 tensor MPx/sec) for 3 TPUs using 2 segments: ipcam-general-v8 +# 16.1 ms/inference ( 62.0 FPS; 15.6 tensor MPx/sec) for 4 TPUs using 2 segments: ipcam-general-v8 +# 12.4 ms/inference ( 80.8 FPS; 20.3 tensor MPx/sec) for 5 TPUs using 2 segments: ipcam-general-v8 +# 10.5 ms/inference ( 95.5 FPS; 23.9 tensor MPx/sec) for 6 TPUs using 2 segments: ipcam-general-v8 +2: ['inc_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'inc_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], +3: ['all_segments_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'all_segments_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], +4: ['2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], +5: ['3x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '3x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], +6: ['2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + } } self.tpu_segments_lists = {} @@ -183,26 +287,44 @@ def __init__(self): # # YOLOv8 benchmarked with 3 CPU cores and 6 PCIe TPUs self.MODEL_SETTINGS = { + "yolov9": { + "large": Settings('YOLOv9', 'yolov9c_416_640px', + 'yolov9c_416_640px.tflite', # 46Mb CPU + 'yolov9c_416_640px_edgetpu.tflite', # 48Mb TPU + 'coco_labels.txt'), + "medium": Settings('YOLOv9', 'yolov9m_416_640px', \ + 'yolov9m_352_576px.tflite', # 21Mb CPU + 'yolov9m_352_576px_edgetpu.tflite', # 22Mb TPU + 'coco_labels.txt'), + "small": Settings('YOLOv9', 'yolov9s_416_640px', + 'yolov9s_352_576px.tflite', # 11Mb CPU + 'yolov9s_352_576px_edgetpu.tflite', # 12Mb TPU + 'coco_labels.txt'), + "tiny": Settings('YOLOv9', 'yolov9t_416_640px', + 'yolov9t_384_640px.tflite', # 4Mb CPU + 'yolov9t_384_640px_edgetpu.tflite', # 3Mb TPU + 'coco_labels.txt') + }, "yolov8": { # 59.88 ms throughput / 855.40 ms inference "large": Settings('YOLOv8', 'yolov8l_416_640px', - 'yolov8l_416_640px.tflite', # 46Mb CPU - 'yolov8l_416_640px_edgetpu.tflite', # 48Mb TPU + 'yolov8l_352_608px.tflite', # 46Mb CPU + 'yolov8l_352_608px_edgetpu.tflite', # 48Mb TPU 'coco_labels.txt'), # 53.72 ms throughput / 762.86 ms inference "medium": Settings('YOLOv8', 'yolov8m_416_640px', \ - 'yolov8m_416_640px.tflite', # 21Mb CPU - 'yolov8m_416_640px_edgetpu.tflite', # 22Mb TPU + 'yolov8m_352_608px.tflite', # 21Mb CPU + 'yolov8m_352_608px_edgetpu.tflite', # 22Mb TPU 'coco_labels.txt'), # 21.52 ms throughput / 291.35 ms inference "small": Settings('YOLOv8', 'yolov8s_416_640px', - 'yolov8s_416_640px.tflite', # 11Mb CPU - 'yolov8s_416_640px_edgetpu.tflite', # 12Mb TPU + 'yolov8s_384_608px.tflite', # 11Mb CPU + 'yolov8s_384_608px_edgetpu.tflite', # 12Mb TPU 'coco_labels.txt'), # 10.35 ms throughput / 123.35 ms inference "tiny": Settings('YOLOv8', 'yolov8n_416_640px', - 'yolov8n_416_640px.tflite', # 4Mb CPU - 'yolov8n_416_640px_edgetpu.tflite', # 3Mb TPU + 'yolov8n_384_640px.tflite', # 4Mb CPU + 'yolov8n_384_640px_edgetpu.tflite', # 3Mb TPU 'coco_labels.txt') }, "yolov5": { diff --git a/tpu_runner.py b/tpu_runner.py index 4abae60..ed73471 100644 --- a/tpu_runner.py +++ b/tpu_runner.py @@ -571,9 +571,9 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list: # We have a list of segment files if isinstance(options.tpu_segments_lists, dict): # Look for a good match between available TPUs and segment counts - # Prioritize first match. Note we have only tested up to 8 TPUs, + # Prioritize first match. Note we have only tested up to 6 TPUs, # so best performance above that can probably be had by extrapolation. - device_count = min(device_count, 8) + device_count = min(device_count, 6) if device_count in options.tpu_segments_lists: seg_fnames = options.tpu_segments_lists[device_count] for fn in seg_fnames: From d0bad4875302375582c6a52a065d8cb92e06ed81 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Mon, 28 Oct 2024 21:07:34 -0700 Subject: [PATCH 4/5] Updated timings --- options.py | 468 +++++++++++++++++++++++++++++--------------------- tpu_runner.py | 78 ++++++--- 2 files changed, 328 insertions(+), 218 deletions(-) diff --git a/options.py b/options.py index c3847bd..b193b01 100644 --- a/options.py +++ b/options.py @@ -21,250 +21,326 @@ def __init__(self, model_name: str, model_name_pattern: str, std_model_name: str self.tpu_model_name = tpu_model_name self.labels_name = labels_name + # Tested on a HP ElietDesk G4 800 SFF i5-8500 3.0 GHz self.MODEL_SEGMENTS = { 'tf2_ssd_mobilenet_v2_coco17_ptq': { -# 6.9 ms/inference (144.3 FPS; 11.7 tensor MPx/sec) for 1 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq -# 3.9 ms/inference (255.1 FPS; 20.8 tensor MPx/sec) for 2 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq -# 2.8 ms/inference (354.6 FPS; 28.8 tensor MPx/sec) for 3 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq -# 2.3 ms/inference (434.8 FPS; 35.4 tensor MPx/sec) for 4 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq -# 2.2 ms/inference (452.5 FPS; 36.7 tensor MPx/sec) for 5 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq -# 2.2 ms/inference (452.5 FPS; 36.7 tensor MPx/sec) for 6 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 6.6 ms/inference (151.1 FPS; 12.3 tensor MPx/sec) for 1 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 3.4 ms/inference (295.0 FPS; 24.0 tensor MPx/sec) for 2 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 2.4 ms/inference (416.7 FPS; 33.9 tensor MPx/sec) for 3 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 1.9 ms/inference (540.5 FPS; 43.8 tensor MPx/sec) for 4 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 1.8 ms/inference (565.0 FPS; 46.0 tensor MPx/sec) for 5 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 1.6 ms/inference (609.8 FPS; 49.6 tensor MPx/sec) for 6 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 1.6 ms/inference (645.2 FPS; 52.6 tensor MPx/sec) for 7 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq +# 1.5 ms/inference (666.7 FPS; 54.1 tensor MPx/sec) for 8 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq + '_tflite': ('all_segments_tf2_ssd_mobilenet_v2_coco17_ptq_edgetpu.tflite', '2e4d39bd76ccbf6fa3b7400a2fb0b8e0') }, 'ssd_mobilenet_v2_coco_quant_postprocess': { -# 7.1 ms/inference (140.1 FPS; 11.4 tensor MPx/sec) for 1 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess -# 3.9 ms/inference (259.1 FPS; 21.0 tensor MPx/sec) for 2 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess -# 2.7 ms/inference (366.3 FPS; 29.8 tensor MPx/sec) for 3 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess -# 2.2 ms/inference (444.4 FPS; 36.1 tensor MPx/sec) for 4 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess -# 2.1 ms/inference (478.5 FPS; 38.9 tensor MPx/sec) for 5 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess -# 2.1 ms/inference (478.5 FPS; 38.9 tensor MPx/sec) for 6 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 6.2 ms/inference (161.6 FPS; 13.1 tensor MPx/sec) for 1 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 3.1 ms/inference (317.5 FPS; 25.8 tensor MPx/sec) for 2 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 2.2 ms/inference (454.5 FPS; 36.9 tensor MPx/sec) for 3 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 1.8 ms/inference (552.5 FPS; 44.8 tensor MPx/sec) for 4 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 1.7 ms/inference (591.7 FPS; 48.1 tensor MPx/sec) for 5 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 1.4 ms/inference (740.7 FPS; 60.3 tensor MPx/sec) for 6 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 1.3 ms/inference (775.2 FPS; 63.0 tensor MPx/sec) for 7 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess +# 1.3 ms/inference (775.2 FPS; 63.0 tensor MPx/sec) for 8 TPUs using 1 segments: ssd_mobilenet_v2_coco_quant_postprocess + '_tflite': ('all_segments_ssd_mobilenet_v2_coco_quant_postprocess_edgetpu.tflite', '02baf9c3bb521f6555cdecabea32cbb0') }, 'ssdlite_mobiledet_coco_qat_postprocess': { -# 8.8 ms/inference (113.8 FPS; 10.6 tensor MPx/sec) for 1 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess -# 4.6 ms/inference (217.9 FPS; 20.2 tensor MPx/sec) for 2 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess -# 3.3 ms/inference (305.8 FPS; 28.4 tensor MPx/sec) for 3 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess -# 2.8 ms/inference (363.6 FPS; 33.9 tensor MPx/sec) for 4 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess -# 2.8 ms/inference (363.6 FPS; 33.9 tensor MPx/sec) for 5 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess -# 2.8 ms/inference (363.6 FPS; 33.9 tensor MPx/sec) for 6 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess -3: ['15x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', '15x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite'], -4: ['4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', '4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite'], -5: ['4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', '4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite'], -6: ['4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', '4x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite'], +# 7.4 ms/inference (135.5 FPS; 12.6 tensor MPx/sec) for 1 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess +# 3.8 ms/inference (266.7 FPS; 24.8 tensor MPx/sec) for 2 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess +# 2.7 ms/inference (375.9 FPS; 35.0 tensor MPx/sec) for 3 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess +# 2.3 ms/inference (434.8 FPS; 40.5 tensor MPx/sec) for 4 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess +# 2.2 ms/inference (458.7 FPS; 42.7 tensor MPx/sec) for 5 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess +# 2.1 ms/inference (483.1 FPS; 44.9 tensor MPx/sec) for 6 TPUs using 2 segments: ssdlite_mobiledet_coco_qat_postprocess +# 2.0 ms/inference (490.2 FPS; 45.6 tensor MPx/sec) for 7 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess +# 1.9 ms/inference (518.1 FPS; 48.3 tensor MPx/sec) for 8 TPUs using 1 segments: ssdlite_mobiledet_coco_qat_postprocess +5: [('166x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', 'de0060c0d5bd8e4d24fc7ea6515335e6'), ('166x_first_seg_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite', '2c6e6a85e9e2a4d5db8d690904a4488d')], +6: [('dumb_ssdlite_mobiledet_coco_qat_postprocess_segment_0_of_2_edgetpu.tflite', 'cd34598bdfae8f0d1af0b4e2161941b8'), ('dumb_ssdlite_mobiledet_coco_qat_postprocess_segment_1_of_2_edgetpu.tflite', '38af20020bc363eeb8f68e70cd951e46')], + '_tflite': ('all_segments_ssdlite_mobiledet_coco_qat_postprocess_edgetpu.tflite', '6d3fa7e552b9c6f58b31237772d83389') }, 'ssd_mobilenet_v1_coco_quant_postprocess': { -# 6.7 ms/inference (149.7 FPS; 12.2 tensor MPx/sec) for 1 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess -# 3.5 ms/inference (289.0 FPS; 23.5 tensor MPx/sec) for 2 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess -# 2.4 ms/inference (411.5 FPS; 33.5 tensor MPx/sec) for 3 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess -# 2.0 ms/inference (490.2 FPS; 39.8 tensor MPx/sec) for 4 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess -# 2.0 ms/inference (502.5 FPS; 40.8 tensor MPx/sec) for 5 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess -# 2.0 ms/inference (505.1 FPS; 41.1 tensor MPx/sec) for 6 TPUs using 2 segments: ssd_mobilenet_v1_coco_quant_postprocess -6: ['dumb_ssd_mobilenet_v1_coco_quant_postprocess_segment_0_of_2_edgetpu.tflite', 'dumb_ssd_mobilenet_v1_coco_quant_postprocess_segment_1_of_2_edgetpu.tflite'], +# 5.6 ms/inference (178.6 FPS; 14.5 tensor MPx/sec) for 1 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 2.8 ms/inference (352.1 FPS; 28.6 tensor MPx/sec) for 2 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 2.0 ms/inference (490.2 FPS; 39.9 tensor MPx/sec) for 3 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 1.7 ms/inference (578.0 FPS; 47.1 tensor MPx/sec) for 4 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 1.6 ms/inference (621.1 FPS; 50.3 tensor MPx/sec) for 5 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 1.4 ms/inference (694.4 FPS; 56.3 tensor MPx/sec) for 6 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 1.4 ms/inference (714.3 FPS; 58.0 tensor MPx/sec) for 7 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess +# 1.4 ms/inference (714.3 FPS; 58.0 tensor MPx/sec) for 8 TPUs using 1 segments: ssd_mobilenet_v1_coco_quant_postprocess + '_tflite': ('all_segments_ssd_mobilenet_v1_coco_quant_postprocess_edgetpu.tflite', '9cf9b99a2ebaf703ca598f2d5a9b1cdf') }, 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq': { -# 175.7 ms/inference ( 5.7 FPS; 2.2 tensor MPx/sec) for 1 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq -# 89.4 ms/inference ( 11.2 FPS; 4.4 tensor MPx/sec) for 2 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq -# 60.9 ms/inference ( 16.4 FPS; 6.4 tensor MPx/sec) for 3 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq -# 46.3 ms/inference ( 21.6 FPS; 8.4 tensor MPx/sec) for 4 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq -# 38.6 ms/inference ( 25.9 FPS; 10.1 tensor MPx/sec) for 5 TPUs using 2 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq -# 34.5 ms/inference ( 29.0 FPS; 11.3 tensor MPx/sec) for 6 TPUs using 2 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq -5: ['2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], -6: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], +# 175.0 ms/inference ( 5.7 FPS; 2.2 tensor MPx/sec) for 1 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 88.3 ms/inference ( 11.3 FPS; 4.4 tensor MPx/sec) for 2 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 59.7 ms/inference ( 16.8 FPS; 6.5 tensor MPx/sec) for 3 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 45.2 ms/inference ( 22.1 FPS; 8.6 tensor MPx/sec) for 4 TPUs using 1 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 35.8 ms/inference ( 27.9 FPS; 10.9 tensor MPx/sec) for 5 TPUs using 2 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 33.1 ms/inference ( 30.2 FPS; 11.8 tensor MPx/sec) for 6 TPUs using 2 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 29.5 ms/inference ( 33.9 FPS; 13.2 tensor MPx/sec) for 7 TPUs using 2 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +# 25.4 ms/inference ( 39.4 FPS; 15.4 tensor MPx/sec) for 8 TPUs using 3 segments: tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq +5: [('2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', '6d131c01fd57097c484dfa3c9c98bd2f'), ('2x_last_seg_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite', '9034af141c83e438fc8ebc5a22d5aa94')], +6: [('all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'e0fd603108e96ecf24cae51e28895e60'), ('all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite', 'ca859ab42ab6fc81efee5665b32db394')], +7: [('all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'e0fd603108e96ecf24cae51e28895e60'), ('all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite', 'ca859ab42ab6fc81efee5665b32db394')], +8: [('dumb_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', '3148d81cdc86d50c62368afd9f882df0'), ('dumb_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', '6bb6a6bbaa604020deb9c1eb8e545c8a'), ('dumb_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite', 'cc4b255b2e50189d25c65867a4fadd6b')], + '_tflite': ('all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_edgetpu.tflite', '17d1ee62d6975099ba957ed5f7472ced') }, 'efficientdet_lite0_320_ptq': { -# 23.2 ms/inference ( 43.2 FPS; 4.0 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite0_320_ptq -# 13.9 ms/inference ( 71.7 FPS; 6.7 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite0_320_ptq -# 9.8 ms/inference (102.4 FPS; 9.5 tensor MPx/sec) for 3 TPUs using 2 segments: efficientdet_lite0_320_ptq -# 9.2 ms/inference (109.3 FPS; 10.2 tensor MPx/sec) for 4 TPUs using 2 segments: efficientdet_lite0_320_ptq -# 7.7 ms/inference (129.5 FPS; 12.1 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite0_320_ptq -# 7.5 ms/inference (134.0 FPS; 12.5 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite0_320_ptq -3: ['3x_first_seg_efficientdet_lite0_320_ptq_segment_0_of_2_edgetpu.tflite', '3x_first_seg_efficientdet_lite0_320_ptq_segment_1_of_2_edgetpu.tflite'], -4: ['133x_first_seg_efficientdet_lite0_320_ptq_segment_0_of_2_edgetpu.tflite', '133x_first_seg_efficientdet_lite0_320_ptq_segment_1_of_2_edgetpu.tflite'], -5: ['133x_first_seg_efficientdet_lite0_320_ptq_segment_0_of_2_edgetpu.tflite', '133x_first_seg_efficientdet_lite0_320_ptq_segment_1_of_2_edgetpu.tflite'], -6: ['166x_first_seg_efficientdet_lite0_320_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite0_320_ptq_segment_1_of_2_edgetpu.tflite'], +# 23.2 ms/inference ( 43.1 FPS; 4.0 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 11.9 ms/inference ( 84.0 FPS; 7.8 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 8.3 ms/inference (119.9 FPS; 11.2 tensor MPx/sec) for 3 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 6.5 ms/inference (153.6 FPS; 14.3 tensor MPx/sec) for 4 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 6.0 ms/inference (166.7 FPS; 15.5 tensor MPx/sec) for 5 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 5.5 ms/inference (181.2 FPS; 16.9 tensor MPx/sec) for 6 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 5.3 ms/inference (188.0 FPS; 17.5 tensor MPx/sec) for 7 TPUs using 1 segments: efficientdet_lite0_320_ptq +# 4.9 ms/inference (203.3 FPS; 18.9 tensor MPx/sec) for 8 TPUs using 1 segments: efficientdet_lite0_320_ptq + '_tflite': ('all_segments_efficientdet_lite0_320_ptq_edgetpu.tflite', '6e4e281e51b5f8b4ca335c32ac86b072') }, 'efficientdet_lite1_384_ptq': { -# 34.7 ms/inference ( 28.8 FPS; 3.9 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite1_384_ptq -# 21.5 ms/inference ( 46.5 FPS; 6.3 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite1_384_ptq -# 14.0 ms/inference ( 71.5 FPS; 9.7 tensor MPx/sec) for 3 TPUs using 1 segments: efficientdet_lite1_384_ptq -# 12.3 ms/inference ( 81.5 FPS; 11.1 tensor MPx/sec) for 4 TPUs using 2 segments: efficientdet_lite1_384_ptq -# 11.2 ms/inference ( 89.0 FPS; 12.1 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite1_384_ptq -# 10.6 ms/inference ( 94.6 FPS; 12.9 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite1_384_ptq -4: ['dumb_efficientdet_lite1_384_ptq_segment_0_of_2_edgetpu.tflite', 'dumb_efficientdet_lite1_384_ptq_segment_1_of_2_edgetpu.tflite'], -5: ['133x_first_seg_efficientdet_lite1_384_ptq_segment_0_of_2_edgetpu.tflite', '133x_first_seg_efficientdet_lite1_384_ptq_segment_1_of_2_edgetpu.tflite'], -6: ['15x_first_seg_efficientdet_lite1_384_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite1_384_ptq_segment_1_of_2_edgetpu.tflite'], +# 34.6 ms/inference ( 28.9 FPS; 3.9 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 17.7 ms/inference ( 56.4 FPS; 7.7 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 12.3 ms/inference ( 81.6 FPS; 11.1 tensor MPx/sec) for 3 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 10.1 ms/inference ( 99.3 FPS; 13.5 tensor MPx/sec) for 4 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 8.7 ms/inference (114.9 FPS; 15.7 tensor MPx/sec) for 5 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 7.7 ms/inference (130.5 FPS; 17.8 tensor MPx/sec) for 6 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 7.2 ms/inference (138.9 FPS; 18.9 tensor MPx/sec) for 7 TPUs using 1 segments: efficientdet_lite1_384_ptq +# 6.8 ms/inference (146.0 FPS; 19.9 tensor MPx/sec) for 8 TPUs using 1 segments: efficientdet_lite1_384_ptq + '_tflite': ('all_segments_efficientdet_lite1_384_ptq_edgetpu.tflite', 'cba6ce06f67d94bb388e26786356f99f') }, 'efficientdet_lite2_448_ptq': { -# 60.6 ms/inference ( 16.5 FPS; 3.1 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite2_448_ptq -# 31.4 ms/inference ( 31.9 FPS; 6.0 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite2_448_ptq -# 20.6 ms/inference ( 48.7 FPS; 9.1 tensor MPx/sec) for 3 TPUs using 2 segments: efficientdet_lite2_448_ptq -# 18.1 ms/inference ( 55.2 FPS; 10.4 tensor MPx/sec) for 4 TPUs using 2 segments: efficientdet_lite2_448_ptq -# 15.3 ms/inference ( 65.4 FPS; 12.3 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite2_448_ptq -# 14.4 ms/inference ( 69.3 FPS; 13.0 tensor MPx/sec) for 6 TPUs using 3 segments: efficientdet_lite2_448_ptq -3: ['2x_last_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], -4: ['4x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '4x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], -5: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], -6: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_2_of_3_edgetpu.tflite'], +# 59.8 ms/inference ( 16.7 FPS; 3.1 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite2_448_ptq +# 30.4 ms/inference ( 32.9 FPS; 6.2 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite2_448_ptq +# 19.3 ms/inference ( 51.8 FPS; 9.7 tensor MPx/sec) for 3 TPUs using 2 segments: efficientdet_lite2_448_ptq +# 15.9 ms/inference ( 62.8 FPS; 11.8 tensor MPx/sec) for 4 TPUs using 1 segments: efficientdet_lite2_448_ptq +# 13.7 ms/inference ( 73.2 FPS; 13.7 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite2_448_ptq +# 11.7 ms/inference ( 85.3 FPS; 16.0 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite2_448_ptq +# 10.3 ms/inference ( 96.7 FPS; 18.1 tensor MPx/sec) for 7 TPUs using 2 segments: efficientdet_lite2_448_ptq +# 9.2 ms/inference (108.1 FPS; 20.3 tensor MPx/sec) for 8 TPUs using 2 segments: efficientdet_lite2_448_ptq +3: [('166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'a1ba35d13759804521475106aebdb778'), ('166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite', 'd4397a5fefe80b63b26a966f05bfbc0a')], +5: [('133x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '03dc0f13ab3588cce6d2a83949e1d05d'), ('133x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite', 'ea708ee1c538b5d9dd6c21c2397b2a95')], +6: [('2x_last_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '7a28effcf51b0aa8708820a7a8e34bba'), ('2x_last_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite', 'd2411db586cc7c0add434a864ba7b25e')], +7: [('166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'a1ba35d13759804521475106aebdb778'), ('166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite', 'd4397a5fefe80b63b26a966f05bfbc0a')], +8: [('166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'a1ba35d13759804521475106aebdb778'), ('166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite', 'd4397a5fefe80b63b26a966f05bfbc0a')], + '_tflite': ('all_segments_efficientdet_lite2_448_ptq_edgetpu.tflite', 'a91e70b4f4785551c5c03791c284ddb0') }, 'efficientdet_lite3_512_ptq': { -# 76.7 ms/inference ( 13.0 FPS; 3.2 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite3_512_ptq -# 38.5 ms/inference ( 25.9 FPS; 6.4 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite3_512_ptq -# 26.9 ms/inference ( 37.2 FPS; 9.2 tensor MPx/sec) for 3 TPUs using 1 segments: efficientdet_lite3_512_ptq -# 21.1 ms/inference ( 47.3 FPS; 11.7 tensor MPx/sec) for 4 TPUs using 1 segments: efficientdet_lite3_512_ptq -# 17.5 ms/inference ( 57.3 FPS; 14.2 tensor MPx/sec) for 5 TPUs using 1 segments: efficientdet_lite3_512_ptq -# 17.1 ms/inference ( 58.4 FPS; 14.4 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite3_512_ptq -6: ['2x_last_seg_efficientdet_lite3_512_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite3_512_ptq_segment_1_of_2_edgetpu.tflite'], +# 75.3 ms/inference ( 13.3 FPS; 3.3 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 38.3 ms/inference ( 26.1 FPS; 6.5 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 26.0 ms/inference ( 38.5 FPS; 9.5 tensor MPx/sec) for 3 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 19.9 ms/inference ( 50.2 FPS; 12.4 tensor MPx/sec) for 4 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 16.6 ms/inference ( 60.1 FPS; 14.8 tensor MPx/sec) for 5 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 14.0 ms/inference ( 71.4 FPS; 17.6 tensor MPx/sec) for 6 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 12.8 ms/inference ( 78.4 FPS; 19.4 tensor MPx/sec) for 7 TPUs using 1 segments: efficientdet_lite3_512_ptq +# 11.7 ms/inference ( 85.3 FPS; 21.1 tensor MPx/sec) for 8 TPUs using 1 segments: efficientdet_lite3_512_ptq + '_tflite': ('all_segments_efficientdet_lite3_512_ptq_edgetpu.tflite', '1e17272603f34514da09472f92272be3') }, 'efficientdet_lite3x_640_ptq': { -# 180.8 ms/inference ( 5.5 FPS; 2.2 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite3x_640_ptq -# 93.6 ms/inference ( 10.7 FPS; 4.2 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite3x_640_ptq -# 63.9 ms/inference ( 15.7 FPS; 6.1 tensor MPx/sec) for 3 TPUs using 2 segments: efficientdet_lite3x_640_ptq -# 48.1 ms/inference ( 20.8 FPS; 8.1 tensor MPx/sec) for 4 TPUs using 2 segments: efficientdet_lite3x_640_ptq -# 40.9 ms/inference ( 24.5 FPS; 9.6 tensor MPx/sec) for 5 TPUs using 2 segments: efficientdet_lite3x_640_ptq -# 35.5 ms/inference ( 28.2 FPS; 11.0 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite3x_640_ptq -3: ['2x_last_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '2x_last_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], -4: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], -5: ['15x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], -6: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], +# 182.4 ms/inference ( 5.5 FPS; 2.1 tensor MPx/sec) for 1 TPUs using 1 segments: efficientdet_lite3x_640_ptq +# 91.9 ms/inference ( 10.9 FPS; 4.2 tensor MPx/sec) for 2 TPUs using 1 segments: efficientdet_lite3x_640_ptq +# 61.8 ms/inference ( 16.2 FPS; 6.3 tensor MPx/sec) for 3 TPUs using 2 segments: efficientdet_lite3x_640_ptq +# 47.5 ms/inference ( 21.0 FPS; 8.2 tensor MPx/sec) for 4 TPUs using 1 segments: efficientdet_lite3x_640_ptq +# 38.7 ms/inference ( 25.9 FPS; 10.1 tensor MPx/sec) for 5 TPUs using 1 segments: efficientdet_lite3x_640_ptq +# 32.2 ms/inference ( 31.0 FPS; 12.1 tensor MPx/sec) for 6 TPUs using 2 segments: efficientdet_lite3x_640_ptq +# 28.4 ms/inference ( 35.2 FPS; 13.8 tensor MPx/sec) for 7 TPUs using 2 segments: efficientdet_lite3x_640_ptq +# 26.4 ms/inference ( 37.9 FPS; 14.8 tensor MPx/sec) for 8 TPUs using 2 segments: efficientdet_lite3x_640_ptq +3: [('2x_last_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'bd050997882668f4915f2a6df3267531'), ('2x_last_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite', '959862e6afcc306010ce67fdeda15375')], +6: [('all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '0c4e89b488f5cff8b29d0de23d231f0d'), ('all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite', 'd7a5a707e0a2b4410dcdc3b5d9d875f0')], +7: [('15x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '1aee305ef779b43f3b0ffb37b2ac98dd'), ('15x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite', '8eac175bd570f3b5411af3f4368b88de')], +8: [('all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '0c4e89b488f5cff8b29d0de23d231f0d'), ('all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite', 'd7a5a707e0a2b4410dcdc3b5d9d875f0')], + '_tflite': ('all_segments_efficientdet_lite3x_640_ptq_edgetpu.tflite', 'a79af4afcf144002cc5b82691e3938f9') }, 'yolov5n-int8': { -# 27.2 ms/inference ( 36.7 FPS; 6.9 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5n-int8 -# 17.2 ms/inference ( 58.2 FPS; 10.9 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5n-int8 -# 12.4 ms/inference ( 80.3 FPS; 15.1 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5n-int8 -# 12.0 ms/inference ( 83.1 FPS; 15.6 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5n-int8 -# 10.9 ms/inference ( 91.4 FPS; 17.1 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5n-int8 -# 10.9 ms/inference ( 91.4 FPS; 17.1 tensor MPx/sec) for 6 TPUs using 2 segments: yolov5n-int8 -3: ['4x_first_seg_yolov5n-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5n-int8_segment_1_of_2_edgetpu.tflite'], -4: ['133x_first_seg_yolov5n-int8_segment_0_of_2_edgetpu.tflite', '133x_first_seg_yolov5n-int8_segment_1_of_2_edgetpu.tflite'], -5: ['4x_first_seg_yolov5n-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5n-int8_segment_1_of_2_edgetpu.tflite'], -6: ['4x_first_seg_yolov5n-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5n-int8_segment_1_of_2_edgetpu.tflite'], +# 26.5 ms/inference ( 37.7 FPS; 7.1 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5n-int8 +# 13.8 ms/inference ( 72.5 FPS; 13.6 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5n-int8 +# 10.2 ms/inference ( 98.2 FPS; 18.4 tensor MPx/sec) for 3 TPUs using 1 segments: yolov5n-int8 +# 8.2 ms/inference (121.2 FPS; 22.7 tensor MPx/sec) for 4 TPUs using 1 segments: yolov5n-int8 +# 7.4 ms/inference (135.0 FPS; 25.3 tensor MPx/sec) for 5 TPUs using 1 segments: yolov5n-int8 +# 6.3 ms/inference (159.2 FPS; 29.8 tensor MPx/sec) for 6 TPUs using 1 segments: yolov5n-int8 +# 6.3 ms/inference (159.2 FPS; 29.8 tensor MPx/sec) for 7 TPUs using 1 segments: yolov5n-int8 +# 6.1 ms/inference (163.4 FPS; 30.6 tensor MPx/sec) for 8 TPUs using 2 segments: yolov5n-int8 +8: [('all_segments_yolov5n-int8_segment_0_of_2_edgetpu.tflite', 'bdd58b4bfeccddf533d439643869083d'), ('all_segments_yolov5n-int8_segment_1_of_2_edgetpu.tflite', '076e13c75bf164ba91d747c615386a63')], + '_tflite': ('all_segments_yolov5n-int8_edgetpu.tflite', 'dcd616c860324d1e653d4e32cf6ebea1') }, 'yolov5s-int8': { -# 39.9 ms/inference ( 25.1 FPS; 4.7 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5s-int8 -# 22.3 ms/inference ( 44.9 FPS; 8.4 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5s-int8 -# 15.0 ms/inference ( 66.6 FPS; 12.5 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5s-int8 -# 11.7 ms/inference ( 85.5 FPS; 16.0 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5s-int8 -# 11.3 ms/inference ( 88.7 FPS; 16.6 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5s-int8 -# 10.0 ms/inference (100.4 FPS; 18.8 tensor MPx/sec) for 6 TPUs using 2 segments: yolov5s-int8 -3: ['166x_first_seg_yolov5s-int8_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov5s-int8_segment_1_of_2_edgetpu.tflite'], -4: ['4x_first_seg_yolov5s-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5s-int8_segment_1_of_2_edgetpu.tflite'], -5: ['166x_first_seg_yolov5s-int8_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov5s-int8_segment_1_of_2_edgetpu.tflite'], -6: ['4x_first_seg_yolov5s-int8_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov5s-int8_segment_1_of_2_edgetpu.tflite'], +# 37.1 ms/inference ( 27.0 FPS; 5.0 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5s-int8 +# 18.8 ms/inference ( 53.1 FPS; 10.0 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5s-int8 +# 13.0 ms/inference ( 77.0 FPS; 14.4 tensor MPx/sec) for 3 TPUs using 1 segments: yolov5s-int8 +# 10.5 ms/inference ( 95.3 FPS; 17.9 tensor MPx/sec) for 4 TPUs using 1 segments: yolov5s-int8 +# 8.5 ms/inference (117.6 FPS; 22.1 tensor MPx/sec) for 5 TPUs using 1 segments: yolov5s-int8 +# 7.2 ms/inference (138.9 FPS; 26.1 tensor MPx/sec) for 6 TPUs using 1 segments: yolov5s-int8 +# 6.7 ms/inference (149.0 FPS; 27.9 tensor MPx/sec) for 7 TPUs using 1 segments: yolov5s-int8 +# 6.6 ms/inference (152.4 FPS; 28.6 tensor MPx/sec) for 8 TPUs using 1 segments: yolov5s-int8 + '_tflite': ('all_segments_yolov5s-int8_edgetpu.tflite', '05ade3a89b783c3118a76a4370c1a3b5') }, 'yolov5m-int8': { -# 100.9 ms/inference ( 9.9 FPS; 1.9 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5m-int8 -# 50.6 ms/inference ( 19.8 FPS; 3.7 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5m-int8 -# 31.7 ms/inference ( 31.6 FPS; 5.9 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5m-int8 -# 25.8 ms/inference ( 38.8 FPS; 7.3 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5m-int8 -# 19.9 ms/inference ( 50.1 FPS; 9.4 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5m-int8 -# 16.9 ms/inference ( 59.1 FPS; 11.1 tensor MPx/sec) for 6 TPUs using 2 segments: yolov5m-int8 -3: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], -4: ['166x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], -5: ['3x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], -6: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], +# 100.7 ms/inference ( 9.9 FPS; 1.9 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5m-int8 +# 50.7 ms/inference ( 19.7 FPS; 3.7 tensor MPx/sec) for 2 TPUs using 1 segments: yolov5m-int8 +# 31.2 ms/inference ( 32.1 FPS; 6.0 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5m-int8 +# 25.6 ms/inference ( 39.1 FPS; 7.3 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5m-int8 +# 19.4 ms/inference ( 51.4 FPS; 9.7 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5m-int8 +# 15.9 ms/inference ( 62.8 FPS; 11.8 tensor MPx/sec) for 6 TPUs using 3 segments: yolov5m-int8 +# 14.4 ms/inference ( 69.6 FPS; 13.0 tensor MPx/sec) for 7 TPUs using 2 segments: yolov5m-int8 +# 14.0 ms/inference ( 71.5 FPS; 13.4 tensor MPx/sec) for 8 TPUs using 2 segments: yolov5m-int8 +3: [('15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'e10855818f6265fa86b123a8a5442107'), ('15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite', 'eb1dac2880acc20338306fe0799ba3ab')], +4: [('166x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '978fd75ec3c100d795de81e1637466a0'), ('166x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite', 'ddee56fa1804b857731cbf065ccefe1f')], +5: [('4x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '30649f7906a2196f30ba279201eeb892'), ('4x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite', 'c57b1cb5d50bac71c42f80cfb9064e6d')], +6: [('166x_first_seg_yolov5m-int8_segment_0_of_3_edgetpu.tflite', '20e354cfa4f2c5c0ce14d929e97a6ba4'), ('166x_first_seg_yolov5m-int8_segment_1_of_3_edgetpu.tflite', 'b8fb00331d6952510a38ac11c546b5a4'), ('166x_first_seg_yolov5m-int8_segment_2_of_3_edgetpu.tflite', 'c85782e9e46bb33d8d663183e9f3cb39')], +7: [('4x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '30649f7906a2196f30ba279201eeb892'), ('4x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite', 'c57b1cb5d50bac71c42f80cfb9064e6d')], +8: [('166x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '978fd75ec3c100d795de81e1637466a0'), ('166x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite', 'ddee56fa1804b857731cbf065ccefe1f')], + '_tflite': ('all_segments_yolov5m-int8_edgetpu.tflite', '0c7ee152856677f94e7cd9ac507c3922') }, 'yolov5l-int8': { -# 183.5 ms/inference ( 5.4 FPS; 1.0 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5l-int8 -# 85.5 ms/inference ( 11.7 FPS; 2.2 tensor MPx/sec) for 2 TPUs using 2 segments: yolov5l-int8 -# 55.2 ms/inference ( 18.1 FPS; 3.4 tensor MPx/sec) for 3 TPUs using 2 segments: yolov5l-int8 -# 43.6 ms/inference ( 22.9 FPS; 4.3 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5l-int8 -# 34.2 ms/inference ( 29.2 FPS; 5.5 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5l-int8 -# 30.0 ms/inference ( 33.3 FPS; 6.2 tensor MPx/sec) for 6 TPUs using 3 segments: yolov5l-int8 -2: ['dumb_yolov5l-int8_segment_0_of_2_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], -3: ['2x_last_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '2x_last_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], -4: ['3x_first_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], -5: ['3x_first_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite'], -6: ['4x_first_seg_yolov5l-int8_segment_0_of_3_edgetpu.tflite', '4x_first_seg_yolov5l-int8_segment_1_of_3_edgetpu.tflite', '4x_first_seg_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], +# 182.9 ms/inference ( 5.5 FPS; 1.0 tensor MPx/sec) for 1 TPUs using 1 segments: yolov5l-int8 +# 84.9 ms/inference ( 11.8 FPS; 2.2 tensor MPx/sec) for 2 TPUs using 2 segments: yolov5l-int8 +# 55.0 ms/inference ( 18.2 FPS; 3.4 tensor MPx/sec) for 3 TPUs using 3 segments: yolov5l-int8 +# 43.3 ms/inference ( 23.1 FPS; 4.3 tensor MPx/sec) for 4 TPUs using 2 segments: yolov5l-int8 +# 33.2 ms/inference ( 30.2 FPS; 5.7 tensor MPx/sec) for 5 TPUs using 2 segments: yolov5l-int8 +# 29.3 ms/inference ( 34.1 FPS; 6.4 tensor MPx/sec) for 6 TPUs using 3 segments: yolov5l-int8 +# 25.8 ms/inference ( 38.7 FPS; 7.3 tensor MPx/sec) for 7 TPUs using 6 segments: yolov5l-int8 +# 20.8 ms/inference ( 48.0 FPS; 9.0 tensor MPx/sec) for 8 TPUs using 5 segments: yolov5l-int8 +2: [('dumb_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '14ff8889dc057808c706b7a9a27d89f7'), ('dumb_yolov5l-int8_segment_1_of_2_edgetpu.tflite', '57d11f35ee8bb0bc019cdd321f8590be')], +3: [('all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', '5bf908acc1b4e71316b90b14714c4575'), ('all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', '346a1cdaacec7caeaee768cdcf0dcc36'), ('all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite', '2a879b9104fc2a10bab85fb6f0805cb7')], +4: [('3x_first_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '107766470c5165067fd4055edfcdb18c'), ('3x_first_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite', 'fd8a6c1f4e3a3dc0bc91d36745e89520')], +5: [('3x_first_seg_yolov5l-int8_segment_0_of_2_edgetpu.tflite', '107766470c5165067fd4055edfcdb18c'), ('3x_first_seg_yolov5l-int8_segment_1_of_2_edgetpu.tflite', 'fd8a6c1f4e3a3dc0bc91d36745e89520')], +6: [('all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', '5bf908acc1b4e71316b90b14714c4575'), ('all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', '346a1cdaacec7caeaee768cdcf0dcc36'), ('all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite', '2a879b9104fc2a10bab85fb6f0805cb7')], +7: [('dumb_yolov5l-int8_segment_0_of_6_edgetpu.tflite', '790b1a914dcf4279c7529670953b8707'), ('dumb_yolov5l-int8_segment_1_of_6_edgetpu.tflite', '501b3556acf1c9ca433b8d952820ea82'), ('dumb_yolov5l-int8_segment_2_of_6_edgetpu.tflite', '871efae1313898b6bb3f5aa4060fb540'), ('dumb_yolov5l-int8_segment_3_of_6_edgetpu.tflite', '5e780f45bb79c929fe671e0a1c730e1a'), ('dumb_yolov5l-int8_segment_4_of_6_edgetpu.tflite', 'b3819a3d6952636b6c478eb531b36d87'), ('dumb_yolov5l-int8_segment_5_of_6_edgetpu.tflite', 'bbd5a9abadc9c254d22b403d53d5d6bc')], +8: [('dumb_yolov5l-int8_segment_0_of_5_edgetpu.tflite', '138febc49003cbf3637516d4a54ae43e'), ('dumb_yolov5l-int8_segment_1_of_5_edgetpu.tflite', '7fc233d166278ea21c03dad2f39d57c6'), ('dumb_yolov5l-int8_segment_2_of_5_edgetpu.tflite', '075cd583ed57286302235428be273557'), ('dumb_yolov5l-int8_segment_3_of_5_edgetpu.tflite', '9442d7c7f007278cb0a2f392968db267'), ('dumb_yolov5l-int8_segment_4_of_5_edgetpu.tflite', 'bb6320aa3bc7363518d5d544b335c8c9')], + '_tflite': ('all_segments_yolov5l-int8_edgetpu.tflite', '947a3ee13a2e6fca0ed52c6108413353') }, 'yolov8n_416_640px': { -# 23.7 ms/inference ( 42.1 FPS; 9.7 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8n_384_640px -# 12.1 ms/inference ( 82.9 FPS; 19.1 tensor MPx/sec) for 2 TPUs using 1 segments: yolov8n_384_640px -# 9.1 ms/inference (109.8 FPS; 25.3 tensor MPx/sec) for 3 TPUs using 1 segments: yolov8n_384_640px -# 7.6 ms/inference (131.6 FPS; 30.3 tensor MPx/sec) for 4 TPUs using 1 segments: yolov8n_384_640px -# 7.0 ms/inference (142.2 FPS; 32.8 tensor MPx/sec) for 5 TPUs using 1 segments: yolov8n_384_640px -# 6.5 ms/inference (154.3 FPS; 35.6 tensor MPx/sec) for 6 TPUs using 1 segments: yolov8n_384_640px +# 23.6 ms/inference ( 42.3 FPS; 9.8 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8n_384_640px +# 11.9 ms/inference ( 84.1 FPS; 19.4 tensor MPx/sec) for 2 TPUs using 1 segments: yolov8n_384_640px +# 8.1 ms/inference (123.2 FPS; 28.4 tensor MPx/sec) for 3 TPUs using 1 segments: yolov8n_384_640px +# 6.4 ms/inference (156.2 FPS; 36.0 tensor MPx/sec) for 4 TPUs using 1 segments: yolov8n_384_640px +# 4.8 ms/inference (210.1 FPS; 42.0 tensor MPx/sec) for 5 TPUs using 1 segments: yolov8n_352_608px +# 4.7 ms/inference (212.8 FPS; 49.1 tensor MPx/sec) for 6 TPUs using 1 segments: yolov8n_384_640px +# 4.7 ms/inference (212.8 FPS; 49.1 tensor MPx/sec) for 7 TPUs using 1 segments: yolov8n_384_640px +# 4.9 ms/inference (204.1 FPS; 51.1 tensor MPx/sec) for 8 TPUs using 3 segments: yolov8n_416_640px +8: [('3x_first_seg_yolov8n_416_640px_segment_0_of_3_edgetpu.tflite', '949cc2bf3815d5d870aeec595181541e'), ('3x_first_seg_yolov8n_416_640px_segment_1_of_3_edgetpu.tflite', 'd8fc08a86a1a82c420ed33c48f82d3af'), ('3x_first_seg_yolov8n_416_640px_segment_2_of_3_edgetpu.tflite', '31bdde82f94b4554012d6ecb7c201db1')], + '_tflite': ('all_segments_yolov8n_384_640px_edgetpu.tflite', 'dbc5eb9f775b8cec51492ed80454b154') }, 'yolov8s_416_640px': { -# 46.5 ms/inference ( 21.5 FPS; 4.7 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8s_384_608px +# 46.7 ms/inference ( 21.4 FPS; 4.7 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8s_384_608px # 23.5 ms/inference ( 42.5 FPS; 9.3 tensor MPx/sec) for 2 TPUs using 1 segments: yolov8s_384_608px -# 16.2 ms/inference ( 61.7 FPS; 13.5 tensor MPx/sec) for 3 TPUs using 1 segments: yolov8s_384_608px -# 10.9 ms/inference ( 91.8 FPS; 18.4 tensor MPx/sec) for 4 TPUs using 2 segments: yolov8s_352_608px -# 9.8 ms/inference (102.0 FPS; 22.3 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8s_384_608px -# 8.9 ms/inference (112.0 FPS; 24.5 tensor MPx/sec) for 6 TPUs using 1 segments: yolov8s_384_608px -4: ['4x_first_seg_yolov8s_352_608px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8s_352_608px_segment_1_of_2_edgetpu.tflite'], -5: ['3x_first_seg_yolov8s_384_608px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov8s_384_608px_segment_1_of_2_edgetpu.tflite'], +# 15.9 ms/inference ( 63.1 FPS; 13.8 tensor MPx/sec) for 3 TPUs using 1 segments: yolov8s_384_608px +# 10.7 ms/inference ( 93.5 FPS; 18.7 tensor MPx/sec) for 4 TPUs using 2 segments: yolov8s_352_608px +# 9.4 ms/inference (106.8 FPS; 23.4 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8s_384_608px +# 8.1 ms/inference (124.2 FPS; 27.2 tensor MPx/sec) for 6 TPUs using 2 segments: yolov8s_384_608px +# 6.9 ms/inference (144.7 FPS; 31.7 tensor MPx/sec) for 7 TPUs using 2 segments: yolov8s_384_608px +# 5.6 ms/inference (177.3 FPS; 35.5 tensor MPx/sec) for 8 TPUs using 2 segments: yolov8s_352_608px +4: [('4x_first_seg_yolov8s_352_608px_segment_0_of_2_edgetpu.tflite', '6ecc47f6d9ef6b2a45ff4393105fe310'), ('4x_first_seg_yolov8s_352_608px_segment_1_of_2_edgetpu.tflite', 'a14cc49567acdc15d27173e3410b1e1e')], +5: [('3x_first_seg_yolov8s_384_608px_segment_0_of_2_edgetpu.tflite', '4b509292971a9a1da8da62f0b59fa8d4'), ('3x_first_seg_yolov8s_384_608px_segment_1_of_2_edgetpu.tflite', '4d95f76fbf94402b992bdd14b8b0ab09')], +6: [('3x_first_seg_yolov8s_384_608px_segment_0_of_2_edgetpu.tflite', '4b509292971a9a1da8da62f0b59fa8d4'), ('3x_first_seg_yolov8s_384_608px_segment_1_of_2_edgetpu.tflite', '4d95f76fbf94402b992bdd14b8b0ab09')], +7: [('2x_first_seg_yolov8s_384_608px_segment_0_of_2_edgetpu.tflite', '61057df9063a0940c61fb80d44c87d07'), ('2x_first_seg_yolov8s_384_608px_segment_1_of_2_edgetpu.tflite', '35286b2059e4a9cdbd0964f653476c97')], +8: [('4x_first_seg_yolov8s_352_608px_segment_0_of_2_edgetpu.tflite', '6ecc47f6d9ef6b2a45ff4393105fe310'), ('4x_first_seg_yolov8s_352_608px_segment_1_of_2_edgetpu.tflite', 'a14cc49567acdc15d27173e3410b1e1e')], + '_tflite': ('all_segments_yolov8s_384_608px_edgetpu.tflite', 'a976571aa7b851c01934f19fbd954837') }, 'yolov8m_416_640px': { -# 188.7 ms/inference ( 5.3 FPS; 1.1 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8m_352_608px -# 95.1 ms/inference ( 10.5 FPS; 2.6 tensor MPx/sec) for 2 TPUs using 2 segments: yolov8m_416_640px -# 58.7 ms/inference ( 17.0 FPS; 4.3 tensor MPx/sec) for 3 TPUs using 2 segments: yolov8m_416_640px -# 44.0 ms/inference ( 22.7 FPS; 5.7 tensor MPx/sec) for 4 TPUs using 2 segments: yolov8m_416_640px -# 35.5 ms/inference ( 28.1 FPS; 7.0 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8m_416_640px -# 30.9 ms/inference ( 32.4 FPS; 8.1 tensor MPx/sec) for 6 TPUs using 3 segments: yolov8m_416_640px -2: ['all_segments_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], -3: ['2x_last_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '2x_last_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], -4: ['2x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], -5: ['4x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], -6: ['133x_first_seg_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', '133x_first_seg_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', '133x_first_seg_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], +# 187.6 ms/inference ( 5.3 FPS; 1.1 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8m_352_608px +# 94.8 ms/inference ( 10.6 FPS; 2.6 tensor MPx/sec) for 2 TPUs using 2 segments: yolov8m_416_640px +# 57.2 ms/inference ( 17.5 FPS; 4.4 tensor MPx/sec) for 3 TPUs using 2 segments: yolov8m_416_640px +# 43.7 ms/inference ( 22.9 FPS; 5.7 tensor MPx/sec) for 4 TPUs using 2 segments: yolov8m_416_640px +# 35.1 ms/inference ( 28.5 FPS; 7.1 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8m_416_640px +# 30.6 ms/inference ( 32.7 FPS; 8.2 tensor MPx/sec) for 6 TPUs using 3 segments: yolov8m_416_640px +# 27.4 ms/inference ( 36.5 FPS; 9.2 tensor MPx/sec) for 7 TPUs using 4 segments: yolov8m_416_640px +# 26.6 ms/inference ( 37.6 FPS; 9.4 tensor MPx/sec) for 8 TPUs using 3 segments: yolov8m_416_640px +2: [('all_segments_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '2538c274186d50fd00f9135029036efe'), ('all_segments_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite', 'f70e3a43ae069e4499b1556aa26ab979')], +3: [('2x_last_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '645b50901151530a030c941067896619'), ('2x_last_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite', '25a69dbf0da3af9abb6150a2f1b8a2ae')], +4: [('2x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '91bc0025498094926aca5248dfa30998'), ('2x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite', 'e24331572eb782077841467c48b16d39')], +5: [('3x_first_seg_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', '06b5de52b30d69241c3e5522a87cbacb'), ('3x_first_seg_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite', '7aca2df7c03ff3d599904c97e6788bb5')], +6: [('all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', '44553d512aa793ede73b027a0c489af4'), ('all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', '2c4cdf70060224436e4d9cef5f6d0096'), ('all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite', '2d79b825a3eaee20fc0c43c200f5f988')], +7: [('all_segments_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', '6ccca16e93be61d56a04f9538fb21190'), ('all_segments_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', 'c58f17a741d30f77f78dc6c1bc0ecdee'), ('all_segments_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', '7d428ca816620e36965d3b442bd8aa27'), ('all_segments_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite', 'a3b1ecbe3e47fbb0ae7843a387f8b66a')], +8: [('all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', '44553d512aa793ede73b027a0c489af4'), ('all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', '2c4cdf70060224436e4d9cef5f6d0096'), ('all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite', '2d79b825a3eaee20fc0c43c200f5f988')], + '_tflite': ('all_segments_yolov8m_352_608px_edgetpu.tflite', 'a59142cec40d2699898bbd82e90f67d2') }, 'yolov8l_416_640px': { -# 236.2 ms/inference ( 4.2 FPS; 0.8 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8l_352_608px -# 118.1 ms/inference ( 8.5 FPS; 1.7 tensor MPx/sec) for 2 TPUs using 1 segments: yolov8l_352_608px -# 85.2 ms/inference ( 11.7 FPS; 2.6 tensor MPx/sec) for 3 TPUs using 2 segments: yolov8l_384_608px -# 59.8 ms/inference ( 16.7 FPS; 3.3 tensor MPx/sec) for 4 TPUs using 1 segments: yolov8l_352_608px -# 46.1 ms/inference ( 21.7 FPS; 4.3 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8l_352_608px -# 51.1 ms/inference ( 19.6 FPS; 4.9 tensor MPx/sec) for 6 TPUs using 3 segments: yolov8l_416_640px -3: ['2x_first_seg_yolov8l_384_608px_segment_0_of_2_edgetpu.tflite', '2x_first_seg_yolov8l_384_608px_segment_1_of_2_edgetpu.tflite'], -5: ['4x_first_seg_yolov8l_352_608px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8l_352_608px_segment_1_of_2_edgetpu.tflite'], -6: ['4x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'], +# 234.7 ms/inference ( 4.3 FPS; 0.8 tensor MPx/sec) for 1 TPUs using 1 segments: yolov8l_352_608px +# 117.7 ms/inference ( 8.5 FPS; 1.7 tensor MPx/sec) for 2 TPUs using 1 segments: yolov8l_352_608px +# 84.0 ms/inference ( 11.9 FPS; 2.6 tensor MPx/sec) for 3 TPUs using 2 segments: yolov8l_384_608px +# 58.9 ms/inference ( 17.0 FPS; 3.4 tensor MPx/sec) for 4 TPUs using 1 segments: yolov8l_352_608px +# 46.0 ms/inference ( 21.7 FPS; 4.3 tensor MPx/sec) for 5 TPUs using 2 segments: yolov8l_352_608px +# 50.6 ms/inference ( 19.7 FPS; 5.0 tensor MPx/sec) for 6 TPUs using 3 segments: yolov8l_416_640px +# 49.2 ms/inference ( 20.3 FPS; 5.1 tensor MPx/sec) for 7 TPUs using 2 segments: yolov8l_416_640px +# 44.6 ms/inference ( 22.4 FPS; 5.6 tensor MPx/sec) for 8 TPUs using 2 segments: yolov8l_416_640px +3: [('2x_first_seg_yolov8l_384_608px_segment_0_of_2_edgetpu.tflite', 'b0c271cc7593b1f9e80af31f00a2ca99'), ('2x_first_seg_yolov8l_384_608px_segment_1_of_2_edgetpu.tflite', '389adfc3a5b8a2fc2bbb45a3d81b8ef5')], +5: [('3x_first_seg_yolov8l_352_608px_segment_0_of_2_edgetpu.tflite', '35aaed3e61e1107767aee408f3b89865'), ('3x_first_seg_yolov8l_352_608px_segment_1_of_2_edgetpu.tflite', '1bf53847af16c9ae9a9b225020742610')], +6: [('4x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '146f8cd733e3ed03ccb0b01f1b2f6e73'), ('4x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '0c3ebe468b46142aae8d6a358cde97dd'), ('4x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite', '694c46bc795510235355efcc23f50b3d')], +7: [('2x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '3ac992275b06e8d32f5a84daec41889f'), ('2x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite', '2b103ec1a0f9812dd9545a94fbd2de9b')], +8: [('15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'b9b49ba86b453a5e3365535b241245b3'), ('15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite', '1f81c69cda86b22244bf83599b77274a')], + '_tflite': ('all_segments_yolov8l_352_608px_edgetpu.tflite', 'f4e5f929370752ade565d5b048b214a7') }, 'yolov9t_416_640px': { -# 29.3 ms/inference ( 34.1 FPS; 7.9 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9t_384_640px -# 14.6 ms/inference ( 68.6 FPS; 15.8 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9t_384_640px -# 10.3 ms/inference ( 96.7 FPS; 22.3 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9t_384_640px -# 8.3 ms/inference (120.2 FPS; 27.7 tensor MPx/sec) for 4 TPUs using 1 segments: yolov9t_384_640px -# 7.3 ms/inference (137.7 FPS; 31.8 tensor MPx/sec) for 5 TPUs using 1 segments: yolov9t_384_640px -# 6.6 ms/inference (151.1 FPS; 34.8 tensor MPx/sec) for 6 TPUs using 1 segments: yolov9t_384_640px +# 28.7 ms/inference ( 34.9 FPS; 8.0 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9t_384_640px +# 14.5 ms/inference ( 69.1 FPS; 15.9 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9t_384_640px +# 9.8 ms/inference (101.7 FPS; 23.5 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9t_384_640px +# 7.6 ms/inference (131.8 FPS; 30.4 tensor MPx/sec) for 4 TPUs using 1 segments: yolov9t_384_640px +# 6.3 ms/inference (158.5 FPS; 36.5 tensor MPx/sec) for 5 TPUs using 1 segments: yolov9t_384_640px +# 5.3 ms/inference (190.1 FPS; 43.9 tensor MPx/sec) for 6 TPUs using 1 segments: yolov9t_384_640px +# 4.6 ms/inference (217.9 FPS; 50.3 tensor MPx/sec) for 7 TPUs using 1 segments: yolov9t_384_640px +# 4.5 ms/inference (223.7 FPS; 51.6 tensor MPx/sec) for 8 TPUs using 1 segments: yolov9t_384_640px + '_tflite': ('all_segments_yolov9t_384_640px_edgetpu.tflite', '95f95ee11fd25ade78ebc70168a1ec3f') }, 'yolov9s_416_640px': { -# 45.9 ms/inference ( 21.8 FPS; 4.1 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9s_352_576px -# 22.8 ms/inference ( 43.9 FPS; 8.3 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9s_352_576px -# 15.3 ms/inference ( 65.1 FPS; 12.3 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9s_352_576px -# 11.7 ms/inference ( 85.4 FPS; 16.1 tensor MPx/sec) for 4 TPUs using 1 segments: yolov9s_352_576px -# 10.3 ms/inference ( 97.3 FPS; 19.4 tensor MPx/sec) for 5 TPUs using 1 segments: yolov9s_352_608px -# 8.3 ms/inference (120.5 FPS; 22.8 tensor MPx/sec) for 6 TPUs using 2 segments: yolov9s_352_576px -6: ['3x_first_seg_yolov9s_352_576px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov9s_352_576px_segment_1_of_2_edgetpu.tflite'], +# 45.1 ms/inference ( 22.2 FPS; 4.2 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9s_352_576px +# 22.7 ms/inference ( 44.1 FPS; 8.3 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9s_352_576px +# 15.3 ms/inference ( 65.5 FPS; 12.4 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9s_352_576px +# 11.5 ms/inference ( 87.0 FPS; 16.5 tensor MPx/sec) for 4 TPUs using 1 segments: yolov9s_352_576px +# 9.3 ms/inference (107.2 FPS; 20.3 tensor MPx/sec) for 5 TPUs using 1 segments: yolov9s_352_576px +# 8.0 ms/inference (125.6 FPS; 23.8 tensor MPx/sec) for 6 TPUs using 1 segments: yolov9s_352_576px +# 6.9 ms/inference (144.3 FPS; 27.3 tensor MPx/sec) for 7 TPUs using 1 segments: yolov9s_352_576px +# 6.2 ms/inference (161.8 FPS; 30.6 tensor MPx/sec) for 8 TPUs using 1 segments: yolov9s_352_576px + '_tflite': ('all_segments_yolov9s_352_576px_edgetpu.tflite', 'da983c998888f7f152dd59c4001e6465') }, 'yolov9m_416_640px': { -# 148.0 ms/inference ( 6.8 FPS; 1.3 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9m_352_576px -# 73.8 ms/inference ( 13.5 FPS; 2.6 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9m_352_576px -# 49.6 ms/inference ( 20.2 FPS; 3.8 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9m_352_576px -# 37.1 ms/inference ( 26.9 FPS; 5.1 tensor MPx/sec) for 4 TPUs using 1 segments: yolov9m_352_576px -# 35.4 ms/inference ( 28.3 FPS; 6.2 tensor MPx/sec) for 5 TPUs using 1 segments: yolov9m_384_608px -# 33.5 ms/inference ( 29.9 FPS; 7.5 tensor MPx/sec) for 6 TPUs using 2 segments: yolov9m_416_640px -6: ['3x_first_seg_yolov9m_416_640px_segment_0_of_2_edgetpu.tflite', '3x_first_seg_yolov9m_416_640px_segment_1_of_2_edgetpu.tflite'], +# 146.9 ms/inference ( 6.8 FPS; 1.3 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9m_352_576px +# 73.6 ms/inference ( 13.6 FPS; 2.6 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9m_352_576px +# 46.9 ms/inference ( 21.3 FPS; 4.0 tensor MPx/sec) for 3 TPUs using 2 segments: yolov9m_352_576px +# 35.4 ms/inference ( 28.2 FPS; 5.3 tensor MPx/sec) for 4 TPUs using 2 segments: yolov9m_352_576px +# 29.5 ms/inference ( 33.9 FPS; 6.4 tensor MPx/sec) for 5 TPUs using 2 segments: yolov9m_352_576px +# 33.2 ms/inference ( 30.1 FPS; 7.6 tensor MPx/sec) for 6 TPUs using 2 segments: yolov9m_416_640px +# 21.9 ms/inference ( 45.7 FPS; 8.6 tensor MPx/sec) for 7 TPUs using 2 segments: yolov9m_352_576px +# 19.8 ms/inference ( 50.4 FPS; 9.5 tensor MPx/sec) for 8 TPUs using 2 segments: yolov9m_352_576px +3: [('15x_first_seg_yolov9m_352_576px_segment_0_of_2_edgetpu.tflite', '5821e54b26d69e4cb1c2d2353b2a32b8'), ('15x_first_seg_yolov9m_352_576px_segment_1_of_2_edgetpu.tflite', 'af1a3b119110c2f13f19c8e9ed39a348')], +4: [('2x_first_seg_yolov9m_352_576px_segment_0_of_2_edgetpu.tflite', 'ca8e62206d0998521cdcdf482a8aafa7'), ('2x_first_seg_yolov9m_352_576px_segment_1_of_2_edgetpu.tflite', '8722298ca656a9aa905b47eebd99f143')], +5: [('4x_first_seg_yolov9m_352_576px_segment_0_of_2_edgetpu.tflite', 'd13d791a033421664d314cd8fe7159a5'), ('4x_first_seg_yolov9m_352_576px_segment_1_of_2_edgetpu.tflite', '23f0abd7c8fc2189afe824307c7e764e')], +6: [('4x_first_seg_yolov9m_416_640px_segment_0_of_2_edgetpu.tflite', '3575d250197920b946d9a007e3175138'), ('4x_first_seg_yolov9m_416_640px_segment_1_of_2_edgetpu.tflite', '28d49a444bdf477f001bf084adcd2d65')], +7: [('3x_first_seg_yolov9m_352_576px_segment_0_of_2_edgetpu.tflite', 'c037de1514a795711ff2fcf9207a9d9d'), ('3x_first_seg_yolov9m_352_576px_segment_1_of_2_edgetpu.tflite', 'fa2f5522c0244aaab5be4ba03e829319')], +8: [('4x_first_seg_yolov9m_352_576px_segment_0_of_2_edgetpu.tflite', 'd13d791a033421664d314cd8fe7159a5'), ('4x_first_seg_yolov9m_352_576px_segment_1_of_2_edgetpu.tflite', '23f0abd7c8fc2189afe824307c7e764e')], + '_tflite': ('all_segments_yolov9m_352_576px_edgetpu.tflite', '1bdb7e615eb275b74f0791448011a70f') }, 'yolov9c_416_640px': { -# 306.7 ms/inference ( 3.3 FPS; 0.8 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9c_416_640px -# 153.2 ms/inference ( 6.5 FPS; 1.6 tensor MPx/sec) for 2 TPUs using 1 segments: yolov9c_416_640px -# 103.0 ms/inference ( 9.7 FPS; 2.4 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9c_416_640px -# 74.6 ms/inference ( 13.4 FPS; 3.4 tensor MPx/sec) for 4 TPUs using 2 segments: yolov9c_416_640px -# 59.5 ms/inference ( 16.8 FPS; 4.2 tensor MPx/sec) for 5 TPUs using 2 segments: yolov9c_416_640px -# 47.1 ms/inference ( 21.2 FPS; 4.6 tensor MPx/sec) for 6 TPUs using 2 segments: yolov9c_384_608px -4: ['dumb_yolov9c_416_640px_segment_0_of_2_edgetpu.tflite', 'dumb_yolov9c_416_640px_segment_1_of_2_edgetpu.tflite'], -5: ['15x_last_seg_yolov9c_416_640px_segment_0_of_2_edgetpu.tflite', '15x_last_seg_yolov9c_416_640px_segment_1_of_2_edgetpu.tflite'], -6: ['all_segments_yolov9c_384_608px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov9c_384_608px_segment_1_of_2_edgetpu.tflite'], +# 304.4 ms/inference ( 3.3 FPS; 0.8 tensor MPx/sec) for 1 TPUs using 1 segments: yolov9c_416_640px +# 112.2 ms/inference ( 8.9 FPS; 1.7 tensor MPx/sec) for 2 TPUs using 2 segments: yolov9c_352_576px +# 102.0 ms/inference ( 9.8 FPS; 2.5 tensor MPx/sec) for 3 TPUs using 1 segments: yolov9c_416_640px +# 72.6 ms/inference ( 13.8 FPS; 3.5 tensor MPx/sec) for 4 TPUs using 2 segments: yolov9c_416_640px +# 59.2 ms/inference ( 16.9 FPS; 4.2 tensor MPx/sec) for 5 TPUs using 2 segments: yolov9c_416_640px +# 38.6 ms/inference ( 25.9 FPS; 4.9 tensor MPx/sec) for 6 TPUs using 2 segments: yolov9c_352_576px +# 35.1 ms/inference ( 28.5 FPS; 5.4 tensor MPx/sec) for 7 TPUs using 2 segments: yolov9c_352_576px +# 32.8 ms/inference ( 30.5 FPS; 5.8 tensor MPx/sec) for 8 TPUs using 2 segments: yolov9c_352_576px +2: [('all_segments_yolov9c_352_576px_segment_0_of_2_edgetpu.tflite', '8c222f8b68ac3409c911e94d9fe238b5'), ('all_segments_yolov9c_352_576px_segment_1_of_2_edgetpu.tflite', '19cad259fa12078a9c578edc75237039')], +4: [('dumb_yolov9c_416_640px_segment_0_of_2_edgetpu.tflite', '9a12743271de03b8cbc418010ca49dd7'), ('dumb_yolov9c_416_640px_segment_1_of_2_edgetpu.tflite', '9c29968741d13145b452f0d5e78f72af')], +5: [('15x_last_seg_yolov9c_416_640px_segment_0_of_2_edgetpu.tflite', 'c3db29ffe0eea32d18d3f1ce00c67977'), ('15x_last_seg_yolov9c_416_640px_segment_1_of_2_edgetpu.tflite', '98798da3713530e359ff3045ddc02cfe')], +6: [('2x_last_seg_yolov9c_352_576px_segment_0_of_2_edgetpu.tflite', '6411b054b7e36a781b533a47314e1f43'), ('2x_last_seg_yolov9c_352_576px_segment_1_of_2_edgetpu.tflite', '5b74321e75daab50ec3708bd04dac846')], +7: [('15x_last_seg_yolov9c_352_576px_segment_0_of_2_edgetpu.tflite', '37b01ef791aae70d97877949c0b40421'), ('15x_last_seg_yolov9c_352_576px_segment_1_of_2_edgetpu.tflite', '3732f1586b0798ae6a078a13a79c0832')], +8: [('2x_last_seg_yolov9c_352_576px_segment_0_of_2_edgetpu.tflite', '6411b054b7e36a781b533a47314e1f43'), ('2x_last_seg_yolov9c_352_576px_segment_1_of_2_edgetpu.tflite', '5b74321e75daab50ec3708bd04dac846')], + '_tflite': ('all_segments_yolov9c_416_640px_edgetpu.tflite', 'c02b9fec754b84c4e4c1757fd2122ddf') }, 'ipcam-general-v8': { -# 233.2 ms/inference ( 4.3 FPS; 1.1 tensor MPx/sec) for 1 TPUs using 1 segments: ipcam-general-v8 +# 233.5 ms/inference ( 4.3 FPS; 1.1 tensor MPx/sec) for 1 TPUs using 1 segments: ipcam-general-v8 # 44.6 ms/inference ( 22.4 FPS; 5.6 tensor MPx/sec) for 2 TPUs using 2 segments: ipcam-general-v8 -# 22.7 ms/inference ( 44.1 FPS; 11.1 tensor MPx/sec) for 3 TPUs using 2 segments: ipcam-general-v8 -# 16.1 ms/inference ( 62.0 FPS; 15.6 tensor MPx/sec) for 4 TPUs using 2 segments: ipcam-general-v8 -# 12.4 ms/inference ( 80.8 FPS; 20.3 tensor MPx/sec) for 5 TPUs using 2 segments: ipcam-general-v8 -# 10.5 ms/inference ( 95.5 FPS; 23.9 tensor MPx/sec) for 6 TPUs using 2 segments: ipcam-general-v8 -2: ['inc_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'inc_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], -3: ['all_segments_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'all_segments_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], -4: ['2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], -5: ['3x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '3x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], -6: ['2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], +# 22.4 ms/inference ( 44.6 FPS; 11.2 tensor MPx/sec) for 3 TPUs using 2 segments: ipcam-general-v8 +# 16.0 ms/inference ( 62.5 FPS; 15.7 tensor MPx/sec) for 4 TPUs using 2 segments: ipcam-general-v8 +# 12.1 ms/inference ( 82.9 FPS; 20.8 tensor MPx/sec) for 5 TPUs using 2 segments: ipcam-general-v8 +# 9.9 ms/inference (101.1 FPS; 25.3 tensor MPx/sec) for 6 TPUs using 2 segments: ipcam-general-v8 +# 8.9 ms/inference (112.7 FPS; 28.2 tensor MPx/sec) for 7 TPUs using 2 segments: ipcam-general-v8 +# 8.7 ms/inference (114.7 FPS; 28.8 tensor MPx/sec) for 8 TPUs using 2 segments: ipcam-general-v8 +2: [('all_segments_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'b9e472613162f8c16eb809ed57132741'), ('all_segments_ipcam-general-v8_segment_1_of_2_edgetpu.tflite', 'b1ed179f61c50bfd97fbfbd3b3acacbc')], +3: [('166x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '075b6f8c13ff4df6be11f9fb5199f974'), ('166x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite', '2f12424f4695b9b555a5fc7ffd6b8b97')], +4: [('2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '8c87ba2ac71450df22e0d4bfe6f2321e'), ('2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite', '85b38d05ef94b39f409d7de7688ba053')], +5: [('2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '8c87ba2ac71450df22e0d4bfe6f2321e'), ('2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite', '85b38d05ef94b39f409d7de7688ba053')], +6: [('2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '8c87ba2ac71450df22e0d4bfe6f2321e'), ('2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite', '85b38d05ef94b39f409d7de7688ba053')], +7: [('4x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'ace0bc3dce916f0b739ee1f9350e6c0d'), ('4x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite', 'a89dceb8ced5134078d15922ce77e560')], +8: [('2x_first_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '8c87ba2ac71450df22e0d4bfe6f2321e'), ('2x_first_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite', '85b38d05ef94b39f409d7de7688ba053')], + '_tflite': ('all_segments_ipcam-general-v8_edgetpu.tflite', '89ded45c3c3d9d7a3e12100ccdd9627b') } } diff --git a/tpu_runner.py b/tpu_runner.py index ed73471..adafcae 100644 --- a/tpu_runner.py +++ b/tpu_runner.py @@ -21,6 +21,7 @@ import logging import queue import math +from numba import jit try: import cv2 @@ -912,7 +913,9 @@ def _decode_result(self, result_list, score_threshold: float): return ([boxes], [class_ids], [scores], [len(scores)]) - def _xywh2xyxy(self, xywh): + @staticmethod + @jit(nopython=True, fastmath=True) + def _xywh2xyxy(xywh): # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right xyxy = np.copy(xywh) xyxy[:, 1] = xywh[:, 0] - xywh[:, 2] * 0.5 # top left x @@ -921,8 +924,9 @@ def _xywh2xyxy(self, xywh): xyxy[:, 2] = xywh[:, 1] + xywh[:, 3] * 0.5 # bottom right y return xyxy - - def _nms(self, dets, scores, thresh): + @staticmethod + @jit(nopython=True, fastmath=True) + def _nms(dets, scores, thresh): ''' dets is a numpy array : num_dets, 4 scores is a numpy array : num_dets, @@ -1202,26 +1206,23 @@ def _resize_and_chop_tiles(self, for y_off in range(0, max(img_h - m_height, 0) + tiles_y, step_y): # Adjust contrast on a per-chunk basis; we will likely be quantizing the image during scaling if isinstance(image, np.ndarray): - cropped_arr = self._autocontrast_scale_np(image, (x_off, y_off, - x_off + m_width, - y_off + m_height)) + cropped_arr, input_scale, input_zero \ + = self._autocontrast_scale_np(image, (x_off, y_off, + x_off + m_width, + y_off + m_height)) else: - cropped_arr = self._pil_autocontrast_scale_np(image, (x_off, y_off, - x_off + m_width, - y_off + m_height)) + cropped_arr, input_scale, input_zero \ + = self._pil_autocontrast_scale_np(image, (x_off, y_off, + x_off + m_width, + y_off + m_height)) logging.debug("Resampled image tile {} at offset {}, {}".format(cropped_arr.shape, x_off, y_off)) resamp_info = (x_off, y_off, i_width/img_w, i_height/img_h) - # Cast and clip - tile_arr = np.zeros(cropped_arr.shape, dtype=self.input_details['dtype']) dinfo = np.iinfo(self.input_details['dtype']) - np.clip(cropped_arr, dinfo.min, dinfo.max, out=tile_arr, casting='unsafe') - - # Ensure this is the tensor-ready size - tile_height, tile_width, tile_c = tile_arr.shape - if tile_width != m_width or tile_height != m_height: - tile_arr = np.pad(tile_arr, ((0, m_height - tile_height), (0, m_width - tile_width), (0, 0))) + tile_arr = np.empty((m_height, m_width, cropped_arr.shape[2]), dtype=dinfo.dtype) + self._jit_rescale_to_tensor(cropped_arr, input_scale, input_zero, + dinfo.min, dinfo.max, tile_arr) tiles.append((tile_arr, resamp_info)) @@ -1230,14 +1231,49 @@ def _resize_and_chop_tiles(self, return tiles + @staticmethod + @jit(nopython=True, fastmath=True) + def _jit_rescale_to_tensor(cropped_arr, input_scale, input_zero, dinfo_min, dinfo_max, tile_arr): + # Image data in the upper left quad + for y in range(cropped_arr.shape[0]): + for x in range(cropped_arr.shape[1]): + for z in range(cropped_arr.shape[2]): + cropped_val = cropped_arr[y,x,z] * input_scale + input_zero + + if cropped_val < dinfo_min: + cropped_val = dinfo_min + elif cropped_val > dinfo_max: + cropped_val = dinfo_max + + tile_arr[y,x,z] = cropped_val + + # Upper right quad padding + for y in range(cropped_arr.shape[0]): + for x in range(cropped_arr.shape[1], tile_arr.shape[1]): + for z in range(cropped_arr.shape[2]): + tile_arr[y,x,z] = input_zero + + # Lower left quad padding + for y in range(cropped_arr.shape[0], tile_arr.shape[0]): + for x in range(cropped_arr.shape[1]): + for z in range(cropped_arr.shape[2]): + tile_arr[y,x,z] = input_zero + + # Lower right quad padding + for y in range(cropped_arr.shape[0], tile_arr.shape[0]): + for x in range(cropped_arr.shape[1], tile_arr.shape[1]): + for z in range(cropped_arr.shape[2]): + tile_arr[y,x,z] = input_zero + + def _pil_autocontrast_scale_np(self, image, crop_dim): image_chunk = ImageOps.autocontrast(image.crop(crop_dim), 1) - return np.asarray(image_chunk, np.float32) * self.input_scale + self.input_zero + return image_chunk, self.input_scale, self.input_zero def _autocontrast_scale_np(self, image, crop_dim): cropped_img = image[crop_dim[1]:crop_dim[3],crop_dim[0]:crop_dim[2]] - # Convert to gret for histogram + # Convert to grey for histogram gray = cv2.cvtColor(cropped_img, cv2.COLOR_RGB2GRAY) # Calculate grayscale histogram @@ -1275,9 +1311,7 @@ def _autocontrast_scale_np(self, image, crop_dim): # - Reducing quantization error. # - Not clamping dynamic range to uint8 before scaling to the input tensor. # - Fewer operations per pixel. - return np.asarray(cropped_img, np.float32) \ - * (alpha * self.input_scale) \ - + (beta * self.input_scale + self.input_zero) + return cropped_img, (alpha * self.input_scale), (beta * self.input_scale + self.input_zero) def _get_tiles(self, options: Options, image: Image): From 13cc413ee2b5ecf85e114b9317e701b153609d42 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Mon, 28 Oct 2024 21:10:51 -0700 Subject: [PATCH 5/5] Speling --- options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/options.py b/options.py index b193b01..580aa36 100644 --- a/options.py +++ b/options.py @@ -21,7 +21,7 @@ def __init__(self, model_name: str, model_name_pattern: str, std_model_name: str self.tpu_model_name = tpu_model_name self.labels_name = labels_name - # Tested on a HP ElietDesk G4 800 SFF i5-8500 3.0 GHz + # Tested on a HP EliteDesk G4 800 SFF i5-8500 3.0 GHz self.MODEL_SEGMENTS = { 'tf2_ssd_mobilenet_v2_coco17_ptq': { # 6.6 ms/inference (151.1 FPS; 12.3 tensor MPx/sec) for 1 TPUs using 1 segments: tf2_ssd_mobilenet_v2_coco17_ptq