cp add_cls_id_layout_ratio (#3621)

Sunting78 · web-flow · commit f67e70365f33 · 2025-03-17T10:38:37.000+08:00
diff --git a/docs/module_usage/tutorials/ocr_modules/layout_detection.en.md b/docs/module_usage/tutorials/ocr_modules/layout_detection.en.md
@@ -308,23 +308,25 @@ Relevant methods, parameters, and explanations are as follows:
 <tr>
 <td><code>layout_unclip_ratio</code></td>
 <td>Scaling factor for the side length of the detection box; if not specified, the default PaddleX official model configuration will be used</td>
-<td><code>float/list/None</code></td>
+<td><code>float/list/dict/None</code></td>
 <td>
 <ul>
 <li><b>float</b>, a positive float number, e.g., 1.1, means expanding the width and height of the detection box by 1.1 times while keeping the center unchanged</li>
 <li><b>List</b>, e.g., [1.2, 1.5], means expanding the width by 1.2 times and the height by 1.5 times while keeping the center unchanged</li>
+<li><b>dict</b>, keys as <b>int</b> representing <code>cls_id</code>, values as float scaling factors, e.g., <code>{0: (1.1, 2.0)}</code> means cls_id 0 expanding the width by 1.1 times and the height by 2.0 times while keeping the center unchanged</li>
 <li><b>None</b>, not specified, will use the default PaddleX official model configuration</li>
 </ul>
 </td>
 <tr>
 <td><code>layout_merge_bboxes_mode</code></td>
 <td>Merging mode for the detection boxes output by the model; if not specified, the default PaddleX official model configuration will be used</td>
-<td><code>string/None</code></td>
+<td><code>string/dict/None</code></td>
 <td>
 <ul>
 <li><b>large</b>, when set to large, only the largest external box will be retained for overlapping detection boxes, and the internal overlapping boxes will be deleted</li>
 <li><b>small</b>, when set to small, only the smallest internal box will be retained for overlapping detection boxes, and the external overlapping boxes will be deleted</li>
 <li><b>union</b>, no filtering of boxes will be performed, and both internal and external boxes will be retained</li>
+<li><b>dict</b>, keys as <b>int</b> representing <code>cls_id</code> and values as merging modes, e.g., <code>{0: "large", 2: "small"}</li>
 <li><b>None</b>, not specified, will use the default PaddleX official model configuration</li>
 </ul>
 </td>
@@ -395,23 +397,25 @@ Relevant methods, parameters, and explanations are as follows:
 <tr>
 <td><code>layout_unclip_ratio</code></td>
 <td>Scaling factor for the side length of the detection box; if not specified, the default PaddleX official model configuration will be used</td>
-<td><code>float/list/None</code></td>
+<td><code>float/list/dict/None</code></td>
 <td>
 <ul>
 <li><b>float</b>, a positive float number, e.g., 1.1, means expanding the width and height of the detection box by 1.1 times while keeping the center unchanged</li>
 <li><b>List</b>, e.g., [1.2, 1.5], means expanding the width by 1.2 times and the height by 1.5 times while keeping the center unchanged</li>
+<li><b>dict</b>, keys as <b>int</b> representing <code>cls_id</code>, values as float scaling factors, e.g., <code>{0: (1.1, 2.0)}</code> means cls_id 0 expanding the width by 1.1 times and the height by 2.0 times while keeping the center unchanged</li>
 <li><b>None</b>, not specified, will use the <code>layout_unclip_ratio</code> parameter specified in <code>create_model</code>. If not specified in <code>create_model</code>, the default PaddleX official model configuration will be used</li>
 </ul>
 </td>
 <tr>
 <td><code>layout_merge_bboxes_mode</code></td>
 <td>Merging mode for the detection boxes output by the model; if not specified, the default PaddleX official model configuration will be used</td>
-<td><code>string/None</code></td>
+<td><code>string/dict/None</code></td>
 <td>
 <ul>
 <li><b>large</b>, when set to large, only the largest external box will be retained for overlapping detection boxes, and the internal overlapping boxes will be deleted</li>
 <li><b>small</b>, when set to small, only the smallest internal box will be retained for overlapping detection boxes, and the external overlapping boxes will be deleted</li>
 <li><b>union</b>, no filtering of boxes will be performed, and both internal and external boxes will be retained</li>
+<li><b>dict</b>, keys as <b>int</b> representing <code>cls_id</code> and values as merging modes, e.g., <code>{0: "large", 2: "small"}</li>
 <li><b>None</b>, not specified, will use the <code>layout_merge_bboxes_mode</code> parameter specified in <code>create_model</code>. If not specified in <code>create_model</code>, the default PaddleX official model configuration will be used</li>
 </ul>
 </td>
diff --git a/docs/module_usage/tutorials/ocr_modules/layout_detection.md b/docs/module_usage/tutorials/ocr_modules/layout_detection.md
@@ -308,23 +308,25 @@ for res in output:
 <tr>
 <td><code>layout_unclip_ratio</code></td>
 <td>检测框的边长缩放倍数；如果不指定，将默认使用PaddleX官方模型配置</td>
-<td><code>float/list/None</code></td>
+<td><code>float/list/dict/None</code></td>
 <td>
 <ul>
 <li><b>float</b>, 大于0的浮点数，如 1.1 , 表示将模型输出的检测框中心不变，宽和高都扩张1.1倍</li>
 <li><b>列表</b>, 如 [1.2, 1.5] , 表示将模型输出的检测框中心不变，宽度扩张1.2倍，高度扩张1.5倍</li>
+<li><b>字典</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>tuple</b>类型，如<code>{0: (1.1, 2.0)}</code>, 表示将模型输出的第0类别检测框中心不变，宽度扩张1.1倍，高度扩张2.0倍</li>
 <li><b>None</b>, 不指定，将默认使用PaddleX官方模型配置</li>
 </ul>
 </td>
 <tr>
 <td><code>layout_merge_bboxes_mode</code></td>
 <td>模型输出的检测框的合并处理模式；如果不指定，将默认使用PaddleX官方模型配置</td>
-<td><code>string/None</code></td>
+<td><code>string/dict/None</code></td>
 <td>
 <ul>
 <li><b>large</b>, 设置为large时，表示在模型输出的检测框中，对于互相重叠包含的检测框，只保留外部最大的框，删除重叠的内部框。</li>
 <li><b>small</b>, 设置为small，表示在模型输出的检测框中，对于互相重叠包含的检测框，只保留内部被包含的小框，删除重叠的外部框。</li>
 <li><b>union</b>, 不进行框的过滤处理，内外框都保留</li>
+<li><b>dict</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>str</b>类型, 如<code>{0: "large", 2: "small"}</code>, 表示对第0类别检测框使用large模式，对第2类别检测框使用small模式</li>  
 <li><b>None</b>, 不指定，将默认使用PaddleX官方模型配置</li>
 </ul>
 </td>
@@ -402,23 +404,25 @@ for res in output:
 <tr>
 <td><code>layout_unclip_ratio</code></td>
 <td>检测框的边长缩放倍数；如果不指定，将默认使用PaddleX官方模型配置</td>
-<td><code>float/list/None</code></td>
+<td><code>float/list/dict/None</code></td>
 <td>
 <ul>
 <li><b>float</b>, 大于0的浮点数，如 1.1 , 表示将模型输出的检测框中心不变，宽和高都扩张1.1倍</li>
 <li><b>列表</b>, 如 [1.2, 1.5] , 表示将模型输出的检测框中心不变，宽度扩张1.2倍，高度扩张1.5倍</li>
+<li><b>字典</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>tuple</b>类型，如<code>{0: (1.1, 2.0)}</code>, 表示将模型输出的第0类别检测框中心不变，宽度扩张1.1倍，高度扩张2.0</li>
 <li><b>None</b>, 不指定，将默认使用 <code>creat_model</code> 指定的 <code>layout_unclip_ratio</code> 参数，如果 <code>creat_model</code> 也没有指定，则默认使用PaddleX官方模型配置</li>
 </ul>
 </td>
 <tr>
 <td><code>layout_merge_bboxes_mode</code></td>
 <td>模型输出的检测框的合并处理模式；如果不指定，将默认使用PaddleX官方模型配置</td>
-<td><code>string/None</code></td>
+<td><code>string/dict/None</code></td>
 <td>
 <ul>
 <li><b>large</b>, 设置为large时，表示在模型输出的检测框中，对于互相重叠包含的检测框，只保留外部最大的框，删除重叠的内部框。</li>
 <li><b>small</b>, 设置为small，表示在模型输出的检测框中，对于互相重叠包含的检测框，只保留内部被包含的小框，删除重叠的外部框。</li>
 <li><b>union</b>, 不进行框的过滤处理，内外框都保留</li>
+<li><b>dict</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>str</b>类型, 如<code>{0: "large", 2: "small"}</code>, 表示对第0类别检测框使用large模式，对第2类别检测框使用small模式</li>  
 <li><b>None</b>, 不指定，将默认使用 <code>creat_model</code> 指定的 <code>layout_merge_bboxes_mode</code> 参数，如果 <code>creat_model</code> 也没有指定，则默认使用PaddleX官方模型配置</li>
 </ul>
 </td>
diff --git a/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v3.en.md b/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v3.en.md
@@ -595,11 +595,12 @@ The following are the parameters and their descriptions for the `visual_predict(
 <tr>
 <td><code>layout_unclip_ratio</code></td>
 <td>The expansion coefficient for layout detection.</td>
-<td><code>float|Tuple[float,float]|None</code></td>
+<td><code>float|Tuple[float,float]|dict|None</code></td>
 <td>
 <ul>
 <li><b>float</b>: Any floating-point number greater than <code>0</code>;</li>
 <li><b>Tuple[float,float]</b>: The expansion coefficients in the horizontal and vertical directions, respectively;</li>
+<li><b>dict</b>, keys as <b>int</b> representing <code>cls_id</code>, values as float scaling factors, e.g., <code>{0: (1.1, 2.0)}</code> means cls_id 0 expanding the width by 1.1 times and the height by 2.0 times while keeping the center unchanged</li>
 <li><b>None</b>: If set to <code>None</code>, it will default to the value initialized by the pipeline, initialized to <code>1.0</code>;</li>
 </ul>
 </td>
@@ -608,10 +609,11 @@ The following are the parameters and their descriptions for the `visual_predict(
 <tr>
 <td><code>layout_merge_bboxes_mode</code></td>
 <td>The overlapping box filtering method.</td>
-<td><code>str|None</code></td>
+<td><code>str|dict|None</code></td>
 <td>
 <ul>
 <li><b>str</b>: large, small, union. Respectively representing retaining the large box, small box, or both when filtering overlapping boxes.</li>
+<li><b>dict</b>, keys as <b>int</b> representing <code>cls_id</code> and values as merging modes, e.g., <code>{0: "large", 2: "small"}</li>
 <li><b>None</b>: If set to <code>None</code>, it will default to the value initialized by the pipeline, initialized to <code>large</code>;</li>
 </ul>
 </td>
diff --git a/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v3.md b/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v3.md
@@ -593,11 +593,12 @@ PP-ChatOCRv3-doc 预测的流程、API说明、产出说明如下：
 <tr>
 <td><code>layout_unclip_ratio</code></td>
 <td>版面检测扩张系数</td>
-<td><code>float|Tuple[float,float]|None</code></td>
+<td><code>float|Tuple[float,float]|dict|None</code></td>
 <td>
 <ul>
 <li><b>float</b>：任意大于 <code>0</code>  浮点数；</li>
 <li><b>Tuple[float,float]</b>：在横纵两个方向各自的扩张系数；</li>
+<li><b>字典</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>tuple</b>类型，如<code>{0: (1.1, 2.0)}</code>, 表示将模型输出的第0类别检测框中心不变，宽度扩张1.1倍，高度扩张2.0倍</li>
 <li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>1.0</code>；</li>
 </ul>
 </td>
@@ -606,10 +607,11 @@ PP-ChatOCRv3-doc 预测的流程、API说明、产出说明如下：
 <tr>
 <td><code>layout_merge_bboxes_mode</code></td>
 <td>重叠框过滤方式</td>
-<td><code>str|None</code></td>
+<td><code>str|dict|None</code></td>
 <td>
 <ul>
 <li><b>str</b>：large，small, union.分别表示重叠框过滤时选择保留大框，小框还是同时保留</li>
+<li><b>dict</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>str</b>类型, 如<code>{0: "large", 2: "small"}</code>, 表示对第0类别检测框使用large模式，对第2类别检测框使用small模式</li>  
 <li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>large</code>；</li>
 </ul>
 </td>
diff --git a/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v4.en.md b/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v4.en.md
@@ -672,11 +672,12 @@ The following are the parameters and descriptions of the `visual_predict()` meth
 <tr>
 <td><code>layout_unclip_ratio</code></td>
 <td>The expansion coefficient for layout detection.</td>
-<td><code>float|Tuple[float,float]|None</code></td>
+<td><code>float|Tuple[float,float]|dict|None</code></td>
 <td>
 <ul>
   <li><b>float</b>: Any floating-point number greater than <code>0</code>;</li>
   <li><b>Tuple[float,float]</b>: The expansion coefficients in the horizontal and vertical directions, respectively;</li>
+  <li><b>dict</b>, keys as <b>int</b> representing <code>cls_id</code>, values as float scaling factors for each category.</li>
   <li><b>None</b>: If set to <code>None</code>, it will default to the value initialized by the pipeline, initialized to <code>1.0</code>;</li>
 </ul>
 </td>
@@ -685,10 +686,11 @@ The following are the parameters and descriptions of the `visual_predict()` meth
 <tr>
 <td><code>layout_merge_bboxes_mode</code></td>
 <td>The method for filtering overlapping bounding boxes.</td>
-<td><code>str|None</code></td>
+<td><code>str|dict|None</code></td>
 <td>
 <ul>
   <li><b>str</b>: large, small, union. Respectively representing retaining the larger box, smaller box, or both when overlapping boxes are filtered.</li>
+  <li><b>dict</b>, keys as <b>int</b> representing <code>cls_id</code> and values as merging modes for each category.</li>
   <li><b>None</b>: If set to <code>None</code>, it will default to the value initialized by the pipeline, initialized to <code>large</code>;</li>
 </ul>
 </td>
diff --git a/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v4.md b/docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v4.md
@@ -841,11 +841,12 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下：
 <tr>
 <td><code>layout_unclip_ratio</code></td>
 <td>版面检测扩张系数</td>
-<td><code>float|Tuple[float,float]|None</code></td>
+<td><code>float|Tuple[float,float]|dict|None</code></td>
 <td>
 <ul>
   <li><b>float</b>：任意大于 <code>0</code>  浮点数；</li>
   <li><b>Tuple[float,float]</b>：在横纵两个方向各自的扩张系数；</li>
+  <li><b>字典</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>tuple</b>类型，如<code>{0: (1.1, 2.0)}</code>, 表示将模型输出的第0类别检测框中心不变，宽度扩张1.1倍，高度扩张2.0倍</li>
   <li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>1.0</code>；</li>
 </ul>
 </td>
@@ -854,10 +855,11 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下：
 <tr>
 <td><code>layout_merge_bboxes_mode</code></td>
 <td>重叠框过滤方式</td>
-<td><code>str|None</code></td>
+<td><code>str|dict|None</code></td>
 <td>
 <ul>
   <li><b>str</b>：large，small, union.分别表示重叠框过滤时选择保留大框，小框还是同时保留</li>
+  字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>str</b>类型, 如<code>{0: "large", 2: "small"}</code>, 表示对第0类别检测框使用large模式，对第2类别检测框使用small模式</li> 
   <li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>large</code>；</li>
 </ul>
 </td>
diff --git a/paddlex/inference/models/object_detection/predictor.py b/paddlex/inference/models/object_detection/predictor.py
@@ -50,7 +50,7 @@ def __init__(
         img_size: Optional[Union[int, Tuple[int, int]]] = None,
         threshold: Optional[Union[float, dict]] = None,
         layout_nms: Optional[bool] = None,
-        layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]] = None,
+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
         layout_merge_bboxes_mode: Optional[Union[str, dict]] = None,
         **kwargs,
     ):
@@ -91,9 +91,11 @@ def __init__(
                 assert (
                     len(layout_unclip_ratio) == 2
                 ), f"The length of `layout_unclip_ratio` should be 2."
+            elif isinstance(layout_unclip_ratio, dict):
+                pass
             else:
                 raise ValueError(
-                    f"The type of `layout_unclip_ratio` must be float or Tuple[float, float], but got {type(layout_unclip_ratio)}."
+                    f"The type of `layout_unclip_ratio` must be float, Tuple[float, float] or Dict, but got {type(layout_unclip_ratio)}."
                 )
 
         if layout_merge_bboxes_mode is not None:
@@ -209,7 +211,7 @@ def process(
         batch_data: List[Any],
         threshold: Optional[Union[float, dict]] = None,
         layout_nms: bool = False,
-        layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]] = None,
+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
         layout_merge_bboxes_mode: Optional[Union[str, dict]] = None,
     ):
         """
diff --git a/paddlex/inference/models/object_detection/processors.py b/paddlex/inference/models/object_detection/processors.py
@@ -540,23 +540,48 @@ def unclip_boxes(boxes, unclip_ratio=None):
     if unclip_ratio is None:
         return boxes
 
-    widths = boxes[:, 4] - boxes[:, 2]
-    heights = boxes[:, 5] - boxes[:, 3]
-
-    new_w = widths * unclip_ratio[0]
-    new_h = heights * unclip_ratio[1]
-    center_x = boxes[:, 2] + widths / 2
-    center_y = boxes[:, 3] + heights / 2
-
-    new_x1 = center_x - new_w / 2
-    new_y1 = center_y - new_h / 2
-    new_x2 = center_x + new_w / 2
-    new_y2 = center_y + new_h / 2
-    expanded_boxes = np.column_stack(
-        (boxes[:, 0], boxes[:, 1], new_x1, new_y1, new_x2, new_y2)
-    )
+    if isinstance(unclip_ratio, dict):
+        expanded_boxes = []
+        for box in boxes:
+            class_id, score, x1, y1, x2, y2 = box
+            if class_id in unclip_ratio:
+                width_ratio, height_ratio = unclip_ratio[class_id]
+
+                width = x2 - x1
+                height = y2 - y1
+
+                new_w = width * width_ratio
+                new_h = height * height_ratio
+                center_x = x1 + width / 2
+                center_y = y1 + height / 2
+
+                new_x1 = center_x - new_w / 2
+                new_y1 = center_y - new_h / 2
+                new_x2 = center_x + new_w / 2
+                new_y2 = center_y + new_h / 2
+
+                expanded_boxes.append([class_id, score, new_x1, new_y1, new_x2, new_y2])
+            else:
+                expanded_boxes.append(box)
+        return np.array(expanded_boxes)
 
-    return expanded_boxes
+    else:
+        widths = boxes[:, 4] - boxes[:, 2]
+        heights = boxes[:, 5] - boxes[:, 3]
+
+        new_w = widths * unclip_ratio[0]
+        new_h = heights * unclip_ratio[1]
+        center_x = boxes[:, 2] + widths / 2
+        center_y = boxes[:, 3] + heights / 2
+
+        new_x1 = center_x - new_w / 2
+        new_y1 = center_y - new_h / 2
+        new_x2 = center_x + new_w / 2
+        new_y2 = center_y + new_h / 2
+        expanded_boxes = np.column_stack(
+            (boxes[:, 0], boxes[:, 1], new_x1, new_y1, new_x2, new_y2)
+        )
+        return expanded_boxes
 
 
 def iou(box1, box2):
@@ -687,8 +712,8 @@ def apply(
         img_size: Tuple[int, int],
         threshold: Union[float, dict],
         layout_nms: Optional[bool],
-        layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]],
-        layout_merge_bboxes_mode: Optional[str],
+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]],
+        layout_merge_bboxes_mode: Optional[Union[str, dict]],
     ) -> Boxes:
         """Apply post-processing to the detection boxes.
 
@@ -774,9 +799,11 @@ def apply(
                 assert (
                     len(layout_unclip_ratio) == 2
                 ), f"The length of `layout_unclip_ratio` should be 2."
+            elif isinstance(layout_unclip_ratio, dict):
+                pass 
             else:
                 raise ValueError(
-                    f"The type of `layout_unclip_ratio` must be float or Tuple[float, float], but got {type(layout_unclip_ratio)}."
+                    f"The type of `layout_unclip_ratio` must be float, Tuple[float, float] or  Dict[int, Tuple[float, float]], but got {type(layout_unclip_ratio)}."
                 )
             boxes = unclip_boxes(boxes, layout_unclip_ratio)