Optimize StructureSystem for better OCR accuracy

RussellLuo · RussellLuo · commit a96ba4fc44c5 · 2024-04-13T08:00:54.000+08:00
diff --git a/paddleocr.py b/paddleocr.py
@@ -634,10 +634,10 @@ def __init__(self, **kwargs):
         super().__init__(params)
         self.page_num = params.page_num
 
-    def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_color=(255, 255, 255), dt_boxes=None):
+    def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_color=(255, 255, 255)):
         """
         OCR with PaddleOCR
-        
+
         args:
             img: img for OCR, support ndarray, img_path and list or ndarray
             det: use text detection or not. If False, only rec will be exec. Default is True
@@ -646,7 +646,6 @@ def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_col
             bin: binarize image to black and white. Default is False.
             inv: invert image colors. Default is False.
             alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
-            dt_boxes: user-specified bounding boxes for OCR. If None, the boxes will be detected automatically.
         """
         assert isinstance(img, (np.ndarray, list, str, bytes))
         if isinstance(img, list) and det == True:
@@ -679,7 +678,7 @@ def preprocess_image(_image):
             ocr_res = []
             for idx, img in enumerate(imgs):
                 img = preprocess_image(img)
-                dt_boxes, rec_res, _ = self.__call__(img, cls, dt_boxes=dt_boxes)
+                dt_boxes, rec_res, _ = self.__call__(img, cls)
                 if not dt_boxes and not rec_res:
                     ocr_res.append(None)
                     continue
@@ -720,27 +719,6 @@ class PPStructure(StructureSystem):
     def __init__(self, **kwargs):
         params = parse_args(mMain=False)
         params.__dict__.update(**kwargs)
-
-        # As reported in issues such as #10270 and #11665, the current
-        # implementation has problems with the precision of OCR recognition.
-        #
-        # To address this issue, here we implement a patch fix by employing a
-        # combination of PaddleOCR (TextSystem) and StructureSystem.
-        self._args = params
-
-        if self._args.ocr:
-            # If OCR is enabled, we first initialize the structure engine without
-            # enabling OCR, and then initialize a standalone OCR engine.
-            kwargs.pop('ocr', None)
-            self._init_structure(ocr=False, **kwargs)
-            self._ocr_engine = PaddleOCR(**kwargs)
-        else:
-            # Init the structure engine with the raw parameters.
-            self._init_structure(**kwargs)
-
-    def _init_structure(self, **kwargs):
-        params = parse_args(mMain=False)
-        params.__dict__.update(**kwargs)
         assert params.structure_version in SUPPORT_STRUCTURE_MODEL_VERSION, "structure_version must in {}, but get {}".format(
             SUPPORT_STRUCTURE_MODEL_VERSION, params.structure_version)
         params.use_gpu = check_gpu(params.use_gpu)
@@ -796,78 +774,12 @@ def _init_structure(self, **kwargs):
         logger.debug(params)
         super().__init__(params)
 
-    def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
-        if not self._args.ocr:
-            return self._predict_structure(img, return_ocr_result_in_table, img_idx, self._args.alphacolor)
-
-        # We first detect all text regions by using the OCR engine.
-        dt_boxes, elapse = self._ocr_engine.text_detector(img)
-
-        # Then do layout analysis by using the structure engine.
-        result = self._predict_structure(img, return_ocr_result_in_table, img_idx, self._args.alphacolor)
-        for r in result:
-            # Ignore tables since they are parsed separately by the internal table model.
-            if r['type'] == 'table':
-                continue
-
-            # Keep only the regions that intersect with the current bbox.
-            r_dt_boxes = self._filter_boxes(dt_boxes, r['bbox'])
-
-            # Perform OCR recognition on texts within the these regions.
-            ocr_result = self._ocr_engine.ocr(img,
-                                              det=self._args.det,
-                                              rec=self._args.rec,
-                                              cls=self._args.use_angle_cls,
-                                              bin=self._args.binarize,
-                                              inv=self._args.invert,
-                                              alpha_color=self._args.alphacolor,
-                                              dt_boxes=r_dt_boxes)
-            if ocr_result:
-                ocr_r = ocr_result[0]
-                if ocr_r:  # Sometimes ocr_r might be None.
-                    r['res'] = [
-                        dict(
-                            text_region=x[0],
-                            text=x[1][0],
-                            confidence=x[1][1],
-                        )
-                        for x in ocr_r
-                    ]
-
-        # Sort the text boxes in order from top to bottom and from left to right.
-        from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
-        h, w, _ = img.shape
-        sorted_result = sorted_layout_boxes(result, w)
-
-        return sorted_result
-
-    def _predict_structure(self, img, return_ocr_result_in_table=False, img_idx=0, alpha_color=(255, 255, 255)):
+    def __call__(self, img, return_ocr_result_in_table=False, img_idx=0, alpha_color=(255, 255, 255)):
         img = check_img(img, alpha_color)
         res, _ = super().__call__(
             img, return_ocr_result_in_table, img_idx=img_idx)
         return res
 
-    def _filter_boxes(self, dt_boxes, bbox):
-        # TODO(RussellLuo): Performance needs improvement?
-        boxes = []
-
-        for idx in range(len(dt_boxes)):
-            box = dt_boxes[idx]
-            rect = box[0][0], box[0][1], box[2][0], box[2][1]
-            if self._has_intersection(bbox, rect):
-                boxes.append(box.tolist())
-
-        return np.array(boxes, np.float32).reshape((len(boxes), 4, 2))
-
-    def _has_intersection(self, rect1, rect2):
-        x_min1, y_min1, x_max1, y_max1 = rect1
-        x_min2, y_min2, x_max2, y_max2 = rect2
-        if x_min1 > x_max2 or x_max1 < x_min2:
-            return False
-        if y_min1 > y_max2 or y_max1 < y_min2:
-            return False
-        return True
-
 
 def main():
     # for cmd
@@ -920,7 +832,7 @@ def main():
                     outfile = args.output + '/' + img_name + '.txt'
                     with open(outfile,'w',encoding='utf-8') as f:
                         f.writelines(lines)
-                     
+
         elif args.type == 'structure':
             img, flag_gif, flag_pdf = check_and_read(img_path)
             if not flag_gif and not flag_pdf:
diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py
@@ -58,7 +58,6 @@ def __init__(self, args):
                 logger.warning(
                     "When args.layout is false, args.ocr is automatically set to false"
                 )
-            args.drop_score = 0
             # init model
             self.layout_predictor = None
             self.text_system = None
@@ -93,6 +92,7 @@ def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
             'all': 0
         }
         start = time.time()
+
         if self.image_orientation_predictor is not None:
             tic = time.time()
             cls_result = self.image_orientation_predictor.predict(
@@ -108,6 +108,7 @@ def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
                 img = cv2.rotate(img, cv_rotate_code[angle])
             toc = time.time()
             time_dict['image_orientation'] = toc - tic
+
         if self.mode == 'structure':
             ori_im = img.copy()
             if self.layout_predictor is not None:
@@ -116,6 +117,20 @@ def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
             else:
                 h, w = ori_im.shape[:2]
                 layout_res = [dict(bbox=None, label='table')]
+
+            # As reported in issues such as #10270 and #11665, the old
+            # implementation, which recognizes texts from the layout regions,
+            # has problems with OCR recognition accuracy.
+            #
+            # To enhance the OCR recognition accuracy, we implement a patch fix
+            # that first detect all text regions by using the text_detector
+            # and then recognize the texts from the text regions (intersecting
+            # with the layout regions) by using the text_recognizer.
+            dt_boxes = []
+            if self.text_system is not None:
+                dt_boxes, elapse = self.text_system.text_detector(img)
+                time_dict['det'] = elapse
+
             res_list = []
             for region in layout_res:
                 res = ''
@@ -126,6 +141,8 @@ def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
                 else:
                     x1, y1, x2, y2 = 0, 0, w, h
                     roi_img = ori_im
+                bbox = [x1, y1, x2, y2]
+
                 if region['label'] == 'table':
                     if self.table_system is not None:
                         res, table_time_dict = self.table_system(
@@ -136,66 +153,99 @@ def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
                         time_dict['rec'] += table_time_dict['rec']
                 else:
                     if self.text_system is not None:
-                        if self.recovery:
-                            wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype)
-                            wht_im[y1:y2, x1:x2, :] = roi_img
-                            filter_boxes, filter_rec_res, ocr_time_dict = self.text_system(
-                                wht_im)
-                        else:
-                            filter_boxes, filter_rec_res, ocr_time_dict = self.text_system(
-                                roi_img)
+                        res, ocr_time_dict = self._predict_text(ori_im, roi_img, bbox, dt_boxes)
                         time_dict['det'] += ocr_time_dict['det']
                         time_dict['rec'] += ocr_time_dict['rec']
 
-                        # remove style char,
-                        # when using the recognition model trained on the PubtabNet dataset,
-                        # it will recognize the text format in the table, such as <b>
-                        style_token = [
-                            '<strike>', '<strike>', '<sup>', '</sub>', '<b>',
-                            '</b>', '<sub>', '</sup>', '<overline>',
-                            '</overline>', '<underline>', '</underline>', '<i>',
-                            '</i>'
-                        ]
-                        res = []
-                        for box, rec_res in zip(filter_boxes, filter_rec_res):
-                            rec_str, rec_conf = rec_res[0], rec_res[1]
-                            for token in style_token:
-                                if token in rec_str:
-                                    rec_str = rec_str.replace(token, '')
-                            if not self.recovery:
-                                box += [x1, y1]
-                            if self.return_word_box:
-                                word_box_content_list, word_box_list = cal_ocr_word_box(rec_str, box, rec_res[2])
-                                res.append({
-                                    'text': rec_str,
-                                    'confidence': float(rec_conf),
-                                    'text_region': box.tolist(),
-                                    'text_word': word_box_content_list,
-                                    'text_word_region': word_box_list
-                                })
-                            else:
-                                res.append({
-                                    'text': rec_str,
-                                    'confidence': float(rec_conf),
-                                    'text_region': box.tolist()
-                                })
                 res_list.append({
                     'type': region['label'].lower(),
-                    'bbox': [x1, y1, x2, y2],
+                    'bbox': bbox,
                     'img': roi_img,
                     'res': res,
                     'img_idx': img_idx
                 })
+
             end = time.time()
             time_dict['all'] = end - start
             return res_list, time_dict
+
         elif self.mode == 'kie':
             re_res, elapse = self.kie_predictor(img)
             time_dict['kie'] = elapse
             time_dict['all'] = elapse
             return re_res[0], time_dict
+
         return None, None
 
+    def _predict_text(self, ori_im, roi_img, bbox, dt_boxes):
+        x1, y1, x2, y2 = bbox
+
+        if self.recovery:
+            wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype)
+            wht_im[y1:y2, x1:x2, :] = roi_img
+            filter_boxes, filter_rec_res, ocr_time_dict = self.text_system(
+                wht_im)
+        else:
+            # Filter the text regions that intersect with the current bbox.
+            intersecting_dt_boxes = self._filter_boxes(dt_boxes, bbox)
+            # Recognize texts from these intersecting text regions.
+            filter_boxes, filter_rec_res, ocr_time_dict = self.text_system(
+                ori_im, dt_boxes=intersecting_dt_boxes)
+
+        # remove style char,
+        # when using the recognition model trained on the PubtabNet dataset,
+        # it will recognize the text format in the table, such as <b>
+        style_token = [
+            '<strike>', '<strike>', '<sup>', '</sub>', '<b>',
+            '</b>', '<sub>', '</sup>', '<overline>',
+            '</overline>', '<underline>', '</underline>', '<i>',
+            '</i>'
+        ]
+        res = []
+        for box, rec_res in zip(filter_boxes, filter_rec_res):
+            rec_str, rec_conf = rec_res[0], rec_res[1]
+            for token in style_token:
+                if token in rec_str:
+                    rec_str = rec_str.replace(token, '')
+            # if not self.recovery:
+            #     box += [x1, y1]
+            if self.return_word_box:
+                word_box_content_list, word_box_list = cal_ocr_word_box(rec_str, box, rec_res[2])
+                res.append({
+                    'text': rec_str,
+                    'confidence': float(rec_conf),
+                    'text_region': box.tolist(),
+                    'text_word': word_box_content_list,
+                    'text_word_region': word_box_list
+                })
+            else:
+                res.append({
+                    'text': rec_str,
+                    'confidence': float(rec_conf),
+                    'text_region': box.tolist()
+                })
+        return res, ocr_time_dict
+
+    def _filter_boxes(self, dt_boxes, bbox):
+        boxes = []
+
+        for idx in range(len(dt_boxes)):
+            box = dt_boxes[idx]
+            rect = box[0][0], box[0][1], box[2][0], box[2][1]
+            if self._has_intersection(bbox, rect):
+                boxes.append(box.tolist())
+
+        return np.array(boxes, np.float32).reshape((len(boxes), 4, 2))
+
+    def _has_intersection(self, rect1, rect2):
+        x_min1, y_min1, x_max1, y_max1 = rect1
+        x_min2, y_min2, x_max2, y_max2 = rect2
+        if x_min1 > x_max2 or x_max1 < x_min2:
+            return False
+        if y_min1 > y_max2 or y_max1 < y_min2:
+            return False
+        return True
+
 
 def save_structure_res(res, save_folder, img_name, img_idx=0):
     excel_save_folder = os.path.join(save_folder, img_name)