Skip to content

Commit 706c864

Browse files
kbabertsky
authored andcommitted
OCR-D processor: properly handle missing or non-downloaded GT/OCR file
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
1 parent de4db88 commit 706c864

File tree

1 file changed

+13
-8
lines changed

1 file changed

+13
-8
lines changed

src/dinglehopper/ocrd_cli.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,19 @@ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
2727
metrics = self.parameter["metrics"]
2828
textequiv_level = self.parameter["textequiv_level"]
2929

30-
try:
31-
gt_file, ocr_file = input_files
32-
assert gt_file, 'missing GT file'
33-
assert ocr_file, 'missing OCR file'
34-
assert gt_file.local_filename
35-
assert ocr_file.local_filename
36-
except (ValueError, AssertionError) as err:
37-
self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
30+
# wrong number of inputs: let fail
31+
gt_file, ocr_file = input_files
32+
# missing on either side: skip (zip_input_files already warned)
33+
if not gt_file or not ocr_file:
34+
return
35+
# missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
36+
if not gt_file.local_filename:
37+
if config.OCRD_MISSING_INPUT == 'ABORT':
38+
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
39+
return
40+
if not ocr_file.local_filename:
41+
if config.OCRD_MISSING_INPUT == 'ABORT':
42+
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
3843
return
3944

4045
page_id = gt_file.pageId

0 commit comments

Comments
 (0)