@@ -143,15 +143,13 @@ def ocr(attachment: UploadFile):
143
143
path_hash = hashlib .sha256 (output_filename .encode ('utf-8' )).hexdigest ()
144
144
# Check the content-type, if image, then extract text using Tesseract.
145
145
if type_details .mime_type .startswith ('image' ):
146
- extraction_function = extract_image_text_and_set_db
147
- # processed_file_path = preprocess_image_opencv(output_filename)
146
+ # Attempt extraction through Tesseract
148
147
set_object (key = path_hash , field = "type" , value = "image" )
149
- enqueue_extraction (extraction_function = extract_image_text_and_set_db , file_path = output_filename , key = path_hash , field = "content" )
148
+ enqueue_extraction (extraction_function = extract_image_text_and_set_db , file_path = output_filename , key = path_hash )
150
149
elif type_details .mime_type .startswith ('application/pdf' ):
151
150
# Attempt extracting text using pdfminer.six or else through the image conversion -> OCR pipeline.
152
- extraction_function = extract_pdf_text_and_set_db
153
151
set_object (key = path_hash , field = "type" , value = "pdf" )
154
- enqueue_extraction (extraction_function = extraction_function , file_path = output_filename , key = path_hash , field = "content" )
152
+ enqueue_extraction (extraction_function = extract_pdf_text_and_set_db , file_path = output_filename , key = path_hash )
155
153
# Add it to a queue.
156
154
BASE_URL = os .environ .get ("BASE_URL" , "http://localhost:8000" )
157
155
link = f"{ BASE_URL } /ocr-result/{ path_hash } "
0 commit comments