4
4
from typing import List
5
5
6
6
from fastapi import FastAPI
7
- from fastapi import UploadFile , Form
7
+ from fastapi import UploadFile
8
8
from fastapi .exceptions import HTTPException
9
9
from fastapi .middleware .cors import CORSMiddleware
10
10
11
- from services import identify_file_type , merge_pdfs , save_file , extract_pdf_text_searchable , get_file_size , extract_image_text , extract_pdf_text_all
11
+ from services import identify_file_type , merge_pdfs , save_file , extract_pdf_text_searchable , get_file_size , extract_image_text
12
+ from service_wrappers import extract_image_text_and_set_db , extract_pdf_text_and_set_db
13
+ from textract_wrapper import detect_text_and_set_db
12
14
from text_analysis import analyze
13
15
from tasks import enqueue_extraction
14
- from textract import detect_text
15
- from db import get_value
16
+ from db import set_object , get_object
16
17
17
18
18
19
app = FastAPI ()
@@ -116,7 +117,7 @@ def extract_img_text(attachment: UploadFile):
116
117
117
118
118
119
@app .post ("/ocr" )
119
- def ocr (attachment : UploadFile , synchronous : bool = Form ( True ) ):
120
+ def ocr (attachment : UploadFile ):
120
121
"""
121
122
TODO: Support multiple attachments
122
123
It could pass a PDF or an image.
@@ -129,7 +130,8 @@ def ocr(attachment: UploadFile, synchronous: bool = Form(True)):
129
130
Searchable: We can use pdfminer.six as being used.
130
131
Non-Searchable: Covert the PDF to an image and then extract the text
131
132
132
- In all of the above cases, the text should be extracted and returned from here.
133
+ In all of the above cases, the processing would happen asynchronously.
134
+ The task would be queued and a link would be returned to the user.
133
135
"""
134
136
type_details = identify_file_type (attachment .file )
135
137
if not type_details .mime_type .startswith ('image' ) and not type_details .mime_type .startswith ('application/pdf' ):
@@ -141,52 +143,45 @@ def ocr(attachment: UploadFile, synchronous: bool = Form(True)):
141
143
path_hash = hashlib .sha256 (output_filename .encode ('utf-8' )).hexdigest ()
142
144
# Check the content-type, if image, then extract text using Tesseract.
143
145
if type_details .mime_type .startswith ('image' ):
144
- extraction_function = extract_image_text
146
+ extraction_function = extract_image_text_and_set_db
147
+ # processed_file_path = preprocess_image_opencv(output_filename)
148
+ set_object (key = path_hash , field = "type" , value = "image" )
149
+ enqueue_extraction (extraction_function = extract_image_text_and_set_db , file_path = output_filename , key = path_hash , field = "content" )
145
150
elif type_details .mime_type .startswith ('application/pdf' ):
146
151
# Attempt extracting text using pdfminer.six or else through the image conversion -> OCR pipeline.
147
- extraction_function = extract_pdf_text_all
148
- if synchronous is True :
149
- is_success , content = extraction_function (file_path = output_filename )
150
- if is_success is True :
151
- # Add one more step.
152
- # Perform text analysis on the extracted text.
153
- # If the extracted text is gibberish, then probably it was a low quality/skewed/noisy input.
154
- # Hence perform text detection using Amazon Textract for better accuracy.
155
- return {"content" : content }
156
- else :
157
- raise HTTPException (400 , detail = content )
158
- else :
159
- # Add it to a queue.
160
- enqueue_extraction (extraction_function = extraction_function , file_path = output_filename )
161
- BASE_URL = os .environ .get ("BASE_URL" , "http://localhost:8000" )
162
- link = f"{ BASE_URL } /ocr-result/{ path_hash } "
163
- return {"link" : link }
152
+ extraction_function = extract_pdf_text_and_set_db
153
+ set_object (key = path_hash , field = "type" , value = "pdf" )
154
+ enqueue_extraction (extraction_function = extraction_function , file_path = output_filename , key = path_hash , field = "content" )
155
+ # Add it to a queue.
156
+ BASE_URL = os .environ .get ("BASE_URL" , "http://localhost:8000" )
157
+ link = f"{ BASE_URL } /ocr-result/{ path_hash } "
158
+ return {"link" : link }
164
159
165
160
166
161
@app .get ("/ocr-result/{key}" )
167
162
def ocr_result (key : str ):
168
- value = get_value (key )
169
- return {"content" : value }
163
+ content = get_object (key , "content" )
164
+ if content is None :
165
+ return {"content" : content }
166
+ # Remove empty lines
167
+ lines = content .splitlines ()
168
+ non_blank_lines = [line for line in lines if line .strip () != '' ]
169
+ content = '\n ' .join (non_blank_lines )
170
+ return {"content" : content }
170
171
171
172
172
173
@app .post ("/textract-ocr" )
173
- def textract_ocr (attachment : UploadFile , synchronous : bool = Form ( True ) ):
174
+ def textract_ocr (attachment : UploadFile ):
174
175
type_details = identify_file_type (attachment .file )
175
176
if not type_details .mime_type .startswith ('image' ):
176
177
raise HTTPException (status_code = 400 , detail = "Provide an image" )
177
178
output_filename = f"/media/textract-ocr-files/{ attachment .filename } "
178
179
save_file (attachment .file , output_filename )
179
180
attachment .file .seek (0 )
180
- if synchronous is True :
181
- is_success , content = detect_text (output_filename )
182
- if is_success is True :
183
- return {"content" : content }
184
- else :
185
- raise HTTPException (400 , detail = content )
186
- else :
187
- # Add it to a queue.
188
- enqueue_extraction (extraction_function = detect_text , file_path = output_filename )
189
- path_hash = hashlib .sha256 (output_filename .encode ('utf-8' )).hexdigest ()
190
- BASE_URL = os .environ .get ("BASE_URL" , "http://localhost:8000" )
191
- link = f"{ BASE_URL } /ocr-result/{ path_hash } "
192
- return {"link" : link }
181
+ path_hash = hashlib .sha256 (output_filename .encode ('utf-8' )).hexdigest ()
182
+ set_object (key = path_hash , field = "type" , value = "pdf" )
183
+ # Add it to a queue.
184
+ enqueue_extraction (extraction_function = detect_text_and_set_db , file_path = output_filename , key = path_hash , field = "content" )
185
+ BASE_URL = os .environ .get ("BASE_URL" , "http://localhost:8000" )
186
+ link = f"{ BASE_URL } /ocr-result/{ path_hash } "
187
+ return {"link" : link }
0 commit comments