Skip to content

Commit 07461fb

Browse files
committed
Get rid of synchronous mode and on success result mess. OCR is computationally expensive, hence doesn't make sense to support synchronous mode.
1 parent d1b4e7c commit 07461fb

File tree

6 files changed

+124
-64
lines changed

6 files changed

+124
-64
lines changed

html/index.html

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,39 @@ <h1 class="text-center mb-4">Extract text from documents</h1>
1818
<label for="fileInput" class="form-label">Upload an image, a scanned document or a PDF.</label>
1919
<input class="form-control" type="file" id="fileInput" accept="image/*,.pdf" required>
2020
</div>
21+
<div class="accordion mb-3" id="processingOptionsAccordion">
22+
<div class="accordion-item border-0">
23+
<h2 class="accordion-header" id="headingOptions">
24+
<button class="accordion-button collapsed py-2 small" type="button" data-bs-toggle="collapse" data-bs-target="#collapseOptions" aria-expanded="false" aria-controls="collapseOptions">
25+
Advanced Options <span class="text-muted ms-2 small">(optional)</span>
26+
</button>
27+
</h2>
28+
<div id="collapseOptions" class="accordion-collapse collapse" aria-labelledby="headingOptions" data-bs-parent="#processingOptionsAccordion">
29+
<div class="accordion-body py-2">
30+
<div class="form-check small mb-1">
31+
<input class="form-check-input" type="checkbox" value="true" id="removeNoise" name="remove_noise">
32+
<label class="form-check-label" for="removeNoise">Remove noise and specks (Uses Bilateral Filter)</label>
33+
</div>
34+
<div class="form-check small mb-1">
35+
<input class="form-check-input" type="checkbox" value="true" id="binarization" name="binarize">
36+
<label class="form-check-label" for="binarization">Perform binarization (Uses Adaptive Thresholding)</label>
37+
</div>
38+
<div class="form-check small">
39+
<input class="form-check-input" type="checkbox" value="true" id="autoRotate" name="auto_rotate">
40+
<label class="form-check-label" for="autoRotate">Auto rotate if needed (Contour Detection and Alignment)</label>
41+
</div>
42+
</div>
43+
</div>
44+
</div>
45+
</div>
2146
<div class="d-flex justify-content-between">
22-
<button type="submit" class="btn btn-outline-primary w-50 me-2 d-flex align-items-center justify-content-center gap-2" data-mode="basic">
23-
Basic OCR
24-
<i class="bi bi-info-circle" data-bs-toggle="tooltip" title="Works for high quality documents."></i>
47+
<button type="submit" class="btn btn-outline-success w-50 me-2 d-flex align-items-center justify-content-center gap-2" data-mode="basic">
48+
Basic OCR (Faster and Recommended)
49+
<i class="bi bi-info-circle" data-bs-toggle="tooltip" title="For scanned documents, PDF and high quality images."></i>
2550
</button>
26-
<button type="submit" class="btn btn-outline-success w-50 d-flex align-items-center justify-content-center gap-2" data-mode="advanced">
27-
Advanced OCR
28-
<i class="bi bi-info-circle" data-bs-toggle="tooltip" title="Works better for Handwritten text and low quality documents."></i>
51+
<button type="submit" class="btn btn-outline-primary w-50 d-flex align-items-center justify-content-center gap-2" data-mode="advanced">
52+
Advanced OCR (Slower but more accurate)
53+
<i class="bi bi-info-circle" data-bs-toggle="tooltip" title="For Handwritten text and low quality documents."></i>
2954
</button>
3055
</div>
3156
</form>
@@ -65,7 +90,7 @@ <h4>Detected Text:</h4>
6590
// Reset, hide previous alert
6691
errorAlert.classList.add('d-none');
6792
errorAlert.textContent = '';
68-
resultText.textContent = 'Processing.....';
93+
resultText.textContent = 'OCR and Recognition is computationally expensive, and takes time. Please wait...';
6994

7095
const file = fileInput.files[0];
7196
if (!file) {
@@ -77,7 +102,7 @@ <h4>Detected Text:</h4>
77102
formData.append('attachment', file);
78103
formData.append('synchronous', false);
79104

80-
resultText.textContent = 'Processing...';
105+
resultText.textContent = 'OCR and Recognition is computationally expensive, and takes time. Please wait...';
81106
const BASE_URL = 'http://localhost:8000';
82107
let pollUrl;
83108
let response;

main.py

Lines changed: 35 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,16 @@
44
from typing import List
55

66
from fastapi import FastAPI
7-
from fastapi import UploadFile, Form
7+
from fastapi import UploadFile
88
from fastapi.exceptions import HTTPException
99
from fastapi.middleware.cors import CORSMiddleware
1010

11-
from services import identify_file_type, merge_pdfs, save_file, extract_pdf_text_searchable, get_file_size, extract_image_text, extract_pdf_text_all
11+
from services import identify_file_type, merge_pdfs, save_file, extract_pdf_text_searchable, get_file_size, extract_image_text
12+
from service_wrappers import extract_image_text_and_set_db, extract_pdf_text_and_set_db
13+
from textract_wrapper import detect_text_and_set_db
1214
from text_analysis import analyze
1315
from tasks import enqueue_extraction
14-
from textract import detect_text
15-
from db import get_value
16+
from db import set_object, get_object
1617

1718

1819
app = FastAPI()
@@ -116,7 +117,7 @@ def extract_img_text(attachment: UploadFile):
116117

117118

118119
@app.post("/ocr")
119-
def ocr(attachment: UploadFile, synchronous: bool = Form(True)):
120+
def ocr(attachment: UploadFile):
120121
"""
121122
TODO: Support multiple attachments
122123
It could pass a PDF or an image.
@@ -129,7 +130,8 @@ def ocr(attachment: UploadFile, synchronous: bool = Form(True)):
129130
Searchable: We can use pdfminer.six as being used.
130131
Non-Searchable: Covert the PDF to an image and then extract the text
131132
132-
In all of the above cases, the text should be extracted and returned from here.
133+
In all of the above cases, the processing would happen asynchronously.
134+
The task would be queued and a link would be returned to the user.
133135
"""
134136
type_details = identify_file_type(attachment.file)
135137
if not type_details.mime_type.startswith('image') and not type_details.mime_type.startswith('application/pdf'):
@@ -141,52 +143,45 @@ def ocr(attachment: UploadFile, synchronous: bool = Form(True)):
141143
path_hash = hashlib.sha256(output_filename.encode('utf-8')).hexdigest()
142144
# Check the content-type, if image, then extract text using Tesseract.
143145
if type_details.mime_type.startswith('image'):
144-
extraction_function = extract_image_text
146+
extraction_function = extract_image_text_and_set_db
147+
# processed_file_path = preprocess_image_opencv(output_filename)
148+
set_object(key=path_hash, field="type", value="image")
149+
enqueue_extraction(extraction_function=extract_image_text_and_set_db, file_path=output_filename, key=path_hash, field="content")
145150
elif type_details.mime_type.startswith('application/pdf'):
146151
# Attempt extracting text using pdfminer.six or else through the image conversion -> OCR pipeline.
147-
extraction_function = extract_pdf_text_all
148-
if synchronous is True:
149-
is_success, content = extraction_function(file_path=output_filename)
150-
if is_success is True:
151-
# Add one more step.
152-
# Perform text analysis on the extracted text.
153-
# If the extracted text is gibberish, then probably it was a low quality/skewed/noisy input.
154-
# Hence perform text detection using Amazon Textract for better accuracy.
155-
return {"content": content}
156-
else:
157-
raise HTTPException(400, detail=content)
158-
else:
159-
# Add it to a queue.
160-
enqueue_extraction(extraction_function=extraction_function, file_path=output_filename)
161-
BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
162-
link = f"{BASE_URL}/ocr-result/{path_hash}"
163-
return {"link": link}
152+
extraction_function = extract_pdf_text_and_set_db
153+
set_object(key=path_hash, field="type", value="pdf")
154+
enqueue_extraction(extraction_function=extraction_function, file_path=output_filename, key=path_hash, field="content")
155+
# Add it to a queue.
156+
BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
157+
link = f"{BASE_URL}/ocr-result/{path_hash}"
158+
return {"link": link}
164159

165160

166161
@app.get("/ocr-result/{key}")
167162
def ocr_result(key: str):
168-
value = get_value(key)
169-
return {"content": value}
163+
content = get_object(key, "content")
164+
if content is None:
165+
return {"content": content}
166+
# Remove empty lines
167+
lines = content.splitlines()
168+
non_blank_lines = [line for line in lines if line.strip() != '']
169+
content = '\n'.join(non_blank_lines)
170+
return {"content": content}
170171

171172

172173
@app.post("/textract-ocr")
173-
def textract_ocr(attachment: UploadFile, synchronous: bool = Form(True)):
174+
def textract_ocr(attachment: UploadFile):
174175
type_details = identify_file_type(attachment.file)
175176
if not type_details.mime_type.startswith('image'):
176177
raise HTTPException(status_code=400, detail="Provide an image")
177178
output_filename = f"/media/textract-ocr-files/{attachment.filename}"
178179
save_file(attachment.file, output_filename)
179180
attachment.file.seek(0)
180-
if synchronous is True:
181-
is_success, content = detect_text(output_filename)
182-
if is_success is True:
183-
return {"content": content}
184-
else:
185-
raise HTTPException(400, detail=content)
186-
else:
187-
# Add it to a queue.
188-
enqueue_extraction(extraction_function=detect_text, file_path=output_filename)
189-
path_hash = hashlib.sha256(output_filename.encode('utf-8')).hexdigest()
190-
BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
191-
link = f"{BASE_URL}/ocr-result/{path_hash}"
192-
return {"link": link}
181+
path_hash = hashlib.sha256(output_filename.encode('utf-8')).hexdigest()
182+
set_object(key=path_hash, field="type", value="pdf")
183+
# Add it to a queue.
184+
enqueue_extraction(extraction_function=detect_text_and_set_db, file_path=output_filename, key=path_hash, field="content")
185+
BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
186+
link = f"{BASE_URL}/ocr-result/{path_hash}"
187+
return {"link": link}

service_wrappers.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from services import extract_image_text, extract_pdf_text_all
2+
from image_preprocessing import preprocess_image_opencv
3+
4+
from db import set_object
5+
6+
7+
def extract_image_text_and_set_db(file_path: str, key: str, field: str, options=None):
8+
if options is None:
9+
options = {
10+
"gray": True,
11+
"denoise": True,
12+
"binarize": True
13+
}
14+
processed_image_path = preprocess_image_opencv(file_path, options)
15+
is_success, content = extract_image_text(processed_image_path)
16+
if is_success is True:
17+
set_object(key, field, content)
18+
return True, content
19+
else:
20+
return False, content
21+
22+
23+
def extract_pdf_text_and_set_db(file_path: str, key: str, field: str):
24+
is_success, content = extract_pdf_text_all(file_path)
25+
if is_success is True:
26+
set_object(key, field, content)
27+
return True, content
28+
else:
29+
return False, content

services.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
"""
2+
This module is supposed to only import Python built-ins or third-party libraries.
3+
4+
It can import application module only if that module too adherese to the above policy.
5+
"""
6+
17
import os
28
import glob
39
import logging
@@ -24,9 +30,6 @@
2430

2531
from fastapi import UploadFile
2632

27-
# Perform image preprocessing to improve image quality, crispness and readability
28-
from image_preprocessing import preprocess_image
29-
3033
from text_analysis import is_meaningful_content
3134

3235

tasks.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,19 @@
22
rq integration
33
44
To use Redis Queue to handle the extraction tasks.
5+
6+
A guiding principle for tasks is that they should be self-contained.
7+
Hence, the task itself should do any processing whether computational or I/O bound.
8+
And once the task has completed, it is responsible for writing it to the database.
9+
result_callbacks are messy, and we want to avoid them.
510
"""
6-
import hashlib
711
from rq import Queue
812

9-
from db import get_connection, set_value
10-
11-
12-
def report_success(job, connection, result, *args, **kwargs):
13-
is_success, extracted_text = result
14-
file_path = job.args[0]
15-
path_hash = hashlib.sha256(file_path.encode('utf-8')).hexdigest()
16-
# Write this result to the data store.
17-
set_value(path_hash, extracted_text)
13+
# Only required to get the Redis connection which orchestrates the queue.
14+
from db import get_connection
1815

1916

20-
def enqueue_extraction(extraction_function, file_path):
17+
def enqueue_extraction(extraction_function, **kwargs):
2118
connection = get_connection()
2219
q = Queue(connection=connection)
23-
# Extraction should be performed on the raw image as well as processed image.
24-
q.enqueue(extraction_function, file_path, on_success=report_success)
20+
q.enqueue(extraction_function, **kwargs)

textract_wrapper.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from textract import detect_text
2+
3+
from db import set_object
4+
5+
6+
def detect_text_and_set_db(file_path: str, key: str, field: str, options=None):
7+
is_success, content = detect_text(file_path)
8+
if is_success is True:
9+
set_object(key, field, content)
10+
return True, content
11+
else:
12+
return False, content

0 commit comments

Comments
 (0)