Get rid of synchronous mode and on success result mess. OCR is computationally expensive, hence doesn't make sense to support synchronous mode.

raaj-akshar · raaj-akshar · commit 07461fb063d0 · 2025-06-02T17:45:35.000+05:30
diff --git a/html/index.html b/html/index.html
@@ -18,14 +18,39 @@ <h1 class="text-center mb-4">Extract text from documents</h1>
             <label for="fileInput" class="form-label">Upload an image, a scanned document or a PDF.</label>
             <input class="form-control" type="file" id="fileInput" accept="image/*,.pdf" required>
           </div>
+<div class="accordion mb-3" id="processingOptionsAccordion">
+  <div class="accordion-item border-0">
+    <h2 class="accordion-header" id="headingOptions">
+      <button class="accordion-button collapsed py-2 small" type="button" data-bs-toggle="collapse" data-bs-target="#collapseOptions" aria-expanded="false" aria-controls="collapseOptions">
+        Advanced Options <span class="text-muted ms-2 small">(optional)</span>
+      </button>
+    </h2>
+    <div id="collapseOptions" class="accordion-collapse collapse" aria-labelledby="headingOptions" data-bs-parent="#processingOptionsAccordion">
+      <div class="accordion-body py-2">
+        <div class="form-check small mb-1">
+          <input class="form-check-input" type="checkbox" value="true" id="removeNoise" name="remove_noise">
+          <label class="form-check-label" for="removeNoise">Remove noise and specks (Uses Bilateral Filter)</label>
+        </div>
+        <div class="form-check small mb-1">
+          <input class="form-check-input" type="checkbox" value="true" id="binarization" name="binarize">
+          <label class="form-check-label" for="binarization">Perform binarization (Uses Adaptive Thresholding)</label>
+        </div>
+        <div class="form-check small">
+          <input class="form-check-input" type="checkbox" value="true" id="autoRotate" name="auto_rotate">
+          <label class="form-check-label" for="autoRotate">Auto rotate if needed (Contour Detection and Alignment)</label>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
           <div class="d-flex justify-content-between">
-            <button type="submit" class="btn btn-outline-primary w-50 me-2 d-flex align-items-center justify-content-center gap-2" data-mode="basic">
-              Basic OCR
-              <i class="bi bi-info-circle" data-bs-toggle="tooltip" title="Works for high quality documents."></i>
+            <button type="submit" class="btn btn-outline-success w-50 me-2 d-flex align-items-center justify-content-center gap-2" data-mode="basic">
+              Basic OCR (Faster and Recommended)
+              <i class="bi bi-info-circle" data-bs-toggle="tooltip" title="For scanned documents, PDF and high quality images."></i>
             </button>
-            <button type="submit" class="btn btn-outline-success w-50 d-flex align-items-center justify-content-center gap-2" data-mode="advanced">
-              Advanced OCR
-              <i class="bi bi-info-circle" data-bs-toggle="tooltip" title="Works better for Handwritten text and low quality documents."></i>
+            <button type="submit" class="btn btn-outline-primary w-50 d-flex align-items-center justify-content-center gap-2" data-mode="advanced">
+              Advanced OCR (Slower but more accurate)
+              <i class="bi bi-info-circle" data-bs-toggle="tooltip" title="For Handwritten text and low quality documents."></i>
             </button>
           </div>
         </form>
@@ -65,7 +90,7 @@ <h4>Detected Text:</h4>
       // Reset, hide previous alert
       errorAlert.classList.add('d-none');
       errorAlert.textContent = '';
-      resultText.textContent = 'Processing.....';
+      resultText.textContent = 'OCR and Recognition is computationally expensive, and takes time. Please wait...';
 
       const file = fileInput.files[0];
       if (!file) {
@@ -77,7 +102,7 @@ <h4>Detected Text:</h4>
       formData.append('attachment', file);
       formData.append('synchronous', false);
 
-      resultText.textContent = 'Processing...';
+      resultText.textContent = 'OCR and Recognition is computationally expensive, and takes time. Please wait...';
       const BASE_URL = 'http://localhost:8000';
       let pollUrl;
       let response;
diff --git a/main.py b/main.py
@@ -4,15 +4,16 @@
 from typing import List
 
 from fastapi import FastAPI
-from fastapi import UploadFile, Form
+from fastapi import UploadFile
 from fastapi.exceptions import HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 
-from services import identify_file_type, merge_pdfs, save_file, extract_pdf_text_searchable, get_file_size, extract_image_text, extract_pdf_text_all
+from services import identify_file_type, merge_pdfs, save_file, extract_pdf_text_searchable, get_file_size, extract_image_text
+from service_wrappers import extract_image_text_and_set_db, extract_pdf_text_and_set_db
+from textract_wrapper import detect_text_and_set_db
 from text_analysis import analyze
 from tasks import enqueue_extraction
-from textract import detect_text
-from db import get_value
+from db import set_object, get_object
 
 
 app = FastAPI()
@@ -116,7 +117,7 @@ def extract_img_text(attachment: UploadFile):
 
 
 @app.post("/ocr")
-def ocr(attachment: UploadFile, synchronous: bool = Form(True)):
+def ocr(attachment: UploadFile):
     """
     TODO: Support multiple attachments
     It could pass a PDF or an image.
@@ -129,7 +130,8 @@ def ocr(attachment: UploadFile, synchronous: bool = Form(True)):
     Searchable: We can use pdfminer.six as being used.
     Non-Searchable: Covert the PDF to an image and then extract the text
 
-    In all of the above cases, the text should be extracted and returned from here.
+    In all of the above cases, the processing would happen asynchronously.
+    The task would be queued and a link would be returned to the user.
     """
     type_details = identify_file_type(attachment.file)
     if not type_details.mime_type.startswith('image') and not type_details.mime_type.startswith('application/pdf'):
@@ -141,52 +143,45 @@ def ocr(attachment: UploadFile, synchronous: bool = Form(True)):
     path_hash = hashlib.sha256(output_filename.encode('utf-8')).hexdigest()
     # Check the content-type, if image, then extract text using Tesseract.
     if type_details.mime_type.startswith('image'):
-        extraction_function = extract_image_text
+        extraction_function = extract_image_text_and_set_db
+        # processed_file_path = preprocess_image_opencv(output_filename)
+        set_object(key=path_hash, field="type", value="image")
+        enqueue_extraction(extraction_function=extract_image_text_and_set_db, file_path=output_filename, key=path_hash, field="content")
     elif type_details.mime_type.startswith('application/pdf'):
         # Attempt extracting text using pdfminer.six or else through the image conversion -> OCR pipeline.
-        extraction_function = extract_pdf_text_all
-    if synchronous is True:
-        is_success, content = extraction_function(file_path=output_filename)
-        if is_success is True:
-            # Add one more step.
-            # Perform text analysis on the extracted text.
-            # If the extracted text is gibberish, then probably it was a low quality/skewed/noisy input.
-            # Hence perform text detection using Amazon Textract for better accuracy.
-            return {"content": content}
-        else:
-            raise HTTPException(400, detail=content)
-    else:
-        # Add it to a queue.
-        enqueue_extraction(extraction_function=extraction_function, file_path=output_filename)
-        BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
-        link = f"{BASE_URL}/ocr-result/{path_hash}"
-        return {"link": link}
+        extraction_function = extract_pdf_text_and_set_db
+        set_object(key=path_hash, field="type", value="pdf")
+        enqueue_extraction(extraction_function=extraction_function, file_path=output_filename, key=path_hash, field="content")
+    # Add it to a queue.
+    BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
+    link = f"{BASE_URL}/ocr-result/{path_hash}"
+    return {"link": link}
 
 
 @app.get("/ocr-result/{key}")
 def ocr_result(key: str):
-    value = get_value(key)
-    return {"content": value}
+    content = get_object(key, "content")
+    if content is None:
+        return {"content": content}
+    # Remove empty lines
+    lines = content.splitlines()
+    non_blank_lines = [line for line in lines if line.strip() != '']
+    content = '\n'.join(non_blank_lines)
+    return {"content": content}
 
 
 @app.post("/textract-ocr")
-def textract_ocr(attachment: UploadFile, synchronous: bool = Form(True)):
+def textract_ocr(attachment: UploadFile):
     type_details = identify_file_type(attachment.file)
     if not type_details.mime_type.startswith('image'):
         raise HTTPException(status_code=400, detail="Provide an image")
     output_filename = f"/media/textract-ocr-files/{attachment.filename}"
     save_file(attachment.file, output_filename)
     attachment.file.seek(0)
-    if synchronous is True:
-        is_success, content = detect_text(output_filename)
-        if is_success is True:
-            return {"content": content}
-        else:
-            raise HTTPException(400, detail=content)
-    else:
-        # Add it to a queue.
-        enqueue_extraction(extraction_function=detect_text, file_path=output_filename)
-        path_hash = hashlib.sha256(output_filename.encode('utf-8')).hexdigest()
-        BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
-        link = f"{BASE_URL}/ocr-result/{path_hash}"
-        return {"link": link}
+    path_hash = hashlib.sha256(output_filename.encode('utf-8')).hexdigest()
+    set_object(key=path_hash, field="type", value="pdf")
+    # Add it to a queue.
+    enqueue_extraction(extraction_function=detect_text_and_set_db, file_path=output_filename, key=path_hash, field="content")
+    BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
+    link = f"{BASE_URL}/ocr-result/{path_hash}"
+    return {"link": link}
diff --git a/service_wrappers.py b/service_wrappers.py
@@ -0,0 +1,29 @@
+from services import extract_image_text, extract_pdf_text_all
+from image_preprocessing import preprocess_image_opencv
+
+from db import set_object
+
+
+def extract_image_text_and_set_db(file_path: str, key: str, field: str, options=None):
+    if options is None:
+        options = {
+            "gray": True,
+            "denoise": True,
+            "binarize": True
+        }
+    processed_image_path = preprocess_image_opencv(file_path, options)
+    is_success, content = extract_image_text(processed_image_path)
+    if is_success is True:
+        set_object(key, field, content)
+        return True, content
+    else:
+        return False, content
+
+
+def extract_pdf_text_and_set_db(file_path: str, key: str, field: str):
+    is_success, content = extract_pdf_text_all(file_path)
+    if is_success is True:
+        set_object(key, field, content)
+        return True, content
+    else:
+        return False, content
diff --git a/services.py b/services.py
@@ -1,3 +1,9 @@
+"""
+This module is supposed to only import Python built-ins or third-party libraries.
+
+It can import application module only if that module too adherese to the above policy.
+"""
+
 import os
 import glob
 import logging
@@ -24,9 +30,6 @@
 
 from fastapi import UploadFile
 
-# Perform image preprocessing to improve image quality, crispness and readability
-from image_preprocessing import preprocess_image
-
 from text_analysis import is_meaningful_content
 
 
diff --git a/tasks.py b/tasks.py
@@ -2,23 +2,19 @@
 rq integration
 
 To use Redis Queue to handle the extraction tasks.
+
+A guiding principle for tasks is that they should be self-contained.
+Hence, the task itself should do any processing whether computational or I/O bound.
+And once the task has completed, it is responsible for writing it to the database.
+result_callbacks are messy, and we want to avoid them.
 """
-import hashlib
 from rq import Queue
 
-from db import get_connection, set_value
-
-
-def report_success(job, connection, result, *args, **kwargs):
-    is_success, extracted_text = result
-    file_path = job.args[0]
-    path_hash = hashlib.sha256(file_path.encode('utf-8')).hexdigest()
-    # Write this result to the data store.
-    set_value(path_hash, extracted_text)
+# Only required to get the Redis connection which orchestrates the queue.
+from db import get_connection
 
 
-def enqueue_extraction(extraction_function, file_path):
+def enqueue_extraction(extraction_function, **kwargs):
     connection = get_connection()
     q = Queue(connection=connection)
-    # Extraction should be performed on the raw image as well as processed image.
-    q.enqueue(extraction_function, file_path, on_success=report_success)
+    q.enqueue(extraction_function, **kwargs)
diff --git a/textract_wrapper.py b/textract_wrapper.py
@@ -0,0 +1,12 @@
+from textract import detect_text
+
+from db import set_object
+
+
+def detect_text_and_set_db(file_path: str, key: str, field: str, options=None):
+    is_success, content = detect_text(file_path)
+    if is_success is True:
+        set_object(key, field, content)
+        return True, content
+    else:
+        return False, content