Skip to content

Commit 4395eba

Browse files
committed
Added ability to extract structured data for PAN and Passport.
1 parent 4f30826 commit 4395eba

File tree

3 files changed

+52
-1
lines changed

3 files changed

+52
-1
lines changed

html/index.html

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ <h2 class="accordion-header" id="headingOptions">
6666
<h4>Detected Text:</h4>
6767
<pre id="resultText" class="bg-light p-3 rounded" style="min-height: 100px;"></pre>
6868
</div>
69+
70+
<div class="mt-4">
71+
<h4>Extracted Fields (Applicable for PAN, Passport and Aadhaar):</h4>
72+
<pre id="extractedFields" class="bg-light p-3 rounded" style="min-height: 100px;"></pre>
73+
</div>
6974
</div>
7075

7176
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
@@ -77,6 +82,7 @@ <h4>Detected Text:</h4>
7782
const form = document.getElementById('ocrForm');
7883
const fileInput = document.getElementById('fileInput');
7984
const resultText = document.getElementById('resultText');
85+
const extractedFields = document.getElementById('extractedFields');
8086
const errorAlert = document.getElementById('errorAlert');
8187

8288
let ocrMode = 'basic'; // default
@@ -106,6 +112,7 @@ <h4>Detected Text:</h4>
106112
errorAlert.classList.add('d-none');
107113
errorAlert.textContent = '';
108114
resultText.textContent = 'OCR and Recognition is computationally expensive, and takes time. Please wait...';
115+
extractedFields.textContent = '';
109116

110117
const formData = new FormData();
111118
formData.append('attachment', file);
@@ -157,6 +164,12 @@ <h4>Detected Text:</h4>
157164
if (result.content !== null) {
158165
clearInterval(poll);
159166
resultText.textContent = result.content;
167+
if (result.category && result.category == 'pan') {
168+
extractedFields.textContent = JSON.stringify(result.pan_data, null, 2);
169+
}
170+
if (result.category && result.category == 'passport') {
171+
extractedFields.textContent = JSON.stringify(result.passport_data, null, 2);
172+
}
160173
}
161174
// If status is "pending", keep polling
162175

main.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import logging
33
import hashlib
4+
import json
45
from typing import List
56

67
from fastapi import FastAPI
@@ -166,11 +167,25 @@ def ocr_result(key: str):
166167
content = get_object(key, "content")
167168
if content is None:
168169
return {"content": content}
170+
response_data = {}
171+
category = get_object(key, "category")
172+
if category is not None:
173+
# Only if category is not None, then include it in the response
174+
response_data["category"] = category
175+
if category == 'passport':
176+
passport_data = get_object(key, "passport_data")
177+
passport_data = json.loads(passport_data)
178+
response_data["passport_data"] = passport_data
179+
elif category == 'pan':
180+
pan_data = get_object(key, "pan_data")
181+
pan_data = json.loads(pan_data)
182+
response_data["pan_data"] = pan_data
169183
# Remove empty lines
170184
lines = content.splitlines()
171185
non_blank_lines = [line for line in lines if line.strip() != '']
172186
content = '\n'.join(non_blank_lines)
173-
return {"content": content}
187+
response_data["content"] = content
188+
return response_data
174189

175190

176191
@app.post("/textract-ocr")

service_wrappers.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
1+
import json
2+
import logging
3+
14
from services import extract_image_text, extract_pdf_text_all
25
from image_preprocessing import preprocess_image_opencv
36

47
from db import set_object
8+
from text_analysis import classify, analyze_passport, analyze_pan
9+
10+
11+
logger = logging.getLogger(__name__)
512

613

714
def extract_image_text_and_set_db(file_path: str, key: str, field: str = 'content', options=None):
@@ -13,8 +20,24 @@ def extract_image_text_and_set_db(file_path: str, key: str, field: str = 'conten
1320
}
1421
processed_image_path = preprocess_image_opencv(file_path, options)
1522
is_success, content = extract_image_text(processed_image_path)
23+
# TODO: Perform text analysis on another queue to not stall this queue
1624
if is_success is True:
1725
set_object(key, field, content)
26+
# Perform classification
27+
category = classify(content)
28+
logger.info(f"Category: {category}")
29+
if category is not None:
30+
set_object(key, "category", category)
31+
if category == 'passport':
32+
passport_data = analyze_passport(content)
33+
passport_data = json.dumps(passport_data)
34+
set_object(key, "passport_data", passport_data)
35+
elif category == 'pan':
36+
pan_data = analyze_pan(content)
37+
pan_data = json.dumps(pan_data)
38+
set_object(key, "pan_data", pan_data)
39+
# Extract structured data
40+
# Store structured data in DB
1841
return True, content
1942
else:
2043
return False, content

0 commit comments

Comments
 (0)