Skip to content

Commit bc5adcf

Browse files
committed
Better code modularity and decision checks for text detection.
1 parent 8c68f5c commit bc5adcf

File tree

4 files changed

+49
-25
lines changed

4 files changed

+49
-25
lines changed

main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from fastapi.exceptions import HTTPException
99
from fastapi.middleware.cors import CORSMiddleware
1010

11-
from services import identify_file_type, merge_pdfs, save_file, extract_pdf_text, get_file_size, extract_image_text, extract_pdf_text_all
11+
from services import identify_file_type, merge_pdfs, save_file, extract_pdf_text_searchable, get_file_size, extract_image_text, extract_pdf_text_all
1212
from text_analysis import analyze
1313
from tasks import enqueue_extraction
1414
from textract import detect_text
@@ -85,7 +85,7 @@ def extract_text(attachment: UploadFile):
8585
output_filename = f"/media/extraction-pdfs/{attachment_name}"
8686
save_file(attachment.file, output_filename)
8787
attachment.file.seek(0)
88-
is_success, content = extract_pdf_text(attachment.file)
88+
is_success, content = extract_pdf_text_searchable(attachment.file)
8989
if is_success is False:
9090
raise HTTPException(status_code=400, detail=content)
9191
analysis_result = analyze(content)

services.py

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,32 @@
33
import logging
44
from typing import List, BinaryIO
55

6+
# File mime-type detection
67
import magic
78
from magic.compat import FileMagic
89

10+
# PDF manipulation
911
from pikepdf import Pdf
1012

13+
# PDF text extraction
1114
from pdfminer.high_level import extract_text
1215
from pdfminer.pdfparser import PDFSyntaxError
1316

17+
# Image text extraction
18+
# OCR can only happen on images, OCR doesn't work with PDF
1419
import pytesseract
1520
from pytesseract.pytesseract import TesseractError
1621

22+
# Convert non-searchable PDFs to images before performing OCR
1723
from pdf2image import convert_from_path
1824

1925
from fastapi import UploadFile
2026

27+
# Perform image preprocessing to improve image quality, crispness and readability
2128
from image_preprocessing import preprocess_image
2229

30+
from text_analysis import is_meaningful_content
31+
2332

2433
logger = logging.getLogger(__name__)
2534

@@ -90,7 +99,7 @@ def save_file(file: BinaryIO, path: str):
9099
logger.info(f"Saved file to {path}")
91100

92101

93-
def extract_pdf_text(file: BinaryIO):
102+
def extract_pdf_text_searchable(file: BinaryIO):
94103
"""
95104
:param: A file like object, opened in binary mode.
96105
Extracts text from a PDF containing embedded text using pdfminer.six library.
@@ -105,31 +114,20 @@ def extract_pdf_text(file: BinaryIO):
105114
return False, "An invalid or corrupted PDF"
106115

107116

108-
def extract_pdf_text_all(file_path: str):
117+
def extract_pdf_text_non_searchable(file_path: str):
109118
"""
110-
Attempts extraction for both searchable and non-searchable PDFs.
111-
112-
1. For searchable_pdfs, delegate to extract_pdf_text which uses pdfminer.six
113-
2. For non-searchable PDFs, convert to an image and then extract text
119+
:param: A PDF file path.
120+
Extracts text from non searchable PDFs i.e scanned PDFs that don't have embedded text.
121+
Converts a PDF to an image and then extracts text from it. Delegates to extract_image_text which
122+
performs OCR using Pytesseract.
114123
"""
115-
f = open(file_path, "rb")
116-
is_success, content = extract_pdf_text(f)
117-
f.close()
118-
if is_success is False:
119-
# It's not even a PDF probably
120-
return False, content
121-
if len(content) > 10:
122-
return True, content
123-
# Probably it's a non-searchable PDF, that's why we were able to get less than 10 characters.
124-
# Convert it to an image first
125124
output_folder = "/media/pdf-to-image" # Directory name -> /media/pdf-to-image
126125
basename = os.path.basename(file_path) # File name -> sample.pdf
127126
if '.pdf' in basename:
128127
basename = basename.replace('.pdf', '')
129128
convert_from_path(file_path, output_folder=output_folder, fmt="png", output_file=basename)
130129
# The converted images have been saved now.
131130
converted_images_paths = sorted(glob.glob(f"{output_folder}/{basename}*.png"))
132-
# We will extend it for all images later.
133131
is_successes = []
134132
contents = []
135133
for converted_image_path in converted_images_paths:
@@ -143,6 +141,25 @@ def extract_pdf_text_all(file_path: str):
143141
return any(is_successes), "\n".join(contents)
144142

145143

144+
def extract_pdf_text_all(file_path: str):
145+
"""
146+
Attempts extraction for both searchable and non-searchable PDFs.
147+
148+
1. For searchable_pdfs, delegate to extract_pdf_text which uses pdfminer.six
149+
2. For non-searchable PDFs, convert to an image and then extract text
150+
"""
151+
f = open(file_path, "rb")
152+
is_success, content = extract_pdf_text_searchable(f)
153+
f.close()
154+
if is_success is False:
155+
# It's not even a PDF probably
156+
return False, content
157+
if is_meaningful_content(content):
158+
return True, content
159+
is_success, content = extract_pdf_text_non_searchable(file_path)
160+
return is_success, content
161+
162+
146163
def get_file_size(file):
147164
file.seek(0, 2) # Move to the end of file
148165
size = file.tell()
@@ -156,11 +173,7 @@ def extract_image_text(file_path: str):
156173
A TesseractError would happen, and will be handled, if the file is non-image.
157174
"""
158175
try:
159-
# Raw image OCR
160-
raw_image_text = pytesseract.image_to_string(file_path)
161-
# Preprocessed image OCR
162-
processed_image_path = preprocess_image(file_path)
163-
pytesseract.image_to_string(processed_image_path)
164-
return True, raw_image_text
176+
text = pytesseract.image_to_string(file_path)
177+
return True, text
165178
except TesseractError:
166179
return False, "An invalid or corrupted image"

tasks.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
"""
2+
rq integration
3+
4+
To use Redis Queue to handle the extraction tasks.
5+
"""
16
import hashlib
27
from rq import Queue
38

text_analysis.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ def is_meaningful_content(text: str):
4444
# If lot of single character words, which isn't even 'a' or 'i'.
4545
SINGLE_CHARACTER_PERCENTAGE_THRESHOLD = 0.5
4646
words = nltk.word_tokenize(text)
47+
# No word could be extracted
48+
if len(words) == 0:
49+
return False
4750
single_character_count = 0
4851
for word in words:
4952
if len(word) == 1 and word.lower() not in ['a', 'i']:
@@ -52,4 +55,7 @@ def is_meaningful_content(text: str):
5255
return False
5356
# If we are able to extract only page end markers, then it's an non meaningful content.
5457
# \x0c is the page end marker.
58+
PAGE_MARGER_PERCENTAGE_THRESHOLD = 0.5
59+
if text.count("\x0c") / len(text) > PAGE_MARGER_PERCENTAGE_THRESHOLD:
60+
return False
5561
return True

0 commit comments

Comments
 (0)