3
3
import logging
4
4
from typing import List , BinaryIO
5
5
6
+ # File mime-type detection
6
7
import magic
7
8
from magic .compat import FileMagic
8
9
10
+ # PDF manipulation
9
11
from pikepdf import Pdf
10
12
13
+ # PDF text extraction
11
14
from pdfminer .high_level import extract_text
12
15
from pdfminer .pdfparser import PDFSyntaxError
13
16
17
+ # Image text extraction
18
+ # OCR can only happen on images, OCR doesn't work with PDF
14
19
import pytesseract
15
20
from pytesseract .pytesseract import TesseractError
16
21
22
+ # Convert non-searchable PDFs to images before performing OCR
17
23
from pdf2image import convert_from_path
18
24
19
25
from fastapi import UploadFile
20
26
27
+ # Perform image preprocessing to improve image quality, crispness and readability
21
28
from image_preprocessing import preprocess_image
22
29
30
+ from text_analysis import is_meaningful_content
31
+
23
32
24
33
logger = logging .getLogger (__name__ )
25
34
@@ -90,7 +99,7 @@ def save_file(file: BinaryIO, path: str):
90
99
logger .info (f"Saved file to { path } " )
91
100
92
101
93
- def extract_pdf_text (file : BinaryIO ):
102
+ def extract_pdf_text_searchable (file : BinaryIO ):
94
103
"""
95
104
:param: A file like object, opened in binary mode.
96
105
Extracts text from a PDF containing embedded text using pdfminer.six library.
@@ -105,31 +114,20 @@ def extract_pdf_text(file: BinaryIO):
105
114
return False , "An invalid or corrupted PDF"
106
115
107
116
108
- def extract_pdf_text_all (file_path : str ):
117
+ def extract_pdf_text_non_searchable (file_path : str ):
109
118
"""
110
- Attempts extraction for both searchable and non-searchable PDFs .
111
-
112
- 1. For searchable_pdfs, delegate to extract_pdf_text which uses pdfminer.six
113
- 2. For non-searchable PDFs, convert to an image and then extract text
119
+ :param: A PDF file path .
120
+ Extracts text from non searchable PDFs i.e scanned PDFs that don't have embedded text.
121
+ Converts a PDF to an image and then extracts text from it. Delegates to extract_image_text which
122
+ performs OCR using Pytesseract.
114
123
"""
115
- f = open (file_path , "rb" )
116
- is_success , content = extract_pdf_text (f )
117
- f .close ()
118
- if is_success is False :
119
- # It's not even a PDF probably
120
- return False , content
121
- if len (content ) > 10 :
122
- return True , content
123
- # Probably it's a non-searchable PDF, that's why we were able to get less than 10 characters.
124
- # Convert it to an image first
125
124
output_folder = "/media/pdf-to-image" # Directory name -> /media/pdf-to-image
126
125
basename = os .path .basename (file_path ) # File name -> sample.pdf
127
126
if '.pdf' in basename :
128
127
basename = basename .replace ('.pdf' , '' )
129
128
convert_from_path (file_path , output_folder = output_folder , fmt = "png" , output_file = basename )
130
129
# The converted images have been saved now.
131
130
converted_images_paths = sorted (glob .glob (f"{ output_folder } /{ basename } *.png" ))
132
- # We will extend it for all images later.
133
131
is_successes = []
134
132
contents = []
135
133
for converted_image_path in converted_images_paths :
@@ -143,6 +141,25 @@ def extract_pdf_text_all(file_path: str):
143
141
return any (is_successes ), "\n " .join (contents )
144
142
145
143
144
+ def extract_pdf_text_all (file_path : str ):
145
+ """
146
+ Attempts extraction for both searchable and non-searchable PDFs.
147
+
148
+ 1. For searchable_pdfs, delegate to extract_pdf_text which uses pdfminer.six
149
+ 2. For non-searchable PDFs, convert to an image and then extract text
150
+ """
151
+ f = open (file_path , "rb" )
152
+ is_success , content = extract_pdf_text_searchable (f )
153
+ f .close ()
154
+ if is_success is False :
155
+ # It's not even a PDF probably
156
+ return False , content
157
+ if is_meaningful_content (content ):
158
+ return True , content
159
+ is_success , content = extract_pdf_text_non_searchable (file_path )
160
+ return is_success , content
161
+
162
+
146
163
def get_file_size (file ):
147
164
file .seek (0 , 2 ) # Move to the end of file
148
165
size = file .tell ()
@@ -156,11 +173,7 @@ def extract_image_text(file_path: str):
156
173
A TesseractError would happen, and will be handled, if the file is non-image.
157
174
"""
158
175
try :
159
- # Raw image OCR
160
- raw_image_text = pytesseract .image_to_string (file_path )
161
- # Preprocessed image OCR
162
- processed_image_path = preprocess_image (file_path )
163
- pytesseract .image_to_string (processed_image_path )
164
- return True , raw_image_text
176
+ text = pytesseract .image_to_string (file_path )
177
+ return True , text
165
178
except TesseractError :
166
179
return False , "An invalid or corrupted image"
0 commit comments