@@ -182,6 +182,22 @@ def pypdf2_image_extraction(data: bytes) -> List[Tuple[str, bytes]]:
182
182
return images
183
183
184
184
185
+ def pymupdf_image_extraction (data : bytes ) -> List [Tuple [str , bytes ]]:
186
+ images = []
187
+ with PyMuPDF .open (stream = data , filetype = "pdf" ) as pdf_file :
188
+ for page_index in range (len (pdf_file )):
189
+ page = pdf_file [page_index ]
190
+ for image_index , img in enumerate (page .get_images (), start = 1 ):
191
+ xref = img [0 ]
192
+ base_image = pdf_file .extract_image (xref )
193
+ image_bytes = base_image ["image" ]
194
+ image_ext = base_image ["ext" ]
195
+ images .append (
196
+ (f"image{ page_index + 1 } _{ image_index } .{ image_ext } " , image_bytes )
197
+ )
198
+ return images
199
+
200
+
185
201
def pdfminer_image_extraction (data : bytes ) -> List [Tuple [str , bytes ]]:
186
202
from PIL import Image
187
203
@@ -577,6 +593,7 @@ def get_text_extraction_score(doc: Document, library_name: str):
577
593
lambda n : pymupdf_get_text (n ),
578
594
version = PyMuPDF .version [0 ],
579
595
watermarking_function = None ,
596
+ image_extraction_function = pymupdf_image_extraction ,
580
597
dependencies = "MuPDF" ,
581
598
license = "GNU AFFERO GPL 3.0 / Commerical" ,
582
599
last_release_date = "2022-08-31" ,
0 commit comments