Skip to content

Commit b375203

Browse files
committed
MAINT: Move from PyPDF2 to pypdf
1 parent 9fe914d commit b375203

23 files changed

+152
-194
lines changed

.isort.cfg

-2
This file was deleted.

.pre-commit-config.yaml

+7-10
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# pre-commit run --all-files
22
repos:
33
- repo: https://github.com/pre-commit/pre-commit-hooks
4-
rev: v4.1.0
4+
rev: v4.4.0
55
hooks:
66
- id: check-ast
77
- id: check-byte-order-marker
@@ -17,25 +17,22 @@ repos:
1717
exclude: results/
1818
- id: check-added-large-files
1919
args: ['--maxkb=1000']
20-
- repo: https://gitlab.com/pycqa/flake8
21-
rev: 3.9.2
20+
- repo: https://github.com/PyCQA/flake8
21+
rev: 6.0.0
2222
hooks:
2323
- id: flake8
2424
args: ["--ignore", "E,W,F"]
2525
# - repo: https://github.com/pre-commit/mirrors-mypy
2626
# rev: v0.942
2727
# hooks:
2828
# - id: mypy
29-
- repo: https://github.com/asottile/seed-isort-config
30-
rev: v2.2.0
31-
hooks:
32-
- id: seed-isort-config
33-
- repo: https://github.com/pre-commit/mirrors-isort
34-
rev: v5.10.1
29+
- repo: https://github.com/pycqa/isort
30+
rev: 5.11.4
3531
hooks:
3632
- id: isort
33+
name: isort (python)
3734
- repo: https://github.com/psf/black
38-
rev: 22.3.0
35+
rev: 22.12.0
3936
hooks:
4037
- id: black
4138
# - repo: https://github.com/asottile/pyupgrade

README.md

+7-7
Large diffs are not rendered by default.

benchmark.py

+19-19
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import numpy as np
2020
import pdfminer
2121
import pdfplumber
22-
import PyPDF2
22+
import pypdf
2323
import pypdfium2 as pdfium
2424
import requests
2525
import tika
@@ -135,9 +135,9 @@ def pymupdf_get_text(data: bytes) -> str:
135135
return text
136136

137137

138-
def pypdf2_get_text(data: bytes) -> str:
138+
def pypdf_get_text(data: bytes) -> str:
139139
text = ""
140-
reader = PyPDF2.PdfFileReader(BytesIO(data))
140+
reader = pypdf.PdfReader(BytesIO(data))
141141
for page in reader.pages:
142142
text += page.extract_text() + "\n"
143143
return text
@@ -153,11 +153,11 @@ def pdfium_get_text(data: bytes) -> str:
153153
return text
154154

155155

156-
def pypdf2_watermarking(watermark_data: bytes, data: bytes) -> bytes:
157-
watermark_pdf = PyPDF2.PdfReader(BytesIO(watermark_data))
156+
def pypdf_watermarking(watermark_data: bytes, data: bytes) -> bytes:
157+
watermark_pdf = pypdf.PdfReader(BytesIO(watermark_data))
158158
watermark_page = watermark_pdf.pages[0]
159-
reader = PyPDF2.PdfReader(BytesIO(data))
160-
writer = PyPDF2.PdfWriter()
159+
reader = pypdf.PdfReader(BytesIO(data))
160+
writer = pypdf.PdfWriter()
161161
for page in reader.pages:
162162
page.merge_page(watermark_page)
163163
writer.add_page(page)
@@ -167,15 +167,15 @@ def pypdf2_watermarking(watermark_data: bytes, data: bytes) -> bytes:
167167
return bytes_stream.read()
168168

169169

170-
def pypdf2_image_extraction(data: bytes) -> List[Tuple[str, bytes]]:
170+
def pypdf_image_extraction(data: bytes) -> List[Tuple[str, bytes]]:
171171
images = []
172172
try:
173-
reader = PyPDF2.PdfReader(BytesIO(data))
173+
reader = pypdf.PdfReader(BytesIO(data))
174174
for page in reader.pages:
175175
for image in page.images:
176176
images.append((image.name, image.data))
177177
except Exception as exc:
178-
print(f"PyPDF2 Image extraction failure: {exc}")
178+
print(f"pypdf Image extraction failure: {exc}")
179179
return images
180180

181181

@@ -444,7 +444,7 @@ def write_benchmark_report(
444444
if len([el for el in image_extraction_times[name] if el is not None]) > 0
445445
]
446446
print(names)
447-
print(image_extraction_times["pypdf2"])
447+
print(image_extraction_times["pypdf"])
448448
averages = [np.mean(image_extraction_times[name]) for name in names]
449449
sort_order = np.argsort([avg for avg in averages])
450450
for place, index in enumerate(sort_order, start=1):
@@ -553,16 +553,16 @@ def get_text_extraction_score(doc: Document, library_name: str):
553553
license="Apache v2",
554554
last_release_date="2020-03-21",
555555
),
556-
"pypdf2": Library(
557-
"PyPDF2",
558-
"pypdf2",
559-
"https://pypi.org/project/PyPDF2/",
560-
pypdf2_get_text,
561-
version=PyPDF2.__version__,
562-
watermarking_function=pypdf2_watermarking,
556+
"pypdf": Library(
557+
"pypdf",
558+
"pypdf",
559+
"https://pypi.org/project/pypdf/",
560+
pypdf_get_text,
561+
version=pypdf.__version__,
562+
watermarking_function=pypdf_watermarking,
563563
license="BSD 3-Clause",
564564
last_release_date="2022-09-25",
565-
image_extraction_function=pypdf2_image_extraction,
565+
image_extraction_function=pypdf_image_extraction,
566566
),
567567
"pdfminer": Library(
568568
"pdfminer.six",

cache.json

+48-48
Original file line numberDiff line numberDiff line change
@@ -292,76 +292,76 @@
292292
"read": 0.24839401245117188
293293
}
294294
},
295-
"pypdf2": {
295+
"pypdf": {
296296
"1601.03642": {
297-
"image_extraction": 0.0035588741302490234,
298-
"read": 0.1545581817626953,
299-
"watermark": 0.4532604217529297
297+
"image_extraction": 0.003720998764038086,
298+
"read": 0.16930723190307617,
299+
"watermark": 0.40448474884033203
300300
},
301301
"1602.06541": {
302-
"image_extraction": 0.2223806381225586,
303-
"read": 0.5582091808319092,
304-
"watermark": 1.2958548069000244
302+
"image_extraction": 0.21662640571594238,
303+
"read": 0.5400035381317139,
304+
"watermark": 1.1498773097991943
305305
},
306306
"1707.09725": {
307-
"image_extraction": 3.600585699081421,
308-
"read": 2.0959312915802,
309-
"watermark": 5.662652492523193
307+
"image_extraction": 3.632526397705078,
308+
"read": 1.9727542400360107,
309+
"watermark": 5.599521636962891
310310
},
311311
"2201.00021": {
312-
"image_extraction": 0.899993896484375,
313-
"read": 0.5941448211669922,
314-
"watermark": 0.8802309036254883
312+
"image_extraction": 0.9102218151092529,
313+
"read": 0.561309814453125,
314+
"watermark": 1.030742883682251
315315
},
316316
"2201.00022": {
317-
"image_extraction": 0.5003249645233154,
318-
"read": 0.2769145965576172,
319-
"watermark": 0.9552063941955566
317+
"image_extraction": 0.49163293838500977,
318+
"read": 0.35618138313293457,
319+
"watermark": 0.7521810531616211
320320
},
321321
"2201.00029": {
322-
"image_extraction": 0.003554821014404297,
323-
"read": 0.38430094718933105,
324-
"watermark": 0.09974980354309082
322+
"image_extraction": 0.0033931732177734375,
323+
"read": 0.41935181617736816,
324+
"watermark": 0.08749532699584961
325325
},
326326
"2201.00037": {
327-
"image_extraction": 0.010822772979736328,
328-
"read": 0.957435131072998,
329-
"watermark": 1.8120055198669434
327+
"image_extraction": 0.010039329528808594,
328+
"read": 1.0310795307159424,
329+
"watermark": 1.6997003555297852
330330
},
331331
"2201.00069": {
332-
"image_extraction": 5.672004461288452,
333-
"read": 0.35195350646972656,
334-
"watermark": 1.1070070266723633
332+
"image_extraction": 5.814162731170654,
333+
"read": 0.4030001163482666,
334+
"watermark": 0.9838786125183105
335335
},
336336
"2201.00151": {
337-
"image_extraction": 0.003854990005493164,
338-
"read": 5.403608083724976,
339-
"watermark": 13.4715895652771
337+
"image_extraction": 0.003250598907470703,
338+
"read": 5.729317903518677,
339+
"watermark": 12.279357194900513
340340
},
341341
"2201.00178": {
342-
"image_extraction": 0.7386953830718994,
343-
"read": 0.443634033203125,
344-
"watermark": 1.1078128814697266
342+
"image_extraction": 0.7503547668457031,
343+
"read": 0.48607897758483887,
344+
"watermark": 1.0085842609405518
345345
},
346346
"2201.00200": {
347-
"image_extraction": 0.004326581954956055,
348-
"read": 0.5157289505004883,
349-
"watermark": 0.4838368892669678
347+
"image_extraction": 0.0038993358612060547,
348+
"read": 0.5258574485778809,
349+
"watermark": 0.4673922061920166
350350
},
351351
"2201.00201": {
352-
"image_extraction": 0.7092070579528809,
353-
"read": 0.27915358543395996,
354-
"watermark": 0.793565034866333
352+
"image_extraction": 0.7103419303894043,
353+
"read": 0.30803465843200684,
354+
"watermark": 0.7354044914245605
355355
},
356356
"2201.00214": {
357-
"image_extraction": 0.4208667278289795,
358-
"read": 19.764751195907593,
359-
"watermark": 45.63583517074585
357+
"image_extraction": 0.3845233917236328,
358+
"read": 18.78579568862915,
359+
"watermark": 43.98939323425293
360360
},
361361
"GeoTopo-book": {
362-
"image_extraction": 1.4834551811218262,
363-
"read": 4.467116117477417,
364-
"watermark": 11.597773551940918
362+
"image_extraction": 1.5176374912261963,
363+
"read": 5.871168851852417,
364+
"watermark": 10.380576372146606
365365
}
366366
},
367367
"tika": {
@@ -506,17 +506,17 @@
506506
"2201.00214": 0.9784680209521233,
507507
"GeoTopo-book": 0.9658770842721947
508508
},
509-
"pypdf2": {
509+
"pypdf": {
510510
"1601.03642": 0.9897718873016692,
511-
"1602.06541": 0.9802022141888839,
511+
"1602.06541": 0.9802517774812367,
512512
"1707.09725": 0.942859219351859,
513513
"2201.00021": 0.9665893615444081,
514514
"2201.00022": 0.9730826246391511,
515515
"2201.00029": 0.9766931724705237,
516-
"2201.00037": 0.9406599334671981,
516+
"2201.00037": 0.9402707115289559,
517517
"2201.00069": 0.9644961587230552,
518-
"2201.00151": 0.9393172305985578,
519-
"2201.00178": 0.9316599563831572,
518+
"2201.00151": 0.9395715828027728,
519+
"2201.00178": 0.9316599563831571,
520520
"2201.00200": 0.974868026918479,
521521
"2201.00201": 0.9830162051681347,
522522
"2201.00214": 0.9761955366631243,

pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[tool.isort]
2+
profile = "black"

0 commit comments

Comments
 (0)