Skip to content

Commit e24743a

Browse files
authored
Fix parsing of objects with very large dictionaries (#999)
* pdfcpu: increase parsing buffer size while continuing parsing This reduces the overhead of copying data when casting "[]byte" to "string" while keywords are still searched. * Add test to validate the improvements in object parsing.
1 parent b9b3d0f commit e24743a

File tree

2 files changed

+155
-3
lines changed

2 files changed

+155
-3
lines changed

pkg/pdfcpu/read.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636

3737
const (
3838
defaultBufSize = 1024
39+
maximumBufSize = 1024 * 1024
3940
)
4041

4142
var (
@@ -1699,16 +1700,18 @@ func buffer(c context.Context, rd io.Reader) (buf []byte, endInd int, streamInd
16991700
//log.Read.Println("buffer: begin")
17001701

17011702
endInd, streamInd = -1, -1
1703+
growSize := defaultBufSize
17021704

17031705
for endInd < 0 && streamInd < 0 {
17041706
if err := c.Err(); err != nil {
17051707
return nil, 0, 0, 0, err
17061708
}
17071709

1708-
if buf, err = growBufBy(buf, defaultBufSize, rd); err != nil {
1710+
if buf, err = growBufBy(buf, growSize, rd); err != nil {
17091711
return nil, 0, 0, 0, err
17101712
}
17111713

1714+
growSize = min(growSize*2, maximumBufSize)
17121715
line := string(buf)
17131716

17141717
endInd, streamInd, err = model.DetectKeywords(line)
@@ -2205,14 +2208,16 @@ func readStreamContentBlindly(rd io.Reader) (buf []byte, err error) {
22052208
// Weak heuristic for reading in stream data for cases where stream length is unknown.
22062209
// ...data...{eol}endstream{eol}endobj
22072210

2208-
if buf, err = growBufBy(buf, defaultBufSize, rd); err != nil {
2211+
growSize := defaultBufSize
2212+
if buf, err = growBufBy(buf, growSize, rd); err != nil {
22092213
return nil, err
22102214
}
22112215

22122216
i := bytes.Index(buf, []byte("endstream"))
22132217
if i < 0 {
22142218
for i = -1; i < 0; i = bytes.Index(buf, []byte("endstream")) {
2215-
buf, err = growBufBy(buf, defaultBufSize, rd)
2219+
growSize = min(growSize*2, maximumBufSize)
2220+
buf, err = growBufBy(buf, growSize, rd)
22162221
if err != nil {
22172222
return nil, err
22182223
}

pkg/pdfcpu/read_test.go

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,17 @@ limitations under the License.
1717
package pdfcpu
1818

1919
import (
20+
"bytes"
2021
"context"
22+
"encoding/hex"
2123
"errors"
2224
"os"
2325
"path/filepath"
2426
"testing"
27+
"time"
28+
29+
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
30+
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
2531
)
2632

2733
func TestReadFileContext(t *testing.T) {
@@ -55,3 +61,144 @@ func TestReadContext(t *testing.T) {
5561
t.Errorf("should have failed with timeout, got %s", err)
5662
}
5763
}
64+
65+
func TestReadLargeDictObject(t *testing.T) {
66+
// Test with "stream" and "endobj" inside the dictionary.
67+
var fp bytes.Buffer
68+
fp.WriteString("123 0 obj\n")
69+
data := make([]byte, 10*1024*1024)
70+
fp.WriteString("<<")
71+
fp.WriteString("/Foo <")
72+
fp.WriteString(hex.EncodeToString(data))
73+
fp.WriteString(">\n")
74+
fp.WriteString("/Bar (stream)\n")
75+
fp.WriteString("/Baz (endobj)\n")
76+
fp.WriteString("/Test <")
77+
fp.WriteString(hex.EncodeToString(data))
78+
fp.WriteString(">\n")
79+
fp.WriteString(">>\n")
80+
fp.WriteString("stream\n")
81+
fp.WriteString("Hello world!\n")
82+
fp.WriteString("endstream\n")
83+
fp.WriteString("endobj\n")
84+
85+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
86+
defer cancel()
87+
88+
// Dummy pdfcpu context to be used for parsing a single object.
89+
c := &model.Context{
90+
Read: &model.ReadContext{
91+
RS: bytes.NewReader(fp.Bytes()),
92+
},
93+
XRefTable: &model.XRefTable{},
94+
}
95+
o, err := ParseObjectWithContext(ctx, c, 0, 123, 0)
96+
if err != nil {
97+
t.Fatal(err)
98+
}
99+
100+
d, ok := o.(types.StreamDict)
101+
if !ok {
102+
t.Fatalf("expected StreamDict, got %T", o)
103+
}
104+
105+
if err := loadEncodedStreamContent(ctx, c, &d, true); err != nil {
106+
t.Fatal(err)
107+
}
108+
109+
if foo := d.HexLiteralEntry("Foo"); foo == nil {
110+
t.Error("expected Foo entry")
111+
} else if expected := hex.EncodeToString(data); foo.Value() != expected {
112+
t.Errorf("Foo value mismatch, expected %d bytes, got %d", len(expected), len(foo.Value()))
113+
}
114+
115+
if bar := d.StringEntry("Bar"); bar == nil {
116+
t.Error("expected Bar entry")
117+
} else if expected := "stream"; *bar != expected {
118+
t.Errorf("expected %s for Bar, got %s", expected, *bar)
119+
}
120+
121+
if baz := d.StringEntry("Baz"); baz == nil {
122+
t.Error("expected Baz entry")
123+
} else if expected := "endobj"; *baz != expected {
124+
t.Errorf("expected %s for Baz, got %s", expected, *baz)
125+
}
126+
127+
if err := d.Decode(); err != nil {
128+
t.Fatal(err)
129+
}
130+
131+
if expected := "Hello world!"; string(d.Content) != expected {
132+
t.Errorf("expected stream content %s, got %s", expected, string(d.Content))
133+
}
134+
}
135+
136+
func TestReadLargeDictObjectStream(t *testing.T) {
137+
// Test without "stream" and "endobj" inside the dictionary.
138+
var fp bytes.Buffer
139+
fp.WriteString("123 0 obj\n")
140+
data := make([]byte, 10*1024*1024)
141+
fp.WriteString("<<")
142+
fp.WriteString("/Foo <")
143+
fp.WriteString(hex.EncodeToString(data))
144+
fp.WriteString(">\n")
145+
fp.WriteString("/Bar (Test)\n")
146+
fp.WriteString("/Baz <")
147+
fp.WriteString(hex.EncodeToString(data))
148+
fp.WriteString(">\n")
149+
fp.WriteString(">>\n")
150+
fp.WriteString("stream\n")
151+
fp.WriteString("Hello world!\n")
152+
fp.WriteString("endstream\n")
153+
fp.WriteString("endobj\n")
154+
155+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
156+
defer cancel()
157+
158+
// Dummy pdfcpu context to be used for parsing a single object.
159+
c := &model.Context{
160+
Read: &model.ReadContext{
161+
RS: bytes.NewReader(fp.Bytes()),
162+
},
163+
XRefTable: &model.XRefTable{},
164+
}
165+
o, err := ParseObjectWithContext(ctx, c, 0, 123, 0)
166+
if err != nil {
167+
t.Fatal(err)
168+
}
169+
170+
d, ok := o.(types.StreamDict)
171+
if !ok {
172+
t.Fatalf("expected StreamDict, got %T", o)
173+
}
174+
175+
if err := loadEncodedStreamContent(ctx, c, &d, true); err != nil {
176+
t.Fatal(err)
177+
}
178+
179+
if foo := d.HexLiteralEntry("Foo"); foo == nil {
180+
t.Error("expected Foo entry")
181+
} else if expected := hex.EncodeToString(data); foo.Value() != expected {
182+
t.Errorf("Foo value mismatch, expected %d bytes, got %d", len(expected), len(foo.Value()))
183+
}
184+
185+
if bar := d.StringEntry("Bar"); bar == nil {
186+
t.Error("expected Bar entry")
187+
} else if expected := "Test"; *bar != expected {
188+
t.Errorf("expected %s for Bar, got %s", expected, *bar)
189+
}
190+
191+
if baz := d.HexLiteralEntry("Baz"); baz == nil {
192+
t.Error("expected Baz entry")
193+
} else if expected := hex.EncodeToString(data); baz.Value() != expected {
194+
t.Errorf("Foo value mismatch, expected %d bytes, got %d", len(expected), len(baz.Value()))
195+
}
196+
197+
if err := d.Decode(); err != nil {
198+
t.Fatal(err)
199+
}
200+
201+
if expected := "Hello world!"; string(d.Content) != expected {
202+
t.Errorf("expected stream content %s, got %s", expected, string(d.Content))
203+
}
204+
}

0 commit comments

Comments
 (0)