Skip to content

Commit 342e1ec

Browse files
jvallesmpinglin
authored andcommitted
feat(document): add TASK_SPLIT_IN_PAGES (#1035)
Because - Sometimes we need to split a PDF document in pages. E.g., to pick a subset of the document or to perform work in batches. This commit - Adds `TASK_SPLIT_IN_PAGES` to the `document` operator. - The task works only for PDF documents for now.
1 parent aa4c1c5 commit 342e1ec

14 files changed

+342
-2
lines changed

go.mod

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ require (
1616
github.com/belong-inc/go-hubspot v0.9.0
1717
github.com/chromedp/chromedp v0.13.6
1818
github.com/cohere-ai/cohere-go/v2 v2.14.1
19+
github.com/dslipak/pdf v0.0.2
1920
github.com/elastic/go-elasticsearch/v8 v8.18.0
2021
github.com/emersion/go-imap/v2 v2.0.0-beta.5
2122
github.com/emersion/go-message v0.18.2
@@ -65,6 +66,7 @@ require (
6566
github.com/nakagami/firebirdsql v0.9.15
6667
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646
6768
github.com/openfga/api/proto v0.0.0-20240318145204-66b9e5cb403c
69+
github.com/pdfcpu/pdfcpu v0.11.0
6870
github.com/pkoukk/tiktoken-go v0.1.7
6971
github.com/redis/go-redis/v9 v9.9.0
7072
github.com/sijms/go-ora v1.3.2
@@ -125,6 +127,9 @@ require (
125127
github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect
126128
github.com/go-jose/go-jose/v4 v4.1.0 // indirect
127129
github.com/go-json-experiment/json v0.0.0-20250517221953-25912455fbc8 // indirect
130+
github.com/hhrutter/lzw v1.0.0 // indirect
131+
github.com/hhrutter/pkcs7 v0.2.0 // indirect
132+
github.com/hhrutter/tiff v1.0.2 // indirect
128133
github.com/minio/crc64nvme v1.0.2 // indirect
129134
github.com/minio/minio-go/v7 v7.0.92 // indirect
130135
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
@@ -146,6 +151,7 @@ require (
146151
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
147152
go.opentelemetry.io/contrib/detectors/gcp v1.36.0 // indirect
148153
go.temporal.io/sdk/contrib/tally v0.2.0 // indirect
154+
gopkg.in/yaml.v2 v2.4.0 // indirect
149155
)
150156

151157
require (

go.sum

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj
178178
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
179179
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
180180
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
181+
github.com/dslipak/pdf v0.0.2 h1:djAvcM5neg9Ush+zR6QXB+VMJzR6TdnX766HPIg1JmI=
182+
github.com/dslipak/pdf v0.0.2/go.mod h1:2L3SnkI9cQwnAS9gfPz2iUoLC0rUZwbucpbKi5R1mUo=
181183
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
182184
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
183185
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
@@ -444,6 +446,12 @@ github.com/hashicorp/vault/api v1.0.4/go.mod h1:gDcqh3WGcR1cpF5AJz/B1UFheUEneMoI
444446
github.com/hashicorp/vault/sdk v0.1.13/go.mod h1:B+hVj7TpuQY1Y/GPbCpffmgd+tSEwvhkWnjtSYCaS2M=
445447
github.com/hashicorp/yamux v0.0.0-20180604194846-3520598351bb/go.mod h1:+NfK9FKeTrX5uv1uIXGdwYDTeHna2qgaIlx54MXqjAM=
446448
github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d/go.mod h1:+NfK9FKeTrX5uv1uIXGdwYDTeHna2qgaIlx54MXqjAM=
449+
github.com/hhrutter/lzw v1.0.0 h1:laL89Llp86W3rRs83LvKbwYRx6INE8gDn0XNb1oXtm0=
450+
github.com/hhrutter/lzw v1.0.0/go.mod h1:2HC6DJSn/n6iAZfgM3Pg+cP1KxeWc3ezG8bBqW5+WEo=
451+
github.com/hhrutter/pkcs7 v0.2.0 h1:i4HN2XMbGQpZRnKBLsUwO3dSckzgX142TNqY/KfXg+I=
452+
github.com/hhrutter/pkcs7 v0.2.0/go.mod h1:aEzKz0+ZAlz7YaEMY47jDHL14hVWD6iXt0AgqgAvWgE=
453+
github.com/hhrutter/tiff v1.0.2 h1:7H3FQQpKu/i5WaSChoD1nnJbGx4MxU5TlNqqpxw55z8=
454+
github.com/hhrutter/tiff v1.0.2/go.mod h1:pcOeuK5loFUE7Y/WnzGw20YxUdnqjY1P0Jlcieb/cCw=
447455
github.com/hjson/hjson-go/v4 v4.0.0 h1:wlm6IYYqHjOdXH1gHev4VoXCaW20HdQAGCxdOEEg2cs=
448456
github.com/hjson/hjson-go/v4 v4.0.0/go.mod h1:KaYt3bTw3zhBjYqnXkYywcYctk0A2nxeEFTse3rH13E=
449457
github.com/iancoleman/strcase v0.3.0 h1:nTXanmYxhfFAMjZL34Ov6gkzEsSJZ5DbhxWjvSASxEI=
@@ -670,6 +678,8 @@ github.com/panjf2000/ants/v2 v2.4.2/go.mod h1:f6F0NZVFsGCp5A7QW/Zj/m92atWwOkY0OI
670678
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
671679
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
672680
github.com/pborman/uuid v1.2.1/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
681+
github.com/pdfcpu/pdfcpu v0.11.0 h1:mL18Y3hSHzSezmnrzA21TqlayBOXuAx7BUzzZyroLGM=
682+
github.com/pdfcpu/pdfcpu v0.11.0/go.mod h1:F1ca4GIVFdPtmgvIdvXAycAm88noyNxZwzr9CpTy+Mw=
673683
github.com/pelletier/go-toml v1.7.0 h1:7utD74fnzVc/cpcyy8sjrlFr5vYpypUixARcHIMIGuI=
674684
github.com/pelletier/go-toml v1.7.0/go.mod h1:vwGMzjaWMwyfHwgIBhI2YUM4fB6nL6lVAvS1LBMMhTE=
675685
github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c h1:dAMKvw0MlJT1GshSTtih8C2gDs04w8dReiOGXrGLNoY=

pkg/component/operator/document/v0/README.mdx

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ It can carry out the following tasks:
1010
- [Convert to Markdown](#convert-to-markdown)
1111
- [Convert to Text](#convert-to-text)
1212
- [Convert to Images](#convert-to-images)
13+
- [Split in Pages](#split-in-pages)
1314

1415

1516

@@ -124,6 +125,32 @@ Convert Document to images.
124125
</div>
125126

126127

128+
### Split in Pages
129+
130+
Divide a document in batches of N pages.
131+
132+
<div class="markdown-col-no-wrap" data-col-1 data-col-2>
133+
134+
| Input | Field ID | Type | Description |
135+
| :--- | :--- | :--- | :--- |
136+
| Task ID (required) | `task` | string | `TASK_SPLIT_IN_PAGES` |
137+
| Document (required) | `document` | string | Document encoded in Base64. For now, only PDF documents are accepted. |
138+
| Batch Size | `batch-size` | number | Pages in each batch. |
139+
</div>
140+
141+
142+
143+
144+
145+
146+
<div class="markdown-col-no-wrap" data-col-1 data-col-2>
147+
148+
| Output | Field ID | Type | Description |
149+
| :--- | :--- | :--- | :--- |
150+
| Batches (optional) | `pages` | array[string] | An ordered list of Base64-encoded documents, each one containing N pages of the input document. Page order in the input document is preserved both in the batch array elements and in the pages within each batch. |
151+
</div>
152+
153+
127154

128155
## Example Recipes
129156

pkg/component/operator/document/v0/config/definition.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ availableTasks:
22
- TASK_CONVERT_TO_MARKDOWN
33
- TASK_CONVERT_TO_TEXT
44
- TASK_CONVERT_TO_IMAGES
5+
- TASK_SPLIT_IN_PAGES
56
custom: false
67
documentationUrl: https://instill-ai.dev/docs/component/operator/document
78
icon: assets/document.svg
@@ -11,7 +12,7 @@ spec: {}
1112
title: Document
1213
type: COMPONENT_TYPE_OPERATOR
1314
uid: e5b290ae-ad53-47c9-a64e-efbc5358520b
14-
version: 0.1.3
15+
version: 0.2.0
1516
sourceUrl: https://github.com/instill-ai/pipeline-backend/blob/main/pkg/component/operator/document/v0
1617
description: Manipulate Document files.
1718
releaseStage: RELEASE_STAGE_ALPHA

pkg/component/operator/document/v0/config/tasks.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,43 @@ TASK_CONVERT_TO_IMAGES:
199199
- images
200200
title: Output
201201
type: object
202+
TASK_SPLIT_IN_PAGES:
203+
shortDescription: Divide a document in batches of N pages.
204+
input:
205+
title: Input
206+
description: Input.
207+
uiOrder: 0
208+
type: object
209+
properties:
210+
document:
211+
title: Document
212+
description: Document encoded in Base64. For now, only PDF documents are accepted.
213+
type: string
214+
uiOrder: 0
215+
batch-size:
216+
title: Batch size
217+
description: Pages in each batch.
218+
uiOrder: 1
219+
type: number
220+
default: 1
221+
required:
222+
- document
223+
output:
224+
title: Output
225+
description: Output.
226+
uiOrder: 0
227+
type: object
228+
properties:
229+
pages:
230+
title: Batches
231+
description: |-
232+
An ordered list of Base64-encoded documents, each one containing N
233+
pages of the input document. Page order in the input document is
234+
preserved both in the batch array elements and in the pages within
235+
each batch.
236+
uiOrder: 0
237+
type: array
238+
items:
239+
type: string
240+
required:
241+
- batches

pkg/component/operator/document/v0/io.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,14 @@ type ConvertToTextOutput struct {
5252
Error string `instill:"error"`
5353
Filename string `instill:"filename"`
5454
}
55+
56+
// SplitInPagesInput defines the input for the page split task.
57+
type SplitInPagesInput struct {
58+
BatchSize uint32 `instill:"batch-size"`
59+
Document format.Document `instill:"document"`
60+
}
61+
62+
// SplitInPagesOutput defines the output for the page split task.
63+
type SplitInPagesOutput struct {
64+
Batches []format.Document `instill:"batches"`
65+
}

pkg/component/operator/document/v0/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ const (
1919
taskConvertToMarkdown string = "TASK_CONVERT_TO_MARKDOWN"
2020
taskConvertToText string = "TASK_CONVERT_TO_TEXT"
2121
taskConvertToImages string = "TASK_CONVERT_TO_IMAGES"
22-
pythonInterpreter string = "/opt/venv/bin/python"
22+
taskSplitInPages string = "TASK_SPLIT_IN_PAGES"
2323
)
2424

2525
var (
@@ -105,6 +105,8 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
105105
e.execute = e.convertToText
106106
case taskConvertToImages:
107107
e.execute = e.convertDocumentToImages
108+
case taskSplitInPages:
109+
e.execute = e.splitInPages
108110
default:
109111
return nil, fmt.Errorf("%s task is not supported", x.Task)
110112
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package document
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"fmt"
7+
8+
pdfcpu "github.com/pdfcpu/pdfcpu/pkg/api"
9+
10+
"github.com/instill-ai/pipeline-backend/pkg/component/base"
11+
"github.com/instill-ai/pipeline-backend/pkg/component/internal/util"
12+
"github.com/instill-ai/pipeline-backend/pkg/data"
13+
"github.com/instill-ai/pipeline-backend/pkg/data/format"
14+
"github.com/instill-ai/x/errmsg"
15+
)
16+
17+
func (e *execution) splitInPages(ctx context.Context, job *base.Job) error {
18+
in := SplitInPagesInput{}
19+
if err := job.Input.ReadData(ctx, &in); err != nil {
20+
return fmt.Errorf("reading input data: %w", err)
21+
}
22+
23+
ct := in.Document.ContentType().String()
24+
fe := util.TransformContentTypeToFileExtension(ct)
25+
if fe != "pdf" {
26+
// return fmt.Errorf("invalid file extension: %s", fe)
27+
return errmsg.AddMessage(
28+
fmt.Errorf("invalid file extension"),
29+
"Page split task takes only PDF documents.",
30+
)
31+
}
32+
33+
b, err := in.Document.Binary()
34+
if err != nil {
35+
return fmt.Errorf("converting document to byte array: %w", err)
36+
}
37+
38+
batchSize := int(in.BatchSize)
39+
if batchSize == 0 {
40+
batchSize = 1
41+
}
42+
43+
rs := bytes.NewReader(b.ByteArray())
44+
rawPages, err := pdfcpu.SplitRaw(rs, batchSize, nil)
45+
if err != nil {
46+
return fmt.Errorf("splitting PDF: %w", err)
47+
}
48+
49+
pages := make([]format.Document, len(rawPages))
50+
for i, rawPage := range rawPages {
51+
var buf bytes.Buffer
52+
if _, err := buf.ReadFrom(rawPage.Reader); err != nil {
53+
return fmt.Errorf("reading bytes from split page: %w", err)
54+
}
55+
56+
page, err := data.NewDocumentFromBytes(buf.Bytes(), in.Document.ContentType().String(), "")
57+
if err != nil {
58+
return fmt.Errorf("creating document from split page: %w", err)
59+
}
60+
61+
pages[i] = page
62+
}
63+
64+
out := SplitInPagesOutput{Batches: pages}
65+
return job.Output.WriteData(ctx, out)
66+
}

0 commit comments

Comments
 (0)