Skip to content

Commit 6386958

Browse files
committed
[RELEASE] iText 7 pdfOcr - 1.0.2
https://git.itextsupport.com/ * release/1.0.2: [RELEASE] 1.0.2-SNAPSHOT -> 1.0.2 Update port-hash thai_03 test fails in .NET. Might be related to reading UTF-8 files issue Combine HOCR and TXT outputs for more precise text recognition Deprecate unused log message constant Stabilize test on different Tesseract versions Add possibility to set image preprocessing properties Tesseract does not respect image rotation when doing OCR Update port-hash Use tesseract executable from path instead of tesseractDir in tests If path to tessdata contains non ASCII characters, code unexpectedly fails TextInfo: move from List<Float> to Rectangle Use generalized Jenkinsfile in the pipeline-library Deprecate Tesseract4LogMessageConstant#CANNOT_CONVERT_IMAGE_TO_GRAYSCALE Update autoported files Non-Ascii characters support for the output file Use ImageTypeDetector from io module to detect image types Use new SystemUtil#runProcessAndWait overload from 7.1.12-SNAPSHOT accepting working directory Update port-hash Update port-hash after release [RELEASE] Update dependency versions
2 parents 0400706 + 6c3e305 commit 6386958

File tree

72 files changed

+2227
-727
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+2227
-727
lines changed

Jenkinsfile

Lines changed: 4 additions & 215 deletions
Original file line numberDiff line numberDiff line change
@@ -1,219 +1,8 @@
11
#!/usr/bin/env groovy
22
@Library('pipeline-library')_
33

4-
def schedule = env.BRANCH_NAME.contains('master') ? '@monthly' : env.BRANCH_NAME == 'develop' ? '@midnight' : ''
4+
def repoName = "pdfOcr"
5+
def dependencyRegex = "itextcore"
6+
def solutionFile = "i7n-ocr.sln"
57

6-
pipeline {
7-
8-
agent { label 'windows' }
9-
10-
environment {
11-
tesseractDir = tool name: 'Tesseract', type: 'com.cloudbees.jenkins.plugins.customtools.CustomTool'
12-
}
13-
14-
options {
15-
ansiColor('xterm')
16-
buildDiscarder(logRotator(artifactNumToKeepStr: '1'))
17-
parallelsAlwaysFailFast()
18-
retry(1)
19-
skipStagesAfterUnstable()
20-
timeout(time: 60, unit: 'MINUTES')
21-
timestamps()
22-
}
23-
24-
triggers {
25-
cron(schedule)
26-
}
27-
28-
stages {
29-
stage('Abort possible previous builds') {
30-
steps {
31-
script {
32-
abortPreviousBuilds()
33-
}
34-
}
35-
}
36-
stage('Wait for blocking jobs') {
37-
steps {
38-
script {
39-
properties[[
40-
$class : 'BuildBlockerProperty',
41-
blockLevel : 'GLOBAL',
42-
blockingJobs : "^iText_7_Java/itextcore/$env.JOB_BASE_NAME\$",
43-
scanQueueFor : 'ALL',
44-
useBuildBlocker: true
45-
]]
46-
}
47-
}
48-
}
49-
stage('Clean workspace') {
50-
options {
51-
timeout(time: 5, unit: 'MINUTES')
52-
}
53-
steps {
54-
cleanWs deleteDirs: true, patterns: [
55-
[pattern: 'packages', type: 'INCLUDE'],
56-
[pattern: 'global-packages', type: 'INCLUDE'],
57-
[pattern: 'tmp/NuGetScratch', type: 'INCLUDE'],
58-
[pattern: 'http-cache', type: 'INCLUDE'],
59-
[pattern: 'plugins-cache', type: 'INCLUDE'],
60-
[pattern: '**/obj', type: 'INCLUDE'],
61-
[pattern: '**/bin', type: 'INCLUDE'],
62-
[pattern: '**/*.nupkg', type: 'INCLUDE']
63-
]
64-
}
65-
}
66-
stage('Compile') {
67-
options {
68-
timeout(time: 20, unit: 'MINUTES')
69-
}
70-
steps {
71-
echo "Tesseract directory is ${tesseractDir}"
72-
73-
withEnv(["NUGET_PACKAGES=${env.WORKSPACE}/global-packages",
74-
"temp=${env.WORKSPACE}/tmp/NuGetScratch",
75-
"NUGET_HTTP_CACHE_PATH=${env.WORKSPACE}/http-cache", "NUGET_PLUGINS_CACHE_PATH=${env.WORKSPACE}/plugins-cache", "gsExec=${gsExec}", "compareExec=${compareExec}", "tesseractDir=${tesseractDir}"]) {
76-
bat "\"${env.NuGet}\" restore i7n-ocr.sln"
77-
bat "dotnet restore i7n-ocr.sln"
78-
bat "dotnet build i7n-ocr.sln --configuration Release --source ${env.WORKSPACE}/packages"
79-
script {
80-
createPackAllFile(findFiles(glob: '**/*.nuspec'))
81-
load 'packAll.groovy'
82-
}
83-
}
84-
}
85-
}
86-
stage('Run Tests') {
87-
options {
88-
timeout(time: 60, unit: 'MINUTES')
89-
}
90-
steps {
91-
echo "Tesseract directory is ${tesseractDir}"
92-
93-
withEnv(["NUGET_PACKAGES=${env.WORKSPACE}/global-packages",
94-
"temp=${env.WORKSPACE}/tmp/NuGetScratch",
95-
"NUGET_HTTP_CACHE_PATH=${env.WORKSPACE}/http-cache", "NUGET_PLUGINS_CACHE_PATH=${env.WORKSPACE}/plugins-cache", "gsExec=${gsExec}", "compareExec=${compareExec}", "tesseractDir=${tesseractDir}"]) {
96-
script {
97-
createRunTestDllsFile(findFiles(glob: '**/itext.*.tests.dll'))
98-
load 'runTestDlls.groovy'
99-
}
100-
}
101-
}
102-
}
103-
stage('Artifactory Deploy') {
104-
options {
105-
timeout(time: 5, unit: 'MINUTES')
106-
}
107-
when {
108-
anyOf {
109-
branch "master"
110-
branch "develop"
111-
}
112-
}
113-
steps {
114-
script {
115-
getAndConfigureJFrogCLI()
116-
findFiles(glob: '*.nupkg').each { item ->
117-
upload(item)
118-
}
119-
}
120-
}
121-
}
122-
stage('Branch Artifactory Deploy') {
123-
options {
124-
timeout time: 5, unit: 'MINUTES'
125-
}
126-
when {
127-
not {
128-
anyOf {
129-
branch "master"
130-
branch "develop"
131-
}
132-
}
133-
}
134-
steps {
135-
script {
136-
getAndConfigureJFrogCLI()
137-
if (env.GIT_URL) {
138-
repoName = ("$env.GIT_URL" =~ /(.*\/)(.*)(\.git)/)[0][2]
139-
findFiles(glob: '*.nupkg').each { item ->
140-
sh "./jfrog rt u \"$item.path\" branch-artifacts/$env.BRANCH_NAME/$repoName/dotnet/ --recursive=false --build-name $env.BRANCH_NAME --build-number $env.BUILD_NUMBER --props \"vcs.revision=$env.GIT_COMMIT;repo.name=$repoName\""
141-
}
142-
}
143-
}
144-
}
145-
}
146-
stage('Archive Artifacts') {
147-
options {
148-
timeout(time: 5, unit: 'MINUTES')
149-
}
150-
steps {
151-
archiveArtifacts allowEmptyArchive: true, artifacts: '*.nupkg'
152-
}
153-
}
154-
}
155-
156-
post {
157-
always {
158-
echo 'One way or another, I have finished \uD83E\uDD16'
159-
}
160-
success {
161-
echo 'I succeeeded! \u263A'
162-
cleanWs deleteDirs: true
163-
}
164-
unstable {
165-
echo 'I am unstable \uD83D\uDE2E'
166-
}
167-
failure {
168-
echo 'I failed \uD83D\uDCA9'
169-
}
170-
changed {
171-
echo 'Things were different before... \uD83E\uDD14'
172-
}
173-
}
174-
175-
}
176-
177-
@NonCPS // has to be NonCPS or the build breaks on the call to .each
178-
def createPackAllFile(list) {
179-
// creates file because the bat command brakes the loop
180-
def cmd = ''
181-
list.each { item ->
182-
if (!item.path.contains("packages")) {
183-
cmd = cmd + "bat '\"${env.NuGet.replace('\\','\\\\')}\" pack \"${item.path.replace('\\','\\\\')}\"'\n"
184-
}
185-
}
186-
writeFile file: 'packAll.groovy', text: cmd
187-
}
188-
189-
@NonCPS // has to be NonCPS or the build breaks on the call to .each
190-
def createRunTestDllsFile(list) {
191-
// creates file because the bat command brakes the loop
192-
def ws = "${env.WORKSPACE.replace('\\','\\\\')}"
193-
def nunit = "${env.'Nunit3-console'.replace('\\','\\\\')}"
194-
def cmd = ''
195-
list.each { item ->
196-
if (!item.path.contains("netcoreapp1.0") && !item.path.contains("obj")) {
197-
cmd = cmd + "bat '\"${nunit}\" \"${ws}\\\\${item.path.replace('\\','\\\\')}\" --result=${item.name}-TestResult.xml'\n"
198-
}
199-
}
200-
writeFile file: 'runTestDlls.groovy', text: cmd
201-
}
202-
203-
@NonCPS // has to be NonCPS or the build breaks on the call to .each
204-
def createRunTestCsProjsFile(list) {
205-
// creates file because the bat command brakes the loop
206-
def ws = "${env.WORKSPACE.replace('\\','\\\\')}"
207-
def cmd = ''
208-
list.each { item ->
209-
cmd = cmd + "bat 'dotnet test ${ws}\\\\${item.path.replace('\\','\\\\')} --configuration Release --no-build --logger \"trx;LogFileName=results.trx\"'\n"
210-
}
211-
writeFile file: 'runTestCsProjs.groovy', text: cmd
212-
}
213-
214-
@NonCPS
215-
def upload(item) {
216-
def itemArray = (item =~ /(.*?)(\.[0-9]*\.[0-9]*\.[0-9]*(-SNAPSHOT)?\.nupkg)/)
217-
def dir = itemArray[ 0 ][ 1 ]
218-
sh "./jfrog rt u \"${item.path}\" nuget/${dir}/ --flat=false --build-name="${env.BRANCH_NAME}" --build-number=${env.BUILD_NUMBER}"
219-
}
8+
automaticDotnetBuild(repoName, dependencyRegex, solutionFile)

doxyfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
3232
# title of most generated pages and in a few other places.
3333
# The default value is: My Project.
3434

35-
PROJECT_NAME = "pdfOCR 1.0.1 API"
35+
PROJECT_NAME = "pdfOCR 1.0.2 API"
3636

3737
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
3838
# could be handy for archiving the generated documentation or if some version

itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@
1515

1616
[assembly: Guid("d6a6ea97-1f23-448f-b700-eff62971d234")]
1717

18-
[assembly: AssemblyVersion("1.0.1.0")]
19-
[assembly: AssemblyFileVersion("1.0.1.0")]
20-
[assembly: AssemblyInformationalVersion("1.0.1")]
18+
[assembly: AssemblyVersion("1.0.2.0")]
19+
[assembly: AssemblyFileVersion("1.0.2.0")]
20+
[assembly: AssemblyInformationalVersion("1.0.2")]

itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
</ItemGroup>
2828
<ItemGroup>
2929
<ProjectReference Include="..\..\..\itextcore\itext\itext.pdftest\itext.pdftest.netstandard.csproj" Condition="Exists('..\..\..\itextcore\itext\itext.pdftest\itext.pdftest.netstandard.csproj')" />
30-
<PackageReference Include="itext7.pdftest" Version="7.1.12" Condition="!Exists('..\..\..\itextcore\itext\itext.pdftest\itext.pdftest.netstandard.csproj')" />
30+
<PackageReference Include="itext7.pdftest" Version="7.1.13" Condition="!Exists('..\..\..\itextcore\itext\itext.pdftest\itext.pdftest.netstandard.csproj')" />
3131
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.6.0" />
3232
<PackageReference Include="NUnit" Version="3.12.0" />
3333
<PackageReference Include="NUnit3TestAdapter" Version="3.16.1">

itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ You should have received a copy of the GNU Affero General Public License
2323
using System;
2424
using System.Collections.Generic;
2525
using System.IO;
26+
using iText.IO.Image;
2627
using iText.IO.Util;
2728
using iText.Kernel.Colors;
2829
using iText.Kernel.Font;
30+
using iText.Kernel.Geom;
2931
using iText.Pdfocr.Helpers;
3032
using iText.Test;
3133
using iText.Test.Attributes;
@@ -39,6 +41,20 @@ public virtual void TestTextInfo() {
3941
NUnit.Framework.Assert.AreEqual(1, result.Count);
4042
TextInfo textInfo = new TextInfo();
4143
textInfo.SetText("text");
44+
textInfo.SetBboxRect(new Rectangle(204.0f, 158.0f, 538.0f, 136.0f));
45+
int page = 2;
46+
result.Put(page, JavaCollectionsUtil.SingletonList<TextInfo>(textInfo));
47+
NUnit.Framework.Assert.AreEqual(2, result.Count);
48+
NUnit.Framework.Assert.AreEqual(textInfo.GetText(), result.Get(page)[0].GetText());
49+
}
50+
51+
[NUnit.Framework.Test]
52+
public virtual void TestTextInfoDeprecationMode() {
53+
String path = PdfHelper.GetDefaultImagePath();
54+
IDictionary<int, IList<TextInfo>> result = new CustomOcrEngine(true).DoImageOcr(new FileInfo(path));
55+
NUnit.Framework.Assert.AreEqual(1, result.Count);
56+
TextInfo textInfo = new TextInfo();
57+
textInfo.SetText("text");
4258
textInfo.SetBbox(JavaUtil.ArraysAsList(204.0f, 158.0f, 742.0f, 294.0f));
4359
int page = 2;
4460
result.Put(page, JavaCollectionsUtil.SingletonList<TextInfo>(textInfo));
@@ -60,5 +76,55 @@ public virtual void TestThaiImageWithNotDefGlyphs() {
6076
String fontName = font.GetFontProgram().GetFontNames().GetFontName();
6177
NUnit.Framework.Assert.IsTrue(fontName.Contains("LiberationSans"));
6278
}
79+
80+
[NUnit.Framework.Test]
81+
public virtual void TestImageRotationHandler() {
82+
NUnit.Framework.Assert.That(() => {
83+
OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties();
84+
properties.SetImageRotationHandler(new ApiTest.NotImplementedImageRotationHandler());
85+
String testName = "testSetAndGetImageRotationHandler";
86+
String path = PdfHelper.GetImagesTestDirectory() + "90_degrees_rotated.jpg";
87+
String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf";
88+
PdfHelper.CreatePdf(pdfPath, new FileInfo(path), properties);
89+
NUnit.Framework.Assert.IsNotNull(properties.GetImageRotationHandler());
90+
}
91+
, NUnit.Framework.Throws.InstanceOf<Exception>().With.Message.EqualTo("applyRotation is not implemented"))
92+
;
93+
}
94+
95+
[NUnit.Framework.Test]
96+
public virtual void TestImageRotationHandlerForTiff() {
97+
NUnit.Framework.Assert.That(() => {
98+
OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties();
99+
properties.SetImageRotationHandler(new ApiTest.NotImplementedImageRotationHandler());
100+
String testName = "testSetAndGetImageRotationHandler";
101+
String path = PdfHelper.GetImagesTestDirectory() + "multipage.tiff";
102+
String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf";
103+
PdfHelper.CreatePdf(pdfPath, new FileInfo(path), properties);
104+
NUnit.Framework.Assert.IsNotNull(properties.GetImageRotationHandler());
105+
}
106+
, NUnit.Framework.Throws.InstanceOf<Exception>().With.Message.EqualTo("applyRotation is not implemented"))
107+
;
108+
}
109+
110+
internal class NotImplementedImageRotationHandler : IImageRotationHandler {
111+
public virtual ImageData ApplyRotation(ImageData imageData) {
112+
throw new Exception("applyRotation is not implemented");
113+
}
114+
}
115+
116+
[LogMessage(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, Count = 7)]
117+
[NUnit.Framework.Test]
118+
public virtual void TestThaiImageWithNotDefGlyphsDeprecationMode() {
119+
String testName = "testThaiImageWithNotdefGlyphs";
120+
String path = PdfHelper.GetThaiImagePath();
121+
String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf";
122+
PdfHelper.CreatePdf(pdfPath, new FileInfo(path), new OcrPdfCreatorProperties().SetTextColor(DeviceRgb.BLACK
123+
), true);
124+
ExtractionStrategy strategy = PdfHelper.GetExtractionStrategy(pdfPath);
125+
PdfFont font = strategy.GetPdfFont();
126+
String fontName = font.GetFontProgram().GetFontNames().GetFontName();
127+
NUnit.Framework.Assert.IsTrue(fontName.Contains("LiberationSans"));
128+
}
63129
}
64130
}

itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ You should have received a copy of the GNU Affero General Public License
2525
using System.IO;
2626
using iText.IO.Util;
2727
using iText.Kernel.Counter.Event;
28+
using iText.Kernel.Geom;
2829
using iText.Pdfocr;
2930
using iText.Pdfocr.Events;
3031

@@ -34,7 +35,14 @@ public class CustomOcrEngine : IOcrEngine, IThreadLocalMetaInfoAware {
3435

3536
private IMetaInfo threadLocalMetaInfo;
3637

37-
public CustomOcrEngine() {
38+
private bool textInfoDeprecationMode = false;
39+
40+
public CustomOcrEngine()
41+
: this(false) {
42+
}
43+
44+
public CustomOcrEngine(bool textInfoDeprecationMode) {
45+
this.textInfoDeprecationMode = textInfoDeprecationMode;
3846
}
3947

4048
public CustomOcrEngine(OcrEngineProperties ocrEngineProperties) {
@@ -47,7 +55,8 @@ public virtual IDictionary<int, IList<TextInfo>> DoImageOcr(FileInfo input) {
4755
if (input.FullName.Contains(PdfHelper.THAI_IMAGE_NAME)) {
4856
text = PdfHelper.THAI_TEXT;
4957
}
50-
TextInfo textInfo = new TextInfo(text, JavaUtil.ArraysAsList(204.0f, 158.0f, 742.0f, 294.0f));
58+
TextInfo textInfo = this.textInfoDeprecationMode ? new TextInfo(text, JavaUtil.ArraysAsList(204.0f, 158.0f
59+
, 742.0f, 294.0f)) : new TextInfo(text, new Rectangle(204.0f, 158.0f, 538.0f, 136.0f));
5160
result.Put(1, JavaCollectionsUtil.SingletonList<TextInfo>(textInfo));
5261
return result;
5362
}

itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,16 @@ public static String GetTextFromPdfLayerUseActualText(String pdfPath, String lay
123123
/// of properties and save to the given path.
124124
/// </summary>
125125
public static void CreatePdf(String pdfPath, FileInfo inputFile, OcrPdfCreatorProperties properties) {
126-
OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties);
126+
CreatePdf(pdfPath, inputFile, properties, false);
127+
}
128+
129+
/// <summary>
130+
/// Perform OCR with custom ocr engine using provided input image and set
131+
/// of properties and save to the given path.
132+
/// </summary>
133+
public static void CreatePdf(String pdfPath, FileInfo inputFile, OcrPdfCreatorProperties properties, bool
134+
textInfoDeprecationMode) {
135+
OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(textInfoDeprecationMode), properties);
127136
try {
128137
using (PdfWriter pdfWriter = GetPdfWriter(pdfPath)) {
129138
ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList<FileInfo>(inputFile), pdfWriter).Close();
61.5 KB
Loading

0 commit comments

Comments
 (0)