Skip to content

Commit 225a486

Browse files
committed
output_format_flag instead of format_type; update model source handling (to allow testing of ml model pulled from GitHub) and environment variables
1 parent 0b5ec8e commit 225a486

8 files changed

Lines changed: 102 additions & 51 deletions

tools/kraken_ocr/kraken_binarize.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@
66
<expand macro="requirements"/>
77
<expand macro="creators"/>
88
<command detect_errors="exit_code"><![CDATA[
9-
#set $format_type = '-n'
9+
#set $output_format_flag = '-n'
1010
@REJECT_MULTIPAGE_TIFF_BASED_ON_FILE_METADATA@
1111
@CREATE_SYMLINKS@
12-
@TORCH_TMP_DIR@
1312
@RUN_KRAKEN@ binarize
1413
--threshold '$threshold'
1514
--zoom '$zoom'
@@ -20,6 +19,7 @@
2019
--low '$low'
2120
--high '$high'
2221
]]></command>
22+
<expand macro="env_vars"/>
2323
<inputs>
2424
<expand macro="input_param"/>
2525
<param name="out_ext" type="select" label="Output format">

tools/kraken_ocr/kraken_segment.xml

Lines changed: 53 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,19 @@
66
<expand macro="requirements"/>
77
<expand macro="creators"/>
88
<command detect_errors="exit_code"><![CDATA[
9-
#if $format_type == '-h'
9+
#if $output_format_flag == '-h'
1010
#set $out_ext = 'html'
11-
#elif $format_type == '-n'
11+
#elif $output_format_flag == '-n'
1212
#set $out_ext = 'json'
1313
#else
1414
#set $out_ext = 'xml'
1515
#end if
1616
@REJECT_MULTIPAGE_TIFF_BASED_ON_FILE_METADATA@
1717
@CREATE_SYMLINKS@
18+
#if $segmenter.type == 'baseline' and $segmenter.model_source.source == "history"
19+
#set $model_name = str($segmenter.model_source.model.name)
20+
ln -s '$segmenter.model_source.model' '$model_name' &&
21+
#end if
1822
#if $segmenter.type == 'boxes'
1923
#if $image.ext == 'pdf'
2024
echo "The legacy box segmenter requires a binarized image input. PDF input seems to be not supported with the legacy box segmenter since its fails even for 1-bit bilevel PDFs. Please extract the PDF pages to binarized PNG/TIFF images first, or use the neural baseline segmenter." >&2;
@@ -27,12 +31,13 @@
2731
fi &&
2832
#end if
2933
#end if
30-
@TORCH_TMP_DIR@
3134
@RUN_KRAKEN@ segment
3235
#if $segmenter.type == 'baseline'
3336
--baseline
34-
#if $segmenter.model.fields.sort_key != '00-default'
35-
--model '$segmenter.model.fields.path'
37+
#if $segmenter.model_source.source == "datatable"
38+
--model '${segmenter.model_source.model.fields.path}'
39+
#elif $segmenter.model_source.source == "history"
40+
--model '$model_name'
3641
#end if
3742
#else
3843
--boxes
@@ -47,9 +52,10 @@
4752
--text-direction '$text_direction'
4853
--input-pad '$input_pad'
4954
]]></command>
55+
<expand macro="env_vars"/>
5056
<inputs>
5157
<expand macro="input_param"/>
52-
<param name="format_type" type="select" label="Output format">
58+
<param name="output_format_flag" type="select" label="Output format">
5359
<option value="-a" selected="true">ALTO XML</option>
5460
<option value="-x">PageXML</option>
5561
<option value="-h">hOCR</option>
@@ -62,16 +68,29 @@
6268
<option value="boxes">Legacy box segmenter (requires binarized input image)</option>
6369
</param>
6470
<when value="baseline">
65-
<param name="model" type="select" label="Segmentation model" help="Select a segmentation model to use. The default is kraken's internal stock model">
66-
<options from_data_table="ml_models">
67-
<column name="value" index="0"/>
68-
<column name="name" index="1"/>
69-
<column name="version" index="2"/>
70-
<column name="path" index="3"/>
71-
<column name="sort_key" index="4"/>
72-
<filter type="sort_by" column="4"/>
73-
</options>
74-
</param>
71+
<conditional name="model_source">
72+
<param name="source" type="select" label="Model source">
73+
<option value="default" selected="true">Kraken internal stock model (BLLA base)</option>
74+
<option value="datatable">Models served by Galaxy</option>
75+
<option value="history">Model from Galaxy history</option>
76+
</param>
77+
<when value="default"/>
78+
<when value="datatable">
79+
<param name="model" type="select" label="Segmentation model" help="Select a segmentation model to use. The default is kraken's internal stock model">
80+
<options from_data_table="ml_models">
81+
<column name="value" index="0"/>
82+
<column name="name" index="1"/>
83+
<column name="version" index="2"/>
84+
<column name="path" index="3"/>
85+
<column name="compatibility" index="4"/>
86+
<filter type="sort_by" column="2"/>
87+
</options>
88+
</param>
89+
</when>
90+
<when value="history">
91+
<param name="model" type="data" format="binary" label="Segmentation model from history"/>
92+
</when>
93+
</conditional>
7594
</when>
7695
<when value="boxes">
7796
<!-- in kraken unbounded - tested manually: <1; >100 lines is empty or the processing time is very long -->
@@ -106,7 +125,7 @@
106125
<data name="output" format="json" label="${tool.name} on ${on_string}">
107126
<filter>image.ext != "pdf"</filter>
108127
<actions>
109-
<conditional name="format_type">
128+
<conditional name="output_format_flag">
110129
<when value="-a">
111130
<action type="format" default="xml"/>
112131
</when>
@@ -133,34 +152,34 @@
133152
<tests>
134153
<test expect_num_outputs="1">
135154
<param name="image" value="input.jpg" ftype="jpg"/>
136-
<param name="format_type" value="-n"/>
137-
<conditional name="segmenter">
138-
<param name="type" value="baseline"/>
139-
</conditional>
155+
<param name="output_format_flag" value="-n"/>
156+
<param name="segmenter|type" value="baseline"/>
140157
<output name="output" ftype="json">
141158
<assert_contents>
142159
<has_text text="&quot;type&quot;: &quot;baselines&quot;"/>
143160
<has_text text="&quot;imagename&quot;: &quot;input.jpg&quot;"/>
144161
</assert_contents>
145162
</output>
146163
</test>
147-
<!-- <test expect_num_outputs="1">
164+
<test expect_num_outputs="1">
148165
<param name="image" value="input.jpg" ftype="jpg"/>
149-
<param name="format_type" value="-a"/>
150-
<conditional name="segmenter">
151-
<param name="type" value="baseline"/>
152-
<param name="model" value="test_ml_model"/>
153-
</conditional>
166+
<param name="output_format_flag" value="-a"/>
167+
<param name="segmenter|type" value="baseline"/>
168+
<param name="segmenter|model_source|source" value="history"/>
169+
<!-- The test model is too large (4.3 MB) to include in the repository.
170+
Thus pulled from a commit-pinned GitHub URL. Zenodo cannot be used here
171+
because the model is only available inside a ZIP archive. -->
172+
<param name="segmenter|model_source|model" value="bdd-segmentation-regions-1.0.mlmodel" location="https://raw.githubusercontent.com/michaelscho/bdd-segmentation-regions/b818488376bc635f1a121427979b33dfc11638df/bdd-segmentation-regions-1.0.mlmodel" ftype="binary"/>
154173
<output name="output" ftype="xml">
155174
<assert_contents>
156175
<has_text text="alto"/>
157176
<has_text text="bdd-segmentation-regions-1.0.mlmodel"/>
158177
</assert_contents>
159178
</output>
160-
</test> -->
179+
</test>
161180
<test expect_num_outputs="1">
162181
<param name="image" value="input_2pages.pdf" ftype="pdf"/>
163-
<param name="format_type" value="-n"/>
182+
<param name="output_format_flag" value="-n"/>
164183
<conditional name="segmenter">
165184
<param name="type" value="baseline"/>
166185
</conditional>
@@ -179,7 +198,7 @@
179198
</test>
180199
<test expect_num_outputs="1">
181200
<param name="image" value="input_binarized.png" ftype="png"/>
182-
<param name="format_type" value="-n"/>
201+
<param name="output_format_flag" value="-n"/>
183202
<conditional name="segmenter">
184203
<param name="type" value="boxes"/>
185204
<param name="scale" value="10"/>
@@ -199,7 +218,7 @@
199218
</test>
200219
<test expect_num_outputs="1">
201220
<param name="image" value="input_binarized.png" ftype="png"/>
202-
<param name="format_type" value="-a"/>
221+
<param name="output_format_flag" value="-a"/>
203222
<conditional name="segmenter">
204223
<param name="type" value="boxes"/>
205224
</conditional>
@@ -211,7 +230,7 @@
211230
</test>
212231
<test expect_num_outputs="1">
213232
<param name="image" value="input_binarized.png" ftype="png"/>
214-
<param name="format_type" value="-h"/>
233+
<param name="output_format_flag" value="-h"/>
215234
<conditional name="segmenter">
216235
<param name="type" value="boxes"/>
217236
</conditional>
@@ -223,7 +242,7 @@
223242
</test>
224243
<test expect_failure="true">
225244
<param name="image" value="input.jpg" ftype="jpg"/>
226-
<param name="format_type" value="-n"/>
245+
<param name="output_format_flag" value="-n"/>
227246
<conditional name="segmenter">
228247
<param name="type" value="boxes"/>
229248
</conditional>
@@ -235,7 +254,7 @@
235254
</test>
236255
<test expect_failure="true">
237256
<param name="image" value="input_2pages.pdf" ftype="pdf"/>
238-
<param name="format_type" value="-n"/>
257+
<param name="output_format_flag" value="-n"/>
239258
<conditional name="segmenter">
240259
<param name="type" value="boxes"/>
241260
</conditional>

tools/kraken_ocr/macros.xml

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@
22
<token name="@TOOL_VERSION@">7.0.2</token>
33
<token name="@VERSION_SUFFIX@">0</token>
44
<token name="@PROFILE@">25.1</token>
5-
<!-- Torch cache dir macro taken from tools/flexynesis -->
6-
<token name="@TORCH_TMP_DIR@"><![CDATA[
7-
export TORCHINDUCTOR_CACHE_DIR=\$_GALAXY_JOB_TMP_DIR &&
8-
]]></token>
95
<token name="@CREATE_SYMLINKS@"><![CDATA[
106
ln -s '$image' 'input.${image.ext}' &&
117
@@ -27,17 +23,38 @@
2723
]]></token>
2824
<token name="@RUN_KRAKEN@"><![CDATA[
2925
kraken
30-
$format_type
26+
$output_format_flag
3127
--raise-on-error
3228
3329
#if $image.ext == 'pdf'
34-
-f pdf
35-
-I 'input.${image.ext}'
36-
-o .${out_ext}
30+
--format-type pdf
31+
--batch-input 'input.${image.ext}'
32+
--suffix .${out_ext}
3733
#else
3834
--input 'input.${image.ext}' 'kraken_output.${out_ext}'
3935
#end if
4036
]]></token>
37+
<token name="@RUN_KRAKEN_OCR@"><![CDATA[
38+
kraken
39+
-vvv
40+
$output_format_flag
41+
--raise-on-error
42+
43+
#if $input_type.source == 'segmented'
44+
--format-type '${segmentation_ext}'
45+
--input 'segmentation.${segmentation_ext}' 'kraken_output.${out_ext}'
46+
#else
47+
#if $input_type.image.ext == 'pdf'
48+
--format-type pdf
49+
--batch-input 'input.${input_type.image.ext}'
50+
--suffix .${out_ext}
51+
#else
52+
--input 'input.${input_type.image.ext}' 'kraken_output.${out_ext}'
53+
#end if
54+
#end if
55+
56+
ocr
57+
]]></token>
4158
<xml name="requirements">
4259
<requirements>
4360
<requirement type="package" version="@TOOL_VERSION@">kraken-ocr</requirement>
@@ -50,6 +67,14 @@
5067
<organization name="AI4SOCIAL+" url="https://cordis.europa.eu/project/id/101292886"/>
5168
</creator>
5269
</xml>
70+
<xml name="env_vars">
71+
<environment_variables>
72+
<!-- kraken uses RichHandler for logging; "dumb" prevents rich terminal
73+
formatting/control sequences from breaking Galaxy's stdout/stderr rendering. -->
74+
<environment_variable name="TERM">dumb</environment_variable>
75+
<environment_variable name="TORCH_HOME">@TORCH_CACHE_DIR@</environment_variable>
76+
</environment_variables>
77+
</xml>
5378
<xml name="citations">
5479
<citations>
5580
<citation type="bibtex">

tools/kraken_ocr/test-data/ml_models.loc

Lines changed: 0 additions & 2 deletions
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
test_ml_model BDD Segmentation Data v1 ${__HERE__}/ml_models/bdd-segmentation-regions-1.0.mlmodel kraken
File renamed without changes.
Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
<tables>
22
<!-- Location of ml-models files -->
3-
<table name="ml_models" comment_char="#">
4-
<columns>value, name, version, path, sort_key</columns>
3+
<table name="segmentation_ml_models" comment_char="#">
4+
<columns>value, name, version, path, compatibility</columns>
55
<file path="tool-data/ml_models.loc" />
66
</table>
7+
<table name="ocr_ml_models" comment_char="#">
8+
<columns>value, name, version, path, compatibility </columns>
9+
<file path="tool-data/ocr_ml_models.loc" />
10+
</table>
711
</tables>
Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
<tables>
22
<!-- Location of ml-models files -->
3-
<table name="ml_models" comment_char="#">
4-
<columns>value, name, version, path, sort_key</columns>
5-
<file path="${__HERE__}/test-data/ml_models.loc" />
3+
<table name="segmentation_ml_models" comment_char="#">
4+
<columns>value, name, version, path, compatibility</columns>
5+
<file path="${__HERE__}/test-data/segmentation_ml_models.loc" />
6+
</table>
7+
<table name="ocr_ml_models" comment_char="#">
8+
<columns>value, name, version, path, compatibility</columns>
9+
<file path="${__HERE__}/test-data/ocr_ml_models.loc" />
610
</table>
711
</tables>

0 commit comments

Comments
 (0)