Skip to content

Commit 35ef45a

Browse files
committed
fix: read returnChars from configuration and correctly set it
1 parent 932f688 commit 35ef45a

File tree

1 file changed

+35
-3
lines changed

1 file changed

+35
-3
lines changed

grobid-core/src/main/java/org/grobid/core/engines/tagging/delft/Preprocessor.java

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,33 @@ public class Preprocessor {
3838
private final Map<Integer, Map<String, Integer>> featuresMapToIndex;
3939
private final boolean hasFeatures;
4040

41+
// Character encoding support - when false, return all-zero char indices
42+
// This matches Python's DataLoader behavior when return_chars=False
43+
private final boolean returnChars;
44+
4145
/**
42-
* Create preprocessor with vocabularies (no features).
46+
* Create preprocessor with vocabularies (no features, with char encoding).
4347
*/
4448
public Preprocessor(Map<String, Integer> charVocab, Map<Integer, String> tagIndex, int maxCharLength) {
45-
this(charVocab, tagIndex, maxCharLength, null, null);
49+
this(charVocab, tagIndex, maxCharLength, null, null, true);
4650
}
4751

4852
/**
4953
* Create preprocessor with vocabularies and features support.
5054
*/
5155
public Preprocessor(Map<String, Integer> charVocab, Map<Integer, String> tagIndex, int maxCharLength,
5256
List<Integer> featuresIndices, Map<Integer, Map<String, Integer>> featuresMapToIndex) {
57+
this(charVocab, tagIndex, maxCharLength, featuresIndices, featuresMapToIndex, true);
58+
}
59+
60+
/**
61+
* Create preprocessor with all options.
62+
*
63+
* @param returnChars If false, tokensToCharIndices returns all zeros (matching
64+
* Python's return_chars=False)
65+
*/
66+
public Preprocessor(Map<String, Integer> charVocab, Map<Integer, String> tagIndex, int maxCharLength,
67+
List<Integer> featuresIndices, Map<Integer, Map<String, Integer>> featuresMapToIndex, boolean returnChars) {
5368
this.charVocab = charVocab;
5469
this.tagIndex = tagIndex;
5570
this.maxCharLength = maxCharLength;
@@ -58,6 +73,7 @@ public Preprocessor(Map<String, Integer> charVocab, Map<Integer, String> tagInde
5873
this.featuresIndices = featuresIndices;
5974
this.featuresMapToIndex = featuresMapToIndex;
6075
this.hasFeatures = featuresIndices != null && !featuresIndices.isEmpty();
76+
this.returnChars = returnChars;
6177
}
6278

6379
/**
@@ -117,7 +133,15 @@ public static Preprocessor fromJson(Path vocabPath) throws IOException {
117133
LOGGER.info("Loaded feature vocabulary with {} feature columns", featuresMapToIndex.size());
118134
}
119135

120-
return new Preprocessor(charVocab, tagIndex, maxCharLength, featuresIndices, featuresMapToIndex);
136+
// Parse returnChars flag (defaults to true for backward compatibility)
137+
boolean returnChars = true;
138+
if (json.has("returnChars") && !json.get("returnChars").isJsonNull()) {
139+
returnChars = json.get("returnChars").getAsBoolean();
140+
}
141+
LOGGER.info("Loaded returnChars={}", returnChars);
142+
143+
return new Preprocessor(charVocab, tagIndex, maxCharLength, featuresIndices, featuresMapToIndex,
144+
returnChars);
121145
}
122146
}
123147

@@ -144,13 +168,21 @@ public List<LayoutToken> tokenize(String text) {
144168
/**
145169
* Convert tokens to character indices.
146170
*
171+
* If returnChars is false, returns all-zero array to match Python's
172+
* DataLoader behavior when return_chars=False.
173+
*
147174
* @param tokens List of tokens
148175
* @param seqLength Padded sequence length
149176
* @return Character indices [seq_len][max_char_length]
150177
*/
151178
public long[][] tokensToCharIndices(List<LayoutToken> tokens, int seqLength) {
152179
long[][] charIndices = new long[seqLength][maxCharLength];
153180

181+
// If returnChars is false, return all-zero array (matches Python's DataLoader)
182+
if (!returnChars) {
183+
return charIndices;
184+
}
185+
154186
for (int i = 0; i < Math.min(tokens.size(), seqLength); i++) {
155187
String word = tokens.get(i).getText();
156188
for (int j = 0; j < Math.min(word.length(), maxCharLength); j++) {

0 commit comments

Comments
 (0)