@@ -38,18 +38,33 @@ public class Preprocessor {
3838 private final Map <Integer , Map <String , Integer >> featuresMapToIndex ;
3939 private final boolean hasFeatures ;
4040
41+ // Character encoding support - when false, return all-zero char indices
42+ // This matches Python's DataLoader behavior when return_chars=False
43+ private final boolean returnChars ;
44+
4145 /**
42- * Create preprocessor with vocabularies (no features).
46+ * Create preprocessor with vocabularies (no features, with char encoding ).
4347 */
4448 public Preprocessor (Map <String , Integer > charVocab , Map <Integer , String > tagIndex , int maxCharLength ) {
45- this (charVocab , tagIndex , maxCharLength , null , null );
49+ this (charVocab , tagIndex , maxCharLength , null , null , true );
4650 }
4751
4852 /**
4953 * Create preprocessor with vocabularies and features support.
5054 */
5155 public Preprocessor (Map <String , Integer > charVocab , Map <Integer , String > tagIndex , int maxCharLength ,
5256 List <Integer > featuresIndices , Map <Integer , Map <String , Integer >> featuresMapToIndex ) {
57+ this (charVocab , tagIndex , maxCharLength , featuresIndices , featuresMapToIndex , true );
58+ }
59+
60+ /**
61+ * Create preprocessor with all options.
62+ *
63+ * @param returnChars If false, tokensToCharIndices returns all zeros (matching
64+ * Python's return_chars=False)
65+ */
66+ public Preprocessor (Map <String , Integer > charVocab , Map <Integer , String > tagIndex , int maxCharLength ,
67+ List <Integer > featuresIndices , Map <Integer , Map <String , Integer >> featuresMapToIndex , boolean returnChars ) {
5368 this .charVocab = charVocab ;
5469 this .tagIndex = tagIndex ;
5570 this .maxCharLength = maxCharLength ;
@@ -58,6 +73,7 @@ public Preprocessor(Map<String, Integer> charVocab, Map<Integer, String> tagInde
5873 this .featuresIndices = featuresIndices ;
5974 this .featuresMapToIndex = featuresMapToIndex ;
6075 this .hasFeatures = featuresIndices != null && !featuresIndices .isEmpty ();
76+ this .returnChars = returnChars ;
6177 }
6278
6379 /**
@@ -117,7 +133,15 @@ public static Preprocessor fromJson(Path vocabPath) throws IOException {
117133 LOGGER .info ("Loaded feature vocabulary with {} feature columns" , featuresMapToIndex .size ());
118134 }
119135
120- return new Preprocessor (charVocab , tagIndex , maxCharLength , featuresIndices , featuresMapToIndex );
136+ // Parse returnChars flag (defaults to true for backward compatibility)
137+ boolean returnChars = true ;
138+ if (json .has ("returnChars" ) && !json .get ("returnChars" ).isJsonNull ()) {
139+ returnChars = json .get ("returnChars" ).getAsBoolean ();
140+ }
141+ LOGGER .info ("Loaded returnChars={}" , returnChars );
142+
143+ return new Preprocessor (charVocab , tagIndex , maxCharLength , featuresIndices , featuresMapToIndex ,
144+ returnChars );
121145 }
122146 }
123147
@@ -144,13 +168,21 @@ public List<LayoutToken> tokenize(String text) {
144168 /**
145169 * Convert tokens to character indices.
146170 *
171+ * If returnChars is false, returns all-zero array to match Python's
172+ * DataLoader behavior when return_chars=False.
173+ *
147174 * @param tokens List of tokens
148175 * @param seqLength Padded sequence length
149176 * @return Character indices [seq_len][max_char_length]
150177 */
151178 public long [][] tokensToCharIndices (List <LayoutToken > tokens , int seqLength ) {
152179 long [][] charIndices = new long [seqLength ][maxCharLength ];
153180
181+ // If returnChars is false, return all-zero array (matches Python's DataLoader)
182+ if (!returnChars ) {
183+ return charIndices ;
184+ }
185+
154186 for (int i = 0 ; i < Math .min (tokens .size (), seqLength ); i ++) {
155187 String word = tokens .get (i ).getText ();
156188 for (int j = 0 ; j < Math .min (word .length (), maxCharLength ); j ++) {
0 commit comments