1515 * <p>This class is stateful, reusable, and not thread-safe. Returned words are
1616 * transient {@link CharSequence} values backed by internal {@link StringBuilder}
1717 * buffers. Callers must copy {@link #word()} if they need to retain it.</p>
18+ *
19+ * <p>The hot path is allocation-free: clitic lookups go through
20+ * {@link CharsMap#valueOrd(CharSequence, int, int)} and
21+ * {@link CharsMap#copy(int, char[], int)} into a reusable {@link #scratch}
22+ * buffer rather than {@link CharsMap#get(CharSequence, int, int)}, which
23+ * allocates a {@link String} per call.</p>
1824 */
1925public final class FrenchCliticTokenizer implements WordTokenizer {
2026 private static final int MAX_SPLITS = 8 ;
@@ -42,55 +48,56 @@ public final class FrenchCliticTokenizer implements WordTokenizer {
4248
4349 private static final CharsMap PREFIX = new CharsMap (15 );
4450 static {
45- PREFIX .put ("c'" , "ce" );
46- PREFIX .put ("d'" , "de" );
47- PREFIX .put ("j'" , "je" );
48- PREFIX .put ("jusqu'" , "jusque" );
49- PREFIX .put ("l'" , "l'" );
51+ PREFIX .put ("c'" , "ce" );
52+ PREFIX .put ("d'" , "de" );
53+ PREFIX .put ("j'" , "je" );
54+ PREFIX .put ("jusqu'" , "jusque" );
55+ PREFIX .put ("l'" , "l'" );
5056 PREFIX .put ("lorsqu'" , "lorsque" );
51- PREFIX .put ("m'" , "me" );
52- PREFIX .put ("n'" , "ne" );
57+ PREFIX .put ("m'" , "me" );
58+ PREFIX .put ("n'" , "ne" );
5359 PREFIX .put ("puisqu'" , "puisque" );
54- PREFIX .put ("qu'" , "que" );
60+ PREFIX .put ("qu'" , "que" );
5561 PREFIX .put ("quoiqu'" , "quoique" );
56- PREFIX .put ("s'" , "se" );
57- PREFIX .put ("t'" , "te" );
62+ PREFIX .put ("s'" , "se" );
63+ PREFIX .put ("t'" , "te" );
5864 }
5965
6066 private static final CharsMap SUFFIX = new CharsMap (25 );
6167 static {
62- SUFFIX .put ("-ce" , "ce" );
63- SUFFIX .put ("-ci" , "" );
64- SUFFIX .put ("-elle" , "elle" );
68+ SUFFIX .put ("-ce" , "ce" );
69+ SUFFIX .put ("-ci" , "" );
70+ SUFFIX .put ("-elle" , "elle" );
6571 SUFFIX .put ("-elles" , "elles" );
66- SUFFIX .put ("-en" , "en" );
67- SUFFIX .put ("-eux" , "eux" );
68- SUFFIX .put ("-il" , "il" );
69- SUFFIX .put ("-ils" , "ils" );
70- SUFFIX .put ("-je" , "je" );
71- SUFFIX .put ("-la" , "la" );
72- SUFFIX .put ("-là" , "" );
73- SUFFIX .put ("-le" , "le" );
74- SUFFIX .put ("-les" , "les" );
75- SUFFIX .put ("-leur" , "leur" );
76- SUFFIX .put ("-lui" , "lui" );
77- SUFFIX .put ("-me" , "me" );
78- SUFFIX .put ("-moi" , "moi" );
79- SUFFIX .put ("-nous" , "nous" );
80- SUFFIX .put ("-on" , "on" );
81- SUFFIX .put ("-t" , "" );
82- SUFFIX .put ("-te" , "te" );
83- SUFFIX .put ("-toi" , "toi" );
84- SUFFIX .put ("-tu" , "tu" );
85- SUFFIX .put ("-vous" , "vous" );
86- SUFFIX .put ("-y" , "y" );
72+ SUFFIX .put ("-en" , "en" );
73+ SUFFIX .put ("-eux" , "eux" );
74+ SUFFIX .put ("-il" , "il" );
75+ SUFFIX .put ("-ils" , "ils" );
76+ SUFFIX .put ("-je" , "je" );
77+ SUFFIX .put ("-la" , "la" );
78+ SUFFIX .put ("-là" , "" );
79+ SUFFIX .put ("-le" , "le" );
80+ SUFFIX .put ("-les" , "les" );
81+ SUFFIX .put ("-leur" , "leur" );
82+ SUFFIX .put ("-lui" , "lui" );
83+ SUFFIX .put ("-me" , "me" );
84+ SUFFIX .put ("-moi" , "moi" );
85+ SUFFIX .put ("-nous" , "nous" );
86+ SUFFIX .put ("-on" , "on" );
87+ SUFFIX .put ("-t" , "" );
88+ SUFFIX .put ("-te" , "te" );
89+ SUFFIX .put ("-toi" , "toi" );
90+ SUFFIX .put ("-tu" , "tu" );
91+ SUFFIX .put ("-vous" , "vous" );
92+ SUFFIX .put ("-y" , "y" );
8793 }
8894
8995 private CharSequence text ;
9096 private int offset ;
9197
9298 private final StringBuilder raw = new StringBuilder (32 );
9399 private final StringBuilder key = new StringBuilder (16 );
100+ private final char [] scratch ;
94101 private final StringBuilder [] pending = new StringBuilder [MAX_SPLITS + 1 ];
95102
96103 private int pendingEnd ;
@@ -100,8 +107,13 @@ public final class FrenchCliticTokenizer implements WordTokenizer {
100107
101108 /**
102109 * Constructs a reusable French clitic tokenizer.
110+ *
111+ * <p>The {@link #scratch} buffer is sized once from the longest sequence
112+ * interned by {@link #PREFIX} or {@link #SUFFIX}.</p>
103113 */
104114 public FrenchCliticTokenizer () {
115+ final int max = Math .max (PREFIX .maxLen (), SUFFIX .maxLen ());
116+ scratch = new char [Math .max (16 , max )];
105117 for (int i = 0 ; i < pending .length ; i ++) {
106118 pending [i ] = new StringBuilder (16 );
107119 }
@@ -170,19 +182,26 @@ public CharSequence word() {
170182 }
171183
172184 /**
173- * Appends a literal replacement word to the pending queue.
185+ * Appends a dictionary-resident replacement word to the pending queue.
186+ *
187+ * <p>Reads {@code valueOrd} from {@code map} into {@link #scratch} and
188+ * copies it into the next pending slot, without allocating a String.</p>
174189 *
175- * @param value the literal replacement
190+ * @param map the dictionary that owns {@code valueOrd}
191+ * @param valueOrd the value ord to materialize
176192 * @return true if the word was appended
177193 */
178- private boolean appendLiteral (final String value ) {
194+ private boolean appendLiteral (final CharsMap map , final int valueOrd ) {
179195 if (pendingEnd >= pending .length ) {
180196 return false ;
181197 }
182198
199+ final int vLen = map .len (valueOrd );
200+ map .copy (valueOrd , scratch , 0 );
201+
183202 final StringBuilder builder = pending [pendingEnd ++];
184203 builder .setLength (0 );
185- builder .append (value );
204+ builder .append (scratch , 0 , vLen );
186205 return true ;
187206 }
188207
@@ -314,8 +333,6 @@ private static char normalizeChar(final char c) {
314333 };
315334 }
316335
317-
318-
319336 /**
320337 * Reads the next raw token into the reusable raw buffer.
321338 *
@@ -354,6 +371,12 @@ private boolean readRawToken() {
354371 /**
355372 * Splits one raw-buffer range into pending words.
356373 *
374+ * <p>The apostrophe branch builds a lowercase lookup key in {@link #key}
375+ * rather than mutating {@link #raw}; this preserves the original case of
376+ * the first character so the proper-name guard ({@code D'Artagnan},
377+ * {@code L'Hôpital}) operates correctly and the no-split fall-through
378+ * emits the original token unchanged.</p>
379+ *
357380 * @param start the inclusive raw-buffer start offset
358381 * @param end the exclusive raw-buffer end offset
359382 * @param depth the current split depth
@@ -383,15 +406,20 @@ private boolean splitRange(final int start, final int end, final int depth) {
383406
384407 if (apostrophe > start ) {
385408 final int prefixEnd = apostrophe + 1 ;
386- // lower case prefix
387- raw .setCharAt (start , Character .toLowerCase (raw .charAt (start )));
388- final String value = PREFIX .get (raw , start , prefixEnd - start );
389409
390- if (value != null && prefixEnd < end ) {
410+ key .setLength (0 );
411+ key .append (Character .toLowerCase (raw .charAt (start )));
412+ for (int i = start + 1 ; i < prefixEnd ; i ++) {
413+ key .append (raw .charAt (i ));
414+ }
415+
416+ final int valueOrd = PREFIX .valueOrd (key , 0 , key .length ());
417+
418+ if (valueOrd >= 0 && prefixEnd < end ) {
391419 final char next = raw .charAt (prefixEnd );
392420
393421 if (!(isUpperOrTitle (next ) && isUpperOrTitle (raw .charAt (start )))) {
394- if (!appendLiteral (value )) {
422+ if (!appendLiteral (PREFIX , valueOrd )) {
395423 return false ;
396424 }
397425 return splitRange (prefixEnd , end , depth + 1 );
@@ -400,15 +428,15 @@ private boolean splitRange(final int start, final int end, final int depth) {
400428 }
401429
402430 if (hyphen > start ) {
403- final String value = SUFFIX .get (raw , hyphen , end - hyphen );
431+ final int valueOrd = SUFFIX .valueOrd (raw , hyphen , end - hyphen );
404432
405- if (value != null ) {
433+ if (valueOrd >= 0 ) {
406434 if (!splitRange (start , hyphen , depth + 1 )) {
407435 return false ;
408436 }
409437
410- if (! value . isEmpty () ) {
411- return appendLiteral (value );
438+ if (SUFFIX . len ( valueOrd ) > 0 ) {
439+ return appendLiteral (SUFFIX , valueOrd );
412440 }
413441
414442 return true ;
@@ -436,4 +464,4 @@ private boolean tooManyHyphens(final int start, final int end) {
436464
437465 return false ;
438466 }
439- }
467+ }
0 commit comments