Skip to content

Commit 140c616

Browse files
committed
Better use of CharsMap
1 parent f63c61c commit 140c616

1 file changed

Lines changed: 78 additions & 50 deletions

File tree

util/src/java/com/github/oeuvres/alix/util/fr/FrenchCliticTokenizer.java

Lines changed: 78 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@
1515
* <p>This class is stateful, reusable, and not thread-safe. Returned words are
1616
* transient {@link CharSequence} values backed by internal {@link StringBuilder}
1717
* buffers. Callers must copy {@link #word()} if they need to retain it.</p>
18+
*
19+
* <p>The hot path is allocation-free: clitic lookups go through
20+
* {@link CharsMap#valueOrd(CharSequence, int, int)} and
21+
* {@link CharsMap#copy(int, char[], int)} into a reusable {@link #scratch}
22+
* buffer rather than {@link CharsMap#get(CharSequence, int, int)}, which
23+
* allocates a {@link String} per call.</p>
1824
*/
1925
public final class FrenchCliticTokenizer implements WordTokenizer {
2026
private static final int MAX_SPLITS = 8;
@@ -42,55 +48,56 @@ public final class FrenchCliticTokenizer implements WordTokenizer {
4248

4349
private static final CharsMap PREFIX = new CharsMap(15);
4450
static {
45-
PREFIX.put("c'", "ce");
46-
PREFIX.put("d'", "de");
47-
PREFIX.put("j'", "je");
48-
PREFIX.put("jusqu'", "jusque");
49-
PREFIX.put("l'", "l'");
51+
PREFIX.put("c'", "ce");
52+
PREFIX.put("d'", "de");
53+
PREFIX.put("j'", "je");
54+
PREFIX.put("jusqu'", "jusque");
55+
PREFIX.put("l'", "l'");
5056
PREFIX.put("lorsqu'", "lorsque");
51-
PREFIX.put("m'", "me");
52-
PREFIX.put("n'", "ne");
57+
PREFIX.put("m'", "me");
58+
PREFIX.put("n'", "ne");
5359
PREFIX.put("puisqu'", "puisque");
54-
PREFIX.put("qu'", "que");
60+
PREFIX.put("qu'", "que");
5561
PREFIX.put("quoiqu'", "quoique");
56-
PREFIX.put("s'", "se");
57-
PREFIX.put("t'", "te");
62+
PREFIX.put("s'", "se");
63+
PREFIX.put("t'", "te");
5864
}
5965

6066
private static final CharsMap SUFFIX = new CharsMap(25);
6167
static {
62-
SUFFIX.put("-ce", "ce");
63-
SUFFIX.put("-ci", "");
64-
SUFFIX.put("-elle", "elle");
68+
SUFFIX.put("-ce", "ce");
69+
SUFFIX.put("-ci", "");
70+
SUFFIX.put("-elle", "elle");
6571
SUFFIX.put("-elles", "elles");
66-
SUFFIX.put("-en", "en");
67-
SUFFIX.put("-eux", "eux");
68-
SUFFIX.put("-il", "il");
69-
SUFFIX.put("-ils", "ils");
70-
SUFFIX.put("-je", "je");
71-
SUFFIX.put("-la", "la");
72-
SUFFIX.put("-là", "");
73-
SUFFIX.put("-le", "le");
74-
SUFFIX.put("-les", "les");
75-
SUFFIX.put("-leur", "leur");
76-
SUFFIX.put("-lui", "lui");
77-
SUFFIX.put("-me", "me");
78-
SUFFIX.put("-moi", "moi");
79-
SUFFIX.put("-nous", "nous");
80-
SUFFIX.put("-on", "on");
81-
SUFFIX.put("-t", "");
82-
SUFFIX.put("-te", "te");
83-
SUFFIX.put("-toi", "toi");
84-
SUFFIX.put("-tu", "tu");
85-
SUFFIX.put("-vous", "vous");
86-
SUFFIX.put("-y", "y");
72+
SUFFIX.put("-en", "en");
73+
SUFFIX.put("-eux", "eux");
74+
SUFFIX.put("-il", "il");
75+
SUFFIX.put("-ils", "ils");
76+
SUFFIX.put("-je", "je");
77+
SUFFIX.put("-la", "la");
78+
SUFFIX.put("-là", "");
79+
SUFFIX.put("-le", "le");
80+
SUFFIX.put("-les", "les");
81+
SUFFIX.put("-leur", "leur");
82+
SUFFIX.put("-lui", "lui");
83+
SUFFIX.put("-me", "me");
84+
SUFFIX.put("-moi", "moi");
85+
SUFFIX.put("-nous", "nous");
86+
SUFFIX.put("-on", "on");
87+
SUFFIX.put("-t", "");
88+
SUFFIX.put("-te", "te");
89+
SUFFIX.put("-toi", "toi");
90+
SUFFIX.put("-tu", "tu");
91+
SUFFIX.put("-vous", "vous");
92+
SUFFIX.put("-y", "y");
8793
}
8894

8995
private CharSequence text;
9096
private int offset;
9197

9298
private final StringBuilder raw = new StringBuilder(32);
9399
private final StringBuilder key = new StringBuilder(16);
100+
private final char[] scratch;
94101
private final StringBuilder[] pending = new StringBuilder[MAX_SPLITS + 1];
95102

96103
private int pendingEnd;
@@ -100,8 +107,13 @@ public final class FrenchCliticTokenizer implements WordTokenizer {
100107

101108
/**
102109
* Constructs a reusable French clitic tokenizer.
110+
*
111+
* <p>The {@link #scratch} buffer is sized once from the longest sequence
112+
* interned by {@link #PREFIX} or {@link #SUFFIX}.</p>
103113
*/
104114
public FrenchCliticTokenizer() {
115+
final int max = Math.max(PREFIX.maxLen(), SUFFIX.maxLen());
116+
scratch = new char[Math.max(16, max)];
105117
for (int i = 0; i < pending.length; i++) {
106118
pending[i] = new StringBuilder(16);
107119
}
@@ -170,19 +182,26 @@ public CharSequence word() {
170182
}
171183

172184
/**
173-
* Appends a literal replacement word to the pending queue.
185+
* Appends a dictionary-resident replacement word to the pending queue.
186+
*
187+
* <p>Reads {@code valueOrd} from {@code map} into {@link #scratch} and
188+
* copies it into the next pending slot, without allocating a String.</p>
174189
*
175-
* @param value the literal replacement
190+
* @param map the dictionary that owns {@code valueOrd}
191+
* @param valueOrd the value ord to materialize
176192
* @return true if the word was appended
177193
*/
178-
private boolean appendLiteral(final String value) {
194+
private boolean appendLiteral(final CharsMap map, final int valueOrd) {
179195
if (pendingEnd >= pending.length) {
180196
return false;
181197
}
182198

199+
final int vLen = map.len(valueOrd);
200+
map.copy(valueOrd, scratch, 0);
201+
183202
final StringBuilder builder = pending[pendingEnd++];
184203
builder.setLength(0);
185-
builder.append(value);
204+
builder.append(scratch, 0, vLen);
186205
return true;
187206
}
188207

@@ -314,8 +333,6 @@ private static char normalizeChar(final char c) {
314333
};
315334
}
316335

317-
318-
319336
/**
320337
* Reads the next raw token into the reusable raw buffer.
321338
*
@@ -354,6 +371,12 @@ private boolean readRawToken() {
354371
/**
355372
* Splits one raw-buffer range into pending words.
356373
*
374+
* <p>The apostrophe branch builds a lowercase lookup key in {@link #key}
375+
* rather than mutating {@link #raw}; this preserves the original case of
376+
* the first character so the proper-name guard ({@code D'Artagnan},
377+
* {@code L'Hôpital}) operates correctly and the no-split fall-through
378+
* emits the original token unchanged.</p>
379+
*
357380
* @param start the inclusive raw-buffer start offset
358381
* @param end the exclusive raw-buffer end offset
359382
* @param depth the current split depth
@@ -383,15 +406,20 @@ private boolean splitRange(final int start, final int end, final int depth) {
383406

384407
if (apostrophe > start) {
385408
final int prefixEnd = apostrophe + 1;
386-
// lower case prefix
387-
raw.setCharAt(start, Character.toLowerCase(raw.charAt(start)));
388-
final String value = PREFIX.get(raw, start, prefixEnd - start);
389409

390-
if (value != null && prefixEnd < end) {
410+
key.setLength(0);
411+
key.append(Character.toLowerCase(raw.charAt(start)));
412+
for (int i = start + 1; i < prefixEnd; i++) {
413+
key.append(raw.charAt(i));
414+
}
415+
416+
final int valueOrd = PREFIX.valueOrd(key, 0, key.length());
417+
418+
if (valueOrd >= 0 && prefixEnd < end) {
391419
final char next = raw.charAt(prefixEnd);
392420

393421
if (!(isUpperOrTitle(next) && isUpperOrTitle(raw.charAt(start)))) {
394-
if (!appendLiteral(value)) {
422+
if (!appendLiteral(PREFIX, valueOrd)) {
395423
return false;
396424
}
397425
return splitRange(prefixEnd, end, depth + 1);
@@ -400,15 +428,15 @@ private boolean splitRange(final int start, final int end, final int depth) {
400428
}
401429

402430
if (hyphen > start) {
403-
final String value = SUFFIX.get(raw, hyphen, end - hyphen);
431+
final int valueOrd = SUFFIX.valueOrd(raw, hyphen, end - hyphen);
404432

405-
if (value != null) {
433+
if (valueOrd >= 0) {
406434
if (!splitRange(start, hyphen, depth + 1)) {
407435
return false;
408436
}
409437

410-
if (!value.isEmpty()) {
411-
return appendLiteral(value);
438+
if (SUFFIX.len(valueOrd) > 0) {
439+
return appendLiteral(SUFFIX, valueOrd);
412440
}
413441

414442
return true;
@@ -436,4 +464,4 @@ private boolean tooManyHyphens(final int start, final int end) {
436464

437465
return false;
438466
}
439-
}
467+
}

0 commit comments

Comments
 (0)