Skip to content

Commit f120aaf

Browse files
committed
Configuration of words with ending dots
1 parent c2fedb0 commit f120aaf

18 files changed

Lines changed: 443 additions & 234 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/AnalyzerCloud.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ public AnalyzerCloud()
5454
@Override
5555
public TokenStreamComponents createComponents(String field)
5656
{
57-
final Tokenizer tokenizer = new TokenizerML();
57+
final Tokenizer tokenizer = new MLTokenizer();
5858
TokenStream ts = tokenizer; // segment words
5959
// interpret html tags as token events like para or section
60-
ts = new FilterHTML(ts);
60+
ts = new MLFilter(ts);
6161
// fr split on ’ and -
6262
ts = new FilterAposHyphenFr(ts);
6363
// pos tagging before lemmatize

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/AnalyzerFind.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@ public AnalyzerFind()
5555
@Override
5656
public TokenStreamComponents createComponents(String field)
5757
{
58-
final Tokenizer tokenizer = new TokenizerML(); // segment words
58+
final Tokenizer tokenizer = new MLTokenizer(); // segment words
5959
TokenStream ts = tokenizer;
60-
ts = new FilterHTML(ts); // interpret tags
60+
ts = new MLFilter(ts); // interpret tags
6161
ts = new FilterAposHyphenFr(ts); // fr split on ’ and -
6262
ts = new FilterLemmatize(ts); // provide lemma+pos
6363
ts = new FilterFind(ts); // orthographic form and lemma as term to index

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/AnalyzerMeta.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ public AnalyzerMeta()
5454
@Override
5555
protected TokenStreamComponents createComponents(String fieldName)
5656
{
57-
final Tokenizer tokenizer = new TokenizerML(); // segment words
57+
final Tokenizer tokenizer = new MLTokenizer(); // segment words
5858
TokenStream ts = tokenizer;
59-
ts = new FilterHTML(ts); // strip tags
59+
ts = new MLFilter(ts); // strip tags
6060
ts = new FilterAposHyphenFr(ts); // fr split on ’ and -
6161
ts = new ASCIIFoldingFilter(ts); // no accents
6262
return new TokenStreamComponents(tokenizer, ts);

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/AnalyzerOrth.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ public AnalyzerOrth()
5454
@Override
5555
public TokenStreamComponents createComponents(String field)
5656
{
57-
final Tokenizer tokenizer = new TokenizerML();
57+
final Tokenizer tokenizer = new MLTokenizer();
5858
// segment words
5959
TokenStream ts = tokenizer;
6060
// interpret html tags as token events like para or section
61-
ts = new FilterHTML(ts);
61+
ts = new MLFilter(ts);
6262
// fr split on ’ and -
6363
ts = new FilterAposHyphenFr(ts);
6464
// provide lemma+pos

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/AnalyzerPos.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ public AnalyzerPos()
5454
@Override
5555
public TokenStreamComponents createComponents(String field)
5656
{
57-
final Tokenizer tokenizer = new TokenizerML();
57+
final Tokenizer tokenizer = new MLTokenizer();
5858
TokenStream ts = tokenizer; // segment words
5959
// interpret html tags as token events like para or section
60-
ts = new FilterHTML(ts);
60+
ts = new MLFilter(ts);
6161
// fr split on ’ and -
6262
ts = new FilterAposHyphenFr(ts);
6363
// pos tagging before lemmatize

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/AnalyzerQuery.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public AnalyzerQuery()
5555
@Override
5656
public TokenStreamComponents createComponents(String field)
5757
{
58-
final Tokenizer tokenizer = new TokenizerML(); // segment words, keep '*'
58+
final Tokenizer tokenizer = new MLTokenizer(); // segment words, keep '*'
5959
TokenStream result = new FilterLemmatize(tokenizer); // provide lemma+pos
6060
result = new FilterFind(result); // orthographic form (not lemma) as term to index
6161
result = new ASCIIFoldingFilter(result); // no accents

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/Lexicons.java

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import java.nio.file.Path;
55

66
import org.apache.lucene.analysis.CharArrayMap;
7+
import org.apache.lucene.analysis.CharArraySet;
78
import org.slf4j.Logger;
89
import org.slf4j.LoggerFactory;
910

@@ -18,22 +19,22 @@ public abstract class Lexicons
1819
protected Lexicons() {}
1920

2021

21-
static public void fillPairs(CharArrayMap<char[]> map, final Class<?> anchor,
22+
static public void fillMap(CharArrayMap<char[]> map, final Class<?> anchor,
2223
final String resourcePath, boolean replace) throws IOException
2324
{
2425
try (CSVReader csv = new CSVReader(anchor, resourcePath, ',', 2)) {
25-
fillPairs(map, csv, replace);
26+
fillMap(map, csv, replace);
2627
}
2728
}
2829

29-
static public void fillPairs(CharArrayMap<char[]> map, final Path file, final boolean replace) throws IOException
30+
static public void fillMap(CharArrayMap<char[]> map, final Path file, final boolean replace) throws IOException
3031
{
3132
try (CSVReader csv = new CSVReader(file, ',', 2)) {
32-
fillPairs(map, csv, replace);
33+
fillMap(map, csv, replace);
3334
}
3435
}
3536

36-
static public void fillPairs(CharArrayMap<char[]> map, final CSVReader csv, final boolean replace) throws IOException
37+
static public void fillMap(CharArrayMap<char[]> map, final CSVReader csv, final boolean replace) throws IOException
3738
{
3839
final int cols = 2;
3940
// what Exception to send if map is null?
@@ -49,5 +50,47 @@ static public void fillPairs(CharArrayMap<char[]> map, final CSVReader csv, fin
4950
map.put(key, csv.getCellToCharArray(1));
5051
}
5152
}
53+
54+
static public void fillSet(CharArraySet set, final Class<?> anchor,
55+
final String resourcePath, final int col, final String rtrim) throws IOException
56+
{
57+
try (CSVReader csv = new CSVReader(anchor, resourcePath, ',', 2)) {
58+
fillSet(set, csv, col, rtrim);
59+
}
60+
}
61+
62+
63+
static public void fillSet(CharArraySet set, final Path file, final int col, final String rtrim) throws IOException
64+
{
65+
try (CSVReader csv = new CSVReader(file, ',', 2)) {
66+
fillSet(set, csv, col, rtrim);
67+
}
68+
}
69+
70+
static public void fillSet(CharArraySet set, final CSVReader csv, final int col, final String rtrim) throws IOException
71+
{
72+
// pass first line
73+
if(!csv.readRow()) return;
74+
while (csv.readRow()) {
75+
if (csv.getCellCount() < col + 1)
76+
continue;
77+
StringBuilder word = csv.getCell(col);
78+
if (word.length() < 1) continue;
79+
if (word.charAt(0) == '#') continue;
80+
rtrim(word, rtrim);
81+
set.add(word);
82+
}
83+
}
84+
85+
public static void rtrim(StringBuilder sb, String stripChars) {
86+
if (stripChars == null || stripChars.length() < 1) return;
87+
int len = sb.length();
88+
while (len > 0) {
89+
char c = sb.charAt(len - 1);
90+
if (stripChars.indexOf(c) < 0) break;
91+
len--;
92+
}
93+
sb.setLength(len);
94+
}
5295

5396
}

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/FilterHTML.java renamed to analysis/src/java/com/github/oeuvres/alix/lucene/analysis/MLFilter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
* tokens are deleted. This allows simple computation of a token context (ex:
5252
* span queries, co-occurrences).
5353
*/
54-
public class FilterHTML extends TokenFilter
54+
public class MLFilter extends TokenFilter
5555
{
5656
/** The term provided by the Tokenizer */
5757
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@@ -98,7 +98,7 @@ public class FilterHTML extends TokenFilter
9898
* Default constructor.
9999
* @param input previous filter.
100100
*/
101-
public FilterHTML(TokenStream input) {
101+
public MLFilter(TokenStream input) {
102102
super(input);
103103
skip = 0;
104104
}

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/TokenizerML.java renamed to analysis/src/java/com/github/oeuvres/alix/lucene/analysis/MLTokenizer.java

Lines changed: 78 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
* - No buffer backtracking; uses a one-char pushback slot.
2727
* - Sentence punctuation token cannot absorb following letters (e.g., "!Word" no longer possible).
2828
*/
29-
public class TokenizerML extends Tokenizer
29+
public class MLTokenizer extends Tokenizer
3030
{
3131
/** Max size of a word-like token (not tags). */
3232
private static final int TOKEN_MAX_SIZE = 256;
@@ -54,11 +54,11 @@ public class TokenizerML extends Tokenizer
5454
private int pendingChar = -1; // 0..65535, or -1
5555
private int pendingCharOffset = -1; // offset where pendingChar occurs
5656

57-
public TokenizerML() {
57+
public MLTokenizer() {
5858
this(CharArraySet.EMPTY_SET);
5959
}
6060

61-
public TokenizerML(final CharArraySet keepTrailingDot) {
61+
public MLTokenizer(final CharArraySet keepTrailingDot) {
6262
super();
6363
// Lucene-style: accept null as “no config”
6464
this.keepTrailingDot = (keepTrailingDot == null) ? CharArraySet.EMPTY_SET : keepTrailingDot;
@@ -135,35 +135,38 @@ public final boolean incrementToken() throws IOException
135135
continue;
136136
}
137137

138-
// Abbrev-dot resolution: previous char appended '.' after a letter.
139-
// Decide whether '.' stays with the token (internal) or becomes punctuation.
140-
if (abbrevDot) {
141-
if (!Char.isLetter(c)) {
142-
143-
// 1) keep dot for 1-letter abbreviation ("M.") — existing policy
144-
final boolean oneLetterAbbrev = (termLen == 2 && Char.isLetter(termBuf[0]));
145-
146-
// 2) optional: keep dot for configured abbreviations (termBuf ends with '.')
147-
// test WITHOUT the final dot: [0, termLen-1)
148-
final boolean listedAbbrev =
149-
!oneLetterAbbrev
150-
&& keepTrailingDot != CharArraySet.EMPTY_SET
151-
&& keepTrailingDot.contains(termBuf, 0, termLen - 1);
152-
153-
if (!oneLetterAbbrev && !listedAbbrev) {
154-
// detach '.' and re-emit as punctuation
155-
termLen--;
156-
pendingChar = '.';
157-
pendingCharOffset = off - 1; // '.' already consumed
158-
tokenEndOff = off - 1; // token ends before '.'
159-
abbrevDot = false;
160-
break;
161-
}
162-
// else: keep the dot in the token; the delimiter will end the token naturally
163-
}
164-
// internal dot case: next char is a letter => keep dot, continue normally
165-
abbrevDot = false;
166-
}
138+
// Abbrev-dot resolution: previous char appended '.' after a letter.
139+
// Decide whether '.' stays with the token (internal) or becomes punctuation.
140+
if (abbrevDot) {
141+
if (!Char.isLetter(c)) {
142+
143+
// 1) keep dot for 1-letter abbreviation ("M.")
144+
final boolean oneLetterAbbrev = (termLen == 2 && Char.isLetter(termBuf[0]));
145+
146+
// 2) keep dot for dotted abbreviations/initialisms ("U.S.A.", "Ph.D.")
147+
final boolean dottedAbbrev = !oneLetterAbbrev && looksLikeDottedAbbrev(termBuf, termLen);
148+
149+
// 3) optional: keep dot for configured abbreviations (termBuf ends with '.')
150+
// test WITHOUT the final dot: [0, termLen-1)
151+
final boolean listedAbbrev =
152+
!oneLetterAbbrev
153+
&& keepTrailingDot != CharArraySet.EMPTY_SET
154+
&& keepTrailingDot.contains(termBuf, 0, termLen - 1);
155+
156+
if (!oneLetterAbbrev && !dottedAbbrev && !listedAbbrev) {
157+
// detach '.' and re-emit as punctuation
158+
termLen--;
159+
pendingChar = '.';
160+
pendingCharOffset = off - 1; // '.' already consumed
161+
tokenEndOff = off - 1; // token ends before '.'
162+
abbrevDot = false;
163+
break;
164+
}
165+
// else: keep the dot in the token; the delimiter will end the token naturally
166+
}
167+
// internal dot case: next char is a letter => keep dot, continue normally
168+
abbrevDot = false;
169+
}
167170

168171
// Start of tag '<'
169172
if (c == '<') {
@@ -277,15 +280,15 @@ else if (nameLen == 3) {
277280
return true;
278281
}
279282

280-
// Dot after a letter: may be abbrev/internal dot. Append now; decide next char whether to detach.
281-
if (c == '.' && termLen > 0 && Char.isLetter(c)) {
283+
// Dot after a letter: may be abbrev/internal dot. Append now; decide next char whether to detach.
284+
if (c == '.' && termLen > 0 && Char.isLetter(termBuf[termLen - 1])) {
282285
if (termLen == termBuf.length) termBuf = termAtt.resizeBuffer(termLen + 1);
283286
termBuf[termLen++] = '.';
284287
bi++; off++; lastChar = '.';
285288
abbrevDot = true;
286289
continue;
287290
}
288-
291+
289292
// Sentence punctuation: standalone run token
290293
if (isSentencePunct(c)) {
291294
if (termLen > 0) break; // emit pending token; punctuation next call
@@ -333,6 +336,22 @@ else if (nameLen == 3) {
333336
if (termLen > 0) break; // emit current token; do not consume delimiter
334337
bi++; off++; lastChar = c; // skip delimiter and continue
335338
}
339+
340+
// EOF-safe abbrev-dot resolution: the loop may end without peeking the next char.
341+
if (abbrevDot) {
342+
final boolean oneLetterAbbrev = (termLen == 2 && Char.isLetter(termBuf[0]));
343+
final boolean dottedAbbrev = !oneLetterAbbrev && looksLikeDottedAbbrev(termBuf, termLen);
344+
final boolean listedAbbrev =
345+
!oneLetterAbbrev
346+
&& keepTrailingDot != CharArraySet.EMPTY_SET
347+
&& keepTrailingDot.contains(termBuf, 0, termLen - 1);
348+
if (!oneLetterAbbrev && !dottedAbbrev && !listedAbbrev) {
349+
termLen--;
350+
pendingChar = '.';
351+
pendingCharOffset = off - 1;
352+
tokenEndOff = off - 1;
353+
}
354+
}
336355

337356
// Finalize token built in this call
338357
termAtt.setLength(termLen);
@@ -347,6 +366,30 @@ else if (nameLen == 3) {
347366

348367
return true;
349368
}
369+
370+
/**
371+
* Heuristic: token currently ends with '.' and also contains internal dots separating short letter-only segments.
372+
* Examples: "U.S.A.", "e.g.", "Ph.D.".
373+
*/
374+
private static boolean looksLikeDottedAbbrev(final char[] buf, final int len)
375+
{
376+
if (len < 4 || buf[len - 1] != '.') return false; // at least "A.B."
377+
int segLen = 0;
378+
boolean hasInternalDot = false;
379+
for (int i = 0; i < len - 1; i++) { // exclude trailing '.'
380+
final char c = buf[i];
381+
if (c == '.') {
382+
if (segLen == 0 || segLen > 3) return false;
383+
hasInternalDot = true;
384+
segLen = 0;
385+
continue;
386+
}
387+
if (!Char.isLetter(c)) return false;
388+
segLen++;
389+
if (segLen > 3) return false;
390+
}
391+
return hasInternalDot && segLen > 0 && segLen <= 3;
392+
}
350393

351394
private boolean emitPendingPunct() throws IOException
352395
{

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrLexicons.java

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import java.util.List;
77

88
import org.apache.lucene.analysis.CharArrayMap;
9+
import org.apache.lucene.analysis.CharArraySet;
910

1011
import com.github.oeuvres.alix.lucene.analysis.Lexicons;
1112
import com.github.oeuvres.alix.util.Cache;
@@ -15,6 +16,30 @@ public class FrLexicons
1516
private FrLexicons()
1617
{
1718
}
19+
20+
public static CharArraySet getDotEndingWords(String... localFiles)
21+
{
22+
CharArraySet m = (CharArraySet) Cache.get(CharArraySet.class, FrLexicons.class,
23+
p -> {
24+
try {
25+
return dotEndingWords(p);
26+
} catch (IOException e) {
27+
throw new UncheckedIOException(e);
28+
}
29+
}, localFiles);
30+
return m;
31+
}
32+
33+
private static CharArraySet dotEndingWords(List<String> localFiles) throws IOException
34+
{
35+
// set ignore case
36+
CharArraySet map = new CharArraySet(100, true);
37+
Lexicons.fillSet(map, Lexicons.class, "/com/github/oeuvres/alix/fr/brevidot.csv", 0, ".");
38+
for (String file : localFiles) {
39+
Lexicons.fillSet(map, Path.of(file), 0, ".");
40+
}
41+
return map;
42+
}
1843

1944
static CharArrayMap<char[]> getTermMapping(String... localFiles)
2045
{
@@ -32,9 +57,9 @@ static CharArrayMap<char[]> getTermMapping(String... localFiles)
3257
private static CharArrayMap<char[]> termMapping(List<String> localFiles) throws IOException
3358
{
3459
CharArrayMap<char[]> map = new CharArrayMap<char[]>(2000, false);
35-
Lexicons.fillPairs(map, Lexicons.class, "/com/github/oeuvres/alix/fr/norm.csv", false);
60+
Lexicons.fillMap(map, Lexicons.class, "/com/github/oeuvres/alix/fr/norm.csv", false);
3661
for (String file : localFiles) {
37-
Lexicons.fillPairs(map, Path.of(file), true);
62+
Lexicons.fillMap(map, Path.of(file), true);
3863
}
3964
return map;
4065
}

0 commit comments

Comments
 (0)