Skip to content

Commit a20f3b1

Browse files
committed
Interpret xml tags as structural punctuation
1 parent 3907e29 commit a20f3b1

8 files changed

Lines changed: 432 additions & 506 deletions

File tree

Lines changed: 384 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,384 @@
1+
/*
2+
* Alix, A Lucene Indexer for XML documents.
3+
*
4+
* Copyright 2026 Frédéric Glorieux <frederic.glorieux@fictif.org> & Unige
5+
* Copyright 2016 Frédéric Glorieux <frederic.glorieux@fictif.org>
6+
* Copyright 2009 Pierre Dittgen <pierre@dittgen.org>
7+
* Frédéric Glorieux <frederic.glorieux@fictif.org>
8+
*
9+
* Alix is a java library to index and search XML text documents
10+
* with Lucene https://lucene.apache.org/core/
11+
* including linguistic expertness for French,
12+
* available under Apache license.
13+
*
14+
* Alix has been started in 2009 under the javacrim project
15+
* https://sf.net/projects/javacrim/
16+
* for a java course at Inalco http://www.er-tim.fr/
17+
* Alix continues the concepts of SDX under another licence
18+
* «Système de Documentation XML»
19+
* 2000-2010 Ministère de la culture et de la communication (France), AJLSM.
20+
* http://savannah.nongnu.org/projects/sdx/
21+
*
22+
* Licensed under the Apache License, Version 2.0 (the "License");
23+
* you may not use this file except in compliance with the License.
24+
* You may obtain a copy of the License at
25+
*
26+
* http://www.apache.org/licenses/LICENSE-2.0
27+
*
28+
* Unless required by applicable law or agreed to in writing, software
29+
* distributed under the License is distributed on an "AS IS" BASIS,
30+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31+
* See the License for the specific language governing permissions and
32+
* limitations under the License.
33+
*/
34+
package com.github.oeuvres.alix.lucene.analysis;
35+
36+
import static com.github.oeuvres.alix.common.Upos.*;
37+
38+
import java.io.IOException;
39+
import java.util.Objects;
40+
41+
import org.apache.lucene.analysis.CharArraySet;
42+
import org.apache.lucene.analysis.TokenFilter;
43+
import org.apache.lucene.analysis.TokenStream;
44+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
45+
46+
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.PosAttribute;
47+
48+
/**
49+
* Converts selected closing XML/HTML tags into synthetic structural boundary tokens
50+
* and drops all other markup tokens.
51+
*
52+
* <h2>Input contract</h2>
53+
* <ul>
54+
* <li>The upstream tokenizer emits tags as tokens whose {@link CharTermAttribute} contains the literal tag
55+
* (including {@code <} and {@code >}).</li>
56+
* <li>Tag tokens are identified by {@link PosAttribute#getPos()} == {@code XML.code}.</li>
57+
* <li>Non-tag tokens (visible text, punctuation, etc.) carry their usual offsets/positions.</li>
58+
* </ul>
59+
*
60+
* <h2>Behavior</h2>
61+
* <ul>
62+
* <li><b>All markup tokens are dropped</b>, except those mapped to boundaries.</li>
63+
* <li>On configured <b>closing tags</b> (e.g. {@code </p>}), emit a synthetic boundary token:
64+
* <ul>
65+
* <li>paragraph boundary: term {@value #PARA_MARK}, {@code PosAttribute = PUNCTpara.code}</li>
66+
* <li>section boundary: term {@value #SECTION_MARK}, {@code PosAttribute = PUNCTsection.code}</li>
67+
* </ul>
68+
* </li>
69+
* <li><b>Only closing tags</b> are considered for boundaries (no mapping on open/self-closing tags).</li>
70+
* <li><b>Local-name matching</b>: prefixes are ignored (e.g. {@code </tei:p>} matches {@code p}).</li>
71+
* <li><b>Coalescing</b>: consecutive boundary requests before any visible token are merged into one;
72+
* section wins over paragraph.</li>
73+
* </ul>
74+
*
75+
* <h2>Configuration</h2>
76+
* <p>
77+
* The constructor accepts two {@code |}-separated lists of element names:
78+
* </p>
79+
* <ul>
80+
* <li>{@code paraElements}: names whose closing tag triggers a paragraph boundary</li>
81+
* <li>{@code sectionElements}: names whose closing tag triggers a section boundary</li>
82+
* </ul>
83+
*
84+
* <pre>{@code
85+
* // Map </p>, </li>, </td>, </h1>.. to ¶, and </article>, </section> to §
86+
* TokenStream ts = new MarkupFilter(tokenizer, "p|li|td|h1|h2|h3", "article|section");
87+
* }</pre>
88+
*
89+
* <h2>Offsets and positions</h2>
90+
* <p>
91+
* Boundary tokens reuse the attribute state of the triggering close-tag token, and overwrite only:
92+
* {@link CharTermAttribute} and {@link PosAttribute}. This preserves offsets/position-increment coherence
93+
* according to what the tokenizer provided for the markup token.
94+
* </p>
95+
*/
96+
public final class MarkupBoundaryFilter extends TokenFilter
97+
{
98+
/** Synthetic term emitted for paragraph-like boundaries. */
99+
public static final String PARA_MARK = "¶";
100+
101+
/** Synthetic term emitted for section-like boundaries. */
102+
public static final String SECTION_MARK = "§";
103+
104+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
105+
private final PosAttribute posAtt = addAttribute(PosAttribute.class);
106+
107+
private final CharArraySet paraOnClose;
108+
private final CharArraySet sectionOnClose;
109+
110+
/**
111+
* Pending structural boundary to emit before the next visible token (or at EOF).
112+
* Stores a POS code ({@code PUNCTpara.code} or {@code PUNCTsection.code}), or 0 for none.
113+
*/
114+
private int pendingBoundaryPos = 0;
115+
116+
/**
117+
* State captured from the triggering close-tag token so the synthetic boundary keeps coherent
118+
* offsets/positions from the source markup token.
119+
*/
120+
private State pendingBoundaryState = null;
121+
122+
/**
123+
* Buffered visible token that was read while a pending boundary still had to be emitted first.
124+
*/
125+
private State deferredVisibleToken = null;
126+
127+
// Defaults as readable strings (local-names, case-sensitive, alphabetic order)
128+
public static final String DEFAULT_PARA_ELEMENTS =
129+
"ab|address|blockquote|cell|dd|div|dt|h1|h2|h3|h4|h5|h6|head|item|l|label|li|p|pre|row|td|th|tr";
130+
131+
public static final String DEFAULT_SECTION_ELEMENTS =
132+
"article|back|body|chapter|div0|div1|div2|div3|div4|div5|div6|div7|front|group|main|section|text";
133+
134+
/** Default policy constructor: uses {@link #DEFAULT_PARA_ELEMENTS} and {@link #DEFAULT_SECTION_ELEMENTS}. */
135+
public MarkupBoundaryFilter(final TokenStream input) {
136+
this(input, DEFAULT_PARA_ELEMENTS, DEFAULT_SECTION_ELEMENTS);
137+
}
138+
139+
/**
140+
* @param input token stream (typically tokenizer output)
141+
* @param paraElements {@code |}-separated local-names mapped from close-tags to paragraph boundary (e.g. {@code "p|li|td|h1"})
142+
* @param sectionElements {@code |}-separated local-names mapped from close-tags to section boundary (e.g. {@code "article|section"})
143+
*/
144+
public MarkupBoundaryFilter(final TokenStream input, final String paraElements, final String sectionElements)
145+
{
146+
super(Objects.requireNonNull(input, "input"));
147+
this.paraOnClose = compileTagSet(paraElements);
148+
this.sectionOnClose = compileTagSet(sectionElements);
149+
}
150+
151+
@Override
152+
public boolean incrementToken() throws IOException
153+
{
154+
// 0) Drain deferred visible token first (if we emitted a boundary before it).
155+
if (deferredVisibleToken != null) {
156+
restoreState(deferredVisibleToken);
157+
deferredVisibleToken = null;
158+
return true;
159+
}
160+
161+
// 1) If a structural boundary is pending, emit it now.
162+
if (pendingBoundaryPos != 0) {
163+
emitPendingBoundary();
164+
return true;
165+
}
166+
167+
while (input.incrementToken()) {
168+
169+
final int pos = posAtt.getPos();
170+
final boolean isXml = (pos == XML.code);
171+
172+
// Visible token: emit unless we must emit a pending boundary first.
173+
if (!isXml) {
174+
if (pendingBoundaryPos != 0) {
175+
deferredVisibleToken = captureState();
176+
emitPendingBoundary();
177+
return true;
178+
}
179+
return true;
180+
}
181+
182+
// Tag token: classify and (maybe) map to boundary; otherwise drop.
183+
final char[] buf = termAtt.buffer();
184+
final int len = termAtt.length();
185+
186+
final TagKind kind = classifyTag(buf, len);
187+
if (kind != TagKind.CLOSE) {
188+
// Drop OPEN, DECL/COMMENT/PI, INVALID
189+
continue;
190+
}
191+
192+
final long span = readLocalTagNameSpan(buf, len, /*from*/2); // after "</"
193+
final int start = (int)(span >>> 32);
194+
final int end = (int)span;
195+
if (end <= start) continue;
196+
197+
final int nameLen = end - start;
198+
199+
// Section boundary wins over paragraph if both configured.
200+
if (sectionOnClose.contains(buf, start, nameLen)) {
201+
requestBoundary(PUNCTsection.code);
202+
continue;
203+
}
204+
if (paraOnClose.contains(buf, start, nameLen)) {
205+
requestBoundary(PUNCTpara.code);
206+
continue;
207+
}
208+
209+
// Default: drop tag token
210+
}
211+
212+
// EOF: still emit a pending boundary if one remains.
213+
if (pendingBoundaryPos != 0) {
214+
emitPendingBoundary();
215+
return true;
216+
}
217+
218+
return false;
219+
}
220+
221+
@Override
222+
public void reset() throws IOException
223+
{
224+
super.reset();
225+
pendingBoundaryPos = 0;
226+
pendingBoundaryState = null;
227+
deferredVisibleToken = null;
228+
}
229+
230+
@Override
231+
public void end() throws IOException
232+
{
233+
super.end();
234+
pendingBoundaryPos = 0;
235+
pendingBoundaryState = null;
236+
deferredVisibleToken = null;
237+
}
238+
239+
// -----------------------------------------------------------------------
240+
// Public helper (requested): compile tag-name lists
241+
// -----------------------------------------------------------------------
242+
243+
/**
244+
* Compile a {@code |}-separated list of tag local-names into a case-insensitive {@link CharArraySet}.
245+
* Empty/null input yields an empty set.
246+
*
247+
* <p>Accepted separators: {@code |} plus optional surrounding whitespace.</p>
248+
*/
249+
public static CharArraySet compileTagSet(final String names)
250+
{
251+
final CharArraySet set = new CharArraySet(16, true);
252+
if (names == null) return set;
253+
254+
int i = 0;
255+
final int n = names.length();
256+
while (i < n) {
257+
// skip spaces and separators
258+
while (i < n) {
259+
final char c = names.charAt(i);
260+
if (c == '|' || isWs(c)) { i++; continue; }
261+
break;
262+
}
263+
if (i >= n) break;
264+
265+
final int start = i;
266+
while (i < n) {
267+
final char c = names.charAt(i);
268+
if (c == '|') break;
269+
i++;
270+
}
271+
int end = i;
272+
// trim right
273+
while (end > start && isWs(names.charAt(end - 1))) end--;
274+
275+
if (end > start) {
276+
// store local-name only (strip any prefix the user might include)
277+
final int p = names.lastIndexOf(':', end - 1);
278+
final int ls = (p >= start) ? (p + 1) : start;
279+
if (end > ls) set.add(names.substring(ls, end));
280+
}
281+
}
282+
return set;
283+
}
284+
285+
private static boolean isWs(char c) {
286+
return c == ' ' || c == '\t' || c == '\n' || c == '\r';
287+
}
288+
289+
// -----------------------------------------------------------------------
290+
// Boundary handling (unchanged semantics)
291+
// -----------------------------------------------------------------------
292+
293+
/**
294+
* Registers a structural boundary to emit later.
295+
* Coalesces consecutive boundaries; section wins over paragraph.
296+
*/
297+
private void requestBoundary(final int posCode)
298+
{
299+
if (posCode != PUNCTpara.code && posCode != PUNCTsection.code) return;
300+
301+
if (pendingBoundaryPos == 0) {
302+
pendingBoundaryPos = posCode;
303+
pendingBoundaryState = captureState();
304+
return;
305+
}
306+
307+
// Coalesce: keep strongest boundary (section > paragraph).
308+
if (pendingBoundaryPos == PUNCTpara.code && posCode == PUNCTsection.code) {
309+
pendingBoundaryPos = posCode;
310+
pendingBoundaryState = captureState();
311+
}
312+
}
313+
314+
/**
315+
* Emits the currently pending structural boundary by restoring the state of the triggering
316+
* tag token and overwriting its term/POS with a synthetic boundary marker.
317+
*/
318+
private void emitPendingBoundary()
319+
{
320+
restoreState(pendingBoundaryState);
321+
pendingBoundaryState = null;
322+
323+
if (pendingBoundaryPos == PUNCTsection.code) {
324+
posAtt.setPos(PUNCTsection.code);
325+
termAtt.setEmpty().append(SECTION_MARK);
326+
}
327+
else {
328+
posAtt.setPos(PUNCTpara.code);
329+
termAtt.setEmpty().append(PARA_MARK);
330+
}
331+
332+
pendingBoundaryPos = 0;
333+
}
334+
335+
// -----------------------------------------------------------------------
336+
// Tag parsing helpers (allocation-free)
337+
// -----------------------------------------------------------------------
338+
339+
private enum TagKind { OPEN, CLOSE, DECL_OR_COMMENT, INVALID }
340+
341+
private static TagKind classifyTag(final char[] buf, final int len)
342+
{
343+
if (len < 3) return TagKind.INVALID;
344+
if (buf[0] != '<') return TagKind.INVALID;
345+
346+
final char c1 = buf[1];
347+
if (c1 == '/') return TagKind.CLOSE;
348+
if (c1 == '!' || c1 == '?') return TagKind.DECL_OR_COMMENT;
349+
return TagKind.OPEN;
350+
}
351+
352+
/**
353+
* Reads local-name span from a tag token.
354+
* @param from index right after '&lt;' (1) or '&lt;/' (2)
355+
* @return packed long: (start&lt;&lt;32) | end, end exclusive; or (0,0) on failure.
356+
*/
357+
private static long readLocalTagNameSpan(final char[] tag, final int n, final int from)
358+
{
359+
int i = from;
360+
while (i < n && isHtmlSpace(tag[i])) i++;
361+
if (i >= n) return 0L;
362+
363+
final int nameStart = i;
364+
while (i < n) {
365+
final char ch = tag[i];
366+
if (ch == '>' || ch == '/' || isHtmlSpace(ch)) break;
367+
i++;
368+
}
369+
final int nameEnd = i;
370+
if (nameEnd <= nameStart) return 0L;
371+
372+
// local-name after last ':'
373+
int localStart = nameStart;
374+
for (int k = nameStart; k < nameEnd; k++) {
375+
if (tag[k] == ':') localStart = k + 1;
376+
}
377+
return (((long)localStart) << 32) | (nameEnd & 0xFFFFFFFFL);
378+
}
379+
380+
private static boolean isHtmlSpace(final char c)
381+
{
382+
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
383+
}
384+
}

0 commit comments

Comments
 (0)