1+ /*
2+ * Alix, A Lucene Indexer for XML documents.
3+ *
4+ * Copyright 2026 Frédéric Glorieux <frederic.glorieux@fictif.org> & Unige
5+ * Copyright 2016 Frédéric Glorieux <frederic.glorieux@fictif.org>
6+ * Copyright 2009 Pierre Dittgen <pierre@dittgen.org>
7+ * Frédéric Glorieux <frederic.glorieux@fictif.org>
8+ *
9+ * Alix is a java library to index and search XML text documents
10+ * with Lucene https://lucene.apache.org/core/
11+ * including linguistic expertness for French,
12+ * available under Apache license.
13+ *
14+ * Alix has been started in 2009 under the javacrim project
15+ * https://sf.net/projects/javacrim/
16+ * for a java course at Inalco http://www.er-tim.fr/
17+ * Alix continues the concepts of SDX under another licence
18+ * «Système de Documentation XML»
19+ * 2000-2010 Ministère de la culture et de la communication (France), AJLSM.
20+ * http://savannah.nongnu.org/projects/sdx/
21+ *
22+ * Licensed under the Apache License, Version 2.0 (the "License");
23+ * you may not use this file except in compliance with the License.
24+ * You may obtain a copy of the License at
25+ *
26+ * http://www.apache.org/licenses/LICENSE-2.0
27+ *
28+ * Unless required by applicable law or agreed to in writing, software
29+ * distributed under the License is distributed on an "AS IS" BASIS,
30+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31+ * See the License for the specific language governing permissions and
32+ * limitations under the License.
33+ */
34+ package com .github .oeuvres .alix .lucene .analysis ;
35+
36+ import static com .github .oeuvres .alix .common .Upos .*;
37+
38+ import java .io .IOException ;
39+ import java .util .Objects ;
40+
41+ import org .apache .lucene .analysis .CharArraySet ;
42+ import org .apache .lucene .analysis .TokenFilter ;
43+ import org .apache .lucene .analysis .TokenStream ;
44+ import org .apache .lucene .analysis .tokenattributes .CharTermAttribute ;
45+
46+ import com .github .oeuvres .alix .lucene .analysis .tokenattributes .PosAttribute ;
47+
48+ /**
49+ * Converts selected closing XML/HTML tags into synthetic structural boundary tokens
50+ * and drops all other markup tokens.
51+ *
52+ * <h2>Input contract</h2>
53+ * <ul>
54+ * <li>The upstream tokenizer emits tags as tokens whose {@link CharTermAttribute} contains the literal tag
55+ * (including {@code <} and {@code >}).</li>
56+ * <li>Tag tokens are identified by {@link PosAttribute#getPos()} == {@code XML.code}.</li>
57+ * <li>Non-tag tokens (visible text, punctuation, etc.) carry their usual offsets/positions.</li>
58+ * </ul>
59+ *
60+ * <h2>Behavior</h2>
61+ * <ul>
62+ * <li><b>All markup tokens are dropped</b>, except those mapped to boundaries.</li>
63+ * <li>On configured <b>closing tags</b> (e.g. {@code </p>}), emit a synthetic boundary token:
64+ * <ul>
65+ * <li>paragraph boundary: term {@value #PARA_MARK}, {@code PosAttribute = PUNCTpara.code}</li>
66+ * <li>section boundary: term {@value #SECTION_MARK}, {@code PosAttribute = PUNCTsection.code}</li>
67+ * </ul>
68+ * </li>
69+ * <li><b>Only closing tags</b> are considered for boundaries (no mapping on open/self-closing tags).</li>
70+ * <li><b>Local-name matching</b>: prefixes are ignored (e.g. {@code </tei:p>} matches {@code p}).</li>
71+ * <li><b>Coalescing</b>: consecutive boundary requests before any visible token are merged into one;
72+ * section wins over paragraph.</li>
73+ * </ul>
74+ *
75+ * <h2>Configuration</h2>
76+ * <p>
77+ * The constructor accepts two {@code |}-separated lists of element names:
78+ * </p>
79+ * <ul>
80+ * <li>{@code paraElements}: names whose closing tag triggers a paragraph boundary</li>
81+ * <li>{@code sectionElements}: names whose closing tag triggers a section boundary</li>
82+ * </ul>
83+ *
84+ * <pre>{@code
85+ * // Map </p>, </li>, </td>, </h1>.. to ¶, and </article>, </section> to §
86+ * TokenStream ts = new MarkupFilter(tokenizer, "p|li|td|h1|h2|h3", "article|section");
87+ * }</pre>
88+ *
89+ * <h2>Offsets and positions</h2>
90+ * <p>
91+ * Boundary tokens reuse the attribute state of the triggering close-tag token, and overwrite only:
92+ * {@link CharTermAttribute} and {@link PosAttribute}. This preserves offsets/position-increment coherence
93+ * according to what the tokenizer provided for the markup token.
94+ * </p>
95+ */
96+ public final class MarkupBoundaryFilter extends TokenFilter
97+ {
98+ /** Synthetic term emitted for paragraph-like boundaries. */
99+ public static final String PARA_MARK = "¶" ;
100+
101+ /** Synthetic term emitted for section-like boundaries. */
102+ public static final String SECTION_MARK = "§" ;
103+
104+ private final CharTermAttribute termAtt = addAttribute (CharTermAttribute .class );
105+ private final PosAttribute posAtt = addAttribute (PosAttribute .class );
106+
107+ private final CharArraySet paraOnClose ;
108+ private final CharArraySet sectionOnClose ;
109+
110+ /**
111+ * Pending structural boundary to emit before the next visible token (or at EOF).
112+ * Stores a POS code ({@code PUNCTpara.code} or {@code PUNCTsection.code}), or 0 for none.
113+ */
114+ private int pendingBoundaryPos = 0 ;
115+
116+ /**
117+ * State captured from the triggering close-tag token so the synthetic boundary keeps coherent
118+ * offsets/positions from the source markup token.
119+ */
120+ private State pendingBoundaryState = null ;
121+
122+ /**
123+ * Buffered visible token that was read while a pending boundary still had to be emitted first.
124+ */
125+ private State deferredVisibleToken = null ;
126+
127+ // Defaults as readable strings (local-names, case-sensitive, alphabetic order)
128+ public static final String DEFAULT_PARA_ELEMENTS =
129+ "ab|address|blockquote|cell|dd|div|dt|h1|h2|h3|h4|h5|h6|head|item|l|label|li|p|pre|row|td|th|tr" ;
130+
131+ public static final String DEFAULT_SECTION_ELEMENTS =
132+ "article|back|body|chapter|div0|div1|div2|div3|div4|div5|div6|div7|front|group|main|section|text" ;
133+
134+ /** Default policy constructor: uses {@link #DEFAULT_PARA_ELEMENTS} and {@link #DEFAULT_SECTION_ELEMENTS}. */
135+ public MarkupBoundaryFilter (final TokenStream input ) {
136+ this (input , DEFAULT_PARA_ELEMENTS , DEFAULT_SECTION_ELEMENTS );
137+ }
138+
139+ /**
140+ * @param input token stream (typically tokenizer output)
141+ * @param paraElements {@code |}-separated local-names mapped from close-tags to paragraph boundary (e.g. {@code "p|li|td|h1"})
142+ * @param sectionElements {@code |}-separated local-names mapped from close-tags to section boundary (e.g. {@code "article|section"})
143+ */
144+ public MarkupBoundaryFilter (final TokenStream input , final String paraElements , final String sectionElements )
145+ {
146+ super (Objects .requireNonNull (input , "input" ));
147+ this .paraOnClose = compileTagSet (paraElements );
148+ this .sectionOnClose = compileTagSet (sectionElements );
149+ }
150+
151+ @ Override
152+ public boolean incrementToken () throws IOException
153+ {
154+ // 0) Drain deferred visible token first (if we emitted a boundary before it).
155+ if (deferredVisibleToken != null ) {
156+ restoreState (deferredVisibleToken );
157+ deferredVisibleToken = null ;
158+ return true ;
159+ }
160+
161+ // 1) If a structural boundary is pending, emit it now.
162+ if (pendingBoundaryPos != 0 ) {
163+ emitPendingBoundary ();
164+ return true ;
165+ }
166+
167+ while (input .incrementToken ()) {
168+
169+ final int pos = posAtt .getPos ();
170+ final boolean isXml = (pos == XML .code );
171+
172+ // Visible token: emit unless we must emit a pending boundary first.
173+ if (!isXml ) {
174+ if (pendingBoundaryPos != 0 ) {
175+ deferredVisibleToken = captureState ();
176+ emitPendingBoundary ();
177+ return true ;
178+ }
179+ return true ;
180+ }
181+
182+ // Tag token: classify and (maybe) map to boundary; otherwise drop.
183+ final char [] buf = termAtt .buffer ();
184+ final int len = termAtt .length ();
185+
186+ final TagKind kind = classifyTag (buf , len );
187+ if (kind != TagKind .CLOSE ) {
188+ // Drop OPEN, DECL/COMMENT/PI, INVALID
189+ continue ;
190+ }
191+
192+ final long span = readLocalTagNameSpan (buf , len , /*from*/ 2 ); // after "</"
193+ final int start = (int )(span >>> 32 );
194+ final int end = (int )span ;
195+ if (end <= start ) continue ;
196+
197+ final int nameLen = end - start ;
198+
199+ // Section boundary wins over paragraph if both configured.
200+ if (sectionOnClose .contains (buf , start , nameLen )) {
201+ requestBoundary (PUNCTsection .code );
202+ continue ;
203+ }
204+ if (paraOnClose .contains (buf , start , nameLen )) {
205+ requestBoundary (PUNCTpara .code );
206+ continue ;
207+ }
208+
209+ // Default: drop tag token
210+ }
211+
212+ // EOF: still emit a pending boundary if one remains.
213+ if (pendingBoundaryPos != 0 ) {
214+ emitPendingBoundary ();
215+ return true ;
216+ }
217+
218+ return false ;
219+ }
220+
221+ @ Override
222+ public void reset () throws IOException
223+ {
224+ super .reset ();
225+ pendingBoundaryPos = 0 ;
226+ pendingBoundaryState = null ;
227+ deferredVisibleToken = null ;
228+ }
229+
230+ @ Override
231+ public void end () throws IOException
232+ {
233+ super .end ();
234+ pendingBoundaryPos = 0 ;
235+ pendingBoundaryState = null ;
236+ deferredVisibleToken = null ;
237+ }
238+
239+ // -----------------------------------------------------------------------
240+ // Public helper (requested): compile tag-name lists
241+ // -----------------------------------------------------------------------
242+
243+ /**
244+ * Compile a {@code |}-separated list of tag local-names into a case-insensitive {@link CharArraySet}.
245+ * Empty/null input yields an empty set.
246+ *
247+ * <p>Accepted separators: {@code |} plus optional surrounding whitespace.</p>
248+ */
249+ public static CharArraySet compileTagSet (final String names )
250+ {
251+ final CharArraySet set = new CharArraySet (16 , true );
252+ if (names == null ) return set ;
253+
254+ int i = 0 ;
255+ final int n = names .length ();
256+ while (i < n ) {
257+ // skip spaces and separators
258+ while (i < n ) {
259+ final char c = names .charAt (i );
260+ if (c == '|' || isWs (c )) { i ++; continue ; }
261+ break ;
262+ }
263+ if (i >= n ) break ;
264+
265+ final int start = i ;
266+ while (i < n ) {
267+ final char c = names .charAt (i );
268+ if (c == '|' ) break ;
269+ i ++;
270+ }
271+ int end = i ;
272+ // trim right
273+ while (end > start && isWs (names .charAt (end - 1 ))) end --;
274+
275+ if (end > start ) {
276+ // store local-name only (strip any prefix the user might include)
277+ final int p = names .lastIndexOf (':' , end - 1 );
278+ final int ls = (p >= start ) ? (p + 1 ) : start ;
279+ if (end > ls ) set .add (names .substring (ls , end ));
280+ }
281+ }
282+ return set ;
283+ }
284+
285+ private static boolean isWs (char c ) {
286+ return c == ' ' || c == '\t' || c == '\n' || c == '\r' ;
287+ }
288+
289+ // -----------------------------------------------------------------------
290+ // Boundary handling (unchanged semantics)
291+ // -----------------------------------------------------------------------
292+
293+ /**
294+ * Registers a structural boundary to emit later.
295+ * Coalesces consecutive boundaries; section wins over paragraph.
296+ */
297+ private void requestBoundary (final int posCode )
298+ {
299+ if (posCode != PUNCTpara .code && posCode != PUNCTsection .code ) return ;
300+
301+ if (pendingBoundaryPos == 0 ) {
302+ pendingBoundaryPos = posCode ;
303+ pendingBoundaryState = captureState ();
304+ return ;
305+ }
306+
307+ // Coalesce: keep strongest boundary (section > paragraph).
308+ if (pendingBoundaryPos == PUNCTpara .code && posCode == PUNCTsection .code ) {
309+ pendingBoundaryPos = posCode ;
310+ pendingBoundaryState = captureState ();
311+ }
312+ }
313+
314+ /**
315+ * Emits the currently pending structural boundary by restoring the state of the triggering
316+ * tag token and overwriting its term/POS with a synthetic boundary marker.
317+ */
318+ private void emitPendingBoundary ()
319+ {
320+ restoreState (pendingBoundaryState );
321+ pendingBoundaryState = null ;
322+
323+ if (pendingBoundaryPos == PUNCTsection .code ) {
324+ posAtt .setPos (PUNCTsection .code );
325+ termAtt .setEmpty ().append (SECTION_MARK );
326+ }
327+ else {
328+ posAtt .setPos (PUNCTpara .code );
329+ termAtt .setEmpty ().append (PARA_MARK );
330+ }
331+
332+ pendingBoundaryPos = 0 ;
333+ }
334+
335+ // -----------------------------------------------------------------------
336+ // Tag parsing helpers (allocation-free)
337+ // -----------------------------------------------------------------------
338+
339+ private enum TagKind { OPEN , CLOSE , DECL_OR_COMMENT , INVALID }
340+
341+ private static TagKind classifyTag (final char [] buf , final int len )
342+ {
343+ if (len < 3 ) return TagKind .INVALID ;
344+ if (buf [0 ] != '<' ) return TagKind .INVALID ;
345+
346+ final char c1 = buf [1 ];
347+ if (c1 == '/' ) return TagKind .CLOSE ;
348+ if (c1 == '!' || c1 == '?' ) return TagKind .DECL_OR_COMMENT ;
349+ return TagKind .OPEN ;
350+ }
351+
352+ /**
353+ * Reads local-name span from a tag token.
354+ * @param from index right after '<' (1) or '</' (2)
355+ * @return packed long: (start<<32) | end, end exclusive; or (0,0) on failure.
356+ */
357+ private static long readLocalTagNameSpan (final char [] tag , final int n , final int from )
358+ {
359+ int i = from ;
360+ while (i < n && isHtmlSpace (tag [i ])) i ++;
361+ if (i >= n ) return 0L ;
362+
363+ final int nameStart = i ;
364+ while (i < n ) {
365+ final char ch = tag [i ];
366+ if (ch == '>' || ch == '/' || isHtmlSpace (ch )) break ;
367+ i ++;
368+ }
369+ final int nameEnd = i ;
370+ if (nameEnd <= nameStart ) return 0L ;
371+
372+ // local-name after last ':'
373+ int localStart = nameStart ;
374+ for (int k = nameStart ; k < nameEnd ; k ++) {
375+ if (tag [k ] == ':' ) localStart = k + 1 ;
376+ }
377+ return (((long )localStart ) << 32 ) | (nameEnd & 0xFFFFFFFFL );
378+ }
379+
380+ private static boolean isHtmlSpace (final char c )
381+ {
382+ return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' ;
383+ }
384+ }
0 commit comments