Skip to content

Commit a872457

Browse files
committed
First draft of alix xml ingester
1 parent 6342515 commit a872457

8 files changed

Lines changed: 174 additions & 539796 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/PosTaggingFilter.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,10 @@ private void tagBufferedQueue()
250250
final int pos = p.getPos();
251251

252252
// Skip structural XML tags entirely (1 -> 0)
253-
if (pos == XML.code) continue;
253+
if (pos == XML.code) {
254+
probAtt.setProb(1);
255+
continue;
256+
}
254257

255258
// Sentence boundaries are submitted as punctuation token (1 -> 1)
256259
if (isSentenceBoundary(pos)) {

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/tokenattributes/ProbAttributeImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import org.apache.lucene.util.AttributeReflector;
55

66
public final class ProbAttributeImpl extends AttributeImpl implements ProbAttribute {
7-
private static double UNKNOWN = -1;
7+
private static double UNKNOWN = 0;
88
private double prob = UNKNOWN;
99

1010

test/src/main/java/com/github/oeuvres/alix/lucene/analysis/AnalysisDemoHelper.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,10 @@ public static void dump(final TokenStream ts, final String input) throws IOExcep
8787
final int start = offAtt.startOffset();
8888
final int end = offAtt.endOffset();
8989
final int pos = (posAtt != null)?posAtt.getPos():0;
90-
final double prob = (probAtt != null)?probAtt.getProb():-1;
90+
final String prob = (probAtt != null)?String.format(java.util.Locale.ROOT, "%.5f",probAtt.getProb()):"";
9191

9292
System.out.printf(
93-
"%5d\t[%d,%d)\t|%s|\t%s\t%s\t%s\t%.5f%n",
93+
"%5d\t[%d,%d)\t|%s|\t%s\t%s\t%s\t%s%n",
9494
i++,
9595
start, end,
9696
safeSlice(input, start, end),

test/src/main/java/com/github/oeuvres/alix/lucene/analysis/MLTokenizerBenchmark.java

Lines changed: 0 additions & 201 deletions
This file was deleted.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package com.github.oeuvres.alix.lucene.analysis;
2+
3+
import java.io.BufferedReader;
4+
import java.io.IOException;
5+
import java.nio.charset.StandardCharsets;
6+
import java.nio.file.Files;
7+
import java.nio.file.Path;
8+
import java.nio.file.Paths;
9+
10+
import org.apache.lucene.analysis.Analyzer;
11+
import org.apache.lucene.analysis.CharArrayMap;
12+
import org.apache.lucene.analysis.TokenStream;
13+
import org.apache.lucene.analysis.Tokenizer;
14+
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
15+
16+
public class MLTokenizerDemo
17+
{
18+
/** Minimal Analyzer for StandardTokenizer ->TermReplaceFilter. */
19+
private static Analyzer buildAnalyzer() {
20+
return new Analyzer() {
21+
@Override
22+
protected TokenStreamComponents createComponents(String fieldName) {
23+
Tokenizer tokenizer = new MLTokenizer();
24+
TokenStream stream = tokenizer;
25+
return new TokenStreamComponents(tokenizer, stream);
26+
}
27+
};
28+
}
29+
30+
static public void main(String[] args) throws IOException
31+
{
32+
Path path = Paths.get("src/test/test-data/ingest.html");
33+
String input = Files.readString(path, StandardCharsets.UTF_8);
34+
try (
35+
Analyzer analyzer = buildAnalyzer();
36+
BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)
37+
){
38+
TokenStream ts = analyzer.tokenStream("contents", reader);
39+
AnalysisDemoHelper.dump(ts, input);
40+
}
41+
}
42+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package com.github.oeuvres.alix.lucene.index;
2+
3+
import java.io.FileInputStream;
4+
import java.io.InputStream;
5+
import java.nio.file.Files;
6+
import java.nio.file.Path;
7+
import java.nio.file.Paths;
8+
9+
public final class AlixSaxParserSinkDebug implements AlixSaxParser.AlixSink {
10+
11+
@Override
12+
public void startUnit(AlixSaxParser.Unit unit) {
13+
System.out.printf("START %-7s id=%s%n", unit.kind(), unit.xmlId());
14+
}
15+
16+
@Override
17+
public void field(AlixSaxParser.Unit unit, AlixSaxParser.FieldSpec f) {
18+
System.out.printf(" FIELD %-12s type=%-8s analyzer=%-12s",
19+
f.name(), f.type().name().toLowerCase(), f.analyzerHint());
20+
21+
if (f.value() != null) {
22+
System.out.printf(" value=%s", shortStr(f.value()));
23+
}
24+
if (f.contentXml() != null) {
25+
System.out.printf(" contentXml=%s", shortStr(f.contentXml()));
26+
}
27+
if (f.source() != null) {
28+
System.out.printf(" source=%s selectors=%d", f.source(), f.selectors().size());
29+
}
30+
System.out.println();
31+
32+
for (var s : f.selectors()) {
33+
System.out.printf(" %s element=%s attribute=%s value=%s%n",
34+
s.mode().name().toLowerCase(), s.element(), s.attribute(), s.value());
35+
}
36+
}
37+
38+
@Override
39+
public void endUnit(AlixSaxParser.Unit unit) {
40+
System.out.printf("END %-7s id=%s%n", unit.kind(), unit.xmlId());
41+
}
42+
43+
private static String shortStr(String s) {
44+
if (s == null) return "null";
45+
s = s.replace('\n', ' ').replace('\r', ' ');
46+
return (s.length() > 90) ? s.substring(0, 87) + "..." : s;
47+
}
48+
49+
public static void main(String[] args) throws Exception {
50+
// final String res = "/ingest/test-alix.xml";
51+
Path path = Paths.get("src/main/resources/ingest/test-alix.xml");
52+
System.out.println(path.toAbsolutePath());
53+
// try (InputStream in = Thread.currentThread().getContextClassLoader() .getResourceAsStream(res)) {
54+
try (InputStream in = Files.newInputStream(path)) {
55+
if (in == null) throw new IllegalStateException(path + ": resource not found");
56+
AlixSaxParser.parse(in, new AlixSaxParserSinkDebug());
57+
}
58+
}
59+
}

0 commit comments

Comments
 (0)