oeuvres
diff --git a/‎alix-cli/lib/alix-analysis-1.0.0.jar‎
12.2 KB b/‎alix-cli/lib/alix-analysis-1.0.0.jar‎
12.2 KB
diff --git a/‎alix-cli/lib/alix-common-1.0.0.jar‎
3.39 KB b/‎alix-cli/lib/alix-common-1.0.0.jar‎
3.39 KB
diff --git a/‎alix-cli/lib/alix-fr-1.0.0.jar‎
467 Bytes b/‎alix-cli/lib/alix-fr-1.0.0.jar‎
467 Bytes
diff --git a/‎alix-cli/lib/alix-util-1.0.0.jar‎
28.5 KB b/‎alix-cli/lib/alix-util-1.0.0.jar‎
28.5 KB
diff --git a/‎alix-cli/lib/lucene-analysis-common-10.2.0.jar‎
1.67 MB b/‎alix-cli/lib/lucene-analysis-common-10.2.0.jar‎
1.67 MB
diff --git a/‎alix-cli/lib/lucene-core-10.2.0.jar‎
4.23 MB b/‎alix-cli/lib/lucene-core-10.2.0.jar‎
4.23 MB
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/FrDics.java‎
Lines changed: 18 additions & 13 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/FrDics.java‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/TokenizerML.java‎
Lines changed: 58 additions & 16 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/TokenizerML.java‎
Lines changed: 58 additions & 16 deletions
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/index/Analyze4vec.java‎
Lines changed: 4 additions & 0 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/index/Analyze4vec.java‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/index/ListErrors.java‎
Lines changed: 23 additions & 8 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/index/ListErrors.java‎
Lines changed: 23 additions & 8 deletions
@@ -284,7 +284,7 @@ synchronized static public void load(final String name, final Reader reader, boo
         loaded.add(name);
         CSVReader csv = null;
         try {
-            csv = new CSVReader(reader, 4);
+            csv = new CSVReader(reader, 4, ',');
             csv.readRow(); // skip first line
             Row row;
             while ((row = csv.readRow()) != null) {
@@ -301,20 +301,10 @@ synchronized static public void load(final String name, final Reader reader, boo
                     decompose(graph, TREELOC);
                 }
                 // known abbreviation with at least one final dot, add the compounds
-                // do not handle multi word abbreviation like "av. J.-C."
+                // do not handle here multi word abbreviation like "av. J.-C."
                 Chain norm = row.get(NORM);
                 if (!hasSpace && graph.last() == '.') {
-                    // if multiple dots like U.S.A., add U., U.S., and U.S.A.
-                    for (int length = 2; length <= graph.length() ; length++) {
-                        if (graph.charAt(length - 1) != '.') continue;
-                        CharsAttImpl key = new CharsAttImpl(graph, 0, length);
-                        BREVIDOT.add(key);
-                    }
-                    if (!norm.isEmpty()) {
-                        NORMALIZE.put(new CharsAttImpl(graph), new CharsAttImpl(norm));
-                    }
-                    // do not add brevidots to dico ? 
-                    continue; 
+                    BREVIDOT.add(new CharsAttImpl(graph));
                 }
                 // check if it is normalization
                 if (!norm.isEmpty()) {
@@ -388,6 +378,21 @@ public static boolean norm(CharsAtt att)
         att.setEmpty().append(val);
         return true;
     }
+    
+    /**
+     * Get normalized orthographic form for a real grapphical form in text.
+     * 
+     * @param test {@link CharAtt} implementation, normalized.
+     * @return true if a normalization has been done, false otherwise.
+     */
+    public static boolean norm(final CharsAtt test, final CharTermAttribute dst)
+    {
+        CharsAtt val = NORMALIZE.get(test);
+        if (val == null)
+            return false;
+        dst.setEmpty().append(val);
+        return true;
+    }
 
     /**
      * Get a dictionary entry from the word dictionary
 
@@ -1,6 +1,8 @@
 package com.github.oeuvres.alix.lucene.analysis;
 
 import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
 
 import org.apache.lucene.analysis.CharacterUtils;
 import org.apache.lucene.analysis.Tokenizer;
@@ -88,14 +90,20 @@ public class TokenizerML  extends Tokenizer
     private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
     /** Used as char[] wrapper for testing */
     private final CharsAttImpl test = new CharsAttImpl();
-    /** Buffer of chars */
-    private final CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(8192);
+    /** Buffer of chars, give a big size avoiding pb for  */
+    private final CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(2 * 1024 * 1024);
     /** Position in buffer */
     private int bufferIndex = 0;
     /** size of buffer*/
     private int  bufferLen = 0;
     /** current char offset */
     private int offset = 0;
+    /** XML entities */
+    static final Map<String, String> XML_ENT = Map.ofEntries(
+        Map.entry("gt", ">"),
+        Map.entry("lt", "<"),
+        Map.entry("amp", "&")
+    );
 
     /**
      * Build a Tokenizer for Markup tagged text.
@@ -111,7 +119,7 @@ public final boolean incrementToken() throws IOException
         // flags
         boolean intag = false;
         boolean number = false;
-        
+        int amp = -1; // check XML entities
         // Mandatory start of a term
         int startOffset = -1;
         char lastChar;
@@ -136,12 +144,12 @@ public final boolean incrementToken() throws IOException
             }
             // if no luck, a try to go back in buffer can fall in negative
             if (bufferIndex < 0) {
-                // System.out.println(buffer.getBuffer());
                 bufferIndex = 0;
             }
             lastChar = c;
             c = buffer.getBuffer()[bufferIndex];
-            // default, go next
+            // record an event for entities
+            if (c == '&') amp = termAtt.length();
 
             // start of a tag, do not advance cursors
             if (c == '<') {
@@ -216,9 +224,24 @@ else if (Char.isToken(c)) {
                     break; // a too big token stop
                 }
             }
-            // possible entity
-            else if (c == ';' && termAtt.charAt(0) == '&') {
-                // TODO !
+            // xml entity, handle case like -&gt;
+            else if (c == ';' && amp >= 0) {
+                termAtt.append(c);
+                final int lim = termAtt.length() - 2 - amp;
+                for (var entry : XML_ENT.entrySet()) {
+                    String key = entry.getKey();
+                    if (key.length() != lim) continue;
+                    int pos = 0;
+                    for (; pos < lim; pos++) {
+                        if (termAtt.charAt(pos + amp + 1) != key.charAt(pos)) break;
+                    }
+                    // entity seems found here
+                    if (pos == lim) {
+                        termAtt.setLength(amp).append(entry.getValue());
+                        break;
+                    }
+                }
+                amp = -1;
             }
             // Clause punctuation, send a punctuation event to separate tokens
             else if (',' == c || ';' == c || ':' == c || '(' == c || ')' == c || '—' == c || '–' == c 
@@ -236,17 +259,12 @@ else if (Char.isToken(c)) {
                 offset++;
                 break;
             }
-            // abbreviation ?
+            // complex case, the dot and abbreviations, append and let next filter define what to do
             else if (c == '.' && Char.isLetter(lastChar) ) {
                 termAtt.append(c);
-                // not an abbreviaiton, send without dot
-                if (!FrDics.isBrevidot( test.wrap(termAtt.buffer(), termAtt.length()) )) {
-                    termAtt.setLength(termAtt.length() - 1);
-                    break;
-                }
             }
             // Possible sentence delimiters
-            else if (c == '.' || c == '…' || c == '?' || c == '!' ) {
+            else if ( c == '.' || c == '…' || c == '?' || c == '!' ) {
                 // if pending word, send, and come back later
                 if (!termAtt.isEmpty() && lastChar != '.' && lastChar != '?' && lastChar != '!') {
                     break;
@@ -258,13 +276,37 @@ else if (c == '.' || c == '…' || c == '?' || c == '!' ) {
                 }
                 termAtt.append(c);
             }
-            // not token char, token to send
+            // not token char, token to send ?
             else if (!termAtt.isEmpty()) {
                 break;
             }
             bufferIndex++;
             offset++;
         }
+        // final dot special case
+        int len = termAtt.length();
+        if (Char.isLetter(termAtt.charAt(0)) && termAtt.charAt(len - 1) == '.') {
+            test.wrap(termAtt.buffer(), termAtt.length());
+            if (FrDics.isBrevidot(test) ) {
+                // maybe normalize now
+                FrDics.norm(test, termAtt);
+            }
+            // one letter, abbreviation
+            else if (termAtt.length() == 2) {
+                
+            }
+            // go back in buffer to restart from first point, remember the "..." case
+            else {
+                while (termAtt.charAt(len - 1) == '.') {
+                    len--;
+                    bufferIndex--;
+                    offset--;
+                    endOffset = offset;
+                }
+                termAtt.setLength(len);
+            }
+        }
+        
         // here, a term should be ready, send it
         posIncAtt.setPositionIncrement(1);
         posLenAtt.setPositionLength(1);
 
@@ -100,6 +100,10 @@ else if (flags == PUNsent.code) {
             else if (PUN.isPun(flags)) {
                 continue;
             }
+            // unknown
+            else if (flags == TOKEN.code()) {
+                continue;
+            }
             else {
                 char[] chars = termAtt.buffer();
                 final int len = termAtt.length();
 
@@ -64,14 +64,17 @@ public Integer call() throws Exception
             }
         }
         else {
-            String text = "Mais cela ne signifie naturellement pas qu’il sache d’emblée composer les dépassements entre eux (Δ<hi>xz</hi> = Δ<hi>xy</hi> + Δ<hi>yz</hi>) et, comme on le verra sous 2), il est au contraire probable qu’aux débuts un plus grand dépassement, et même un dépassement égal mais entre éléments plus grands, leur paraissent d’une autre nature qu’un dépassement entre petits éléments.";
+            String text = "Mais cela ne signifie naturellement pas qu’il sache d’emblée composer les dépassements &gt; entre eux (Δ<hi>xz</hi> = Δ<hi>xy</hi> + Δ<hi>yz</hi>) &amp;, comme on le verra sous 2), il est au contraire probable qu’aux débuts un plus grand dépassement, et même un dépassement égal mais entre éléments plus grands, leur paraissent d’une autre nature qu’un dépassement entre petits éléments.";
             analyze(analyzer.tokenStream("", new StringReader(text)));
         }
 
 
         analyzer.close();
         Top<Chain> top = new Top<Chain>(Chain.class, 2000);
         for (Entry<Chain, IntMutable> entry: errors.entrySet()) {
+            if (entry.getKey().equals("Ad*")) {
+                System.out.println(entry);
+            }
             top.insert(entry.getValue().value(), entry.getKey());
         }
 
@@ -113,8 +116,8 @@ private void analyze(final TokenStream tokenStream) throws IOException
                 continue;
             }
             charsAtt.wrap(termAtt.buffer(), termAtt.length());
+            FrDics.norm(charsAtt);
             if (Char.isUpperCase(charsAtt.charAt(0))) {
-                FrDics.norm(charsAtt);
                 if (FrDics.name(charsAtt) != null) {
                     up();
                     continue;
@@ -126,11 +129,28 @@ private void analyze(final TokenStream tokenStream) throws IOException
                     continue;
                 }
                 charsAtt.toLower();
+                if (FrDics.word(charsAtt) != null) {
+                    up();
+                    continue;
+                }
+                // candidate name, let it
+                // charsAtt.capitalize();
+                up();
+                continue;
+            }
+            else if (FrDics.word(charsAtt) != null) {
+                up();
+                continue;
             }
-            if (FrDics.word(charsAtt) != null) {
+            // variables
+            if (
+                charsAtt.length() == 1
+             || (charsAtt.length() == 2 && (charsAtt.charAt(1) == '\'' || charsAtt.charAt(1) == '.' || Char.isDigit(charsAtt.charAt(1))))
+            ) {
                 up();
                 continue;
             }
+            
             if (!form.isEmpty()) form.append(' ');
             form.append(termAtt);
         }
@@ -173,13 +193,8 @@ public TokenStreamComponents createComponents(String field)
      */
     public static void main(String[] args) throws Exception
     {
-        /*
         int exitCode = new CommandLine(new ListErrors()).execute(args);
         System.exit(exitCode);
-        */
-        CharTermAttribute term = new CharTermAttributeImpl();
-        term.append("test");
-        System.out.println(term.equals("test"));
     }
 
 }