oeuvres
diff --git a/‎alix-cli/lib/alix-analysis-1.0.0.jar‎
661 Bytes b/‎alix-cli/lib/alix-analysis-1.0.0.jar‎
661 Bytes
diff --git a/‎alix-cli/lib/alix-common-1.0.0.jar‎
-144 Bytes b/‎alix-cli/lib/alix-common-1.0.0.jar‎
-144 Bytes
diff --git a/‎alix-cli/lib/alix-search-1.0.0.jar‎
1.13 KB b/‎alix-cli/lib/alix-search-1.0.0.jar‎
1.13 KB
diff --git a/‎analysis/pom.xml‎
Lines changed: 5 additions & 0 deletions b/‎analysis/pom.xml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/FilterCloud.java‎
Lines changed: 36 additions & 20 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/FilterCloud.java‎
Lines changed: 36 additions & 20 deletions
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/index/Cli.java‎
Lines changed: 10 additions & 0 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/index/Cli.java‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/index/Keywords.java‎
Lines changed: 44 additions & 13 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/index/Keywords.java‎
Lines changed: 44 additions & 13 deletions
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/index/BytesDic.java‎
Lines changed: 6 additions & 18 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/index/BytesDic.java‎
Lines changed: 6 additions & 18 deletions
diff --git a/‎common/src/resources/com/github/oeuvres/alix/fr/stop.csv‎
Lines changed: 11 additions & 11 deletions b/‎common/src/resources/com/github/oeuvres/alix/fr/stop.csv‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎search/src/java/com/github/oeuvres/alix/lucene/search/AlixReader.java‎
Lines changed: 16 additions & 0 deletions b/‎search/src/java/com/github/oeuvres/alix/lucene/search/AlixReader.java‎
Lines changed: 16 additions & 0 deletions
@@ -37,6 +37,11 @@
             <artifactId>lucene-analysis-common</artifactId>
             <version>10.2.1</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-queries</artifactId>
+            <version>10.2.1</version>
+        </dependency>
         <dependency>
             <groupId>net.sf.saxon</groupId>
             <artifactId>Saxon-HE</artifactId>
 
@@ -46,6 +46,7 @@
 import com.github.oeuvres.alix.fr.TagFr;
 import com.github.oeuvres.alix.lucene.analysis.tokenattributes.LemAtt;
 import com.github.oeuvres.alix.lucene.analysis.tokenattributes.OrthAtt;
+import com.github.oeuvres.alix.util.Char;
 
 /**
  * A final token filter before indexation, to plug after a lemmatizer filter,
@@ -67,7 +68,7 @@ public class FilterCloud extends TokenFilter
     /** A lemma when possible */
     private final LemAtt lemAtt = addAttribute(LemAtt.class);
     /** keep right position order */
-    private int skippedPositions;
+    private int holes;
 
 
     /**
@@ -83,22 +84,45 @@ public final boolean incrementToken() throws IOException
     {
         // skipping positions will create holes, the count of tokens will be different
         // from the count of positions
-        skippedPositions = 0;
+        holes = 0;
         while (input.incrementToken()) {
-            // no position for XML between words
-            if (flagsAtt.getFlags() == XML.code) {
-                continue;
-            }
+            if (skip()) continue;
             if (accept()) {
-                if (skippedPositions != 0) {
-                    posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
-                }
+                if (holes != 0) posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + holes);
                 return true;
             }
-            skippedPositions += posIncrAtt.getPositionIncrement();
+            holes += posIncrAtt.getPositionIncrement();
         }
         return false;
     }
+    
+    /**
+     * Token to skip, without the position, different noises.
+     * @return
+     */
+    protected boolean skip()
+    {
+        final int flags = flagsAtt.getFlags();
+        // known word from dictionary, keep it
+        if (!lemAtt.isEmpty()) return false;
+        // empty
+        if (termAtt.isEmpty()) return true;
+        // no position for XML between words M<sup>elle</sup>
+        if (flags == XML.code) return true;
+        // unknown short word
+        if (termAtt.length() < 3) return true;
+        // < >
+        if (Char.isMath(termAtt.charAt(0))) return true;
+        char charLast = termAtt.charAt(termAtt.length() - 1);
+        // variable like A'
+        if (charLast == '\'') return true;
+        // variable like A.
+        if (charLast == '.' && termAtt.length() == 2) return true;
+        // variable like A4
+        if (Char.isDigit(charLast) && !Char.isDigit(termAtt.charAt(termAtt.length() - 2))) return true;
+        // default is no skip
+        return false;
+    }
 
     /**
      * Most of the tokens are not rejected but rewrited, except punctuation.
@@ -121,8 +145,8 @@ else if (flags == PUNpara.code || flags == PUNsection.code) {
                 // let it
             }
             else {
-                // termAtt.setEmpty().append("");
             }
+            termAtt.setEmpty().append("");
             return true;
         }
         // unify numbers
@@ -131,15 +155,7 @@ else if (flags == PUNpara.code || flags == PUNsection.code) {
             return true;
         }
 
-        // keep flexion of substantives ? Nothing to append to term
-        /*
-        if (flags == SUB.code) {
-            if (orthAtt.length() != 0) {
-                termAtt.setEmpty().append(orthAtt);
-            }
-            return true;
-        }
-        */
+        // do not keep flexion on substantives, no semantic gain
         if (!lemAtt.isEmpty()) termAtt.setEmpty().append(lemAtt);
         else if (!orthAtt.isEmpty()) termAtt.setEmpty().append(orthAtt);
         // no more suffix
 
@@ -110,6 +110,16 @@ public void parse(File propsFile) throws IOException, NoSuchFieldException
             FrDics.load(dicAbs.getCanonicalPath(), dicAbs);
             System.err.println("Local dictionary loaded: " + dicAbs);
         }
+        for (final String dic: globs("stopfile")) {
+            File dicAbs = new File(dic);
+            if (!dicAbs.exists()) {
+                System.err.println("Local dictionary file not found: " + dic
+                        + " (resolved as: " + dicAbs.getAbsolutePath() + ")");
+                continue;
+            }
+            FrDics.load(dicAbs.getCanonicalPath(), dicAbs);
+            System.err.println("Local dictionary loaded: " + dicAbs);
+        }
 
         key = "xsl";
         List<String> values = globs(key);
 
@@ -17,6 +17,7 @@
 import org.apache.lucene.search.SortField;
 
 import static com.github.oeuvres.alix.common.Names.*;
+
 import com.github.oeuvres.alix.lucene.search.AlixReader;
 import com.github.oeuvres.alix.lucene.search.Distrib;
 import com.github.oeuvres.alix.lucene.search.Doc;
@@ -57,39 +58,69 @@ public Integer call() throws Exception
             new Sort(new SortField(ALIX_ID, SortField.Type.STRING))
         );
         ScoreDoc[] hits = results.scoreDocs;
-        
+
+
 
         for (ScoreDoc src : hits) {
             final int docId = src.doc;
             final Document document = storedFields.document(docId, fields);
             final String bibl = document.get(BIBL).replaceAll("<a [^>]+>", "").replaceAll("</a>", "");
-            System.out.println("<p>" + bibl + "</p>");
 
-            FormEnum forms = Doc.formEnum(alixReader, docId, TEXT_CLOUD, Distrib.OCCS, true);
-            printKeywords(forms, Distrib.FREQ); // OCCS = FREQ
-            printKeywords(forms, Distrib.TFIDF);
-            printKeywords(forms, Distrib.CHI2);
-            printKeywords(forms, Distrib.BM25);
-            printKeywords(forms, Distrib.G);
+            // freq without stop words
+            FormEnum formsNostop = null;
+            FormEnum forms = null;
+            try {
+                formsNostop = Doc.formEnum(alixReader, docId, TEXT_CLOUD, Distrib.OCCS, true);
+                forms = Doc.formEnum(alixReader, docId, TEXT_CLOUD, Distrib.OCCS, false);
+            }
+            catch (Exception e) {
+                System.err.println("[" + document.get(ALIX_ID) + "] " + bibl);
+                continue;
+            }
+            System.out.println("<h4>[" + document.get(ALIX_ID) + "] " + bibl + "</h4>");
+            printKeywords(formsNostop, Distrib.FREQ, "Fréquence");
+            printKeywords(formsNostop, Distrib.G, "G test");
+            // printKeywords(formsNostop, Distrib.TFIDF, null);
+            // printKeywords(forms, Distrib.CHI2);
+            // freq with stop words
+            printKeywords(formsNostop, Distrib.FREQ_IDF, "Fréquence * IDF");
         }
 
         return 0;
     }
 
-    private void printKeywords(FormEnum forms, Distrib distrib)
+    private void print(final String label, final String[] terms)
     {
-        forms.score(distrib);
-        forms.sort(FormEnum.Order.SCORE, 50, false);
+        System.out.print("<p>");
+        System.out.print("<b>" + label + "</b>: ");
         boolean first = true;
+        for (String form: terms) {
+            if (first) first = false;
+            else System.out.print(", ");
+            System.out.print(ML.escape(form));
+        }
+        System.out.println("<p>");
+    }
+    
+    private void printKeywords(final FormEnum forms, final Distrib distrib, String label)
+    {
+        if (label == null) label = distrib.name();
+        forms.score(distrib);
+        forms.sort(FormEnum.Order.SCORE, 10, false);
         System.out.print("<p>");
-        System.out.print("<b>" + distrib.name() + "</b>: ");
+        System.out.print("<b>" + label + "</b>: ");
+        boolean first = true;
         while (forms.hasNext()) {
             forms.next();
             String form = forms.form();
             if (first) first = false;
             else System.out.print(", ");
             System.out.print(ML.escape(form));
-            System.out.print(" <small>(" + forms.freq() + ")</small>");
+            /*
+            System.out.print(" <small>(" + forms.freq());
+            // System.out.print(" — " + String.format("%.5f", forms.score()));
+            System.out.print(")</small>");
+            */
         }
         System.out.println("<p>");
     }
 
@@ -50,30 +50,20 @@
  */
 public class BytesDic
 {
-    /** Dictionary words as bytes. */
-    private BytesRefHash dic = new BytesRefHash();
 
-    public BytesDic()
+    private BytesDic()
     {
-        dic.add(new BytesRef(""));
-        dic.add(new BytesRef("#")); // default for comment
-        dic.add(new BytesRef(",")); // default separator
     }
 
-    public boolean contains(BytesRef bytes)
-    {
-        return (dic.find(bytes) != -1);
-    }
     /**
      * Load a word list from an {@link InputStream}
      * @param file
      * @throws IOException
      */
-    public BytesDic load(final File file) throws IOException
+    static public void load(final BytesRefHash dic, final File file) throws IOException
     {
         Reader reader = new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8);
-        load(reader);
-        return this;
+        load(dic, reader);
     }
 
     /**
@@ -82,11 +72,10 @@ public BytesDic load(final File file) throws IOException
      * @param stream resource to load.
      * @throws IOException
      */
-    public BytesDic load(final InputStream stream) throws IOException
+    static public void load(final BytesRefHash dic, final InputStream stream) throws IOException
     {
         Reader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
-        load(reader);
-        return this;
+        load(dic, reader);
     }
 
     /**
@@ -95,7 +84,7 @@ public BytesDic load(final InputStream stream) throws IOException
      * @param reader reader to load.
      * @throws IOException
      */
-    public BytesDic load(Reader reader) throws IOException
+    static public void load(final BytesRefHash dic, final Reader reader) throws IOException
     {
         try (BufferedReader br = getBufferedReader(reader)) {
             String word = null;
@@ -109,7 +98,6 @@ public BytesDic load(Reader reader) throws IOException
                 dic.add(bytes);
             }
         }
-        return this;
     }
 
     /**
 
@@ -225,8 +225,8 @@ c1
 c2
 c3
 c_est
-# c'est
-# c’est
+c'est
+c’est
 ca
 ça
 car
@@ -280,7 +280,7 @@ coll
 com
 combien
 comme
-# comment
+comment
 contre
 D
 D.
@@ -737,7 +737,7 @@ par
 par-dessus
 parce
 parce que
-# parfois
+parfois
 parmi
 part
 pas
@@ -758,8 +758,8 @@ plutôt
 plutost
 point
 pour
-# pourquoi
-# pourquoy
+pourquoi
+pourquoy
 pourtant
 pouvait
 pouvoir
@@ -928,10 +928,10 @@ tels
 tes
 the
 The
-# tien
-# toi
-# toi-même
-# ton
+tien
+toi
+toi-même
+ton
 toujours
 tous
 tous deux
@@ -945,7 +945,7 @@ trente
 très
 trois
 trop
-# tu
+tu
 u
 u.
 u'
 
@@ -86,6 +86,11 @@
 import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.store.NIOFSDirectory;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+
+import com.github.oeuvres.alix.fr.TagFr;
+import com.github.oeuvres.alix.lucene.index.BytesDic;
 
 /**
  * <p>
@@ -154,6 +159,10 @@ public class AlixReader
     private FieldInfos fieldInfos;
     /** The IndexSearcher if requested */
     private IndexSearcher searcher;
+    /** Dictionary of stopwords for the index */
+    private BytesRefHash stopwords = new BytesRefHash();
+
+    
     /** Analyzer for indexation and search */
     final private Analyzer analyzer;
     /** Ways to open a lucene index */
@@ -201,6 +210,10 @@ private AlixReader(final String name, final Path path, final Analyzer analyzer,
         }
         this.analyzer = analyzer;
         this.props = new Properties();
+        stopwords.add(new BytesRef(""));
+        stopwords.add(new BytesRef("#")); // default for comment
+        stopwords.add(new BytesRef(",")); // default separator
+        BytesDic.load(stopwords, TagFr.class.getResourceAsStream("stop.csv"));
     }
 
     /**
@@ -371,6 +384,8 @@ public FieldRail fieldRail(final String fieldName) throws IOException
         cache(key, fieldRail);
         return fieldRail;
     }
+    
+    
 
     /**
      * Get a frequence object.
@@ -386,6 +401,7 @@ public FieldText fieldText(final String fieldName) throws IOException
         if (fieldText != null)
             return fieldText;
         fieldText = new FieldText(reader(), fieldName);
+        fieldText.loadStopwords(stopwords);
         cache(key, fieldText);
         return fieldText;
     }
Original file line number	Diff line number	Diff line change
`@@ -50,30 +50,20 @@`
`50`	`50`	`*/`
`51`	`51`	`public class BytesDic`
`52`	`52`	`{`
`53`		`- /** Dictionary words as bytes. */`
`54`		`- private BytesRefHash dic = new BytesRefHash();`
`55`	`53`
`56`		`- public BytesDic()`
	`54`	`+ private BytesDic()`
`57`	`55`	`{`
`58`		`- dic.add(new BytesRef(""));`
`59`		`- dic.add(new BytesRef("#")); // default for comment`
`60`		`- dic.add(new BytesRef(",")); // default separator`
`61`	`56`	`}`
`62`	`57`
`63`		`- public boolean contains(BytesRef bytes)`
`64`		`- {`
`65`		`- return (dic.find(bytes) != -1);`
`66`		`- }`
`67`	`58`	`/**`
`68`	`59`	`* Load a word list from an {@link InputStream}`
`69`	`60`	`* @param file`
`70`	`61`	`* @throws IOException`
`71`	`62`	`*/`
`72`		`- public BytesDic load(final File file) throws IOException`
	`63`	`+ static public void load(final BytesRefHash dic, final File file) throws IOException`
`73`	`64`	`{`
`74`	`65`	`Reader reader = new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8);`
`75`		`- load(reader);`
`76`		`- return this;`
	`66`	`+ load(dic, reader);`
`77`	`67`	`}`
`78`	`68`
`79`	`69`	`/**`
`@@ -82,11 +72,10 @@ public BytesDic load(final File file) throws IOException`
`82`	`72`	`* @param stream resource to load.`
`83`	`73`	`* @throws IOException`
`84`	`74`	`*/`
`85`		`- public BytesDic load(final InputStream stream) throws IOException`
	`75`	`+ static public void load(final BytesRefHash dic, final InputStream stream) throws IOException`
`86`	`76`	`{`
`87`	`77`	`Reader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);`
`88`		`- load(reader);`
`89`		`- return this;`
	`78`	`+ load(dic, reader);`
`90`	`79`	`}`
`91`	`80`
`92`	`81`	`/**`
`@@ -95,7 +84,7 @@ public BytesDic load(final InputStream stream) throws IOException`
`95`	`84`	`* @param reader reader to load.`
`96`	`85`	`* @throws IOException`
`97`	`86`	`*/`
`98`		`- public BytesDic load(Reader reader) throws IOException`
	`87`	`+ static public void load(final BytesRefHash dic, final Reader reader) throws IOException`
`99`	`88`	`{`
`100`	`89`	`try (BufferedReader br = getBufferedReader(reader)) {`
`101`	`90`	`String word = null;`
`@@ -109,7 +98,6 @@ public BytesDic load(Reader reader) throws IOException`
`109`	`98`	`dic.add(bytes);`
`110`	`99`	`}`
`111`	`100`	`}`
`112`		`- return this;`
`113`	`101`	`}`
`114`	`102`
`115`	`103`	`/**`