Skip to content

Commit b165c4b

Browse files
committed
Keywords OK
1 parent 4ced3ac commit b165c4b

15 files changed

Lines changed: 204 additions & 92 deletions

File tree

661 Bytes
Binary file not shown.

alix-cli/lib/alix-common-1.0.0.jar

-144 Bytes
Binary file not shown.

alix-cli/lib/alix-search-1.0.0.jar

1.13 KB
Binary file not shown.

analysis/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
<artifactId>lucene-analysis-common</artifactId>
3838
<version>10.2.1</version>
3939
</dependency>
40+
<dependency>
41+
<groupId>org.apache.lucene</groupId>
42+
<artifactId>lucene-queries</artifactId>
43+
<version>10.2.1</version>
44+
</dependency>
4045
<dependency>
4146
<groupId>net.sf.saxon</groupId>
4247
<artifactId>Saxon-HE</artifactId>

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/FilterCloud.java

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import com.github.oeuvres.alix.fr.TagFr;
4747
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.LemAtt;
4848
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.OrthAtt;
49+
import com.github.oeuvres.alix.util.Char;
4950

5051
/**
5152
* A final token filter before indexation, to plug after a lemmatizer filter,
@@ -67,7 +68,7 @@ public class FilterCloud extends TokenFilter
6768
/** A lemma when possible */
6869
private final LemAtt lemAtt = addAttribute(LemAtt.class);
6970
/** keep right position order */
70-
private int skippedPositions;
71+
private int holes;
7172

7273

7374
/**
@@ -83,22 +84,45 @@ public final boolean incrementToken() throws IOException
8384
{
8485
// skipping positions will create holes, the count of tokens will be different
8586
// from the count of positions
86-
skippedPositions = 0;
87+
holes = 0;
8788
while (input.incrementToken()) {
88-
// no position for XML between words
89-
if (flagsAtt.getFlags() == XML.code) {
90-
continue;
91-
}
89+
if (skip()) continue;
9290
if (accept()) {
93-
if (skippedPositions != 0) {
94-
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
95-
}
91+
if (holes != 0) posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + holes);
9692
return true;
9793
}
98-
skippedPositions += posIncrAtt.getPositionIncrement();
94+
holes += posIncrAtt.getPositionIncrement();
9995
}
10096
return false;
10197
}
98+
99+
/**
100+
* Token to skip, without the position, different noises.
101+
* @return
102+
*/
103+
protected boolean skip()
104+
{
105+
final int flags = flagsAtt.getFlags();
106+
// known word from dictionary, keep it
107+
if (!lemAtt.isEmpty()) return false;
108+
// empty
109+
if (termAtt.isEmpty()) return true;
110+
// no position for XML between words M<sup>elle</sup>
111+
if (flags == XML.code) return true;
112+
// unknown short word
113+
if (termAtt.length() < 3) return true;
114+
// < >
115+
if (Char.isMath(termAtt.charAt(0))) return true;
116+
char charLast = termAtt.charAt(termAtt.length() - 1);
117+
// variable like A'
118+
if (charLast == '\'') return true;
119+
// variable like A.
120+
if (charLast == '.' && termAtt.length() == 2) return true;
121+
// variable like A4
122+
if (Char.isDigit(charLast) && !Char.isDigit(termAtt.charAt(termAtt.length() - 2))) return true;
123+
// default is no skip
124+
return false;
125+
}
102126

103127
/**
104128
* Most of the tokens are not rejected but rewrited, except punctuation.
@@ -121,8 +145,8 @@ else if (flags == PUNpara.code || flags == PUNsection.code) {
121145
// let it
122146
}
123147
else {
124-
// termAtt.setEmpty().append("");
125148
}
149+
termAtt.setEmpty().append("");
126150
return true;
127151
}
128152
// unify numbers
@@ -131,15 +155,7 @@ else if (flags == PUNpara.code || flags == PUNsection.code) {
131155
return true;
132156
}
133157

134-
// keep flexion of substantives ? Nothing to append to term
135-
/*
136-
if (flags == SUB.code) {
137-
if (orthAtt.length() != 0) {
138-
termAtt.setEmpty().append(orthAtt);
139-
}
140-
return true;
141-
}
142-
*/
158+
// do not keep flexion on substantives, no semantic gain
143159
if (!lemAtt.isEmpty()) termAtt.setEmpty().append(lemAtt);
144160
else if (!orthAtt.isEmpty()) termAtt.setEmpty().append(orthAtt);
145161
// no more suffix

analysis/src/java/com/github/oeuvres/alix/lucene/index/Cli.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,16 @@ public void parse(File propsFile) throws IOException, NoSuchFieldException
110110
FrDics.load(dicAbs.getCanonicalPath(), dicAbs);
111111
System.err.println("Local dictionary loaded: " + dicAbs);
112112
}
113+
for (final String dic: globs("stopfile")) {
114+
File dicAbs = new File(dic);
115+
if (!dicAbs.exists()) {
116+
System.err.println("Local dictionary file not found: " + dic
117+
+ " (resolved as: " + dicAbs.getAbsolutePath() + ")");
118+
continue;
119+
}
120+
FrDics.load(dicAbs.getCanonicalPath(), dicAbs);
121+
System.err.println("Local dictionary loaded: " + dicAbs);
122+
}
113123

114124
key = "xsl";
115125
List<String> values = globs(key);

analysis/src/java/com/github/oeuvres/alix/lucene/index/Keywords.java

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.apache.lucene.search.SortField;
1818

1919
import static com.github.oeuvres.alix.common.Names.*;
20+
2021
import com.github.oeuvres.alix.lucene.search.AlixReader;
2122
import com.github.oeuvres.alix.lucene.search.Distrib;
2223
import com.github.oeuvres.alix.lucene.search.Doc;
@@ -57,39 +58,69 @@ public Integer call() throws Exception
5758
new Sort(new SortField(ALIX_ID, SortField.Type.STRING))
5859
);
5960
ScoreDoc[] hits = results.scoreDocs;
60-
61+
62+
6163

6264
for (ScoreDoc src : hits) {
6365
final int docId = src.doc;
6466
final Document document = storedFields.document(docId, fields);
6567
final String bibl = document.get(BIBL).replaceAll("<a [^>]+>", "").replaceAll("</a>", "");
66-
System.out.println("<p>" + bibl + "</p>");
6768

68-
FormEnum forms = Doc.formEnum(alixReader, docId, TEXT_CLOUD, Distrib.OCCS, true);
69-
printKeywords(forms, Distrib.FREQ); // OCCS = FREQ
70-
printKeywords(forms, Distrib.TFIDF);
71-
printKeywords(forms, Distrib.CHI2);
72-
printKeywords(forms, Distrib.BM25);
73-
printKeywords(forms, Distrib.G);
69+
// freq without stop words
70+
FormEnum formsNostop = null;
71+
FormEnum forms = null;
72+
try {
73+
formsNostop = Doc.formEnum(alixReader, docId, TEXT_CLOUD, Distrib.OCCS, true);
74+
forms = Doc.formEnum(alixReader, docId, TEXT_CLOUD, Distrib.OCCS, false);
75+
}
76+
catch (Exception e) {
77+
System.err.println("[" + document.get(ALIX_ID) + "] " + bibl);
78+
continue;
79+
}
80+
System.out.println("<h4>[" + document.get(ALIX_ID) + "] " + bibl + "</h4>");
81+
printKeywords(formsNostop, Distrib.FREQ, "Fréquence");
82+
printKeywords(formsNostop, Distrib.G, "G test");
83+
// printKeywords(formsNostop, Distrib.TFIDF, null);
84+
// printKeywords(forms, Distrib.CHI2);
85+
// freq with stop words
86+
printKeywords(formsNostop, Distrib.FREQ_IDF, "Fréquence * IDF");
7487
}
7588

7689
return 0;
7790
}
7891

79-
private void printKeywords(FormEnum forms, Distrib distrib)
92+
private void print(final String label, final String[] terms)
8093
{
81-
forms.score(distrib);
82-
forms.sort(FormEnum.Order.SCORE, 50, false);
94+
System.out.print("<p>");
95+
System.out.print("<b>" + label + "</b>: ");
8396
boolean first = true;
97+
for (String form: terms) {
98+
if (first) first = false;
99+
else System.out.print(", ");
100+
System.out.print(ML.escape(form));
101+
}
102+
System.out.println("<p>");
103+
}
104+
105+
private void printKeywords(final FormEnum forms, final Distrib distrib, String label)
106+
{
107+
if (label == null) label = distrib.name();
108+
forms.score(distrib);
109+
forms.sort(FormEnum.Order.SCORE, 10, false);
84110
System.out.print("<p>");
85-
System.out.print("<b>" + distrib.name() + "</b>: ");
111+
System.out.print("<b>" + label + "</b>: ");
112+
boolean first = true;
86113
while (forms.hasNext()) {
87114
forms.next();
88115
String form = forms.form();
89116
if (first) first = false;
90117
else System.out.print(", ");
91118
System.out.print(ML.escape(form));
92-
System.out.print(" <small>(" + forms.freq() + ")</small>");
119+
/*
120+
System.out.print(" <small>(" + forms.freq());
121+
// System.out.print(" — " + String.format("%.5f", forms.score()));
122+
System.out.print(")</small>");
123+
*/
93124
}
94125
System.out.println("<p>");
95126
}

common/src/java/com/github/oeuvres/alix/lucene/index/BytesDic.java

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -50,30 +50,20 @@
5050
*/
5151
public class BytesDic
5252
{
53-
/** Dictionary words as bytes. */
54-
private BytesRefHash dic = new BytesRefHash();
5553

56-
public BytesDic()
54+
private BytesDic()
5755
{
58-
dic.add(new BytesRef(""));
59-
dic.add(new BytesRef("#")); // default for comment
60-
dic.add(new BytesRef(",")); // default separator
6156
}
6257

63-
public boolean contains(BytesRef bytes)
64-
{
65-
return (dic.find(bytes) != -1);
66-
}
6758
/**
6859
* Load a word list from an {@link InputStream}
6960
* @param file
7061
* @throws IOException
7162
*/
72-
public BytesDic load(final File file) throws IOException
63+
static public void load(final BytesRefHash dic, final File file) throws IOException
7364
{
7465
Reader reader = new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8);
75-
load(reader);
76-
return this;
66+
load(dic, reader);
7767
}
7868

7969
/**
@@ -82,11 +72,10 @@ public BytesDic load(final File file) throws IOException
8272
* @param stream resource to load.
8373
* @throws IOException
8474
*/
85-
public BytesDic load(final InputStream stream) throws IOException
75+
static public void load(final BytesRefHash dic, final InputStream stream) throws IOException
8676
{
8777
Reader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
88-
load(reader);
89-
return this;
78+
load(dic, reader);
9079
}
9180

9281
/**
@@ -95,7 +84,7 @@ public BytesDic load(final InputStream stream) throws IOException
9584
* @param reader reader to load.
9685
* @throws IOException
9786
*/
98-
public BytesDic load(Reader reader) throws IOException
87+
static public void load(final BytesRefHash dic, final Reader reader) throws IOException
9988
{
10089
try (BufferedReader br = getBufferedReader(reader)) {
10190
String word = null;
@@ -109,7 +98,6 @@ public BytesDic load(Reader reader) throws IOException
10998
dic.add(bytes);
11099
}
111100
}
112-
return this;
113101
}
114102

115103
/**

common/src/resources/com/github/oeuvres/alix/fr/stop.csv

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,8 @@ c1
225225
c2
226226
c3
227227
c_est
228-
# c'est
229-
# c’est
228+
c'est
229+
c’est
230230
ca
231231
ça
232232
car
@@ -280,7 +280,7 @@ coll
280280
com
281281
combien
282282
comme
283-
# comment
283+
comment
284284
contre
285285
D
286286
D.
@@ -737,7 +737,7 @@ par
737737
par-dessus
738738
parce
739739
parce que
740-
# parfois
740+
parfois
741741
parmi
742742
part
743743
pas
@@ -758,8 +758,8 @@ plutôt
758758
plutost
759759
point
760760
pour
761-
# pourquoi
762-
# pourquoy
761+
pourquoi
762+
pourquoy
763763
pourtant
764764
pouvait
765765
pouvoir
@@ -928,10 +928,10 @@ tels
928928
tes
929929
the
930930
The
931-
# tien
932-
# toi
933-
# toi-même
934-
# ton
931+
tien
932+
toi
933+
toi-même
934+
ton
935935
toujours
936936
tous
937937
tous deux
@@ -945,7 +945,7 @@ trente
945945
très
946946
trois
947947
trop
948-
# tu
948+
tu
949949
u
950950
u.
951951
u'

search/src/java/com/github/oeuvres/alix/lucene/search/AlixReader.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@
8686
import org.apache.lucene.store.MMapDirectory;
8787
import org.apache.lucene.store.NIOFSDirectory;
8888
import org.apache.lucene.util.Bits;
89+
import org.apache.lucene.util.BytesRef;
90+
import org.apache.lucene.util.BytesRefHash;
91+
92+
import com.github.oeuvres.alix.fr.TagFr;
93+
import com.github.oeuvres.alix.lucene.index.BytesDic;
8994

9095
/**
9196
* <p>
@@ -154,6 +159,10 @@ public class AlixReader
154159
private FieldInfos fieldInfos;
155160
/** The IndexSearcher if requested */
156161
private IndexSearcher searcher;
162+
/** Dictionary of stopwords for the index */
163+
private BytesRefHash stopwords = new BytesRefHash();
164+
165+
157166
/** Analyzer for indexation and search */
158167
final private Analyzer analyzer;
159168
/** Ways to open a lucene index */
@@ -201,6 +210,10 @@ private AlixReader(final String name, final Path path, final Analyzer analyzer,
201210
}
202211
this.analyzer = analyzer;
203212
this.props = new Properties();
213+
stopwords.add(new BytesRef(""));
214+
stopwords.add(new BytesRef("#")); // default for comment
215+
stopwords.add(new BytesRef(",")); // default separator
216+
BytesDic.load(stopwords, TagFr.class.getResourceAsStream("stop.csv"));
204217
}
205218

206219
/**
@@ -371,6 +384,8 @@ public FieldRail fieldRail(final String fieldName) throws IOException
371384
cache(key, fieldRail);
372385
return fieldRail;
373386
}
387+
388+
374389

375390
/**
376391
* Get a frequence object.
@@ -386,6 +401,7 @@ public FieldText fieldText(final String fieldName) throws IOException
386401
if (fieldText != null)
387402
return fieldText;
388403
fieldText = new FieldText(reader(), fieldName);
404+
fieldText.loadStopwords(stopwords);
389405
cache(key, fieldText);
390406
return fieldText;
391407
}

0 commit comments

Comments
 (0)