Skip to content

Commit 6c46d56

Browse files
committed
Errors ?
1 parent ad6ddad commit 6c46d56

15 files changed

Lines changed: 909 additions & 110 deletions

File tree

alix-cli/errors.bat

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
@echo off
2+
setlocal
3+
SET DIR=%~dp0
4+
java -Dorg.slf4j.simpleLogger.defaultLogLevel=debug -cp "%DIR%lib/*" com.github.oeuvres.alix.lucene.index.ListErrors %*
5+

analysis/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
<dependency>
3131
<groupId>org.apache.lucene</groupId>
3232
<artifactId>lucene-analysis-common</artifactId>
33-
<version>10.1.0</version>
33+
<version>10.2.0</version>
3434
</dependency>
3535
<dependency>
3636
<groupId>net.sf.saxon</groupId>

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/TokenizerML.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,10 @@ else if (Char.isToken(c)) {
216216
break; // a too big token stop
217217
}
218218
}
219+
// possible entity
220+
else if (c == ';' && termAtt.charAt(0) == '&') {
221+
// TODO !
222+
}
219223
// Clause punctuation, send a punctuation event to separate tokens
220224
else if (',' == c || ';' == c || ':' == c || '(' == c || ')' == c || '—' == c || '–' == c
221225
|| '"' == c || '«' == c || '»' == c ) {

analysis/src/java/com/github/oeuvres/alix/lucene/index/Analyze4vec.java

Lines changed: 146 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,37 @@
11
package com.github.oeuvres.alix.lucene.index;
22

33
import static com.github.oeuvres.alix.common.Flags.*;
4+
import static com.github.oeuvres.alix.fr.TagFr.*;
45

56
import java.io.BufferedReader;
67
import java.io.BufferedWriter;
78
import java.io.File;
89
import java.io.FileInputStream;
9-
import java.io.FileNotFoundException;
1010
import java.io.FileOutputStream;
1111
import java.io.IOException;
1212
import java.io.InputStreamReader;
1313
import java.io.OutputStreamWriter;
1414
import java.io.Writer;
1515
import java.nio.file.Path;
16-
import java.util.ArrayList;
17-
import java.util.InvalidPropertiesFormatException;
18-
import java.util.Properties;
1916
import java.util.concurrent.Callable;
2017

2118
import org.apache.lucene.analysis.Analyzer;
19+
import org.apache.lucene.analysis.TokenFilter;
2220
import org.apache.lucene.analysis.TokenStream;
2321
import org.apache.lucene.analysis.Tokenizer;
2422
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2523
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
24+
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
2625

26+
import com.github.oeuvres.alix.fr.TagFr;
2727
import com.github.oeuvres.alix.lucene.analysis.FilterAposHyphenFr;
28-
import com.github.oeuvres.alix.lucene.analysis.FilterCloud;
2928
import com.github.oeuvres.alix.lucene.analysis.FilterFrPos;
3029
import com.github.oeuvres.alix.lucene.analysis.FilterHTML;
3130
import com.github.oeuvres.alix.lucene.analysis.FilterLemmatize;
3231
import com.github.oeuvres.alix.lucene.analysis.FilterLocution;
33-
import com.github.oeuvres.alix.lucene.analysis.FrDics;
3432
import com.github.oeuvres.alix.lucene.analysis.TokenizerML;
35-
import com.github.oeuvres.alix.util.Dir;
33+
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.LemAtt;
34+
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.OrthAtt;
3635

3736
import picocli.CommandLine;
3837
import picocli.CommandLine.Command;
@@ -42,17 +41,13 @@
4241
* Analyse an XML/TEI corpus to output a custom text designed for a word2vec training.
4342
*/
4443
@Command(name = "Analyze", description = "Analyse an XML/TEI corpus to output a custom text designed for a word2vec training.")
45-
public class Analyze4vec implements Callable<Integer>
44+
public class Analyze4vec extends Cli implements Callable<Integer>
4645
{
4746
final static String APP = "alix.corpus4vec";
48-
@Parameters(index = "0", arity = "1", paramLabel = "corpus.xml", description = "1 Java/XML/properties describing a document collection (src file…)")
49-
/** configuration files */
50-
File conf;
47+
5148
@Parameters(index = "1", arity = "1", paramLabel = "corpus.txt", description = "1 destination text file for analyzed corpus.")
5249
/** Destination text file. */
5350
File dstFile;
54-
/** File globs to parse, populated by parsing corpus properties */
55-
ArrayList<Path> paths = new ArrayList<>();
5651
@Override
5752
public Integer call() throws Exception
5853
{
@@ -85,6 +80,7 @@ private static void unroll(final TokenStream tokenStream, final Writer writer) t
8580
final CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
8681
final FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
8782
tokenStream.reset();
83+
int startLast = 0;
8884
while(tokenStream.incrementToken()) {
8985
final int flags = flagsAtt.getFlags();
9086
if (flags == PUNsection.code) {
@@ -116,67 +112,6 @@ else if (PUN.isPun(flags)) {
116112
}
117113
tokenStream.close();
118114
}
119-
120-
121-
/**
122-
* Parse properties to output the corpus
123-
*
124-
* @param propsFile A properties file in XML format
125-
* {@link Properties#loadFromXML(java.io.InputStream)}.
126-
* @throws IOException I/O file system error, or required files not
127-
* found.
128-
* @throws NoSuchFieldException Properties errors.
129-
*/
130-
public void parse(File propsFile) throws IOException, NoSuchFieldException
131-
{
132-
if (!propsFile.exists()) throw new FileNotFoundException(
133-
"\n [" + APP + "] " + propsFile.getAbsolutePath() + "\nProperties file not found");
134-
Properties props = new Properties();
135-
try {
136-
props.loadFromXML(new FileInputStream(propsFile));
137-
}
138-
catch (InvalidPropertiesFormatException e) {
139-
throw new InvalidPropertiesFormatException(
140-
"\n [" + APP + "] " + propsFile + "\nXML error in properties file\n"
141-
+ "cf. https://docs.oracle.com/javase/8/docs/api/java/util/Properties.html");
142-
}
143-
catch (IOException e) {
144-
throw new IOException(
145-
"\n [" + APP + "] " + propsFile.getAbsolutePath() + "\nProperties file not readable");
146-
}
147-
148-
final File base = propsFile.getCanonicalFile().getParentFile();
149-
150-
final String src = props.getProperty("src");
151-
if (src == null) throw new NoSuchFieldException(
152-
"\n [" + APP + "] " + propsFile + "\nan src entry is needed, to have path to index"
153-
+ "\n<entry key=\"src\">../corpus1/*.xml;../corpus2/*.xml</entry>");
154-
String[] blurf = src.split(" *[;] *|[\t ]*[\n\r]+[\t ]*");
155-
// resolve globs relative to the folder of the properties field
156-
for (String glob : blurf) {
157-
glob = Dir.globNorm(glob, base);
158-
Dir.include(paths, glob);
159-
}
160-
161-
final String exclude = props.getProperty("exclude");
162-
if (exclude != null) {
163-
String[] globs = exclude.split(" *[;] *|[\t ]*[\n\r]+[\t ]*");
164-
for (String glob : globs) {
165-
glob = Dir.globNorm(glob, base);
166-
Dir.exclude(paths, glob);
167-
}
168-
}
169-
final String dicfile = props.getProperty("dicfile");
170-
if (dicfile != null) {
171-
File dicAbs = new File(dicfile);
172-
if (!dicAbs.isAbsolute()) dicAbs = new File(base, dicfile);
173-
if (!dicAbs.exists()) {
174-
throw new FileNotFoundException("Local dictionary file not found <entry key=\"dicfile\">" + dicfile
175-
+ "</entry>, resolved as " + dicAbs.getAbsolutePath());
176-
}
177-
FrDics.load(dicAbs.getCanonicalPath(), dicAbs);
178-
}
179-
}
180115

181116
public class Analyzer4vec extends Analyzer
182117
{
@@ -205,9 +140,145 @@ public TokenStreamComponents createComponents(String field)
205140
// group compounds after lemmatization for verbal compounds
206141
ts = new FilterLocution(ts);
207142
// last filter èrepare term to index
208-
ts = new FilterCloud(ts);
143+
ts = new Filter4vec(ts);
209144
return new TokenStreamComponents(tokenizer, ts);
210145
}
146+
}
147+
148+
/**
149+
* A final token filter before indexation, to plug after a lemmatizer filter,
150+
* providing most significant tokens for word cloud. Index lemma instead of
151+
* forms when available. Strip punctuation and numbers. Positions of striped
152+
* tokens are deleted. This allows simple computation of a token context (ex:
153+
* span queries, co-occurrences).
154+
*/
155+
public class Filter4vec extends TokenFilter
156+
{
157+
/** The term provided by the Tokenizer */
158+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
159+
/** The position increment (inform it if positions are stripped) */
160+
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
161+
/** A linguistic category as a short number, see {@link TagFr} */
162+
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
163+
/** A normalized orthographic form */
164+
private final OrthAtt orthAtt = addAttribute(OrthAtt.class);
165+
/** A lemma when possible */
166+
private final LemAtt lemAtt = addAttribute(LemAtt.class);
167+
/** keep right position order */
168+
private int skippedPositions;
169+
/** Convert flags as tag to append to term */
170+
static String[] suffix = new String[256];
171+
static {
172+
suffix[VERB.code] = "_VERB"; // 305875
173+
suffix[SUB.code] = ""; // 110522
174+
suffix[ADJ.code] = "_ADJ"; // 67833
175+
suffix[VERBger.code] = "_VERB"; // 8207
176+
suffix[ADV.code] = "_ADV"; // 2336
177+
suffix[VERBppas.code] = "_VERB"; // 1107
178+
suffix[VERBexpr.code] = "_VERB"; // 270
179+
suffix[NUM.code] = ""; // 254
180+
suffix[EXCL.code] = ""; // 166
181+
suffix[VERBmod.code] = "_VERB"; // 91
182+
suffix[VERBaux.code] = "_AUX"; // 89
183+
suffix[PREP.code] = "_MG"; // 71
184+
suffix[PROpers.code] = "_MG"; // 51
185+
suffix[ADVscen.code] = "_MG"; // 33
186+
suffix[DETindef.code] = "_MG"; // 31
187+
suffix[PROindef.code] = "_MG"; // 28
188+
suffix[PROdem.code] = "_MG"; // 27
189+
suffix[ADVasp.code] = "_MG"; // 24
190+
suffix[ADVdeg.code] = "_MG"; // 23
191+
suffix[PROrel.code] = "_MG"; // 18
192+
suffix[PROquest.code] = "_MG"; // 16
193+
suffix[CONJsub.code] = "_MG"; // 16
194+
suffix[DETposs.code] = "_MG"; // 15
195+
suffix[ADVconj.code] = "_MG"; // 15
196+
suffix[DETart.code] = "_MG"; // 11
197+
suffix[DETdem.code] = "_MG"; // 10
198+
suffix[CONJcoord.code] = "_MG"; // 10
199+
suffix[ADVneg.code] = "_MG"; // 9
200+
suffix[ADVquest.code] = "_MG"; // 4
201+
suffix[DETprep.code] = "_MG"; // 4
202+
suffix[DETnum.code] = "_MG"; // from locutions
203+
}
204+
205+
/**
206+
* Default constructor.
207+
* @param input previous filter.
208+
*/
209+
public Filter4vec(TokenStream input) {
210+
super(input);
211+
}
212+
213+
@Override
214+
public final boolean incrementToken() throws IOException
215+
{
216+
// skipping positions will create holes, the count of tokens will be different
217+
// from the count of positions
218+
skippedPositions = 0;
219+
while (input.incrementToken()) {
220+
// no position for XML between words
221+
if (flagsAtt.getFlags() == XML.code) {
222+
continue;
223+
}
224+
if (accept()) {
225+
if (skippedPositions != 0) {
226+
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
227+
}
228+
return true;
229+
}
230+
skippedPositions += posIncrAtt.getPositionIncrement();
231+
}
232+
return false;
233+
}
234+
235+
/**
236+
* Most of the tokens are not rejected but rewrited, except punctuation.
237+
*
238+
* @return true if accepted
239+
*/
240+
protected boolean accept()
241+
{
242+
final int flags = flagsAtt.getFlags();
243+
if (flags == TEST.code) {
244+
System.out.println(termAtt + " — " + orthAtt);
245+
}
246+
// record an empty token at punctuation position for the rails
247+
if (PUN.isPun(flags)) {
248+
if (flags == PUNclause.code) {
249+
}
250+
else if (flags == PUNsent.code) {
251+
}
252+
else if (flags == PUNpara.code || flags == PUNsection.code) {
253+
// let it
254+
}
255+
else {
256+
// termAtt.setEmpty().append("");
257+
}
258+
return true;
259+
}
260+
// unify numbers
261+
if (flags == DIGIT.code) {
262+
termAtt.setEmpty().append("#");
263+
return true;
264+
}
265+
if (!lemAtt.isEmpty()) termAtt.setEmpty().append(lemAtt);
266+
else if (!orthAtt.isEmpty()) termAtt.setEmpty().append(orthAtt);
267+
// String suff = suffix[flags];
268+
return true;
269+
}
270+
271+
@Override
272+
public void reset() throws IOException
273+
{
274+
super.reset();
275+
}
276+
277+
@Override
278+
public void end() throws IOException
279+
{
280+
super.end();
281+
}
211282

212283
}
213284

0 commit comments

Comments
 (0)