|
1 | 1 | package com.github.oeuvres.alix.lucene.index; |
2 | 2 |
|
3 | 3 | import static com.github.oeuvres.alix.common.Flags.*; |
| 4 | +import static com.github.oeuvres.alix.fr.TagFr.*; |
4 | 5 |
|
5 | 6 | import java.io.BufferedReader; |
6 | 7 | import java.io.BufferedWriter; |
7 | 8 | import java.io.File; |
8 | 9 | import java.io.FileInputStream; |
9 | | -import java.io.FileNotFoundException; |
10 | 10 | import java.io.FileOutputStream; |
11 | 11 | import java.io.IOException; |
12 | 12 | import java.io.InputStreamReader; |
13 | 13 | import java.io.OutputStreamWriter; |
14 | 14 | import java.io.Writer; |
15 | 15 | import java.nio.file.Path; |
16 | | -import java.util.ArrayList; |
17 | | -import java.util.InvalidPropertiesFormatException; |
18 | | -import java.util.Properties; |
19 | 16 | import java.util.concurrent.Callable; |
20 | 17 |
|
21 | 18 | import org.apache.lucene.analysis.Analyzer; |
| 19 | +import org.apache.lucene.analysis.TokenFilter; |
22 | 20 | import org.apache.lucene.analysis.TokenStream; |
23 | 21 | import org.apache.lucene.analysis.Tokenizer; |
24 | 22 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
25 | 23 | import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; |
| 24 | +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
26 | 25 |
|
| 26 | +import com.github.oeuvres.alix.fr.TagFr; |
27 | 27 | import com.github.oeuvres.alix.lucene.analysis.FilterAposHyphenFr; |
28 | | -import com.github.oeuvres.alix.lucene.analysis.FilterCloud; |
29 | 28 | import com.github.oeuvres.alix.lucene.analysis.FilterFrPos; |
30 | 29 | import com.github.oeuvres.alix.lucene.analysis.FilterHTML; |
31 | 30 | import com.github.oeuvres.alix.lucene.analysis.FilterLemmatize; |
32 | 31 | import com.github.oeuvres.alix.lucene.analysis.FilterLocution; |
33 | | -import com.github.oeuvres.alix.lucene.analysis.FrDics; |
34 | 32 | import com.github.oeuvres.alix.lucene.analysis.TokenizerML; |
35 | | -import com.github.oeuvres.alix.util.Dir; |
| 33 | +import com.github.oeuvres.alix.lucene.analysis.tokenattributes.LemAtt; |
| 34 | +import com.github.oeuvres.alix.lucene.analysis.tokenattributes.OrthAtt; |
36 | 35 |
|
37 | 36 | import picocli.CommandLine; |
38 | 37 | import picocli.CommandLine.Command; |
|
42 | 41 | * Analyse an XML/TEI corpus to output a custom text designed for a word2vec training. |
43 | 42 | */ |
44 | 43 | @Command(name = "Analyze", description = "Analyse an XML/TEI corpus to output a custom text designed for a word2vec training.") |
45 | | -public class Analyze4vec implements Callable<Integer> |
| 44 | +public class Analyze4vec extends Cli implements Callable<Integer> |
46 | 45 | { |
47 | 46 | final static String APP = "alix.corpus4vec"; |
48 | | - @Parameters(index = "0", arity = "1", paramLabel = "corpus.xml", description = "1 Java/XML/properties describing a document collection (src file…)") |
49 | | - /** configuration files */ |
50 | | - File conf; |
| 47 | + |
51 | 48 | @Parameters(index = "1", arity = "1", paramLabel = "corpus.txt", description = "1 destination text file for analyzed corpus.") |
52 | 49 | /** Destination text file. */ |
53 | 50 | File dstFile; |
54 | | - /** File globs to parse, populated by parsing corpus properties */ |
55 | | - ArrayList<Path> paths = new ArrayList<>(); |
56 | 51 | @Override |
57 | 52 | public Integer call() throws Exception |
58 | 53 | { |
@@ -85,6 +80,7 @@ private static void unroll(final TokenStream tokenStream, final Writer writer) t |
85 | 80 | final CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); |
86 | 81 | final FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class); |
87 | 82 | tokenStream.reset(); |
| 83 | + int startLast = 0; |
88 | 84 | while(tokenStream.incrementToken()) { |
89 | 85 | final int flags = flagsAtt.getFlags(); |
90 | 86 | if (flags == PUNsection.code) { |
@@ -116,67 +112,6 @@ else if (PUN.isPun(flags)) { |
116 | 112 | } |
117 | 113 | tokenStream.close(); |
118 | 114 | } |
119 | | - |
120 | | - |
121 | | - /** |
122 | | - * Parse properties to output the corpus |
123 | | - * |
124 | | - * @param propsFile A properties file in XML format |
125 | | - * {@link Properties#loadFromXML(java.io.InputStream)}. |
126 | | - * @throws IOException I/O file system error, or required files not |
127 | | - * found. |
128 | | - * @throws NoSuchFieldException Properties errors. |
129 | | - */ |
130 | | - public void parse(File propsFile) throws IOException, NoSuchFieldException |
131 | | - { |
132 | | - if (!propsFile.exists()) throw new FileNotFoundException( |
133 | | - "\n [" + APP + "] " + propsFile.getAbsolutePath() + "\nProperties file not found"); |
134 | | - Properties props = new Properties(); |
135 | | - try { |
136 | | - props.loadFromXML(new FileInputStream(propsFile)); |
137 | | - } |
138 | | - catch (InvalidPropertiesFormatException e) { |
139 | | - throw new InvalidPropertiesFormatException( |
140 | | - "\n [" + APP + "] " + propsFile + "\nXML error in properties file\n" |
141 | | - + "cf. https://docs.oracle.com/javase/8/docs/api/java/util/Properties.html"); |
142 | | - } |
143 | | - catch (IOException e) { |
144 | | - throw new IOException( |
145 | | - "\n [" + APP + "] " + propsFile.getAbsolutePath() + "\nProperties file not readable"); |
146 | | - } |
147 | | - |
148 | | - final File base = propsFile.getCanonicalFile().getParentFile(); |
149 | | - |
150 | | - final String src = props.getProperty("src"); |
151 | | - if (src == null) throw new NoSuchFieldException( |
152 | | - "\n [" + APP + "] " + propsFile + "\nan src entry is needed, to have path to index" |
153 | | - + "\n<entry key=\"src\">../corpus1/*.xml;../corpus2/*.xml</entry>"); |
154 | | - String[] blurf = src.split(" *[;] *|[\t ]*[\n\r]+[\t ]*"); |
155 | | - // resolve globs relative to the folder of the properties field |
156 | | - for (String glob : blurf) { |
157 | | - glob = Dir.globNorm(glob, base); |
158 | | - Dir.include(paths, glob); |
159 | | - } |
160 | | - |
161 | | - final String exclude = props.getProperty("exclude"); |
162 | | - if (exclude != null) { |
163 | | - String[] globs = exclude.split(" *[;] *|[\t ]*[\n\r]+[\t ]*"); |
164 | | - for (String glob : globs) { |
165 | | - glob = Dir.globNorm(glob, base); |
166 | | - Dir.exclude(paths, glob); |
167 | | - } |
168 | | - } |
169 | | - final String dicfile = props.getProperty("dicfile"); |
170 | | - if (dicfile != null) { |
171 | | - File dicAbs = new File(dicfile); |
172 | | - if (!dicAbs.isAbsolute()) dicAbs = new File(base, dicfile); |
173 | | - if (!dicAbs.exists()) { |
174 | | - throw new FileNotFoundException("Local dictionary file not found <entry key=\"dicfile\">" + dicfile |
175 | | - + "</entry>, resolved as " + dicAbs.getAbsolutePath()); |
176 | | - } |
177 | | - FrDics.load(dicAbs.getCanonicalPath(), dicAbs); |
178 | | - } |
179 | | - } |
180 | 115 |
|
181 | 116 | public class Analyzer4vec extends Analyzer |
182 | 117 | { |
@@ -205,9 +140,145 @@ public TokenStreamComponents createComponents(String field) |
205 | 140 | // group compounds after lemmatization for verbal compounds |
206 | 141 | ts = new FilterLocution(ts); |
207 | 142 | // last filter èrepare term to index |
208 | | - ts = new FilterCloud(ts); |
| 143 | + ts = new Filter4vec(ts); |
209 | 144 | return new TokenStreamComponents(tokenizer, ts); |
210 | 145 | } |
| 146 | + } |
| 147 | + |
| 148 | + /** |
| 149 | + * A final token filter before indexation, to plug after a lemmatizer filter, |
| 150 | + * providing most significant tokens for word cloud. Index lemma instead of |
| 151 | + * forms when available. Strip punctuation and numbers. Positions of striped |
| 152 | + * tokens are deleted. This allows simple computation of a token context (ex: |
| 153 | + * span queries, co-occurrences). |
| 154 | + */ |
| 155 | + public class Filter4vec extends TokenFilter |
| 156 | + { |
| 157 | + /** The term provided by the Tokenizer */ |
| 158 | + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| 159 | + /** The position increment (inform it if positions are stripped) */ |
| 160 | + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| 161 | + /** A linguistic category as a short number, see {@link TagFr} */ |
| 162 | + private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); |
| 163 | + /** A normalized orthographic form */ |
| 164 | + private final OrthAtt orthAtt = addAttribute(OrthAtt.class); |
| 165 | + /** A lemma when possible */ |
| 166 | + private final LemAtt lemAtt = addAttribute(LemAtt.class); |
| 167 | + /** keep right position order */ |
| 168 | + private int skippedPositions; |
| 169 | + /** Convert flags as tag to append to term */ |
| 170 | + static String[] suffix = new String[256]; |
| 171 | + static { |
| 172 | + suffix[VERB.code] = "_VERB"; // 305875 |
| 173 | + suffix[SUB.code] = ""; // 110522 |
| 174 | + suffix[ADJ.code] = "_ADJ"; // 67833 |
| 175 | + suffix[VERBger.code] = "_VERB"; // 8207 |
| 176 | + suffix[ADV.code] = "_ADV"; // 2336 |
| 177 | + suffix[VERBppas.code] = "_VERB"; // 1107 |
| 178 | + suffix[VERBexpr.code] = "_VERB"; // 270 |
| 179 | + suffix[NUM.code] = ""; // 254 |
| 180 | + suffix[EXCL.code] = ""; // 166 |
| 181 | + suffix[VERBmod.code] = "_VERB"; // 91 |
| 182 | + suffix[VERBaux.code] = "_AUX"; // 89 |
| 183 | + suffix[PREP.code] = "_MG"; // 71 |
| 184 | + suffix[PROpers.code] = "_MG"; // 51 |
| 185 | + suffix[ADVscen.code] = "_MG"; // 33 |
| 186 | + suffix[DETindef.code] = "_MG"; // 31 |
| 187 | + suffix[PROindef.code] = "_MG"; // 28 |
| 188 | + suffix[PROdem.code] = "_MG"; // 27 |
| 189 | + suffix[ADVasp.code] = "_MG"; // 24 |
| 190 | + suffix[ADVdeg.code] = "_MG"; // 23 |
| 191 | + suffix[PROrel.code] = "_MG"; // 18 |
| 192 | + suffix[PROquest.code] = "_MG"; // 16 |
| 193 | + suffix[CONJsub.code] = "_MG"; // 16 |
| 194 | + suffix[DETposs.code] = "_MG"; // 15 |
| 195 | + suffix[ADVconj.code] = "_MG"; // 15 |
| 196 | + suffix[DETart.code] = "_MG"; // 11 |
| 197 | + suffix[DETdem.code] = "_MG"; // 10 |
| 198 | + suffix[CONJcoord.code] = "_MG"; // 10 |
| 199 | + suffix[ADVneg.code] = "_MG"; // 9 |
| 200 | + suffix[ADVquest.code] = "_MG"; // 4 |
| 201 | + suffix[DETprep.code] = "_MG"; // 4 |
| 202 | + suffix[DETnum.code] = "_MG"; // from locutions |
| 203 | + } |
| 204 | + |
| 205 | + /** |
| 206 | + * Default constructor. |
| 207 | + * @param input previous filter. |
| 208 | + */ |
| 209 | + public Filter4vec(TokenStream input) { |
| 210 | + super(input); |
| 211 | + } |
| 212 | + |
| 213 | + @Override |
| 214 | + public final boolean incrementToken() throws IOException |
| 215 | + { |
| 216 | + // skipping positions will create holes, the count of tokens will be different |
| 217 | + // from the count of positions |
| 218 | + skippedPositions = 0; |
| 219 | + while (input.incrementToken()) { |
| 220 | + // no position for XML between words |
| 221 | + if (flagsAtt.getFlags() == XML.code) { |
| 222 | + continue; |
| 223 | + } |
| 224 | + if (accept()) { |
| 225 | + if (skippedPositions != 0) { |
| 226 | + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); |
| 227 | + } |
| 228 | + return true; |
| 229 | + } |
| 230 | + skippedPositions += posIncrAtt.getPositionIncrement(); |
| 231 | + } |
| 232 | + return false; |
| 233 | + } |
| 234 | + |
| 235 | + /** |
| 236 | + * Most of the tokens are not rejected but rewrited, except punctuation. |
| 237 | + * |
| 238 | + * @return true if accepted |
| 239 | + */ |
| 240 | + protected boolean accept() |
| 241 | + { |
| 242 | + final int flags = flagsAtt.getFlags(); |
| 243 | + if (flags == TEST.code) { |
| 244 | + System.out.println(termAtt + " — " + orthAtt); |
| 245 | + } |
| 246 | + // record an empty token at punctuation position for the rails |
| 247 | + if (PUN.isPun(flags)) { |
| 248 | + if (flags == PUNclause.code) { |
| 249 | + } |
| 250 | + else if (flags == PUNsent.code) { |
| 251 | + } |
| 252 | + else if (flags == PUNpara.code || flags == PUNsection.code) { |
| 253 | + // let it |
| 254 | + } |
| 255 | + else { |
| 256 | + // termAtt.setEmpty().append(""); |
| 257 | + } |
| 258 | + return true; |
| 259 | + } |
| 260 | + // unify numbers |
| 261 | + if (flags == DIGIT.code) { |
| 262 | + termAtt.setEmpty().append("#"); |
| 263 | + return true; |
| 264 | + } |
| 265 | + if (!lemAtt.isEmpty()) termAtt.setEmpty().append(lemAtt); |
| 266 | + else if (!orthAtt.isEmpty()) termAtt.setEmpty().append(orthAtt); |
| 267 | + // String suff = suffix[flags]; |
| 268 | + return true; |
| 269 | + } |
| 270 | + |
| 271 | + @Override |
| 272 | + public void reset() throws IOException |
| 273 | + { |
| 274 | + super.reset(); |
| 275 | + } |
| 276 | + |
| 277 | + @Override |
| 278 | + public void end() throws IOException |
| 279 | + { |
| 280 | + super.end(); |
| 281 | + } |
211 | 282 |
|
212 | 283 | } |
213 | 284 |
|
|
0 commit comments