Skip to content

Commit e4ff968

Browse files
committed
Refactor ingest Package
1 parent 7140d2d commit e4ff968

7 files changed

Lines changed: 378 additions & 245 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/ingest/AlixTxtIndexer.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,10 +232,10 @@ public void load(File file) throws IOException
232232

233233
book.add(new StoredField("bibl", bibl)); // (TokenStream fields cannot be stored)
234234
TokenStream ts = new AnalyzerMeta().tokenStream("meta", bibl); // renew token stream
235-
book.add(new Field("bibl", ts, AlixWriter.ftypeMeta)); // indexation of the chosen tokens
235+
book.add(new Field("bibl", ts, LuceneWriter.ftypeText)); // indexation of the chosen tokens
236236
chapter.add(new StoredField("bibl", bibl)); // (TokenStream fields cannot be stored)
237237
ts = new AnalyzerMeta().tokenStream("meta", bibl); // renew token stream
238-
chapter.add(new Field("bibl", ts, AlixWriter.ftypeMeta)); // indexation of the chosen tokens
238+
chapter.add(new Field("bibl", ts, LuceneWriter.ftypeText)); // indexation of the chosen tokens
239239

240240
chapter.add(new IntPoint(name, val)); // to search
241241
chapter.add(new StoredField(name, val)); // to show
@@ -255,7 +255,7 @@ public void load(File file) throws IOException
255255

256256
chapter.add(new StoredField(name, text)); // text has to be stored for snippets and conc
257257
TokenStream source = analyzer.tokenStream("stats", text);
258-
chapter.add(new Field(name, source, AlixWriter.ftypeText)); // indexation of the chosen tokens
258+
chapter.add(new Field(name, source, LuceneWriter.ftypeText)); // indexation of the chosen tokens
259259

260260
// System.out.println(doc);
261261
writer.addDocument(chapter);

analysis/src/java/com/github/oeuvres/alix/ingest/IngestConfig.java

Lines changed: 11 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
* If multiple {@code tei} globs match the same file, the first occurrence is kept and subsequent ones
4343
* are reported via {@link Report#warn(String)} (no duplicate list is stored).
4444
* </p>
45-
* :contentReference[oaicite:2]{index=2}
4645
*/
4746
public final class IngestConfig
4847
{
@@ -120,22 +119,22 @@ public static IngestConfig load(Path configXml, Report rep) throws IOException
120119
properties.loadFromXML(in);
121120
}
122121

123-
String name = trimToNull(properties.getProperty("name"));
122+
String name = trimOrNull(properties.getProperty("name"));
124123
if (name == null)
125-
name = fileStem(cfg);
124+
name = Dir.stem(cfg);
126125

127-
String label = trimToNull(properties.getProperty("label"));
126+
String label = trimOrNull(properties.getProperty("label"));
128127

129-
String indexrootStr = trimToNull(properties.getProperty("indexroot"));
128+
String indexrootStr = trimOrNull(properties.getProperty("indexroot"));
130129
if (indexrootStr == null)
131130
throw new IllegalArgumentException("Missing required key: indexroot in " + cfg);
132-
Path indexroot = resolvePath(baseDir, indexrootStr);
131+
Path indexroot = Dir.resolve(baseDir, indexrootStr);
133132

134133
// Optional resources
135134
Path prexslt = null;
136-
String prexsltStr = trimToNull(properties.getProperty("prexslt"));
135+
String prexsltStr = trimOrNull(properties.getProperty("prexslt"));
137136
if (prexsltStr != null)
138-
prexslt = resolvePath(baseDir, prexsltStr);
137+
prexslt = Dir.resolve(baseDir, prexsltStr);
139138

140139
List<Path> dicfile = resolveFiles(baseDir, lines(properties, "dicfile"));
141140
List<Path> stopfile = resolveFiles(baseDir, lines(properties, "stopfile"));
@@ -203,7 +202,8 @@ private static List<String> normalizeGlobs(Path cfg, List<String> globs) throws
203202
return out;
204203
}
205204

206-
private static String trimToNull(String s)
205+
/** Trim to null: returns null for null, empty, or whitespace-only strings. */
206+
private static String trimOrNull(String s)
207207
{
208208
if (s == null)
209209
return null;
@@ -232,31 +232,16 @@ private static List<String> lines(Properties p, String key)
232232
return out;
233233
}
234234

235-
private static Path resolvePath(Path baseDir, String relOrAbs)
236-
{
237-
Path p = Path.of(relOrAbs.trim());
238-
if (!p.isAbsolute())
239-
p = baseDir.resolve(p);
240-
return p.toAbsolutePath().normalize();
241-
}
242-
243235
private static List<Path> resolveFiles(Path baseDir, List<String> relOrAbsList)
244236
{
245237
if (relOrAbsList.isEmpty())
246238
return Collections.emptyList();
247239
List<Path> out = new ArrayList<>(relOrAbsList.size());
248240
for (String s : relOrAbsList)
249-
out.add(resolvePath(baseDir, s));
241+
out.add(Dir.resolve(baseDir, s));
250242
return out;
251243
}
252244

253-
private static String fileStem(Path p)
254-
{
255-
String n = p.getFileName().toString();
256-
int dot = n.lastIndexOf('.');
257-
return (dot > 0) ? n.substring(0, dot) : n;
258-
}
259-
260245
@Override
261246
public String toString()
262247
{
@@ -276,11 +261,6 @@ public String toString()
276261
appendList(sb, "dicfile", dicfile, 10);
277262
appendList(sb, "stopfile", stopfile, 10);
278263

279-
// If you keep exclude globs only locally during load(), remove this line.
280-
// If you store them, include them:
281-
// appendStringList(sb, "exclude", excludeGlobs, 10);
282-
283-
284264
sb.append('}');
285265
return sb.toString();
286266
}
@@ -298,4 +278,4 @@ private static void appendList(StringBuilder sb, String key, List<Path> list, in
298278
sb.append(" ... +").append(list.size() - n).append('\n');
299279
}
300280
}
301-
}
281+
}

analysis/src/java/com/github/oeuvres/alix/ingest/AlixWriter.java renamed to analysis/src/java/com/github/oeuvres/alix/ingest/LuceneWriter.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,19 @@
1313
import org.apache.lucene.store.Directory;
1414
import org.apache.lucene.store.FSDirectory;
1515

16-
public class AlixWriter
16+
public class LuceneWriter
1717
{
1818
/** Lucene field type for alix text field */
1919
public static final FieldType ftypeText = new FieldType();
2020
static {
2121
ftypeText.setTokenized(true);
2222
// freqs required, position needed for co-occurrences
23-
// ftypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
24-
ftypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
23+
ftypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
24+
// ftypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
2525
ftypeText.setOmitNorms(false); // keep norms for Similarity, http://makble.com/what-is-lucene-norms
26-
// ftypeText.setStoreTermVectors(true);
27-
// ftypeText.setStoreTermVectorPositions(true);
28-
// ftypeText.setStoreTermVectorOffsets(true);
26+
ftypeText.setStoreTermVectors(true);
27+
ftypeText.setStoreTermVectorPositions(true);
28+
ftypeText.setStoreTermVectorOffsets(true);
2929
ftypeText.setStored(false); // TokenStream fields cannot be stored
3030
ftypeText.freeze();
3131
}

analysis/src/java/com/github/oeuvres/alix/ingest/AlixTeiIngestor.java renamed to analysis/src/java/com/github/oeuvres/alix/ingest/TeiIngestor.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
* - build into indexroot/name_tmp
4545
* - on success: move indexroot/name → indexroot/name_old (if exists), then name_tmp → name
4646
*/
47-
public final class AlixTeiIngestor
47+
public final class TeiIngestor
4848
{
4949

5050
private static final String ALIX_XSL_CLASSPATH = "/com/github/oeuvres/alix/xml/alix.xsl";
@@ -55,12 +55,12 @@ public final class AlixTeiIngestor
5555
private final Templates alixTpl;
5656
private final SAXParserFactory spf;
5757

58-
public AlixTeiIngestor(Report rep) throws TransformerException
58+
public TeiIngestor(Report rep) throws TransformerException
5959
{
6060
this.rep = (rep != null) ? rep : Report.ReportNull.INSTANCE;
6161

6262
this.stf = (SAXTransformerFactory) new TransformerFactoryImpl();
63-
this.resolver = new XsltJarResolver(AlixTeiIngestor.class);
63+
this.resolver = new XsltJarResolver(TeiIngestor.class);
6464
this.stf.setURIResolver(resolver);
6565

6666
// Compile required alix.xsl from classpath with correct systemId

test/src/main/java/com/github/oeuvres/alix/ingest/AlixTeiIngestorDemo.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ private AlixTeiIngestorDemo()
2424
public static void main(String[] args) throws IOException, TransformerException, SAXException, ParserConfigurationException
2525
{
2626
Report rep = new ReportConsole();
27-
AlixTeiIngestor ingestor = new AlixTeiIngestor(rep);
27+
TeiIngestor ingestor = new TeiIngestor(rep);
2828
Path cfgPath = Path.of("D:\\code\\piaget-labo\\install\\alix-piaget.xml");
2929
// Path cfgPath = Path.of("D:\\code\\piaget-labo\\install\\alix-test.xml");
3030
IngestConfig cfg = IngestConfig.load(cfgPath, rep);

test/src/main/java/com/github/oeuvres/alix/lucene/terms/CoocDemo.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package com.github.oeuvres.alix.lucene.terms;
22

33
import java.io.BufferedReader;
4-
import java.io.IOException;
54
import java.io.InputStreamReader;
65
import java.nio.IntBuffer;
76
import java.nio.charset.StandardCharsets;

0 commit comments

Comments
 (0)