Skip to content

Commit 3f29812

Browse files
committed
Add krill.index.textSize.max configuration option
... to increase maximum text size Resolves #205 DeLiKo@DNB-XXL requires krill.index.textSize.max = 120000000 Change-Id: I1cd64ffc38179ae1fd965e5ef5f7ec7503fbcd21
1 parent 44057c4 commit 3f29812

File tree

5 files changed

+66
-2
lines changed

5 files changed

+66
-2
lines changed

src/main/java/de/ids_mannheim/korap/KrillIndex.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import java.util.regex.Pattern;
1818
import java.util.zip.GZIPInputStream;
1919

20+
import com.fasterxml.jackson.core.JsonFactory;
21+
import com.fasterxml.jackson.core.StreamReadConstraints;
2022
import org.apache.lucene.analysis.Analyzer;
2123
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
2224
import org.apache.lucene.document.Document;
@@ -76,6 +78,8 @@
7678
import de.ids_mannheim.korap.util.KrillProperties;
7779
import de.ids_mannheim.korap.util.QueryException;
7880

81+
import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
82+
7983
/**
8084
* <p>KrillIndex implements a simple API for searching in and writing
8185
* to a
@@ -249,6 +253,23 @@ public String getVersion () {
249253
return this.version;
250254
};
251255

256+
public void setMaxStringLength(int maxStringLength) {
257+
if (maxStringLength < DEFAULT_MAX_STRING_LEN) {
258+
throw new IllegalArgumentException("Maximum string length must not be smaller than the default value: "
259+
+ DEFAULT_MAX_STRING_LEN);
260+
}
261+
262+
StreamReadConstraints constraints = StreamReadConstraints.builder()
263+
.maxStringLength(maxStringLength)
264+
.build();
265+
266+
JsonFactory factory = JsonFactory.builder()
267+
.streamReadConstraints(constraints)
268+
.build();
269+
270+
this.mapper = new ObjectMapper(factory);
271+
log.info("Maximum string length set to {}.", maxStringLength);
272+
}
252273

253274
/**
254275
* Get the name of the index.

src/main/java/de/ids_mannheim/korap/index/Indexer.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import de.ids_mannheim.korap.KrillIndex;
2626
import de.ids_mannheim.korap.util.KrillProperties;
2727

28+
import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
29+
2830
/**
2931
* Standalone indexer tool for Krill.
3032
* Although the preferred index method
@@ -195,7 +197,7 @@ public static void main (String[] argv) {
195197
options.addOption(Option.builder("a").longOpt("addInsteadofUpsert")
196198
.desc("Always add files to the index, never update")
197199
.build());
198-
200+
199201
CommandLineParser parser = new DefaultParser();
200202

201203
String propFile = null;
@@ -216,7 +218,6 @@ public static void main (String[] argv) {
216218
if (cmd.hasOption("a")) {
217219
addInsteadOfUpsert = true;
218220
};
219-
220221
}
221222
catch (MissingOptionException e) {
222223
HelpFormatter formatter = new HelpFormatter();
@@ -237,6 +238,12 @@ public static void main (String[] argv) {
237238
try {
238239
// Get indexer object
239240
Indexer indexer = new Indexer(prop);
241+
242+
// Apply max text size from configuration
243+
if (KrillProperties.maxTextSize > DEFAULT_MAX_STRING_LEN) {
244+
log.info("Setting max text length to " + KrillProperties.maxTextSize);
245+
indexer.index.setMaxStringLength(KrillProperties.maxTextSize);
246+
}
240247

241248
// Iterate over list of directories
242249
for (String arg : inputDirectories) {

src/main/java/de/ids_mannheim/korap/util/KrillProperties.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import org.slf4j.Logger;
77
import org.slf4j.LoggerFactory;
88

9+
import static com.fasterxml.jackson.core.StreamReadConstraints.DEFAULT_MAX_STRING_LEN;
10+
911
/**
1012
*
1113
* Todo: Properties may be loaded twice - although Java may cache automatically
@@ -23,6 +25,7 @@ public class KrillProperties {
2325
public static int maxTokenContextSize = 60;
2426
public static int maxCharContextSize = 500;
2527
public static int defaultSearchContextLength = 6;
28+
public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
2629

2730
public static boolean matchExpansionIncludeContextSize = false;
2831

@@ -89,6 +92,7 @@ public static void updateConfigurations (Properties prop) {
8992
// EM: not implemented yet
9093
// String maxCharContextSize = prop.getProperty("krill.context.max.char");
9194
String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
95+
String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
9296

9397
try {
9498
if (maxTokenMatchSize != null) {
@@ -107,6 +111,18 @@ public static void updateConfigurations (Properties prop) {
107111
KrillProperties.defaultSearchContextLength = Integer
108112
.parseInt(defaultSearchContextLength);
109113
}
114+
if (maxTextSizeValue != null) {
115+
int userMaxTextLength = Integer
116+
.parseInt(maxTextSizeValue);
117+
if (userMaxTextLength < DEFAULT_MAX_STRING_LEN) {
118+
log.warn("Specified krill.index.textSize.max is too small. Using default value: "
119+
+ DEFAULT_MAX_STRING_LEN);
120+
KrillProperties.maxTextSize = DEFAULT_MAX_STRING_LEN;
121+
} else {
122+
KrillProperties.maxTextSize = userMaxTextLength;
123+
}
124+
125+
}
110126
}
111127
catch (NumberFormatException e) {
112128
log.error("A Krill property expects numerical values: "

src/main/resources/krill.properties.info

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ krill.index.commit.count = 134217000
1414
krill.index.commit.log = log/krill.commit.log
1515
krill.index.commit.auto = 500
1616
krill.index.relations.max = 100
17+
krill.index.textSize.max = 20000000

src/test/java/de/ids_mannheim/korap/TestIndexer.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import java.io.ByteArrayOutputStream;
66
import java.io.File;
7+
import java.io.FileWriter;
78
import java.io.IOException;
89
import java.io.PrintStream;
910

@@ -105,6 +106,24 @@ public void testUnicodeProblem () throws IOException {
105106
assertEquals(outputStream.toString(), "Added 1 file.\n");
106107
}
107108

109+
@Test
110+
public void testMaxTextSize () throws IOException {
111+
// Create a temporary properties file with the max text size setting
112+
File tempPropertiesFile = File.createTempFile("krill", ".properties");
113+
FileWriter writer = new FileWriter(tempPropertiesFile);
114+
writer.write("krill.version = ${project.version}\n");
115+
writer.write("krill.name = ${project.name}\n");
116+
writer.write("krill.indexDir = test-output\n");
117+
writer.write("krill.index.textSize.max = 25000000\n");
118+
writer.close();
119+
120+
Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
121+
"-i", "src/test/resources/bzk", "-o", "test-output-1"});
122+
assertEquals("Added or updated 1 file.\n", outputStream.toString());
123+
124+
tempPropertiesFile.delete();
125+
}
126+
108127
@Before
109128
public void setOutputStream () {
110129
System.setOut(new PrintStream(outputStream));

0 commit comments

Comments
 (0)