Skip to content

Commit eca3c90

Browse files
committed
Check for regular text used as input to set_language
Fail in a better way if a text field is used as input to set_language. Truncate text to 45 characters, in case a large string is used as input. Add more testing.
1 parent 0d118dc commit eca3c90

3 files changed

Lines changed: 29 additions & 10 deletions

File tree

indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/SetLanguageTestCase.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
import static com.yahoo.vespa.indexinglanguage.expressions.ExpressionAssert.assertVerify;
1111
import static com.yahoo.vespa.indexinglanguage.expressions.ExpressionAssert.assertVerifyThrows;
1212
import static org.junit.Assert.assertEquals;
13-
import static org.junit.Assert.assertFalse;
1413
import static org.junit.Assert.assertNotEquals;
14+
import static org.junit.Assert.assertThrows;
1515

1616
/**
1717
* @author Simon Thoresen Hult
@@ -57,4 +57,11 @@ public void testSettingMissingValue() {
5757
assertEquals(Language.UNKNOWN, ctx.getLanguage());
5858
}
5959

60+
@Test
61+
public void testSettingIllegalValue() {
62+
ExecutionContext ctx = new ExecutionContext(new SimpleTestAdapter());
63+
ctx.setCurrentValue(new StringFieldValue("this is part of a long string which is mistakenly used as language"));
64+
assertThrows(IllegalArgumentException.class, () -> new SetLanguageExpression().execute(ctx));
65+
}
66+
6067
}

linguistics/src/main/java/com/yahoo/language/LocaleFactory.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
22
package com.yahoo.language;
33

4+
import com.yahoo.text.Text;
5+
46
import java.util.IllformedLocaleException;
57
import java.util.Locale;
68
import java.util.Objects;
@@ -22,6 +24,15 @@ private LocaleFactory() {}
2224
*/
2325
public static Locale fromLanguageTag(String tag) {
2426
Objects.requireNonNull(tag, "tag cannot be null");
27+
// Tags can have variants and extension making them arbitrarily long,
28+
// but tags exceeding 35-40 characters are extremely rare in production systems.
29+
// Truncate in case people mistakenly use large text fields as tags
30+
var truncatedTag = Text.truncate(tag, 45);
31+
// truncate adds whitespace, so check original tag for whitespace
32+
if (tag.contains(" ")) {
33+
throw new IllegalArgumentException("Illegal language tag '" + truncatedTag + "', language tags corresponding to RFC5646 cannot contain whitespace");
34+
}
35+
tag = truncatedTag;
2536

2637
tag = tag.trim();
2738
if (tag.isEmpty()) return UNKNOWN;
@@ -49,7 +60,7 @@ public static Locale fromLanguageTag(String tag) {
4960
try {
5061
return new Locale.Builder().setLanguage(language).setScript(script).setRegion(region).build();
5162
} catch (IllformedLocaleException e) {
52-
throw new IllegalArgumentException("Unknown language tag '" + tag + "', it must be a language tag corresponding to RFC5646");
63+
throw new IllegalArgumentException("Illegal language tag '" + tag + "', it must be a language tag corresponding to RFC5646");
5364
}
5465
}
5566

linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,21 @@ public void requireThatLocaleCanBeCreatedFromLanguageTag() {
3232
assertLocale("es", "es", "", "");
3333
assertLocale("es-419", "es", "", "419");
3434

35-
try {
36-
LocaleFactory.fromLanguageTag(null);
37-
fail();
38-
} catch (NullPointerException e) {
39-
40-
}
35+
assertThrows(NullPointerException.class, () ->LocaleFactory.fromLanguageTag(null));
4136

4237
assertLocale("", "", "", "");
4338
assertLocale("z-foo", "", "", "");
4439
assertLocale("ojeroierhoiherohjdadsfodsfoifiopeoipefwoipfwe", "", "", "");
4540

4641
var exception = assertThrows(IllegalArgumentException.class,
47-
() -> LocaleFactory.fromLanguageTag("foo-13"));
48-
assertEquals("Unknown language tag 'foo-13', it must be a language tag corresponding to RFC5646",
42+
() -> LocaleFactory.fromLanguageTag("foo-13-ojeroierhoiherohjdadsfodsfoifiopeoipefwoipfwe"));
43+
assertEquals("Illegal language tag 'foo-13-ojeroierhoiherohjdadsfodsfoifiopeo ...', it must be a language tag corresponding to RFC5646",
44+
exception.getMessage());
45+
46+
var languageTag = "this is part of a long string which should be cut off at 20 characters";
47+
exception = assertThrows(IllegalArgumentException.class,
48+
() -> LocaleFactory.fromLanguageTag(languageTag));
49+
assertEquals("Illegal language tag 'this is part of a long string which shoul ...', language tags corresponding to RFC5646 cannot contain whitespace",
4950
exception.getMessage());
5051
}
5152

0 commit comments

Comments
 (0)