Skip to content

Commit 818347e

Browse files
committed
MIR-1559-Use language-specific Solr fields for title, abstract and subject in search forms
1 parent dd90c55 commit 818347e

File tree

20 files changed

+593
-20
lines changed

20 files changed

+593
-20
lines changed

mir-it/src/test/java/org/mycore/mir/it/model/MIRSearchField.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
package org.mycore.mir.it.model;
22

33
public enum MIRSearchField {
4-
Titel("mods.title"),
4+
Titel("mods.title,mods.title.lang.de,mods.title.lang.en,mods.title.lang.es,mods.title.lang.fr,mods.title.lang.it,mods.title.lang.ru,mods.title.lang.ru.latin,mods.title.lang.tr"),
55
Autor("mods.author"),
66
Name("mods.name"),
7-
Metadaten("allMeta");
7+
Metadaten("allMeta,mods.abstract.lang.de,mods.abstract.lang.en,mods.abstract.lang.es,mods.abstract.lang.fr,mods.abstract.lang.it,mods.abstract.lang.ru,mods.abstract.lang.ru.latin,mods.abstract.lang.tr,mods.subject.lang.de,mods.subject.lang.en,mods.subject.lang.es,mods.subject.lang.fr,mods.subject.lang.it,mods.subject.lang.ru,mods.subject.lang.ru.latin,mods.subject.lang.tr");
88

99
private String value;
1010

mir-it/src/test/java/org/mycore/mir/it/model/MIRSearchTestDataLoader.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,15 @@ public class MIRSearchTestDataLoader {
2424
private static final String TEST_FOLDER_NAME = "testFiles/";
2525

2626
// TODO: read from property
27-
private static final List<String> FILE_NAMES = Stream.of("mir_mods_00010000.xml").collect(Collectors.toList());
27+
private static final List<String> FILE_NAMES = Stream.of(
28+
"mir_mods_00010000.xml",
29+
"mir_mods_00010001.xml",
30+
"mir_mods_00010002.xml",
31+
"mir_mods_00010003.xml",
32+
"mir_mods_00010004.xml",
33+
"mir_mods_00010005.xml",
34+
"mir_mods_00010006.xml",
35+
"mir_mods_00010007.xml").collect(Collectors.toList());
2836

2937
public void lazyLoadData(MCRWebdriverWrapper webDriverWrapper) throws IOException, InterruptedException {
3038
if (!loaded) {
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
package org.mycore.mir.it.tests;
2+
3+
import java.io.IOException;
4+
import java.util.Arrays;
5+
import java.util.Collection;
6+
import java.util.List;
7+
import java.util.Optional;
8+
import java.util.stream.Collectors;
9+
10+
import org.junit.Assert;
11+
import org.junit.Before;
12+
import org.junit.Test;
13+
import org.junit.runner.RunWith;
14+
import org.junit.runners.Parameterized;
15+
import org.mycore.mir.it.controller.MIRSearchController;
16+
import org.mycore.mir.it.model.MIRComplexSearchQuery;
17+
import org.mycore.mir.it.model.MIRSearchField;
18+
import org.mycore.mir.it.model.MIRSearchFieldCondition;
19+
import org.mycore.mir.it.model.MIRSearchTestDataLoader;
20+
import org.openqa.selenium.By;
21+
22+
/**
23+
* Integration tests for language-specific Solr fields.
24+
* <p>
25+
* Each test document contains a title and abstract in a specific language using a <b>plural or inflected</b>
26+
* word form. The test searches with the <b>singular or base</b> form. A match proves that the language-specific
27+
* stemmer is working correctly, since {@code text_general} (without stemming) would not match.
28+
* </p>
29+
* <p>
30+
* Stemming proof per language:
31+
* </p>
32+
* <ul>
33+
* <li><b>de:</b> "Bibliotheken" (plural) vs. search "Bibliothek" (singular) — GermanLightStemFilterFactory</li>
34+
* <li><b>en:</b> "Libraries" (plural) vs. search "Library" (singular) — PorterStemFilterFactory</li>
35+
* <li><b>es:</b> "bibliotecas" (plural) vs. search "biblioteca" (singular) — SpanishLightStemFilterFactory</li>
36+
* <li><b>fr:</b> "bibliothèques" (plural) vs. search "bibliothèque" (singular) — FrenchLightStemFilterFactory</li>
37+
* <li><b>it:</b> "biblioteche" (plural) vs. search "biblioteca" (singular) — ItalianLightStemFilterFactory</li>
38+
* <li><b>ru:</b> "библиотек" (genitive plural) vs. search "библиотека" (nominative) — RussianLightStemFilterFactory</li>
39+
* <li><b>tr:</b> "araştırmalar" (plural) vs. search "araştırma" (singular) — SnowballPorterFilterFactory (Turkish)</li>
40+
* </ul>
41+
*
42+
* <p>These tests require both the MCR and MIR tickets to be implemented:</p>
43+
* <ul>
44+
* <li>MCR: Language-specific Solr field types and MODS indexing XSLT</li>
45+
* <li>MIR: Search forms updated to query language-specific fields</li>
46+
* </ul>
47+
*/
48+
@RunWith(Parameterized.class)
49+
public class MIRLanguageSearchITCase extends MIRITBase {
50+
51+
private final String language;
52+
53+
private final String expectedDocId;
54+
55+
private final String titleSearchTerm;
56+
57+
private final String abstractSearchTerm;
58+
59+
public MIRLanguageSearchITCase(String language, String expectedDocId,
60+
String titleSearchTerm, String abstractSearchTerm) {
61+
this.language = language;
62+
this.expectedDocId = expectedDocId;
63+
this.titleSearchTerm = titleSearchTerm;
64+
this.abstractSearchTerm = abstractSearchTerm;
65+
}
66+
67+
/**
68+
* Test parameters: language, document ID, stemmed title search term, stemmed abstract search term.
69+
* The search term is always a different word form than what is stored in the document.
70+
*/
71+
@Parameterized.Parameters(name = "{0}")
72+
public static Collection<Object[]> data() {
73+
return Arrays.asList(new Object[][] {
74+
// { language, docId, title search (singular/base), abstract search (singular/base) }
75+
{ "de", "mir_mods_00010001", "Bibliothek", "Publikation" },
76+
{ "en", "mir_mods_00010002", "Library", "publication" },
77+
{ "es", "mir_mods_00010003", "biblioteca", "publicación" },
78+
{ "fr", "mir_mods_00010004", "bibliothèque", "publication" },
79+
{ "it", "mir_mods_00010005", "biblioteca", "pubblicazione" },
80+
{ "ru", "mir_mods_00010006", "\u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0430",
81+
"\u043f\u0443\u0431\u043b\u0438\u043a\u0430\u0446\u0438\u044f" },
82+
{ "tr", "mir_mods_00010007", "araştırma", "araç" },
83+
});
84+
}
85+
86+
@Before
87+
public final void init() throws IOException, InterruptedException {
88+
MIRSearchTestDataLoader searchTestDataLoader = new MIRSearchTestDataLoader();
89+
searchTestDataLoader.lazyLoadData(getDriver());
90+
// navigate to start page so the "Suche" nav link is available
91+
getDriver().get(getAPPUrlString());
92+
}
93+
94+
/**
95+
* Tests that language-specific title stemming works via the simple search form.
96+
* Searches with the base/singular form of a word that appears in plural/inflected form in the title.
97+
*/
98+
@Test
99+
public void testTitleStemming() {
100+
MIRSearchController searchController = new MIRSearchController(getDriver(), getAPPUrlString());
101+
searchController.setTitle(titleSearchTerm);
102+
103+
List<String> foundIds = collectResultIds();
104+
105+
Assert.assertTrue(
106+
"Language [" + language + "]: title search for '" + titleSearchTerm
107+
+ "' should find " + expectedDocId + " (found: " + String.join(",", foundIds) + ")",
108+
foundIds.contains(expectedDocId));
109+
}
110+
111+
/**
112+
* Tests that language-specific abstract stemming works via the complex search form.
113+
* Uses the "Metadaten" (allMeta) field since there is no dedicated abstract field in the current form.
114+
* Searches with the base/singular form of a word that appears in plural/inflected form in the abstract.
115+
*/
116+
@Test
117+
public void testAbstractStemming() {
118+
MIRSearchController searchController = new MIRSearchController(getDriver(), getAPPUrlString());
119+
120+
List<MIRComplexSearchQuery> queries = List.of(
121+
new MIRComplexSearchQuery(MIRSearchFieldCondition.enthält, abstractSearchTerm,
122+
MIRSearchField.Metadaten));
123+
124+
searchController.complexSearchBy(queries, null, null, null,
125+
null, null, null, null, null);
126+
127+
List<String> foundIds = collectResultIds();
128+
129+
Assert.assertTrue(
130+
"Language [" + language + "]: abstract search for '" + abstractSearchTerm
131+
+ "' should find " + expectedDocId + " (found: " + String.join(",", foundIds) + ")",
132+
foundIds.contains(expectedDocId));
133+
}
134+
135+
private List<String> collectResultIds() {
136+
return getDriver().waitAndFindElements(By.xpath(".//input[@name='id']"))
137+
.stream()
138+
.map(v -> Optional.ofNullable(v.getDomProperty("value"))
139+
.orElseGet(() -> v.getDomAttribute("value")))
140+
.collect(Collectors.toList());
141+
}
142+
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
package org.mycore.mir.it.tests;
2+
3+
import java.io.IOException;
4+
import java.util.List;
5+
import java.util.Optional;
6+
import java.util.stream.Collectors;
7+
8+
import org.junit.Assert;
9+
import org.junit.Before;
10+
import org.junit.Test;
11+
import org.mycore.mir.it.controller.MIRSearchController;
12+
import org.mycore.mir.it.model.MIRComplexSearchQuery;
13+
import org.mycore.mir.it.model.MIRSearchField;
14+
import org.mycore.mir.it.model.MIRSearchFieldCondition;
15+
import org.mycore.mir.it.model.MIRSearchTestDataLoader;
16+
import org.openqa.selenium.By;
17+
18+
/**
19+
* Integration test for Russian transliteration search via {@code text_ru_latin} field type.
20+
* <p>
21+
* The test document {@code mir_mods_00010006} has a Russian title:
22+
* "Исследования развития современных библиотек"
23+
* </p>
24+
* <p>
25+
* The {@code text_ru_latin} analyzer applies {@code ICUTransformFilterFactory} (Any-Latin) before stemming,
26+
* so searching with Latin characters should find Cyrillic content.
27+
* For example, searching for "bibliotek" should match "библиотек" in the title.
28+
* </p>
29+
*
30+
* <p>Requires the MCR ticket (text_ru_latin field type + mods.title.lang.ru.latin field)
31+
* and the MIR ticket (search form queries ru.latin fields) to be implemented.</p>
32+
*/
33+
public class MIRRussianTransliterationITCase extends MIRITBase {
34+
35+
private static final String RUSSIAN_DOC_ID = "mir_mods_00010006";
36+
37+
@Before
38+
public final void init() throws IOException, InterruptedException {
39+
MIRSearchTestDataLoader searchTestDataLoader = new MIRSearchTestDataLoader();
40+
searchTestDataLoader.lazyLoadData(getDriver());
41+
}
42+
43+
/**
44+
* Tests that a Russian title can be found using Latin transliteration.
45+
* The document title contains "библиотек" (bibliotek in Latin), searching with "bibliotek"
46+
* should match via the {@code mods.title.lang.ru.latin} field.
47+
*/
48+
@Test
49+
public void testTitleSearchWithLatinCharacters() {
50+
MIRSearchController searchController = new MIRSearchController(getDriver(), getAPPUrlString());
51+
searchController.setTitle("bibliotek");
52+
53+
List<String> foundIds = collectResultIds();
54+
55+
Assert.assertTrue(
56+
"Searching for 'bibliotek' (Latin) should find Russian document " + RUSSIAN_DOC_ID
57+
+ " via transliteration (found: " + String.join(",", foundIds) + ")",
58+
foundIds.contains(RUSSIAN_DOC_ID));
59+
}
60+
61+
/**
62+
* Tests that a Russian abstract can be found using Latin transliteration.
63+
* The document abstract contains "Управление". The ICU Any-Latin transliteration
64+
* produces "upravlenie". Note: RussianLightStemFilterFactory only operates on Cyrillic,
65+
* so after transliteration to Latin, no stemming occurs — the search term must be the
66+
* exact ICU transliteration of a word in the document.
67+
*/
68+
@Test
69+
public void testAbstractSearchWithLatinCharacters() {
70+
MIRSearchController searchController = new MIRSearchController(getDriver(), getAPPUrlString());
71+
72+
List<MIRComplexSearchQuery> queries = List.of(
73+
new MIRComplexSearchQuery(MIRSearchFieldCondition.enthält, "upravlenie",
74+
MIRSearchField.Metadaten));
75+
76+
searchController.complexSearchBy(queries, null, null, null,
77+
null, null, null, null, null);
78+
79+
List<String> foundIds = collectResultIds();
80+
81+
Assert.assertTrue(
82+
"Searching for 'upravlenie' (Latin/ICU) should find Russian document " + RUSSIAN_DOC_ID
83+
+ " via transliteration (found: " + String.join(",", foundIds) + ")",
84+
foundIds.contains(RUSSIAN_DOC_ID));
85+
}
86+
87+
private List<String> collectResultIds() {
88+
return getDriver().waitAndFindElements(By.xpath(".//input[@name='id']"))
89+
.stream()
90+
.map(v -> Optional.ofNullable(v.getDomProperty("value"))
91+
.orElseGet(() -> v.getDomAttribute("value")))
92+
.collect(Collectors.toList());
93+
}
94+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<mycoreobject xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" ID="mir_mods_00010001" label="mir_mods_00010001" version="2017.07-SNAPSHOT" xsi:noNamespaceSchemaLocation="datamodel-mods.xsd">
3+
<structure/>
4+
<metadata xml:lang="de">
5+
<def.modsContainer class="MCRMetaXML" heritable="false" notinherit="true">
6+
<modsContainer inherited="0">
7+
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
8+
<mods:typeOfResource>text</mods:typeOfResource>
9+
<mods:genre type="intern" authorityURI="https://www.openagrar.de/classifications/genres" valueURI="https://www.openagrar.de/classifications/genres#article"/>
10+
<mods:titleInfo xml:lang="de" xlink:type="simple">
11+
<mods:title>Untersuchungen zur Entwicklung moderner Bibliotheken</mods:title>
12+
</mods:titleInfo>
13+
<mods:abstract xml:lang="de">Die Verwaltung wissenschaftlicher Publikationen erfordert neue digitale Werkzeuge</mods:abstract>
14+
<mods:subject xlink:type="simple">
15+
<mods:topic xml:lang="de">Bibliothekswissenschaften</mods:topic>
16+
</mods:subject>
17+
<mods:name type="personal" xlink:type="simple">
18+
<mods:displayForm>Schmidt, Hans</mods:displayForm>
19+
<mods:role>
20+
<mods:roleTerm authority="marcrelator" type="code">aut</mods:roleTerm>
21+
</mods:role>
22+
<mods:namePart type="family">Schmidt</mods:namePart>
23+
<mods:namePart type="given">Hans</mods:namePart>
24+
</mods:name>
25+
<mods:language>
26+
<mods:languageTerm authority="rfc4646" type="code">de</mods:languageTerm>
27+
</mods:language>
28+
<mods:classification authority="sdnb" displayLabel="sdnb">020</mods:classification>
29+
<mods:accessCondition type="use and reproduction" xlink:href="http://www.mycore.org/classifications/mir_licenses#cc_by_4.0" xlink:type="simple"/>
30+
</mods:mods>
31+
</modsContainer>
32+
</def.modsContainer>
33+
</metadata>
34+
<service>
35+
<servdates class="MCRMetaISO8601Date">
36+
<servdate type="modifydate" inherited="0">2024-01-01T00:00:00.000Z</servdate>
37+
<servdate type="createdate" inherited="0">2024-01-01T00:00:00.000Z</servdate>
38+
</servdates>
39+
<servflags class="MCRMetaLangText">
40+
<servflag type="createdby" inherited="0" form="plain">administrator</servflag>
41+
<servflag type="modifiedby" inherited="0" form="plain">administrator</servflag>
42+
</servflags>
43+
<servstates class="MCRMetaClassification">
44+
<servstate inherited="0" classid="state" categid="published"/>
45+
</servstates>
46+
</service>
47+
</mycoreobject>
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<mycoreobject xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" ID="mir_mods_00010002" label="mir_mods_00010002" version="2017.07-SNAPSHOT" xsi:noNamespaceSchemaLocation="datamodel-mods.xsd">
3+
<structure/>
4+
<metadata xml:lang="de">
5+
<def.modsContainer class="MCRMetaXML" heritable="false" notinherit="true">
6+
<modsContainer inherited="0">
7+
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
8+
<mods:typeOfResource>text</mods:typeOfResource>
9+
<mods:genre type="intern" authorityURI="https://www.openagrar.de/classifications/genres" valueURI="https://www.openagrar.de/classifications/genres#article"/>
10+
<mods:titleInfo xml:lang="en" xlink:type="simple">
11+
<mods:title>Investigations into the Development of Modern Libraries</mods:title>
12+
</mods:titleInfo>
13+
<mods:abstract xml:lang="en">Managing scientific publications requires new digital instruments</mods:abstract>
14+
<mods:subject xlink:type="simple">
15+
<mods:topic xml:lang="en">Library Sciences</mods:topic>
16+
</mods:subject>
17+
<mods:name type="personal" xlink:type="simple">
18+
<mods:displayForm>Smith, John</mods:displayForm>
19+
<mods:role>
20+
<mods:roleTerm authority="marcrelator" type="code">aut</mods:roleTerm>
21+
</mods:role>
22+
<mods:namePart type="family">Smith</mods:namePart>
23+
<mods:namePart type="given">John</mods:namePart>
24+
</mods:name>
25+
<mods:language>
26+
<mods:languageTerm authority="rfc4646" type="code">en</mods:languageTerm>
27+
</mods:language>
28+
<mods:classification authority="sdnb" displayLabel="sdnb">020</mods:classification>
29+
<mods:accessCondition type="use and reproduction" xlink:href="http://www.mycore.org/classifications/mir_licenses#cc_by_4.0" xlink:type="simple"/>
30+
</mods:mods>
31+
</modsContainer>
32+
</def.modsContainer>
33+
</metadata>
34+
<service>
35+
<servdates class="MCRMetaISO8601Date">
36+
<servdate type="modifydate" inherited="0">2024-01-01T00:00:00.000Z</servdate>
37+
<servdate type="createdate" inherited="0">2024-01-01T00:00:00.000Z</servdate>
38+
</servdates>
39+
<servflags class="MCRMetaLangText">
40+
<servflag type="createdby" inherited="0" form="plain">administrator</servflag>
41+
<servflag type="modifiedby" inherited="0" form="plain">administrator</servflag>
42+
</servflags>
43+
<servstates class="MCRMetaClassification">
44+
<servstate inherited="0" classid="state" categid="published"/>
45+
</servstates>
46+
</service>
47+
</mycoreobject>

0 commit comments

Comments
 (0)