Skip to content

Commit 87bf81d

Browse files
committed
refactor: require language when extracting epub paragraphs
#1994
1 parent 67c5362 commit 87bf81d

File tree

3 files changed

+41
-27
lines changed

3 files changed

+41
-27
lines changed

src/main/java/ai/elimu/util/epub/EPubParagraphExtractionHelper.java

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
import org.xml.sax.SAXException;
1616

1717
import ai.elimu.entity.enums.StoryBookProvider;
18+
import ai.elimu.model.v2.enums.Language;
19+
import ai.elimu.util.ConfigHelper;
20+
import ai.elimu.util.linguistics.ThaiHelper;
1821

1922
@Slf4j
2023
public class EPubParagraphExtractionHelper {
@@ -25,7 +28,7 @@ public class EPubParagraphExtractionHelper {
2528
* @param xhtmlFile The XHTML file containing the paragraphs, e.g. {@code chapter-2.xhtml}.
2629
* @return A list of paragraphs.
2730
*/
28-
public static List<String> extractParagraphsFromChapterFile(File xhtmlFile) {
31+
public static List<String> extractParagraphsFromChapterFile(File xhtmlFile, Language language) {
2932
log.info("extractParagraphsFromChapter");
3033

3134
List<String> paragraphs = new ArrayList<>();
@@ -62,7 +65,7 @@ public static List<String> extractParagraphsFromChapterFile(File xhtmlFile) {
6265
</body>
6366
*/
6467
if ("p".equals(bodyChildNode.getNodeName()) && (bodyChildNode.getAttributes().getNamedItem("dir") == null)) {
65-
processParagraphNode(StoryBookProvider.GLOBAL_DIGITAL_LIBRARY, bodyChildNode, paragraphs);
68+
processParagraphNode(StoryBookProvider.GLOBAL_DIGITAL_LIBRARY, bodyChildNode, paragraphs, language);
6669
}
6770

6871
// StoryBookProvider: LETS_READ_ASIA
@@ -77,7 +80,7 @@ public static List<String> extractParagraphsFromChapterFile(File xhtmlFile) {
7780
</body>
7881
*/
7982
if ("p".equals(bodyChildNode.getNodeName()) && (bodyChildNode.getAttributes().getNamedItem("dir") != null)) {
80-
processParagraphNode(StoryBookProvider.LETS_READ_ASIA, bodyChildNode, paragraphs);
83+
processParagraphNode(StoryBookProvider.LETS_READ_ASIA, bodyChildNode, paragraphs, language);
8184
}
8285

8386
// StoryBookProvider: LETS_READ_ASIA
@@ -102,7 +105,7 @@ public static List<String> extractParagraphsFromChapterFile(File xhtmlFile) {
102105

103106
// Look for "<p>"
104107
if ("p".equals(langDivChildNode.getNodeName())) {
105-
processParagraphNode(StoryBookProvider.LETS_READ_ASIA, langDivChildNode, paragraphs);
108+
processParagraphNode(StoryBookProvider.LETS_READ_ASIA, langDivChildNode, paragraphs, language);
106109
}
107110
}
108111
}
@@ -184,7 +187,7 @@ public static List<String> extractParagraphsFromChapterFile(File xhtmlFile) {
184187

185188
// Expected format: <p>ভীমের শুধু ঘুম আর ঘুম। সকালে উঠতেই পারে না।</p>
186189
if ("p".equals(contentDivChildNode.getNodeName())) {
187-
processParagraphNode(StoryBookProvider.STORYWEAVER, contentDivChildNode, paragraphs);
190+
processParagraphNode(StoryBookProvider.STORYWEAVER, contentDivChildNode, paragraphs, language);
188191
}
189192
}
190193
}
@@ -204,7 +207,7 @@ public static List<String> extractParagraphsFromChapterFile(File xhtmlFile) {
204207
if ("#text".equals(bodyChildNode.getNodeName())) {
205208
String paragraph = bodyChildNode.getTextContent();
206209
log.info("paragraph: \"" + paragraph + "\"");
207-
paragraph = getCleanedUpParagraph(paragraph);
210+
paragraph = getCleanedUpParagraph(paragraph, language);
208211
if (StringUtils.isNotBlank(paragraph)) {
209212
paragraphs.add(paragraph);
210213
}
@@ -219,7 +222,7 @@ public static List<String> extractParagraphsFromChapterFile(File xhtmlFile) {
219222
return paragraphs;
220223
}
221224

222-
private static void processParagraphNode(StoryBookProvider storyBookProvider, Node paragraphNode, List<String> paragraphs) {
225+
private static void processParagraphNode(StoryBookProvider storyBookProvider, Node paragraphNode, List<String> paragraphs, Language language) {
223226
log.info("processParagraphNode");
224227

225228
log.info("storyBookProvider: " + storyBookProvider);
@@ -273,15 +276,15 @@ private static void processParagraphNode(StoryBookProvider storyBookProvider, No
273276
String[] paragraphArray = paragraphNode.getTextContent().split("</p><p>");
274277
for (String paragraph : paragraphArray) {
275278
log.info("paragraph: \"" + paragraph + "\"");
276-
paragraph = getCleanedUpParagraph(paragraph);
279+
paragraph = getCleanedUpParagraph(paragraph, language);
277280
if (StringUtils.isNotBlank(paragraph)) {
278281
paragraphs.add(paragraph);
279282
}
280283
}
281284
} else if (storyBookProvider == StoryBookProvider.STORYWEAVER) {
282285
String paragraph = paragraphNode.getTextContent();
283286
log.info("paragraph: \"" + paragraph + "\"");
284-
paragraph = getCleanedUpParagraph(paragraph);
287+
paragraph = getCleanedUpParagraph(paragraph, language);
285288

286289
// Skip paragraphs containing CSS code
287290
// See example at src/test/resources/ai/elimu/util/epub/hin-sw-10145-ek-sau-saintisvan-paer.epub_4.xhtml
@@ -298,8 +301,13 @@ private static void processParagraphNode(StoryBookProvider storyBookProvider, No
298301
/**
299302
* E.g. "लेना ।" --> "लेना।"
300303
*/
301-
private static String getCleanedUpParagraph(String paragraph) {
304+
private static String getCleanedUpParagraph(String paragraph, Language language) {
302305
log.info("getCleanedUpParagraph, paragraph: \"" + paragraph + "\"");
306+
307+
if (language == Language.THA) {
308+
// Add whitespaces between Thai words
309+
paragraph = ThaiHelper.splitIntoWords(paragraph);
310+
}
303311

304312
// Replace line-breaks with a whitespace
305313
paragraph = paragraph.replace("\n", " ");

src/main/java/ai/elimu/web/content/storybook/StoryBookCreateFromEPubController.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@
1313
import ai.elimu.entity.contributor.Contributor;
1414
import ai.elimu.entity.contributor.ImageContributionEvent;
1515
import ai.elimu.entity.contributor.StoryBookContributionEvent;
16+
import ai.elimu.model.v2.enums.Language;
1617
import ai.elimu.model.v2.enums.ReadingLevel;
1718
import ai.elimu.model.v2.enums.content.ImageFormat;
1819
import ai.elimu.service.storybook.StoryBookEPubService;
1920
import ai.elimu.util.ChecksumHelper;
21+
import ai.elimu.util.ConfigHelper;
2022
import ai.elimu.util.DiscordHelper;
2123
import ai.elimu.util.GitHubLfsHelper;
2224
import ai.elimu.util.ImageColorHelper;
@@ -310,7 +312,8 @@ public String handleSubmit(
310312
storyBookChapter.setImage(chapterImage);
311313
}
312314

313-
List<String> paragraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(chapterFile);
315+
Language language = Language.valueOf(ConfigHelper.getProperty("content.language"));
316+
List<String> paragraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(chapterFile, language);
314317
log.info("paragraphs.size(): " + paragraphs.size());
315318
for (int i = 0; i < paragraphs.size(); i++) {
316319
String paragraph = paragraphs.get(i);
@@ -421,7 +424,8 @@ public String handleSubmit(
421424
storyBookDao.update(storyBook);
422425

423426
if (!EnvironmentContextLoaderListener.PROPERTIES.isEmpty()) {
424-
String contentUrl = "http://" + EnvironmentContextLoaderListener.PROPERTIES.getProperty("content.language").toLowerCase() + ".elimu.ai/content/storybook/edit/" + storyBook.getId();
427+
Language language = Language.valueOf(ConfigHelper.getProperty("content.language"));
428+
String contentUrl = "http://" + language.toString().toLowerCase() + ".elimu.ai/content/storybook/edit/" + storyBook.getId();
425429
String embedThumbnailUrl = null;
426430
if (storyBook.getCoverImage() != null) {
427431
embedThumbnailUrl = storyBook.getCoverImage().getUrl();

src/test/java/ai/elimu/util/epub/EPubParagraphExtractionHelperTest.java

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import org.springframework.core.io.ClassRelativeResourceLoader;
66
import org.springframework.core.io.Resource;
77
import org.springframework.core.io.ResourceLoader;
8+
9+
import ai.elimu.model.v2.enums.Language;
810
import lombok.extern.slf4j.Slf4j;
911
import java.io.File;
1012
import java.io.IOException;
@@ -21,7 +23,7 @@ public void testExtractParagraphsFromChapterFile_ENG_GDL_1349() throws IOExcepti
2123
File xhtmlFile = resource.getFile();
2224
log.debug("xhtmlFile: " + xhtmlFile);
2325

24-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
26+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.ENG);
2527

2628
assertEquals(2, storyBookParagraphs.size());
2729
assertEquals("Fifth grade student, Little Miss Grace,", storyBookParagraphs.get(0));
@@ -35,7 +37,7 @@ public void testExtractParagraphsFromChapterFile_ENG_GDL_1855() throws IOExcepti
3537
File xhtmlFile = resource.getFile();
3638
log.debug("xhtmlFile: " + xhtmlFile);
3739

38-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
40+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.ENG);
3941

4042
assertEquals(2, storyBookParagraphs.size());
4143
assertEquals("Some wild cats have stripes.", storyBookParagraphs.get(0));
@@ -49,7 +51,7 @@ public void testExtractParagraphsFromChapterFile_ENG_GDL_1855_ch4() throws IOExc
4951
File xhtmlFile = resource.getFile();
5052
log.debug("xhtmlFile: " + xhtmlFile);
5153

52-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
54+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.ENG);
5355

5456
assertEquals(2, storyBookParagraphs.size());
5557
assertEquals("Some wild cats have spots.", storyBookParagraphs.get(0));
@@ -63,7 +65,7 @@ public void testExtractParagraphsFromChapterFile_ENG_LRA_377b7e63_ch1() throws I
6365
File xhtmlFile = resource.getFile();
6466
log.debug("xhtmlFile: " + xhtmlFile);
6567

66-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
68+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.ENG);
6769

6870
assertEquals(1, storyBookParagraphs.size());
6971
assertEquals("The moon rises.", storyBookParagraphs.get(0));
@@ -76,7 +78,7 @@ public void testExtractParagraphsFromChapterFile_ENG_LRA_377b7e63_ch5() throws I
7678
File xhtmlFile = resource.getFile();
7779
log.debug("xhtmlFile: " + xhtmlFile);
7880

79-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
81+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.ENG);
8082

8183
assertEquals(1, storyBookParagraphs.size());
8284
assertEquals("What if I try talking to them? The moon slowly moves toward the group of stars.", storyBookParagraphs.get(0));
@@ -89,7 +91,7 @@ public void testExtractParagraphsFromChapterFile_ENG_LRA_377b7e63_ch6() throws I
8991
File xhtmlFile = resource.getFile();
9092
log.debug("xhtmlFile: " + xhtmlFile);
9193

92-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
94+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.ENG);
9395

9496
assertEquals(1, storyBookParagraphs.size());
9597
assertEquals("Will they play with me? I am so different from them. I should try.", storyBookParagraphs.get(0));
@@ -102,7 +104,7 @@ public void testExtractParagraphsFromChapterFile_TGL_LRA_faa0d66e() throws IOExc
102104
File xhtmlFile = resource.getFile();
103105
log.debug("xhtmlFile: " + xhtmlFile);
104106

105-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
107+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.TGL);
106108

107109
assertEquals(3, storyBookParagraphs.size());
108110
assertEquals("WAAAAHHHH!", storyBookParagraphs.get(0));
@@ -117,7 +119,7 @@ public void testExtractParagraphsFromChapterFile_TGL_LRA_7f877260_ch4() throws I
117119
File xhtmlFile = resource.getFile();
118120
log.debug("xhtmlFile: " + xhtmlFile);
119121

120-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
122+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.TGL);
121123

122124
assertEquals(2, storyBookParagraphs.size());
123125
assertEquals("Ano kaya kung kaya kang palundagin ng jelly beans nang napakataas?", storyBookParagraphs.get(0));
@@ -131,7 +133,7 @@ public void testExtractParagraphsFromChapterFile_TGL_LRA_7f877260_ch13() throws
131133
File xhtmlFile = resource.getFile();
132134
log.debug("xhtmlFile: " + xhtmlFile);
133135

134-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
136+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.TGL);
135137

136138
assertEquals(1, storyBookParagraphs.size());
137139
assertEquals("\"Nagmumuni-muni lang,\" sabi niya.", storyBookParagraphs.get(0));
@@ -145,7 +147,7 @@ public void testExtractParagraphsFromChapterFile_HIN_GDL_1287_ch3() throws IOExc
145147
File xhtmlFile = resource.getFile();
146148
log.debug("xhtmlFile: " + xhtmlFile);
147149

148-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
150+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.HIN);
149151

150152
assertEquals(1, storyBookParagraphs.size());
151153
assertEquals("उस मोटे राजा के पास एक पतला कुत्ता था । एक दिन मोटा राजा और उसका पतला कुत्ता सैर करने गए।", storyBookParagraphs.get(0));
@@ -158,7 +160,7 @@ public void testExtractParagraphsFromChapterFile_HIN_GDL_1296() throws IOExcepti
158160
File xhtmlFile = resource.getFile();
159161
log.debug("xhtmlFile: " + xhtmlFile);
160162

161-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
163+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.HIN);
162164

163165
assertEquals(1, storyBookParagraphs.size());
164166
assertEquals("एक बादल घुमने चला.", storyBookParagraphs.get(0));
@@ -171,7 +173,7 @@ public void testExtractParagraphsFromChapterFile_HIN_GDL_1296_ch3() throws IOExc
171173
File xhtmlFile = resource.getFile();
172174
log.debug("xhtmlFile: " + xhtmlFile);
173175

174-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
176+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.HIN);
175177

176178
assertEquals(2, storyBookParagraphs.size());
177179
assertEquals("साथ में चिड़िया भी उड़ती", storyBookParagraphs.get(0));
@@ -185,7 +187,7 @@ public void testExtractParagraphsFromChapterFile_HIN_SW_99651_ch3() throws IOExc
185187
File xhtmlFile = resource.getFile();
186188
log.debug("xhtmlFile: " + xhtmlFile);
187189

188-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
190+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.HIN);
189191

190192
assertEquals(2, storyBookParagraphs.size());
191193
assertEquals("हमारा सबसे अच्छा मित्र है पक्षी!", storyBookParagraphs.get(0));
@@ -199,7 +201,7 @@ public void testExtractParagraphsFromChapterFile_HIN_SW_10145_ch4() throws IOExc
199201
File xhtmlFile = resource.getFile();
200202
log.debug("xhtmlFile: " + xhtmlFile);
201203

202-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
204+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.HIN);
203205

204206
assertEquals(3, storyBookParagraphs.size());
205207
assertEquals("एक गोजर एक बड़े से भूरे पत्ते के भीतर", storyBookParagraphs.get(0));
@@ -214,7 +216,7 @@ public void testExtractParagraphsFromChapterFile_HIN_SW_141016_ch8() throws IOEx
214216
File xhtmlFile = resource.getFile();
215217
log.debug("xhtmlFile: " + xhtmlFile);
216218

217-
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile);
219+
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.HIN);
218220

219221
assertEquals(1, storyBookParagraphs.size());
220222
assertEquals("\"देखो माँ, मैं बंदर की तरह झूल रही हूँ।\"", storyBookParagraphs.get(0));

0 commit comments

Comments
 (0)