Skip to content

Commit d61950f

Browse files
authored
refactor: add one paragraph per line break during epub import (#2303)
2 parents 289886f + cc50564 commit d61950f

6 files changed

Lines changed: 28 additions & 34 deletions

File tree

pom-dependency-tree.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ai.elimu:webapp:war:2.6.100-SNAPSHOT
1+
ai.elimu:webapp:war:2.6.101-SNAPSHOT
22
+- ai.elimu:model:jar:model-2.0.114:compile
33
| \- com.google.code.gson:gson:jar:2.13.1:compile
44
| \- com.google.errorprone:error_prone_annotations:jar:2.38.0:compile

src/main/java/ai/elimu/util/epub/EPubParagraphExtractionHelper.java

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -239,29 +239,17 @@ private static void processParagraphNode(StoryBookProvider storyBookProvider, No
239239

240240
log.debug("storyBookProvider: " + storyBookProvider);
241241
if ((storyBookProvider == StoryBookProvider.GLOBAL_DIGITAL_LIBRARY) || (storyBookProvider == StoryBookProvider.LETS_READ_ASIA)) {
242-
// If single line-break ("<br/>"), replace it with whitespace.
243-
// If double line-breaks ("<br/><br/>"), treat the subsequent text as a new paragraph.
242+
// If line-break ("<br/>"), treat the subsequent text as a new paragraph.
244243
if (paragraphNode.hasChildNodes()) {
245244
NodeList paragraphChildNodeList = paragraphNode.getChildNodes();
246-
int consecutiveLineBreaksCount = 0;
247245
for (int k = 0; k < paragraphChildNodeList.getLength(); k++) {
248246
Node paragraphChildNode = paragraphChildNodeList.item(k);
249247
log.debug("paragraphChildNode: " + paragraphChildNode);
250248
log.debug("paragraphChildNode.getNodeName(): " + paragraphChildNode.getNodeName());
251249
log.debug("paragraphChildNode.getTextContent(): \"" + paragraphChildNode.getTextContent() + "\"");
252250
if ("br".equals(paragraphChildNode.getNodeName())) {
253-
consecutiveLineBreaksCount++;
254-
} else {
255-
consecutiveLineBreaksCount = 0;
256-
}
257-
log.debug("consecutiveLineBreaksCount: " + consecutiveLineBreaksCount);
258-
if (consecutiveLineBreaksCount == 1) {
259-
// Replace "<br/>" with " "
260-
paragraphChildNode.setTextContent(" ");
261-
} else if (consecutiveLineBreaksCount == 2) {
262-
// Replace "<br/><br/>" with "</p><p>"
251+
// Replace "<br/>" with "</p><p>"
263252
paragraphChildNode.setTextContent("</p><p>");
264-
consecutiveLineBreaksCount = 0;
265253
}
266254

267255
if (storyBookProvider == StoryBookProvider.LETS_READ_ASIA) {
@@ -275,8 +263,8 @@ private static void processParagraphNode(StoryBookProvider storyBookProvider, No
275263
for (int l = 0; l < emChildNodeList.getLength(); l++) {
276264
Node emChildNode = emChildNodeList.item(l);
277265
if ("br".equals(emChildNode.getNodeName())) {
278-
// Replace "<br/>" with " "
279-
emChildNode.setTextContent(" ");
266+
// Replace "<br/>" with "</p><p>"
267+
emChildNode.setTextContent("</p><p>");
280268
}
281269
}
282270
}

src/main/java/ai/elimu/web/content/word/WordListController.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,15 @@ public String handleRequest(Model model) {
5151
model.addAttribute("maxUsageCount", maxUsageCount);
5252

5353
// Extract letter frequency distribution from storybook paragraphs
54-
List<String> wordsInParagraphs = new ArrayList<>();
54+
List<String> paragraphs = new ArrayList<>();
5555
for (StoryBookParagraph storyBookParagraph : storyBookParagraphDao.readAll()) {
56-
for (String word : storyBookParagraph.getOriginalText().split(" ")) {
57-
wordsInParagraphs.add(word);
56+
if (StringUtils.isNotBlank(storyBookParagraph.getOriginalText())) {
57+
paragraphs.add(storyBookParagraph.getOriginalText());
5858
}
5959
}
6060
if (StringUtils.isNotBlank(ConfigHelper.getProperty("content.language"))) {
6161
Language language = Language.valueOf(ConfigHelper.getProperty("content.language"));
62-
Map<String, Integer> wordFrequencyMap = WordFrequencyHelper.getWordFrequency(wordsInParagraphs, language);
62+
Map<String, Integer> wordFrequencyMap = WordFrequencyHelper.getWordFrequency(paragraphs, language);
6363
model.addAttribute("wordFrequencyMap", wordFrequencyMap);
6464
}
6565

src/main/webapp/WEB-INF/jsp/content/letter/list.jsp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
var data = {
1313
labels: [
1414
<c:forEach var="key" items="${letterFrequencyMap.keySet()}">
15-
'${key}',
15+
'<c:out value="${key}" escapeXml="true" />',
1616
</c:forEach>
1717
],
1818
datasets: [

src/main/webapp/WEB-INF/jsp/content/word/list.jsp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313
labels: [
1414
<c:forEach var="key" items="${wordFrequencyMap.keySet()}" varStatus="status">
1515
<c:if test="${status.index < 100}">
16-
"${key}",
16+
"<c:out value="${key}" escapeXml="true" />",
1717
</c:if>
1818
</c:forEach>
1919
],
2020
datasets: [
2121
{
2222
data: [
23-
<c:forEach var="key" items="${wordFrequencyMap}">
23+
<c:forEach var="key" items="${wordFrequencyMap}" varStatus="status">
2424
<c:if test="${status.index < 100}">
2525
${key.value},
2626
</c:if>

src/test/java/ai/elimu/util/epub/EPubParagraphExtractionHelperTest.java

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,9 @@ public void testExtractParagraphsFromChapterFile_ENG_LRA_377b7e63_ch5() throws I
8080

8181
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.ENG);
8282

83-
assertEquals(1, storyBookParagraphs.size());
84-
assertEquals("What if I try talking to them? The moon slowly moves toward the group of stars.", storyBookParagraphs.get(0));
83+
assertEquals(2, storyBookParagraphs.size());
84+
assertEquals("What if I try talking to them?", storyBookParagraphs.get(0));
85+
assertEquals("The moon slowly moves toward the group of stars.", storyBookParagraphs.get(1));
8586
}
8687

8788
@Test
@@ -93,8 +94,9 @@ public void testExtractParagraphsFromChapterFile_ENG_LRA_377b7e63_ch6() throws I
9394

9495
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.ENG);
9596

96-
assertEquals(1, storyBookParagraphs.size());
97-
assertEquals("Will they play with me? I am so different from them. I should try.", storyBookParagraphs.get(0));
97+
assertEquals(2, storyBookParagraphs.size());
98+
assertEquals("Will they play with me? I am so different from them.", storyBookParagraphs.get(0));
99+
assertEquals("I should try.", storyBookParagraphs.get(1));
98100
}
99101

100102
@Test
@@ -244,8 +246,10 @@ public void testExtractParagraphsFromChapterFile_THA_LRA_376896c7_ch9() throws I
244246

245247
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.THA);
246248

247-
assertEquals(1, storyBookParagraphs.size());
248-
assertEquals("คุณ แม่! คุณ แม่! มอง ไป ทาง ไหน มี แต่ คน หนู ชัก สับสน ไม่ รู้ จะ เดิน ตรง ไหน", storyBookParagraphs.get(0));
249+
assertEquals(3, storyBookParagraphs.size());
250+
assertEquals("คุณ แม่! คุณ แม่!", storyBookParagraphs.get(0));
251+
assertEquals("มอง ไป ทาง ไหน มี แต่ คน", storyBookParagraphs.get(1));
252+
assertEquals("หนู ชัก สับสน ไม่ รู้ จะ เดิน ตรง ไหน", storyBookParagraphs.get(2));
249253
}
250254

251255
@Test
@@ -270,8 +274,9 @@ public void testExtractParagraphsFromChapterFile_THA_LRA_c2d75faf_ch8() throws I
270274

271275
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.THA);
272276

273-
assertEquals(1, storyBookParagraphs.size());
274-
assertEquals("“ส่วน นี่ คือ ท่า ตั้ง วง บน” “ทำ ตาม ฉัน นะ”", storyBookParagraphs.get(0));
277+
assertEquals(2, storyBookParagraphs.size());
278+
assertEquals("“ส่วน นี่ คือ ท่า ตั้ง วง บน”", storyBookParagraphs.get(0));
279+
assertEquals("“ทำ ตาม ฉัน นะ”", storyBookParagraphs.get(1));
275280
}
276281

277282
@Test
@@ -283,7 +288,8 @@ public void testExtractParagraphsFromChapterFile_VIE_LRA_b46cf9ee_ch2() throws I
283288

284289
List<String> storyBookParagraphs = EPubParagraphExtractionHelper.extractParagraphsFromChapterFile(xhtmlFile, Language.VIE);
285290

286-
assertEquals(1, storyBookParagraphs.size());
287-
assertEquals("Chó thách Mèo vật nhau Xem thử tay nào thắng", storyBookParagraphs.get(0));
291+
assertEquals(2, storyBookParagraphs.size());
292+
assertEquals("Chó thách Mèo vật nhau", storyBookParagraphs.get(0));
293+
assertEquals("Xem thử tay nào thắng", storyBookParagraphs.get(1));
288294
}
289295
}

0 commit comments

Comments
 (0)