Skip to content

Commit 44014d0

Browse files
committed
chore: update version to 5.3.0 and enhance text normalization by adding duplicate removal functionality
1 parent 7285bab commit 44014d0

File tree

2 files changed

+56
-3
lines changed

2 files changed

+56
-3
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "ppu-pdf",
3-
"version": "5.2.1",
3+
"version": "5.3.0",
44
"description": "Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.",
55
"keywords": [
66
"pdf-reader",

src/pdf-reader-common.ts

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,13 +334,66 @@ export class PdfReaderCommon {
334334

335335
protected normalizedText(str: string): string {
336336
const spacedLetterPattern = /^([A-Z]\s)+[A-Z]$/;
337-
338337
str = str.replace(/ +/g, " ");
339338

340339
if (spacedLetterPattern.test(str)) {
341-
return str.replace(/\s/g, "");
340+
str = str.replace(/\s/g, "");
342341
}
343342

343+
str = this.removeDuplicates(str);
344344
return str?.trim();
345345
}
346+
347+
protected removeDuplicates(text: string): string {
348+
text = text.replace(/\s+/g, " ").trim();
349+
if (!text) return text;
350+
351+
const words = text.split(" ").filter((word) => word.length > 0);
352+
const newString = [];
353+
354+
const wordLength = words.length;
355+
let repetitionCount = 0;
356+
357+
for (const word of words) {
358+
if (word.length < 3) return text;
359+
360+
const threeLetters = word.substring(0, 3);
361+
const restOfWord = word.substring(3);
362+
const patternIndex = restOfWord.indexOf(threeLetters);
363+
364+
if (patternIndex === -1) return text;
365+
const checkPattern = word.substring(0, 3 + patternIndex);
366+
if (!this.isWordRepeatedPattern(word, checkPattern)) return text;
367+
368+
newString.push(checkPattern);
369+
repetitionCount++;
370+
}
371+
372+
if (wordLength !== repetitionCount) return text;
373+
return newString.join(" ");
374+
}
375+
376+
protected isWordRepeatedPattern(word: string, pattern: string): boolean {
377+
if (word.length < pattern.length * 2) return false;
378+
if (!word.startsWith(pattern + pattern)) return false;
379+
380+
let pos = 0;
381+
while (pos < word.length) {
382+
const remainingLength = word.length - pos;
383+
if (remainingLength >= pattern.length) {
384+
if (word.substring(pos, pos + pattern.length) === pattern) {
385+
pos += pattern.length;
386+
} else {
387+
const remaining = word.substring(pos);
388+
if (pattern.startsWith(remaining)) break;
389+
return false;
390+
}
391+
} else {
392+
const remaining = word.substring(pos);
393+
return pattern.startsWith(remaining);
394+
}
395+
}
396+
397+
return true;
398+
}
346399
}

0 commit comments

Comments
 (0)