Skip to content

Commit 4917c91

Browse files
committed
release: v5.7.0 - Support TOON
1 parent 6429e2e commit 4917c91

File tree

10 files changed

+134
-20
lines changed

10 files changed

+134
-20
lines changed

README.md

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ There are two class of `PdfReader` (uses mupdfjs) and `PdfReaderLegacy` uses (pd
77
## Features
88

99
- **Text Extraction:** Retrieve all text content from a PDF.
10+
- **LLM-Friendly Text Extraction:** Retrieve all text content while maintaining bbox and encode in Token Object Notation (TOON).
1011
- **Coordinate Data:** Get precise bounding box and dimension information for each text element.
1112
- **Line Grouping:** Merge individual text tokens into coherent lines.
1213
- **Scanned PDF Detection:** Determine if a PDF/individual page appears to be scanned or digitally generated.
@@ -16,23 +17,24 @@ There are two class of `PdfReader` (uses mupdfjs) and `PdfReaderLegacy` uses (pd
1617

1718
## Differences
1819

19-
| Indicator | PdfReader | PdfReaderLegacy |
20-
| -------------------------- | --------- | --------------- |
21-
| Library | mupdfjs | pdfjs-dist |
22-
| Pages index start | 0 | 1 |
23-
| open() |||
24-
| getTexts() |||
25-
| getTextsScanned() |||
26-
| isScanned() |||
27-
| isPageScanned() |||
28-
| getLinesFromTexts() |||
29-
| getCompactLinesFromTexts() |||
30-
| destroy() |||
31-
| destroyPage() |||
32-
| renderAll() |||
33-
| saveCanvasToPng() |||
34-
| dumpCanvasMap() |||
35-
| Resize viewport/Custom DPI |||
20+
| Indicator | PdfReader | PdfReaderLegacy |
21+
| ----------------------------------- | --------- | --------------- |
22+
| Library | mupdfjs | pdfjs-dist |
23+
| Pages index start | 0 | 1 |
24+
| open() |||
25+
| getTexts() |||
26+
| getTextsScanned() |||
27+
| isScanned() |||
28+
| isPageScanned() |||
29+
| getLinesFromTexts() |||
30+
| getCompactLinesFromTexts() |||
31+
| destroy() |||
32+
| destroyPage() |||
33+
| renderAll() |||
34+
| saveCanvasToPng() |||
35+
| dumpCanvasMap() |||
36+
| Resize viewport/Custom DPI |||
37+
| pdfReader.getLinesFromTextsInToon() |||
3638

3739
## Benchmark
3840

@@ -204,6 +206,7 @@ Configuration options for `PdfReader`, allowing customization of PDF text extrac
204206
| `mergeCloseTextNeighbor` | `boolean` | `true` | Merges text elements that are close to each other into a single entity. |
205207
| `simpleSortAlgorithm` | `boolean` | `false` | Uses a simplified sorting algorithm for text positioning. |
206208
| `scale` | `number` | `1` | The pdf document scale |
209+
| `enableToon` | `boolean` | `false` | To enable pdf words extraction in TOON format |
207210

208211
### Usage Example:
209212

bun.lock

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"name": "ppu-pdf",
66
"dependencies": {
77
"@napi-rs/canvas": "^0.1.69",
8+
"@toon-format/toon": "^1.0.0",
89
"mupdf": "^1.26.2",
910
"pdfjs-dist": "4.9.155",
1011
},
@@ -140,6 +141,8 @@
140141

141142
"@techstark/opencv-js": ["@techstark/[email protected]", "", {}, "sha512-S4XELidRiQeA0q1s9VQLo540wCxUo24r1O4C+LqZ6llX+sPCXvZCPv3Ice8dEIr0uavyZ8YZeKXSBdDgMXSXjw=="],
142143

144+
"@toon-format/toon": ["@toon-format/[email protected]", "", {}, "sha512-2gIk8LaqrzpurNDaDWZ72kucAGcBbxJxUnp+4ZP+Pny/QVNdMVf97yzSDTI3ed2q8ypjj8T271P6iE3bRmQBNw=="],
145+
143146
"@types/bun": ["@types/[email protected]", "", { "dependencies": { "bun-types": "1.3.1" } }, "sha512-4jNMk2/K9YJtfqwoAa28c8wK+T7nvJFOjxI4h/7sORWcypRNxBpr+TPNaCfVWq70tLCJsqoFwcf0oI0JU/fvMQ=="],
144147

145148
"@types/estree": ["@types/[email protected]", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="],

examples/pdf-digital.example.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import { PdfReader } from "../src";
44

5-
const pdfReader = new PdfReader({ verbose: false });
5+
const pdfReader = new PdfReader({ verbose: false, enableToon: true });
66
const file = Bun.file("./assets/opposite-expectation.pdf");
77

88
const buffer = await file.arrayBuffer();
@@ -16,5 +16,8 @@ pdfReader.destroy(pdf);
1616
const lines = pdfReader.getLinesFromTexts(texts);
1717
console.log("lines: ", lines.get(0));
1818

19+
const linesInToon = pdfReader.getLinesFromTextsInToon(texts);
20+
console.log("lines in toon: ", linesInToon);
21+
1922
const isScanned = pdfReader.isScanned(texts);
2023
console.log("is pdf scanned: ", isScanned);

jsr.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@snowfluke/ppu-pdf",
3-
"version": "5.6.0",
3+
"version": "5.7.0",
44
"license": "MIT",
55
"exports": "./src/index.ts",
66
"publish": {

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "ppu-pdf",
3-
"version": "5.6.0",
3+
"version": "5.7.0",
44
"description": "Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.",
55
"keywords": [
66
"pdf-reader",
@@ -53,6 +53,7 @@
5353
},
5454
"dependencies": {
5555
"@napi-rs/canvas": "^0.1.69",
56+
"@toon-format/toon": "^1.0.0",
5657
"mupdf": "^1.26.2",
5758
"pdfjs-dist": "4.9.155"
5859
}

src/pdf-reader-common.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,19 @@ import { createCanvas, type Canvas } from "@napi-rs/canvas";
22
import { createWriteStream, existsSync, mkdirSync } from "fs";
33
import { join } from "path";
44

5+
import { encode } from "@toon-format/toon";
56
import { CONSTANT } from "./pdf.constant";
67
import {
78
type CompactPageLines,
89
type CompactPdfLine,
910
type CompactPdfWord,
1011
type PageLines,
1112
type PageTexts,
13+
type PageToonLines,
1214
type PdfCompactLineAlgorithm,
1315
type PdfLine,
1416
type PdfScannedThreshold,
17+
type PdfToonLine,
1518
type PdfWord,
1619
} from "./pdf.interface";
1720

@@ -389,4 +392,40 @@ export class PdfReaderCommon {
389392

390393
return isWordsBelowThreshold || isTextLengthBelowThreshold;
391394
}
395+
396+
protected getToonWords(pdfWords: PdfWord[], enableToon: boolean): string {
397+
if (!enableToon) return "";
398+
399+
const simplifyWords = pdfWords.map((word) => ({
400+
text: word.text,
401+
bbox: [word.bbox.x0, word.bbox.y0, word.bbox.x1, word.bbox.y1],
402+
}));
403+
404+
return encode(simplifyWords);
405+
}
406+
407+
protected getLinesFromTextsInToonCommon(
408+
pageTexts: PageTexts,
409+
startIndex = 0
410+
): PageToonLines {
411+
let pageLines: PageToonLines = "";
412+
const numOfPages = pageTexts.size;
413+
414+
for (let i = startIndex; i < numOfPages + startIndex; i++) {
415+
const pdfText = pageTexts.get(i);
416+
let lines: PdfToonLine = {};
417+
if (pdfText) {
418+
lines = this.getLines(pdfText.words).reduce((acc, word, index) => {
419+
acc[`${index}`] = word.words.map((el) => ({
420+
text: el.text,
421+
bbox: [el.bbox.x0, el.bbox.y0, el.bbox.x0, el.bbox.y1],
422+
}));
423+
return acc;
424+
}, {} as PdfToonLine);
425+
}
426+
pageLines += `# Page ${i} lines:\n ${encode(lines)}\n`;
427+
}
428+
429+
return pageLines;
430+
}
392431
}

src/pdf-reader-legacy.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import {
2323
type CompactPageLines,
2424
type PageLines,
2525
type PageTexts,
26+
type PageToonLines,
2627
type PdfCompactLineAlgorithm,
2728
type PdfReaderOptions,
2829
type PdfScannedThreshold,
@@ -203,6 +204,8 @@ export class PdfReaderLegacy extends PdfReaderCommon {
203204
linesMap.set(pageNum, {
204205
words: textsFiltered,
205206
fullText,
207+
confidence: 1,
208+
toon: this.getToonWords(textsFiltered, this.options.enableToon),
206209
});
207210
}
208211

@@ -235,6 +238,8 @@ export class PdfReaderLegacy extends PdfReaderCommon {
235238
linesMap.set(pageNum, {
236239
words: textsFiltered,
237240
fullText,
241+
confidence: ocrResult.confidence,
242+
toon: this.getToonWords(textsFiltered, this.options.enableToon),
238243
});
239244
} catch (error) {
240245
if (this.options.verbose) {
@@ -243,6 +248,8 @@ export class PdfReaderLegacy extends PdfReaderCommon {
243248
linesMap.set(pageNum, {
244249
words: [],
245250
fullText: "",
251+
confidence: 0,
252+
toon: "",
246253
});
247254
}
248255
}
@@ -467,6 +474,15 @@ export class PdfReaderLegacy extends PdfReaderCommon {
467474
return this.getLinesFromTextsCommon(pageTexts, this.startIndex);
468475
}
469476

477+
/**
478+
* Converts extracted text into TOON format string for LLM-friendly input.
479+
* @param pageTexts - The extracted text data from a PDF.
480+
* @returns A string of TOON format
481+
*/
482+
getLinesFromTextsInToon(pageTexts: PageTexts): PageToonLines {
483+
return this.getLinesFromTextsInToonCommon(pageTexts, this.startIndex);
484+
}
485+
470486
/**
471487
* Converts extracted text into compact structured lines using a specified algorithm.
472488
* @param pageTexts - The extracted text data from a PDF.

src/pdf-reader.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
type CompactPageLines,
1414
type PageLines,
1515
type PageTexts,
16+
type PageToonLines,
1617
type PdfCompactLineAlgorithm,
1718
type PdfReaderOptions,
1819
type PdfScannedThreshold,
@@ -207,6 +208,8 @@ export class PdfReader extends PdfReaderCommon {
207208
linesMap.set(pageNum, {
208209
words: textsFiltered,
209210
fullText,
211+
confidence: 1,
212+
toon: this.getToonWords(textsFiltered, this.options.enableToon),
210213
});
211214
}
212215

@@ -239,6 +242,8 @@ export class PdfReader extends PdfReaderCommon {
239242
linesMap.set(pageNum, {
240243
words: textsFiltered,
241244
fullText,
245+
confidence: ocrResult.confidence,
246+
toon: this.getToonWords(textsFiltered, this.options.enableToon),
242247
});
243248
} catch (error) {
244249
if (this.options.verbose) {
@@ -247,6 +252,8 @@ export class PdfReader extends PdfReaderCommon {
247252
linesMap.set(pageNum, {
248253
words: [],
249254
fullText: "",
255+
confidence: 0,
256+
toon: "",
250257
});
251258
}
252259
}
@@ -446,6 +453,15 @@ export class PdfReader extends PdfReaderCommon {
446453
return this.getLinesFromTextsCommon(pageTexts, this.startIndex);
447454
}
448455

456+
/**
457+
* Converts extracted text into TOON format string for LLM-friendly input.
458+
* @param pageTexts - The extracted text data from a PDF.
459+
* @returns A string of TOON format
460+
*/
461+
getLinesFromTextsInToon(pageTexts: PageTexts): PageToonLines {
462+
return this.getLinesFromTextsInToonCommon(pageTexts, this.startIndex);
463+
}
464+
449465
/**
450466
* Converts extracted text into compact structured lines using a specified algorithm.
451467
* @param pageTexts - The extracted text data from a PDF.

src/pdf.constant.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,7 @@ export const PDF_READER_DEFAULT_OPTIONS: PdfReaderOptions = {
5050

5151
/** List of fonts to be used in the document. */
5252
fonts: [],
53+
54+
/** Whether to turn on/off toon notation format extraction for LLM-friendly text */
55+
enableToon: false,
5356
};

src/pdf.interface.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,17 @@ export interface PdfWord {
8080
metadata: PdfMetadata;
8181
}
8282

83+
/**
84+
* Represents a single word extracted from a PDF in TOON Format.
85+
*/
86+
export interface PdfToonWord {
87+
/** The extracted text. */
88+
text: string;
89+
90+
/** The bounding box of the word [x,y,x1,y1] */
91+
bbox: [number, number, number, number];
92+
}
93+
8394
/**
8495
* Represents all words extracted from a page.
8596
*/
@@ -89,6 +100,12 @@ export interface PdfTexts {
89100

90101
/** The full text of extracted words */
91102
fullText: string;
103+
104+
/** The full confidence of text extraction */
105+
confidence: number;
106+
107+
/** The full text of extracted words in toon notation for LLM-friendly input */
108+
toon: string;
92109
}
93110

94111
/**
@@ -111,6 +128,11 @@ export interface PdfLine {
111128
words: PdfWord[];
112129
}
113130

131+
/**
132+
* Represents a single line of text in a PDF in TOON format.
133+
*/
134+
export type PdfToonLine = Record<string, PdfToonWord[]>;
135+
114136
/**
115137
* Represents a mapping of page numbers to their corresponding extracted texts.
116138
*/
@@ -121,6 +143,11 @@ export type PageTexts = Map<number, PdfTexts>;
121143
*/
122144
export type PageLines = Map<number, PdfLine[]>;
123145

146+
/**
147+
* Represents a lines in a string formmat of TOON.
148+
*/
149+
export type PageToonLines = string;
150+
124151
/**
125152
* Represents a compact version of a PDF word with only text and bounding box.
126153
*/
@@ -200,6 +227,9 @@ export interface PdfReaderOptions {
200227

201228
/** List of fonts used in the document. */
202229
fonts: PdfFont[];
230+
231+
/** Whether to turn on/off toon notation format extraction for LLM-friendly text */
232+
enableToon: boolean;
203233
}
204234

205235
/**

0 commit comments

Comments
 (0)