Skip to content

Commit fab05db

Browse files
Improve typing, add page number to nodes and a bit of clean up
1 parent c608b96 commit fab05db

File tree

5 files changed

+138
-51
lines changed

5 files changed

+138
-51
lines changed

node-zerox/src/index.ts

+8-3
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ export const zerox = async ({
9797

9898
if (maintainFormat) {
9999
// Use synchronous processing
100-
for (const image of images) {
100+
for (const [idx, image] of images.entries()) {
101101
const imagePath = path.join(tempDirectory, image);
102102
try {
103103
const { content, inputTokens, outputTokens } = await getCompletion({
@@ -106,6 +106,7 @@ export const zerox = async ({
106106
llmParams,
107107
maintainFormat,
108108
model,
109+
pageNumber: idx + 1,
109110
priorPage,
110111
});
111112
const formattedMarkdown = formatMarkdown(content);
@@ -124,7 +125,10 @@ export const zerox = async ({
124125
}
125126
} else {
126127
// Process in parallel with a limit on concurrent pages
127-
const processPage = async (image: string): Promise<string | null> => {
128+
const processPage = async (
129+
image: string,
130+
pageNumber: number
131+
): Promise<string | null> => {
128132
const imagePath = path.join(tempDirectory, image);
129133
try {
130134
const { content, inputTokens, outputTokens } = await getCompletion({
@@ -133,6 +137,7 @@ export const zerox = async ({
133137
llmParams,
134138
maintainFormat,
135139
model,
140+
pageNumber,
136141
priorPage,
137142
});
138143
const formattedMarkdown = formatMarkdown(content);
@@ -156,7 +161,7 @@ export const zerox = async ({
156161

157162
const promises = images.map((image, index) =>
158163
limit(() =>
159-
processPage(image).then((result) => {
164+
processPage(image, index + 1).then((result) => {
160165
results[index] = result;
161166
})
162167
)

node-zerox/src/openAI.ts

+16-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
import { CompletionArgs, CompletionResponse } from "./types";
2-
import { convertKeysToSnakeCase, encodeImageToBase64, markdownToJson } from "./utils";
2+
import {
3+
convertKeysToSnakeCase,
4+
encodeImageToBase64,
5+
markdownToJson,
6+
} from "./utils";
37
import axios from "axios";
48

59
export const getCompletion = async ({
@@ -8,6 +12,7 @@ export const getCompletion = async ({
812
llmParams,
913
maintainFormat,
1014
model,
15+
pageNumber,
1116
priorPage,
1217
}: CompletionArgs): Promise<CompletionResponse> => {
1318
const systemPrompt = `
@@ -58,13 +63,21 @@ export const getCompletion = async ({
5863

5964
const data = response.data;
6065

61-
const jsonOutput = await markdownToJson(data.choices[0].message.content);
62-
console.log("====>>>>", JSON.stringify(jsonOutput));
66+
const jsonOutput = await markdownToJson(
67+
data.choices[0].message.content,
68+
pageNumber
69+
);
70+
71+
// TODO: remove this
72+
// Only for development
73+
console.log('======')
74+
console.log(JSON.stringify(jsonOutput));
6375

6476
return {
6577
content: data.choices[0].message.content,
6678
inputTokens: data.usage.prompt_tokens,
6779
outputTokens: data.usage.completion_tokens,
80+
structuredContent: jsonOutput,
6881
};
6982
} catch (err) {
7083
console.error("Error in OpenAI completion", err);

node-zerox/src/types.ts

+51
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export interface CompletionResponse {
3636
content: string;
3737
inputTokens: number;
3838
outputTokens: number;
39+
structuredContent: ProcessedNode[];
3940
}
4041

4142
export interface CompletionArgs {
@@ -44,6 +45,7 @@ export interface CompletionArgs {
4445
llmParams?: LLMParams;
4546
maintainFormat: boolean;
4647
model: ModelOptions | string;
48+
pageNumber: number;
4749
priorPage: string;
4850
}
4951

@@ -54,3 +56,52 @@ export interface LLMParams {
5456
temperature?: number;
5557
topP?: number;
5658
}
59+
60+
export enum MdNodeType {
61+
break = "break",
62+
heading = "heading",
63+
list = "list",
64+
paragraph = "paragraph",
65+
strong = "strong",
66+
table = "table",
67+
text = "text",
68+
thematicBreak = "thematicBreak",
69+
}
70+
71+
export enum ConvertedNodeType {
72+
heading = "heading",
73+
list = "list",
74+
text = "text",
75+
}
76+
export interface BaseNode {
77+
id: string;
78+
page?: number;
79+
parentId?: string;
80+
}
81+
82+
export interface TextNode extends BaseNode {
83+
type: ConvertedNodeType.text;
84+
value: string;
85+
}
86+
87+
export interface HeadingNode extends BaseNode {
88+
type: ConvertedNodeType.heading;
89+
value: string;
90+
}
91+
92+
export interface ListNode extends BaseNode {
93+
type: ConvertedNodeType.list;
94+
value: ListItem[];
95+
}
96+
97+
export interface ListItem {
98+
id: string;
99+
value: string;
100+
}
101+
102+
export type ProcessedNode = TextNode | HeadingNode | ListNode;
103+
104+
export interface ParentId {
105+
depth: number;
106+
id: string;
107+
}

node-zerox/src/utils.ts

+61-44
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
import { convert } from "libreoffice-convert";
22
import { fromPath } from "pdf2pic";
3-
import { LLMParams } from "./types";
3+
import {
4+
ConvertedNodeType,
5+
LLMParams,
6+
MdNodeType,
7+
ParentId,
8+
ProcessedNode,
9+
} from "./types";
410
import { pipeline } from "stream/promises";
511
import { promisify } from "util";
612
import * as Tesseract from "tesseract.js";
@@ -313,18 +319,13 @@ export const convertKeysToSnakeCase = (
313319
);
314320
};
315321

316-
interface ProcessedNode {
317-
id: string;
318-
parentId: string | undefined;
319-
type: string;
320-
value: any;
321-
}
322-
interface parentId {
323-
id: string;
324-
depth: number;
325-
}
326-
327-
export const markdownToJson = async (markdownString: string) => {
322+
/**
323+
*
324+
* @param markdownString String - Markdown text
325+
* @param page Number - Page number
326+
* @returns ProcessedNode[] - Array of processed nodes
327+
*/
328+
export const markdownToJson = async (markdownString: string, page: number) => {
328329
/**
329330
* Bypassing typescript transpiler using eval to use dynamic imports
330331
*
@@ -341,83 +342,99 @@ export const markdownToJson = async (markdownString: string) => {
341342

342343
console.log(JSON.stringify(parsedMd));
343344

344-
const parentIdManager: parentId[] = [];
345+
const parentIdManager: ParentId[] = [];
345346

346-
const jsonObj: ProcessedNode[] = [];
347-
parsedMd.children.forEach((node: any) => {
348-
const isHeading = node.type === "heading";
347+
const processedNodes: ProcessedNode[] = [];
348+
parsedMd.children.forEach((sourceNode: any) => {
349+
const isHeading = sourceNode.type === MdNodeType.heading;
349350

350-
if (isHeading && node.depth <= (parentIdManager.at(-1)?.depth || 0)) {
351+
if (isHeading && sourceNode.depth <= (parentIdManager.at(-1)?.depth || 0)) {
351352
for (let i = parentIdManager.length; i > 0; i--) {
352353
parentIdManager.pop();
353-
if (node.depth > (parentIdManager.at(-1)?.depth || 0)) {
354+
if (sourceNode.depth > (parentIdManager.at(-1)?.depth || 0)) {
354355
break;
355356
}
356357
}
357358
}
358-
const processedNode = processNode(node, parentIdManager.at(-1)?.id);
359+
const processedNode = processNode(
360+
sourceNode,
361+
page,
362+
parentIdManager.at(-1)?.id
363+
);
359364

360365
if (isHeading) {
361-
parentIdManager.push({ id: processedNode[0].id, depth: node.depth });
366+
parentIdManager.push({
367+
id: processedNode[0].id,
368+
depth: sourceNode.depth,
369+
});
362370
}
363371

364-
jsonObj.push(...processedNode);
372+
processedNodes.push(...processedNode);
365373
});
366374

367-
return jsonObj;
368-
};
369-
370-
const type: Record<string, string> = {
371-
heading: "heading",
372-
text: "text",
373-
list: "list",
375+
return processedNodes;
374376
};
375377

376-
const processNode = (node: any, parentId?: string): ProcessedNode[] => {
378+
const processNode = (
379+
node: any,
380+
page: number,
381+
parentId?: string
382+
): ProcessedNode[] => {
377383
let value: any;
378384
let siblingNodes: ProcessedNode[] = [];
379385

380-
if (node.type === "heading") {
381-
value = node.children
382-
.map((childNode: any) => processText(childNode))
383-
.join(" ");
384-
} else if (node.type === "paragraph") {
386+
if (
387+
node.type === MdNodeType.heading ||
388+
node.type === MdNodeType.paragraph ||
389+
node.type === MdNodeType.strong
390+
) {
385391
value = node.children
386392
.map((childNode: any) => processText(childNode))
387393
.join(" ");
388-
} else if (node.type === "list") {
394+
} else if (node.type === MdNodeType.list) {
389395
const processedNodes = node.children.map((childNode: any) =>
390-
processListItem(childNode)
396+
processListItem(childNode, page)
391397
);
392398
value = [];
393399
processedNodes.forEach((pn: any) => {
394400
value.push(...pn.node);
401+
402+
// Store nested list nodes
395403
siblingNodes.push(...pn.siblings);
396404
});
397405
}
398406

399407
return [
400408
{
401409
id: nanoid(),
410+
page,
402411
parentId,
403-
type: type[node.type as string] || type.text,
412+
type:
413+
ConvertedNodeType[node.type as ConvertedNodeType] ||
414+
ConvertedNodeType.text,
404415
value,
405416
},
406417
...(siblingNodes || []),
407418
];
408419
};
409420

421+
const ignoreNodeTypes = new Set([MdNodeType.break, MdNodeType.thematicBreak]);
422+
410423
const processText = (node: any) => {
411-
return node.value;
424+
if (ignoreNodeTypes.has(node.type)) return "";
425+
426+
return node.type === MdNodeType.text
427+
? node.value
428+
: node.children.map((child: any) => processText(child)).join(" ");
412429
};
413430

414-
const processListItem = (node: any) => {
431+
const processListItem = (node: any, page: number) => {
415432
let newNode: ProcessedNode[] = [];
416433
let siblings: ProcessedNode[] = [];
417434

418435
node.children.forEach((childNode: any) => {
419-
if (childNode.type !== "list") {
420-
const processedNode = processNode(childNode);
436+
if (childNode.type !== MdNodeType.list) {
437+
const processedNode = processNode(childNode, page);
421438
if (newNode.length > 0) {
422439
newNode[0].value += processedNode.map(({ value }) => value).join(", ");
423440
} else {
@@ -429,13 +446,13 @@ const processListItem = (node: any) => {
429446
newNode = [
430447
{
431448
id: nanoid(),
432-
type: "text",
449+
type: ConvertedNodeType.text,
433450
value: "",
434451
parentId: undefined,
435452
},
436453
];
437454
}
438-
const processedNode = processNode(childNode, newNode[0].id);
455+
const processedNode = processNode(childNode, page, newNode[0].id);
439456
siblings.push(...processedNode);
440457
}
441458
});

node-zerox/tsconfig.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
"outDir": "./dist",
77
"strict": true,
88
"esModuleInterop": true,
9-
"skipLibCheck": true
9+
"skipLibCheck": true,
10+
"downlevelIteration": true,
1011
},
1112
"include": ["src/**/*"],
1213
"exclude": ["node_modules", "**/*.test.ts"]

0 commit comments

Comments
 (0)