1
1
import { convert } from "libreoffice-convert" ;
2
2
import { fromPath } from "pdf2pic" ;
3
- import { LLMParams } from "./types" ;
3
+ import {
4
+ ConvertedNodeType ,
5
+ LLMParams ,
6
+ MdNodeType ,
7
+ ParentId ,
8
+ ProcessedNode ,
9
+ } from "./types" ;
4
10
import { pipeline } from "stream/promises" ;
5
11
import { promisify } from "util" ;
6
12
import * as Tesseract from "tesseract.js" ;
@@ -313,18 +319,13 @@ export const convertKeysToSnakeCase = (
313
319
) ;
314
320
} ;
315
321
316
- interface ProcessedNode {
317
- id : string ;
318
- parentId : string | undefined ;
319
- type : string ;
320
- value : any ;
321
- }
322
- interface parentId {
323
- id : string ;
324
- depth : number ;
325
- }
326
-
327
- export const markdownToJson = async ( markdownString : string ) => {
322
+ /**
323
+ *
324
+ * @param markdownString String - Markdown text
325
+ * @param page Number - Page number
326
+ * @returns ProcessedNode[] - Array of processed nodes
327
+ */
328
+ export const markdownToJson = async ( markdownString : string , page : number ) => {
328
329
/**
329
330
* Bypassing typescript transpiler using eval to use dynamic imports
330
331
*
@@ -341,83 +342,99 @@ export const markdownToJson = async (markdownString: string) => {
341
342
342
343
console . log ( JSON . stringify ( parsedMd ) ) ;
343
344
344
- const parentIdManager : parentId [ ] = [ ] ;
345
+ const parentIdManager : ParentId [ ] = [ ] ;
345
346
346
- const jsonObj : ProcessedNode [ ] = [ ] ;
347
- parsedMd . children . forEach ( ( node : any ) => {
348
- const isHeading = node . type === " heading" ;
347
+ const processedNodes : ProcessedNode [ ] = [ ] ;
348
+ parsedMd . children . forEach ( ( sourceNode : any ) => {
349
+ const isHeading = sourceNode . type === MdNodeType . heading ;
349
350
350
- if ( isHeading && node . depth <= ( parentIdManager . at ( - 1 ) ?. depth || 0 ) ) {
351
+ if ( isHeading && sourceNode . depth <= ( parentIdManager . at ( - 1 ) ?. depth || 0 ) ) {
351
352
for ( let i = parentIdManager . length ; i > 0 ; i -- ) {
352
353
parentIdManager . pop ( ) ;
353
- if ( node . depth > ( parentIdManager . at ( - 1 ) ?. depth || 0 ) ) {
354
+ if ( sourceNode . depth > ( parentIdManager . at ( - 1 ) ?. depth || 0 ) ) {
354
355
break ;
355
356
}
356
357
}
357
358
}
358
- const processedNode = processNode ( node , parentIdManager . at ( - 1 ) ?. id ) ;
359
+ const processedNode = processNode (
360
+ sourceNode ,
361
+ page ,
362
+ parentIdManager . at ( - 1 ) ?. id
363
+ ) ;
359
364
360
365
if ( isHeading ) {
361
- parentIdManager . push ( { id : processedNode [ 0 ] . id , depth : node . depth } ) ;
366
+ parentIdManager . push ( {
367
+ id : processedNode [ 0 ] . id ,
368
+ depth : sourceNode . depth ,
369
+ } ) ;
362
370
}
363
371
364
- jsonObj . push ( ...processedNode ) ;
372
+ processedNodes . push ( ...processedNode ) ;
365
373
} ) ;
366
374
367
- return jsonObj ;
368
- } ;
369
-
370
- const type : Record < string , string > = {
371
- heading : "heading" ,
372
- text : "text" ,
373
- list : "list" ,
375
+ return processedNodes ;
374
376
} ;
375
377
376
- const processNode = ( node : any , parentId ?: string ) : ProcessedNode [ ] => {
378
+ const processNode = (
379
+ node : any ,
380
+ page : number ,
381
+ parentId ?: string
382
+ ) : ProcessedNode [ ] => {
377
383
let value : any ;
378
384
let siblingNodes : ProcessedNode [ ] = [ ] ;
379
385
380
- if ( node . type === "heading" ) {
381
- value = node . children
382
- . map ( ( childNode : any ) => processText ( childNode ) )
383
- . join ( " " ) ;
384
- } else if ( node . type === "paragraph" ) {
386
+ if (
387
+ node . type === MdNodeType . heading ||
388
+ node . type === MdNodeType . paragraph ||
389
+ node . type === MdNodeType . strong
390
+ ) {
385
391
value = node . children
386
392
. map ( ( childNode : any ) => processText ( childNode ) )
387
393
. join ( " " ) ;
388
- } else if ( node . type === " list" ) {
394
+ } else if ( node . type === MdNodeType . list ) {
389
395
const processedNodes = node . children . map ( ( childNode : any ) =>
390
- processListItem ( childNode )
396
+ processListItem ( childNode , page )
391
397
) ;
392
398
value = [ ] ;
393
399
processedNodes . forEach ( ( pn : any ) => {
394
400
value . push ( ...pn . node ) ;
401
+
402
+ // Store nested list nodes
395
403
siblingNodes . push ( ...pn . siblings ) ;
396
404
} ) ;
397
405
}
398
406
399
407
return [
400
408
{
401
409
id : nanoid ( ) ,
410
+ page,
402
411
parentId,
403
- type : type [ node . type as string ] || type . text ,
412
+ type :
413
+ ConvertedNodeType [ node . type as ConvertedNodeType ] ||
414
+ ConvertedNodeType . text ,
404
415
value,
405
416
} ,
406
417
...( siblingNodes || [ ] ) ,
407
418
] ;
408
419
} ;
409
420
421
+ const ignoreNodeTypes = new Set ( [ MdNodeType . break , MdNodeType . thematicBreak ] ) ;
422
+
410
423
const processText = ( node : any ) => {
411
- return node . value ;
424
+ if ( ignoreNodeTypes . has ( node . type ) ) return "" ;
425
+
426
+ return node . type === MdNodeType . text
427
+ ? node . value
428
+ : node . children . map ( ( child : any ) => processText ( child ) ) . join ( " " ) ;
412
429
} ;
413
430
414
- const processListItem = ( node : any ) => {
431
+ const processListItem = ( node : any , page : number ) => {
415
432
let newNode : ProcessedNode [ ] = [ ] ;
416
433
let siblings : ProcessedNode [ ] = [ ] ;
417
434
418
435
node . children . forEach ( ( childNode : any ) => {
419
- if ( childNode . type !== " list" ) {
420
- const processedNode = processNode ( childNode ) ;
436
+ if ( childNode . type !== MdNodeType . list ) {
437
+ const processedNode = processNode ( childNode , page ) ;
421
438
if ( newNode . length > 0 ) {
422
439
newNode [ 0 ] . value += processedNode . map ( ( { value } ) => value ) . join ( ", " ) ;
423
440
} else {
@@ -429,13 +446,13 @@ const processListItem = (node: any) => {
429
446
newNode = [
430
447
{
431
448
id : nanoid ( ) ,
432
- type : " text" ,
449
+ type : ConvertedNodeType . text ,
433
450
value : "" ,
434
451
parentId : undefined ,
435
452
} ,
436
453
] ;
437
454
}
438
- const processedNode = processNode ( childNode , newNode [ 0 ] . id ) ;
455
+ const processedNode = processNode ( childNode , page , newNode [ 0 ] . id ) ;
439
456
siblings . push ( ...processedNode ) ;
440
457
}
441
458
} ) ;
0 commit comments