@@ -276,9 +276,11 @@ class HTMLReader(
276276
277277 private def extractElements (root : Node ): Array [HTMLElement ] = {
278278 var sentenceIndex = 0
279+ var paragraphIndex = 0
279280 val elements = ArrayBuffer [HTMLElement ]()
280281 val trackingNodes = mutable.Map [Node , NodeMetadata ]()
281282 var pageNumber = 1
283+ val paragraphSpacingY = 25
282284
283285 // Track parent-child hierarchy
284286 var currentParentId : Option [String ] = None
@@ -428,6 +430,10 @@ class HTMLReader(
428430 case " a" =>
429431 pageMetadata(" sentence" ) = sentenceIndex.toString
430432 sentenceIndex += 1
433+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
434+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
435+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
436+ paragraphIndex += 1
431437 val href = element.attr(" href" ).trim
432438 val linkText = element.text().trim
433439 if (href.nonEmpty && linkText.nonEmpty && ! visitedNode) {
@@ -443,6 +449,10 @@ class HTMLReader(
443449 case " table" =>
444450 pageMetadata(" sentence" ) = sentenceIndex.toString
445451 sentenceIndex += 1
452+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
453+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
454+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
455+ paragraphIndex += 1
446456 val tableContent = outputFormat match {
447457 case " plain-text" => extractNestedTableContent(element).trim
448458 case " html-table" =>
@@ -474,6 +484,10 @@ class HTMLReader(
474484 case " li" =>
475485 pageMetadata(" sentence" ) = sentenceIndex.toString
476486 sentenceIndex += 1
487+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
488+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
489+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
490+ paragraphIndex += 1
477491 val itemText = element.text().trim
478492 if (itemText.nonEmpty && ! visitedNode) {
479493 trackingNodes(element).visited = true
@@ -493,6 +507,10 @@ class HTMLReader(
493507 if (codeText.nonEmpty && ! visitedNode) {
494508 pageMetadata(" sentence" ) = sentenceIndex.toString
495509 sentenceIndex += 1
510+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
511+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
512+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
513+ paragraphIndex += 1
496514 trackingNodes(element).visited = true
497515 pageMetadata(" element_id" ) = newUUID()
498516 currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
@@ -519,6 +537,9 @@ class HTMLReader(
519537 sentenceIndex += 1
520538 trackingNodes(element).visited = true
521539 pageMetadata(" element_id" ) = newUUID()
540+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
541+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
542+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
522543 currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
523544 elements += HTMLElement (
524545 ElementType .NARRATIVE_TEXT ,
@@ -534,11 +555,15 @@ class HTMLReader(
534555 trackingNodes(element).visited = true
535556 val titleId = newUUID()
536557 pageMetadata(" element_id" ) = titleId
558+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
559+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
560+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
537561 elements += HTMLElement (
538562 ElementType .TITLE ,
539563 content = titleText,
540564 metadata = pageMetadata)
541565 currentParentId = Some (titleId)
566+ paragraphIndex += 1
542567 }
543568
544569 case ElementType .UNCATEGORIZED_TEXT =>
@@ -548,11 +573,15 @@ class HTMLReader(
548573 sentenceIndex += 1
549574 trackingNodes(element).visited = true
550575 pageMetadata(" element_id" ) = newUUID()
576+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
577+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
578+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
551579 currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
552580 elements += HTMLElement (
553581 ElementType .UNCATEGORIZED_TEXT ,
554582 content = text,
555583 metadata = pageMetadata)
584+ paragraphIndex += 1
556585 }
557586 }
558587 }
@@ -565,6 +594,10 @@ class HTMLReader(
565594 sentenceIndex += 1
566595 val titleId = newUUID()
567596 pageMetadata(" element_id" ) = titleId
597+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
598+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
599+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
600+ paragraphIndex += 1
568601 elements += HTMLElement (
569602 ElementType .TITLE ,
570603 content = titleText,
@@ -585,6 +618,10 @@ class HTMLReader(
585618 if (divText.nonEmpty) {
586619 pageMetadata(" sentence" ) = sentenceIndex.toString
587620 sentenceIndex += 1
621+ pageMetadata(" paragraph_index" ) = paragraphIndex.toString
622+ pageMetadata(" paragraph_y" ) = (paragraphIndex * paragraphSpacingY).toString
623+ pageMetadata(" page_y" ) = (paragraphIndex * paragraphSpacingY).toString
624+ paragraphIndex += 1
588625 trackingNodes(element).visited = true
589626 pageMetadata(" element_id" ) = newUUID()
590627 currentParentId.foreach(pid => pageMetadata(" parent_id" ) = pid)
0 commit comments