Skip to content

Table Title and Table content separate chunks: Merge contents of parent_id and element.id #3012

Open
@weissenbacherpwc

Description

@weissenbacherpwc

Hi,

I am using partition and chunk_by_title to chunk my pdfs. It generally works but when I investigated the chunks I saw that if there is a Table in one of my documents, the title of the table is always one chunk and the actual content of a table is a separate chunk which I think it not optimal.

E.g. see this example with a pptx-file:

test = pptx_reader("my_file.pptx")
for i in test:
    if i.metadata.get("filetype") == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
        print(i.page_content)
        print(i.metadata)
        print("+++++++++++++++++++++++++")

Prints:
+++++++++++++++++++++++++
RAG Evaluation: RAGAS
{'file_directory': '...', 'filename': '301123_genai_präsentation.pptx', 'filetype': '...', 'last_modified': '2023-11-30T10:26:30', 'page_number': 15, 'source': '301123_genai_präsentation.pptx', 'source_documents': '301123_genai_präsentation.pptx', 'page': 15}
+++++++++++++++++++++++++
Retrieval Generation
Model Context Recall Context Precision Faithfulness
Llama 2-Chat 0.86 0.58 0.91
LeoLM-Chat 0.86 0.58 0.81
LeoLM-Mistral-Chat 0.86 0.58 0.87
EM German Leo Mistral 0.86 0.58 0.82
Llama-German-Assistant 0.86 0.58 0.91
{'file_directory': '...', 'filename': '301123_genai_präsentation.pptx', 'last_modified': '2023-11-30T10:26:30', 'page_number': 15, 'parent_id': 'a9e22a24894f5c1dbe9b0b66251bbbc2', 'filetype': '...', 'source': '301123_genai_präsentation.pptx', 'source_documents': '301123_genai_präsentation.pptx', 'page': 15}

Question
So I see a parent_id key in the second output. How can I merge the content of the first output (the table heading) with the second output, so I would have all in one chunk:
RAG Evaluation: RAGAS
Retrieval Generation
Model Context Recall Context Precision Faithfulness
Llama 2-Chat 0.86 0.58 0.91
LeoLM-Chat 0.86 0.58 0.81
LeoLM-Mistral-Chat 0.86 0.58 0.87
EM German Leo Mistral 0.86 0.58 0.82
Llama-German-Assistant 0.86 0.58 0.91

Here is the full code:

import os
import yaml
import box
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.docx import partition_docx
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.xlsx import partition_xlsx
from unstructured.partition.html import partition_html
from langchain_core.documents import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
import re

def filter_elements(elements):
    possible_titles = ["Inhaltsverzeichnis", "Inhalt", "Structure", "Agenda", "Abbildungsverzeichnis", "Gliederung", "Tabellenverzeichnis"] # Filter "Inhaltsverzeichnis"-Pages

    # Find the first element that matches any of the possible titles and is categorized as "Title"
    reference_titles = [
        el for el in elements
        if el.text in possible_titles and el.category == "Title"
    ]
    # Get the ID of the matched title element
    reference_ids = [title.id for title in reference_titles]
    elements = [el for el in elements if el.metadata.parent_id not in reference_ids] 
    # Pattern to detect many dots in a row that indicate a Table of Structure. We want to remove this.
    elements = [el for el in elements if not re.search(r'\.{50}', el.text)]
    # Filtering small chunks below 60 chars, as mostly the are not meaningful
    #html_elements = [el for el in html_elements if len(el.text) > 60]
    elements = [el for el in elements if el.category != "Header"]
    elements = [el for el in elements if el.category != "Footer"]
    return elements

def chunk_elements_by_title(elements):
    elements = chunk_by_title(elements,
                            combine_text_under_n_chars=cfg.UNSTRUCTURED_COMBINE_TEXT_UNDER_N_CHARS, #combine_text_under_n_chars=0,
                            max_characters=cfg.UNSTRUCTURED_MAX_CHARACTERS,
                            new_after_n_chars=cfg.UNSTRUCTURED_NEW_AFTER_N_CHARS
                            )
    return elements

def html_reader(filename):
    html_elements = partition_html(filename=filename, mode="elements")
    html_elements = filter_elements(html_elements)
    html_elements = chunk_elements_by_title(html_elements)
    return html_elements

def powerpoint_reader(filename):
    pptx_elements = partition_pptx(filename=filename)
    pptx_elements = chunk_by_title(pptx_elements)
    return pptx_elements
    
def markdown_reader(filename):
    md_elements = partition_md(filename=filename)
    md_elements = filter_elements(md_elements)
    md_elements = chunk_elements_by_title(md_elements)
    return md_elements
    
def excel_reader(filename):
    excel_elements = partition_xlsx(filename=filename)
    excel_elements = chunk_by_title(excel_elements)
    return excel_elements
    
def word_reader(filename):
    word_elements = partition_docx(filename=filename)
    word_elements = filter_elements(word_elements)
    word_elements = chunk_elements_by_title(word_elements)
    return word_elements

def pdf_reader(filename, llm):
    if cfg.UNSTRUCTURED_CHUNKING_ACTIVATED == True:
        pdf_elements = partition_pdf(filename=filename,
                                    strategy="hi_res",
                                    infer_table_structure=True,
                                    languages=["eng", "deu"],
                                    )  
        pdf_elements = filter_elements(pdf_elements)        
        pdf_elements = chunk_elements_by_title(pdf_elements)
        print(f'PDF Chunking of file {filename} Done')
    return pdf_elements

Metadata

Metadata

Assignees

No one assigned

    Labels

    chunkingRelated to element chunking.enhancementNew feature or request

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions