Description
Hi,
I am using partition and chunk_by_title to chunk my pdfs. It generally works but when I investigated the chunks I saw that if there is a Table in one of my documents, the title of the table is always one chunk and the actual content of a table is a separate chunk which I think it not optimal.
E.g. see this example with a pptx-file:
test = pptx_reader("my_file.pptx")
for i in test:
if i.metadata.get("filetype") == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
print(i.page_content)
print(i.metadata)
print("+++++++++++++++++++++++++")
Prints:
+++++++++++++++++++++++++
RAG Evaluation: RAGAS
{'file_directory': '...', 'filename': '301123_genai_präsentation.pptx', 'filetype': '...', 'last_modified': '2023-11-30T10:26:30', 'page_number': 15, 'source': '301123_genai_präsentation.pptx', 'source_documents': '301123_genai_präsentation.pptx', 'page': 15}
+++++++++++++++++++++++++
Retrieval Generation
Model Context Recall Context Precision Faithfulness
Llama 2-Chat 0.86 0.58 0.91
LeoLM-Chat 0.86 0.58 0.81
LeoLM-Mistral-Chat 0.86 0.58 0.87
EM German Leo Mistral 0.86 0.58 0.82
Llama-German-Assistant 0.86 0.58 0.91
{'file_directory': '...', 'filename': '301123_genai_präsentation.pptx', 'last_modified': '2023-11-30T10:26:30', 'page_number': 15, 'parent_id': 'a9e22a24894f5c1dbe9b0b66251bbbc2', 'filetype': '...', 'source': '301123_genai_präsentation.pptx', 'source_documents': '301123_genai_präsentation.pptx', 'page': 15}
Question
So I see a parent_id key in the second output. How can I merge the content of the first output (the table heading) with the second output, so I would have all in one chunk:
RAG Evaluation: RAGAS
Retrieval Generation
Model Context Recall Context Precision Faithfulness
Llama 2-Chat 0.86 0.58 0.91
LeoLM-Chat 0.86 0.58 0.81
LeoLM-Mistral-Chat 0.86 0.58 0.87
EM German Leo Mistral 0.86 0.58 0.82
Llama-German-Assistant 0.86 0.58 0.91
Here is the full code:
import os
import yaml
import box
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.docx import partition_docx
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.xlsx import partition_xlsx
from unstructured.partition.html import partition_html
from langchain_core.documents import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
import re
def filter_elements(elements):
possible_titles = ["Inhaltsverzeichnis", "Inhalt", "Structure", "Agenda", "Abbildungsverzeichnis", "Gliederung", "Tabellenverzeichnis"] # Filter "Inhaltsverzeichnis"-Pages
# Find the first element that matches any of the possible titles and is categorized as "Title"
reference_titles = [
el for el in elements
if el.text in possible_titles and el.category == "Title"
]
# Get the ID of the matched title element
reference_ids = [title.id for title in reference_titles]
elements = [el for el in elements if el.metadata.parent_id not in reference_ids]
# Pattern to detect many dots in a row that indicate a Table of Structure. We want to remove this.
elements = [el for el in elements if not re.search(r'\.{50}', el.text)]
# Filtering small chunks below 60 chars, as mostly the are not meaningful
#html_elements = [el for el in html_elements if len(el.text) > 60]
elements = [el for el in elements if el.category != "Header"]
elements = [el for el in elements if el.category != "Footer"]
return elements
def chunk_elements_by_title(elements):
elements = chunk_by_title(elements,
combine_text_under_n_chars=cfg.UNSTRUCTURED_COMBINE_TEXT_UNDER_N_CHARS, #combine_text_under_n_chars=0,
max_characters=cfg.UNSTRUCTURED_MAX_CHARACTERS,
new_after_n_chars=cfg.UNSTRUCTURED_NEW_AFTER_N_CHARS
)
return elements
def html_reader(filename):
html_elements = partition_html(filename=filename, mode="elements")
html_elements = filter_elements(html_elements)
html_elements = chunk_elements_by_title(html_elements)
return html_elements
def powerpoint_reader(filename):
pptx_elements = partition_pptx(filename=filename)
pptx_elements = chunk_by_title(pptx_elements)
return pptx_elements
def markdown_reader(filename):
md_elements = partition_md(filename=filename)
md_elements = filter_elements(md_elements)
md_elements = chunk_elements_by_title(md_elements)
return md_elements
def excel_reader(filename):
excel_elements = partition_xlsx(filename=filename)
excel_elements = chunk_by_title(excel_elements)
return excel_elements
def word_reader(filename):
word_elements = partition_docx(filename=filename)
word_elements = filter_elements(word_elements)
word_elements = chunk_elements_by_title(word_elements)
return word_elements
def pdf_reader(filename, llm):
if cfg.UNSTRUCTURED_CHUNKING_ACTIVATED == True:
pdf_elements = partition_pdf(filename=filename,
strategy="hi_res",
infer_table_structure=True,
languages=["eng", "deu"],
)
pdf_elements = filter_elements(pdf_elements)
pdf_elements = chunk_elements_by_title(pdf_elements)
print(f'PDF Chunking of file {filename} Done')
return pdf_elements