Skip to content

Drive smart chip indexing #4459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions backend/onyx/chat/process_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,14 +1089,10 @@ def stream_chat_message_objects(
selected_search_docs=selected_db_search_docs,
# Deduping happens at the last step to avoid harming quality by dropping content early on
# Skip deduping completely for ordering-only mode to save time
dedupe_docs=(
False
if search_for_ordering_only
else (
retrieval_options.dedupe_docs
if retrieval_options
else False
)
dedupe_docs=bool(
not search_for_ordering_only
and retrieval_options
and retrieval_options.dedupe_docs
),
user_files=user_file_files if search_for_ordering_only else [],
loaded_user_files=user_files
Expand Down
83 changes: 82 additions & 1 deletion backend/onyx/connectors/google_drive/doc_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from onyx.connectors.google_drive.models import GDriveMimeType
from onyx.connectors.google_drive.models import GoogleDriveFileType
from onyx.connectors.google_drive.section_extraction import get_document_sections
from onyx.connectors.google_drive.section_extraction import HEADING_DELIMITER
from onyx.connectors.google_utils.resources import GoogleDocsService
from onyx.connectors.google_utils.resources import GoogleDriveService
from onyx.connectors.models import ConnectorFailure
Expand All @@ -35,6 +36,10 @@

logger = setup_logger()

# This is not a standard valid unicode char, it is used by the docs advanced API to
# represent smart chips (elements like dates and doc links).
SMART_CHIP_CHAR = "\ue907"

# Mapping of Google Drive mime types to export formats
GOOGLE_MIME_TYPES_TO_EXPORT = {
GDriveMimeType.DOC.value: "text/plain",
Expand Down Expand Up @@ -220,6 +225,75 @@ def _download_and_extract_sections_basic(
return []


def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int:
start = haystack.find(needle, start)
while start >= 0 and n > 1:
start = haystack.find(needle, start + len(needle))
n -= 1
return start


def align_basic_advanced(
basic_sections: list[TextSection | ImageSection], adv_sections: list[TextSection]
) -> list[TextSection | ImageSection]:
"""Align the basic sections with the advanced sections.
In particular, the basic sections contain all content of the file,
including smart chips like dates and doc links. The advanced sections
are separated by section headers and contain header-based links that
improve user experience when they click on the source in the UI.

There are edge cases in text matching (i.e. the heading is a smart chip or
there is a smart chip in the doc with text containing the actual heading text)
that make the matching imperfect; this is hence done on a best-effort basis.
"""
if len(adv_sections) <= 1:
return basic_sections # no benefit from aligning

basic_full_text = "".join(
[section.text for section in basic_sections if isinstance(section, TextSection)]
)
new_sections: list[TextSection | ImageSection] = []
heading_start = 0
for adv_ind in range(1, len(adv_sections)):
heading = adv_sections[adv_ind].text.split(HEADING_DELIMITER)[0]
# retrieve the longest part of the heading that is not a smart chip
heading_key = max(heading.split(SMART_CHIP_CHAR), key=len).strip()
if heading_key == "":
logger.warning(
f"Cannot match heading: {heading}, its link will come from the following section"
)
continue
heading_offset = heading.find(heading_key)

# count occurrences of heading str in previous section
heading_count = adv_sections[adv_ind - 1].text.count(heading_key)

prev_start = heading_start
heading_start = (
_find_nth(basic_full_text, heading_key, heading_count, start=prev_start)
- heading_offset
)
if heading_start < 0:
logger.warning(
f"Heading key {heading_key} from heading {heading} not found in basic text"
)
heading_start = prev_start
continue

new_sections.append(
TextSection(
link=adv_sections[adv_ind - 1].link,
text=basic_full_text[prev_start:heading_start],
)
)

# handle last section
new_sections.append(
TextSection(link=adv_sections[-1].link, text=basic_full_text[heading_start:])
)
return new_sections


def convert_drive_item_to_document(
file: GoogleDriveFileType,
drive_service: Callable[[], GoogleDriveService],
Expand All @@ -244,10 +318,17 @@ def convert_drive_item_to_document(
try:
# get_document_sections is the advanced approach for Google Docs
doc_sections = get_document_sections(
docs_service=docs_service(), doc_id=file.get("id", "")
docs_service=docs_service(),
doc_id=file.get("id", ""),
)
if doc_sections:
sections = cast(list[TextSection | ImageSection], doc_sections)
if any(SMART_CHIP_CHAR in section.text for section in doc_sections):
basic_sections = _download_and_extract_sections_basic(
file, drive_service(), allow_images
)
sections = align_basic_advanced(basic_sections, doc_sections)

except Exception as e:
logger.warning(
f"Error in advanced parsing: {e}. Falling back to basic extraction."
Expand Down
189 changes: 136 additions & 53 deletions backend/onyx/connectors/google_drive/section_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,20 @@
from onyx.connectors.google_utils.resources import GoogleDocsService
from onyx.connectors.models import TextSection

HEADING_DELIMITER = "\n"


class CurrentHeading(BaseModel):
id: str
id: str | None
text: str


def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
def _build_gdoc_section_link(doc_id: str, tab_id: str, heading_id: str | None) -> str:
"""Builds a Google Doc link that jumps to a specific heading"""
# NOTE: doesn't support docs with multiple tabs atm, if we need that ask
# @Chris
return (
f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
)
heading_str = f"#heading={heading_id}" if heading_id else ""
return f"https://docs.google.com/document/d/{doc_id}/edit?tab={tab_id}{heading_str}"


def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
Expand All @@ -31,75 +32,157 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
for element in paragraph.get("elements", []):
if "textRun" in element:
text_elements.append(element["textRun"].get("content", ""))

# Handle links
if "textStyle" in element and "link" in element["textStyle"]:
text_elements.append(f"({element['textStyle']['link'].get('url', '')})")

if "person" in element:
name = element["person"].get("personProperties", {}).get("name", "")
email = element["person"].get("personProperties", {}).get("email", "")
person_str = "<Person|"
if name:
person_str += f"name: {name}, "
if email:
person_str += f"email: {email}"
person_str += ">"
text_elements.append(person_str)

if "richLink" in element:
props = element["richLink"].get("richLinkProperties", {})
title = props.get("title", "")
uri = props.get("uri", "")
link_str = f"[{title}]({uri})"
text_elements.append(link_str)

return "".join(text_elements)


def _extract_text_from_table(table: dict[str, Any]) -> str:
"""
Extracts the text content from a table element.
"""
row_strs = []

for row in table.get("tableRows", []):
cells = row.get("tableCells", [])
cell_strs = []
for cell in cells:
child_elements = cell.get("content", {})
cell_str = []
for child_elem in child_elements:
if "paragraph" not in child_elem:
continue
cell_str.append(_extract_text_from_paragraph(child_elem["paragraph"]))
cell_strs.append("".join(cell_str))
row_strs.append(", ".join(cell_strs))
return "\n".join(row_strs)


def get_document_sections(
docs_service: GoogleDocsService,
doc_id: str,
) -> list[TextSection]:
"""Extracts sections from a Google Doc, including their headings and content"""
# Fetch the document structure
doc = docs_service.documents().get(documentId=doc_id).execute()
http_request = docs_service.documents().get(documentId=doc_id)

# Google has poor support for tabs in the docs api, see
# https://cloud.google.com/python/docs/reference/cloudtasks/
# latest/google.cloud.tasks_v2.types.HttpRequest
# https://developers.google.com/workspace/docs/api/how-tos/tabs
# https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get
# this is a hack to use the param mentioned in the rest api docs
# TODO: check if it can be specified i.e. in documents()
http_request.uri += "&includeTabsContent=true"
doc = http_request.execute()

# Get the content
content = doc.get("body", {}).get("content", [])
tabs = doc.get("tabs", {})
sections: list[TextSection] = []
for tab in tabs:
sections.extend(get_tab_sections(tab, doc_id))
return sections


def _is_heading(paragraph: dict[str, Any]) -> bool:
"""Checks if a paragraph (a block of text in a drive document) is a heading"""
if not (
"paragraphStyle" in paragraph
and "namedStyleType" in paragraph["paragraphStyle"]
):
return False

style = paragraph["paragraphStyle"]["namedStyleType"]
is_heading = style.startswith("HEADING_")
is_title = style.startswith("TITLE")
return is_heading or is_title


def _add_finished_section(
sections: list[TextSection],
doc_id: str,
tab_id: str,
current_heading: CurrentHeading,
current_section: list[str],
) -> None:
"""Adds a finished section to the list of sections if the section has content.
Returns the list of sections to use going forward, which may be the old list
if a new section was not added.
"""
if not (current_section or current_heading.text):
return
# If we were building a previous section, add it to sections list

# this is unlikely to ever matter, but helps if the doc contains weird headings
header_text = current_heading.text.replace(HEADING_DELIMITER, "")
section_text = f"{header_text}{HEADING_DELIMITER}" + "\n".join(current_section)
sections.append(
TextSection(
text=section_text.strip(),
link=_build_gdoc_section_link(doc_id, tab_id, current_heading.id),
)
)


def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]:
tab_id = tab["tabProperties"]["tabId"]
content = tab.get("documentTab", {}).get("body", {}).get("content", [])

sections: list[TextSection] = []
current_section: list[str] = []
current_heading: CurrentHeading | None = None
current_heading = CurrentHeading(id=None, text="")

for element in content:
if "paragraph" not in element:
continue

paragraph = element["paragraph"]

# Check if this is a heading
if (
"paragraphStyle" in paragraph
and "namedStyleType" in paragraph["paragraphStyle"]
):
style = paragraph["paragraphStyle"]["namedStyleType"]
is_heading = style.startswith("HEADING_")
is_title = style.startswith("TITLE")

if is_heading or is_title:
# If we were building a previous section, add it to sections list
if current_heading is not None and current_section:
heading_text = current_heading.text
section_text = f"{heading_text}\n" + "\n".join(current_section)
sections.append(
TextSection(
text=section_text.strip(),
link=_build_gdoc_section_link(doc_id, current_heading.id),
)
)
current_section = []

# Start new heading
heading_id = _extract_id_from_heading(paragraph)
heading_text = _extract_text_from_paragraph(paragraph)
current_heading = CurrentHeading(
id=heading_id,
text=heading_text,
)
if "paragraph" in element:
paragraph = element["paragraph"]

# If this is not a heading, add content to current section
if not _is_heading(paragraph):
text = _extract_text_from_paragraph(paragraph)
if text.strip():
current_section.append(text)
continue

# Add content to current section
if current_heading is not None:
text = _extract_text_from_paragraph(paragraph)
_add_finished_section(
sections, doc_id, tab_id, current_heading, current_section
)

current_section = []

# Start new heading
heading_id = _extract_id_from_heading(paragraph)
heading_text = _extract_text_from_paragraph(paragraph)
current_heading = CurrentHeading(
id=heading_id,
text=heading_text,
)
elif "table" in element:
text = _extract_text_from_table(element["table"])
if text.strip():
current_section.append(text)

# Don't forget to add the last section
if current_heading is not None and current_section:
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
sections.append(
TextSection(
text=section_text.strip(),
link=_build_gdoc_section_link(doc_id, current_heading.id),
)
)
_add_finished_section(sections, doc_id, tab_id, current_heading, current_section)

return sections
2 changes: 0 additions & 2 deletions backend/onyx/connectors/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ class TextSection(Section):
"""Section containing text content"""

text: str
link: str | None = None

def __sizeof__(self) -> int:
return sys.getsizeof(self.text) + sys.getsizeof(self.link)
Expand All @@ -49,7 +48,6 @@ class ImageSection(Section):
"""Section containing an image reference"""

image_file_name: str
link: str | None = None

def __sizeof__(self) -> int:
return sys.getsizeof(self.image_file_name) + sys.getsizeof(self.link)
Expand Down
Loading