Drive smart chip indexing (#4459)

evan-onyx · web-flow · commit 9c7309924178 · 2025-04-07T21:52:45.000Z
* WIP

* WIP almost done, but realized we can just do basic retrieval

* rebased and added scripts

* improved approach to extracting smart chips

* remove files from previous branch

* fix connector tests

* fix test
diff --git a/backend/onyx/chat/process_message.py b/backend/onyx/chat/process_message.py
@@ -1089,14 +1089,10 @@ def stream_chat_message_objects(
                         selected_search_docs=selected_db_search_docs,
                         # Deduping happens at the last step to avoid harming quality by dropping content early on
                         # Skip deduping completely for ordering-only mode to save time
-                        dedupe_docs=(
-                            False
-                            if search_for_ordering_only
-                            else (
-                                retrieval_options.dedupe_docs
-                                if retrieval_options
-                                else False
-                            )
+                        dedupe_docs=bool(
+                            not search_for_ordering_only
+                            and retrieval_options
+                            and retrieval_options.dedupe_docs
                         ),
                         user_files=user_file_files if search_for_ordering_only else [],
                         loaded_user_files=user_files
diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -12,6 +12,7 @@
 from onyx.connectors.google_drive.models import GDriveMimeType
 from onyx.connectors.google_drive.models import GoogleDriveFileType
 from onyx.connectors.google_drive.section_extraction import get_document_sections
+from onyx.connectors.google_drive.section_extraction import HEADING_DELIMITER
 from onyx.connectors.google_utils.resources import GoogleDocsService
 from onyx.connectors.google_utils.resources import GoogleDriveService
 from onyx.connectors.models import ConnectorFailure
@@ -35,6 +36,10 @@
 
 logger = setup_logger()
 
+# This is not a standard valid unicode char, it is used by the docs advanced API to
+# represent smart chips (elements like dates and doc links).
+SMART_CHIP_CHAR = "\ue907"
+
 # Mapping of Google Drive mime types to export formats
 GOOGLE_MIME_TYPES_TO_EXPORT = {
     GDriveMimeType.DOC.value: "text/plain",
@@ -220,6 +225,75 @@ def _download_and_extract_sections_basic(
             return []
 
 
+def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int:
+    start = haystack.find(needle, start)
+    while start >= 0 and n > 1:
+        start = haystack.find(needle, start + len(needle))
+        n -= 1
+    return start
+
+
+def align_basic_advanced(
+    basic_sections: list[TextSection | ImageSection], adv_sections: list[TextSection]
+) -> list[TextSection | ImageSection]:
+    """Align the basic sections with the advanced sections.
+    In particular, the basic sections contain all content of the file,
+    including smart chips like dates and doc links. The advanced sections
+    are separated by section headers and contain header-based links that
+    improve user experience when they click on the source in the UI.
+
+    There are edge cases in text matching (i.e. the heading is a smart chip or
+    there is a smart chip in the doc with text containing the actual heading text)
+    that make the matching imperfect; this is hence done on a best-effort basis.
+    """
+    if len(adv_sections) <= 1:
+        return basic_sections  # no benefit from aligning
+
+    basic_full_text = "".join(
+        [section.text for section in basic_sections if isinstance(section, TextSection)]
+    )
+    new_sections: list[TextSection | ImageSection] = []
+    heading_start = 0
+    for adv_ind in range(1, len(adv_sections)):
+        heading = adv_sections[adv_ind].text.split(HEADING_DELIMITER)[0]
+        # retrieve the longest part of the heading that is not a smart chip
+        heading_key = max(heading.split(SMART_CHIP_CHAR), key=len).strip()
+        if heading_key == "":
+            logger.warning(
+                f"Cannot match heading: {heading}, its link will come from the following section"
+            )
+            continue
+        heading_offset = heading.find(heading_key)
+
+        # count occurrences of heading str in previous section
+        heading_count = adv_sections[adv_ind - 1].text.count(heading_key)
+
+        prev_start = heading_start
+        heading_start = (
+            _find_nth(basic_full_text, heading_key, heading_count, start=prev_start)
+            - heading_offset
+        )
+        if heading_start < 0:
+            logger.warning(
+                f"Heading key {heading_key} from heading {heading} not found in basic text"
+            )
+            heading_start = prev_start
+            continue
+
+        new_sections.append(
+            TextSection(
+                link=adv_sections[adv_ind - 1].link,
+                text=basic_full_text[prev_start:heading_start],
+            )
+        )
+
+    # handle last section
+    new_sections.append(
+        TextSection(link=adv_sections[-1].link, text=basic_full_text[heading_start:])
+    )
+    return new_sections
+
+
 def convert_drive_item_to_document(
     file: GoogleDriveFileType,
     drive_service: Callable[[], GoogleDriveService],
@@ -244,10 +318,17 @@ def convert_drive_item_to_document(
             try:
                 # get_document_sections is the advanced approach for Google Docs
                 doc_sections = get_document_sections(
-                    docs_service=docs_service(), doc_id=file.get("id", "")
+                    docs_service=docs_service(),
+                    doc_id=file.get("id", ""),
                 )
                 if doc_sections:
                     sections = cast(list[TextSection | ImageSection], doc_sections)
+                    if any(SMART_CHIP_CHAR in section.text for section in doc_sections):
+                        basic_sections = _download_and_extract_sections_basic(
+                            file, drive_service(), allow_images
+                        )
+                        sections = align_basic_advanced(basic_sections, doc_sections)
+
             except Exception as e:
                 logger.warning(
                     f"Error in advanced parsing: {e}. Falling back to basic extraction."
diff --git a/backend/onyx/connectors/google_drive/section_extraction.py b/backend/onyx/connectors/google_drive/section_extraction.py
@@ -5,19 +5,20 @@
 from onyx.connectors.google_utils.resources import GoogleDocsService
 from onyx.connectors.models import TextSection
 
+HEADING_DELIMITER = "\n"
+
 
 class CurrentHeading(BaseModel):
-    id: str
+    id: str | None
     text: str
 
 
-def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
+def _build_gdoc_section_link(doc_id: str, tab_id: str, heading_id: str | None) -> str:
     """Builds a Google Doc link that jumps to a specific heading"""
     # NOTE: doesn't support docs with multiple tabs atm, if we need that ask
     # @Chris
-    return (
-        f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
-    )
+    heading_str = f"#heading={heading_id}" if heading_id else ""
+    return f"https://docs.google.com/document/d/{doc_id}/edit?tab={tab_id}{heading_str}"
 
 
 def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
@@ -31,75 +32,157 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
     for element in paragraph.get("elements", []):
         if "textRun" in element:
             text_elements.append(element["textRun"].get("content", ""))
+
+        # Handle links
+        if "textStyle" in element and "link" in element["textStyle"]:
+            text_elements.append(f"({element['textStyle']['link'].get('url', '')})")
+
+        if "person" in element:
+            name = element["person"].get("personProperties", {}).get("name", "")
+            email = element["person"].get("personProperties", {}).get("email", "")
+            person_str = "<Person|"
+            if name:
+                person_str += f"name: {name}, "
+            if email:
+                person_str += f"email: {email}"
+            person_str += ">"
+            text_elements.append(person_str)
+
+        if "richLink" in element:
+            props = element["richLink"].get("richLinkProperties", {})
+            title = props.get("title", "")
+            uri = props.get("uri", "")
+            link_str = f"[{title}]({uri})"
+            text_elements.append(link_str)
+
     return "".join(text_elements)
 
 
+def _extract_text_from_table(table: dict[str, Any]) -> str:
+    """
+    Extracts the text content from a table element.
+    """
+    row_strs = []
+
+    for row in table.get("tableRows", []):
+        cells = row.get("tableCells", [])
+        cell_strs = []
+        for cell in cells:
+            child_elements = cell.get("content", {})
+            cell_str = []
+            for child_elem in child_elements:
+                if "paragraph" not in child_elem:
+                    continue
+                cell_str.append(_extract_text_from_paragraph(child_elem["paragraph"]))
+            cell_strs.append("".join(cell_str))
+        row_strs.append(", ".join(cell_strs))
+    return "\n".join(row_strs)
+
+
 def get_document_sections(
     docs_service: GoogleDocsService,
     doc_id: str,
 ) -> list[TextSection]:
     """Extracts sections from a Google Doc, including their headings and content"""
     # Fetch the document structure
-    doc = docs_service.documents().get(documentId=doc_id).execute()
+    http_request = docs_service.documents().get(documentId=doc_id)
+
+    # Google has poor support for tabs in the docs api, see
+    # https://cloud.google.com/python/docs/reference/cloudtasks/
+    # latest/google.cloud.tasks_v2.types.HttpRequest
+    # https://developers.google.com/workspace/docs/api/how-tos/tabs
+    # https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get
+    # this is a hack to use the param mentioned in the rest api docs
+    # TODO: check if it can be specified i.e. in documents()
+    http_request.uri += "&includeTabsContent=true"
+    doc = http_request.execute()
 
     # Get the content
-    content = doc.get("body", {}).get("content", [])
+    tabs = doc.get("tabs", {})
+    sections: list[TextSection] = []
+    for tab in tabs:
+        sections.extend(get_tab_sections(tab, doc_id))
+    return sections
+
+
+def _is_heading(paragraph: dict[str, Any]) -> bool:
+    """Checks if a paragraph (a block of text in a drive document) is a heading"""
+    if not (
+        "paragraphStyle" in paragraph
+        and "namedStyleType" in paragraph["paragraphStyle"]
+    ):
+        return False
+
+    style = paragraph["paragraphStyle"]["namedStyleType"]
+    is_heading = style.startswith("HEADING_")
+    is_title = style.startswith("TITLE")
+    return is_heading or is_title
+
+
+def _add_finished_section(
+    sections: list[TextSection],
+    doc_id: str,
+    tab_id: str,
+    current_heading: CurrentHeading,
+    current_section: list[str],
+) -> None:
+    """Adds a finished section to the list of sections if the section has content.
+    Returns the list of sections to use going forward, which may be the old list
+    if a new section was not added.
+    """
+    if not (current_section or current_heading.text):
+        return
+    # If we were building a previous section, add it to sections list
+
+    # this is unlikely to ever matter, but helps if the doc contains weird headings
+    header_text = current_heading.text.replace(HEADING_DELIMITER, "")
+    section_text = f"{header_text}{HEADING_DELIMITER}" + "\n".join(current_section)
+    sections.append(
+        TextSection(
+            text=section_text.strip(),
+            link=_build_gdoc_section_link(doc_id, tab_id, current_heading.id),
+        )
+    )
+
+
+def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]:
+    tab_id = tab["tabProperties"]["tabId"]
+    content = tab.get("documentTab", {}).get("body", {}).get("content", [])
 
     sections: list[TextSection] = []
     current_section: list[str] = []
-    current_heading: CurrentHeading | None = None
+    current_heading = CurrentHeading(id=None, text="")
 
     for element in content:
-        if "paragraph" not in element:
-            continue
-
-        paragraph = element["paragraph"]
-
-        # Check if this is a heading
-        if (
-            "paragraphStyle" in paragraph
-            and "namedStyleType" in paragraph["paragraphStyle"]
-        ):
-            style = paragraph["paragraphStyle"]["namedStyleType"]
-            is_heading = style.startswith("HEADING_")
-            is_title = style.startswith("TITLE")
-
-            if is_heading or is_title:
-                # If we were building a previous section, add it to sections list
-                if current_heading is not None and current_section:
-                    heading_text = current_heading.text
-                    section_text = f"{heading_text}\n" + "\n".join(current_section)
-                    sections.append(
-                        TextSection(
-                            text=section_text.strip(),
-                            link=_build_gdoc_section_link(doc_id, current_heading.id),
-                        )
-                    )
-                    current_section = []
-
-                # Start new heading
-                heading_id = _extract_id_from_heading(paragraph)
-                heading_text = _extract_text_from_paragraph(paragraph)
-                current_heading = CurrentHeading(
-                    id=heading_id,
-                    text=heading_text,
-                )
+        if "paragraph" in element:
+            paragraph = element["paragraph"]
+
+            # If this is not a heading, add content to current section
+            if not _is_heading(paragraph):
+                text = _extract_text_from_paragraph(paragraph)
+                if text.strip():
+                    current_section.append(text)
                 continue
 
-        # Add content to current section
-        if current_heading is not None:
-            text = _extract_text_from_paragraph(paragraph)
+            _add_finished_section(
+                sections, doc_id, tab_id, current_heading, current_section
+            )
+
+            current_section = []
+
+            # Start new heading
+            heading_id = _extract_id_from_heading(paragraph)
+            heading_text = _extract_text_from_paragraph(paragraph)
+            current_heading = CurrentHeading(
+                id=heading_id,
+                text=heading_text,
+            )
+        elif "table" in element:
+            text = _extract_text_from_table(element["table"])
             if text.strip():
                 current_section.append(text)
 
     # Don't forget to add the last section
-    if current_heading is not None and current_section:
-        section_text = f"{current_heading.text}\n" + "\n".join(current_section)
-        sections.append(
-            TextSection(
-                text=section_text.strip(),
-                link=_build_gdoc_section_link(doc_id, current_heading.id),
-            )
-        )
+    _add_finished_section(sections, doc_id, tab_id, current_heading, current_section)
 
     return sections
diff --git a/backend/onyx/connectors/models.py b/backend/onyx/connectors/models.py
@@ -39,7 +39,6 @@ class TextSection(Section):
     """Section containing text content"""
 
     text: str
-    link: str | None = None
 
     def __sizeof__(self) -> int:
         return sys.getsizeof(self.text) + sys.getsizeof(self.link)
@@ -49,7 +48,6 @@ class ImageSection(Section):
     """Section containing an image reference"""
 
     image_file_name: str
-    link: str | None = None
 
     def __sizeof__(self) -> int:
         return sys.getsizeof(self.image_file_name) + sys.getsizeof(self.link)
diff --git a/backend/tests/daily/connectors/google_drive/consts_and_utils.py b/backend/tests/daily/connectors/google_drive/consts_and_utils.py
diff --git a/backend/tests/daily/connectors/google_drive/test_sections.py b/backend/tests/daily/connectors/google_drive/test_sections.py