Skip to content

Commit 9c73099

Browse files
authored
Drive smart chip indexing (#4459)
* WIP * WIP almost done, but realized we can just do basic retrieval * rebased and added scripts * improved approach to extracting smart chips * remove files from previous branch * fix connector tests * fix test
1 parent 88d4a65 commit 9c73099

File tree

6 files changed

+232
-74
lines changed

6 files changed

+232
-74
lines changed

backend/onyx/chat/process_message.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -1089,14 +1089,10 @@ def stream_chat_message_objects(
10891089
selected_search_docs=selected_db_search_docs,
10901090
# Deduping happens at the last step to avoid harming quality by dropping content early on
10911091
# Skip deduping completely for ordering-only mode to save time
1092-
dedupe_docs=(
1093-
False
1094-
if search_for_ordering_only
1095-
else (
1096-
retrieval_options.dedupe_docs
1097-
if retrieval_options
1098-
else False
1099-
)
1092+
dedupe_docs=bool(
1093+
not search_for_ordering_only
1094+
and retrieval_options
1095+
and retrieval_options.dedupe_docs
11001096
),
11011097
user_files=user_file_files if search_for_ordering_only else [],
11021098
loaded_user_files=user_files

backend/onyx/connectors/google_drive/doc_conversion.py

+82-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from onyx.connectors.google_drive.models import GDriveMimeType
1313
from onyx.connectors.google_drive.models import GoogleDriveFileType
1414
from onyx.connectors.google_drive.section_extraction import get_document_sections
15+
from onyx.connectors.google_drive.section_extraction import HEADING_DELIMITER
1516
from onyx.connectors.google_utils.resources import GoogleDocsService
1617
from onyx.connectors.google_utils.resources import GoogleDriveService
1718
from onyx.connectors.models import ConnectorFailure
@@ -35,6 +36,10 @@
3536

3637
logger = setup_logger()
3738

39+
# This is not a standard valid unicode char, it is used by the docs advanced API to
40+
# represent smart chips (elements like dates and doc links).
41+
SMART_CHIP_CHAR = "\ue907"
42+
3843
# Mapping of Google Drive mime types to export formats
3944
GOOGLE_MIME_TYPES_TO_EXPORT = {
4045
GDriveMimeType.DOC.value: "text/plain",
@@ -220,6 +225,75 @@ def _download_and_extract_sections_basic(
220225
return []
221226

222227

228+
def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int:
229+
start = haystack.find(needle, start)
230+
while start >= 0 and n > 1:
231+
start = haystack.find(needle, start + len(needle))
232+
n -= 1
233+
return start
234+
235+
236+
def align_basic_advanced(
237+
basic_sections: list[TextSection | ImageSection], adv_sections: list[TextSection]
238+
) -> list[TextSection | ImageSection]:
239+
"""Align the basic sections with the advanced sections.
240+
In particular, the basic sections contain all content of the file,
241+
including smart chips like dates and doc links. The advanced sections
242+
are separated by section headers and contain header-based links that
243+
improve user experience when they click on the source in the UI.
244+
245+
There are edge cases in text matching (i.e. the heading is a smart chip or
246+
there is a smart chip in the doc with text containing the actual heading text)
247+
that make the matching imperfect; this is hence done on a best-effort basis.
248+
"""
249+
if len(adv_sections) <= 1:
250+
return basic_sections # no benefit from aligning
251+
252+
basic_full_text = "".join(
253+
[section.text for section in basic_sections if isinstance(section, TextSection)]
254+
)
255+
new_sections: list[TextSection | ImageSection] = []
256+
heading_start = 0
257+
for adv_ind in range(1, len(adv_sections)):
258+
heading = adv_sections[adv_ind].text.split(HEADING_DELIMITER)[0]
259+
# retrieve the longest part of the heading that is not a smart chip
260+
heading_key = max(heading.split(SMART_CHIP_CHAR), key=len).strip()
261+
if heading_key == "":
262+
logger.warning(
263+
f"Cannot match heading: {heading}, its link will come from the following section"
264+
)
265+
continue
266+
heading_offset = heading.find(heading_key)
267+
268+
# count occurrences of heading str in previous section
269+
heading_count = adv_sections[adv_ind - 1].text.count(heading_key)
270+
271+
prev_start = heading_start
272+
heading_start = (
273+
_find_nth(basic_full_text, heading_key, heading_count, start=prev_start)
274+
- heading_offset
275+
)
276+
if heading_start < 0:
277+
logger.warning(
278+
f"Heading key {heading_key} from heading {heading} not found in basic text"
279+
)
280+
heading_start = prev_start
281+
continue
282+
283+
new_sections.append(
284+
TextSection(
285+
link=adv_sections[adv_ind - 1].link,
286+
text=basic_full_text[prev_start:heading_start],
287+
)
288+
)
289+
290+
# handle last section
291+
new_sections.append(
292+
TextSection(link=adv_sections[-1].link, text=basic_full_text[heading_start:])
293+
)
294+
return new_sections
295+
296+
223297
def convert_drive_item_to_document(
224298
file: GoogleDriveFileType,
225299
drive_service: Callable[[], GoogleDriveService],
@@ -244,10 +318,17 @@ def convert_drive_item_to_document(
244318
try:
245319
# get_document_sections is the advanced approach for Google Docs
246320
doc_sections = get_document_sections(
247-
docs_service=docs_service(), doc_id=file.get("id", "")
321+
docs_service=docs_service(),
322+
doc_id=file.get("id", ""),
248323
)
249324
if doc_sections:
250325
sections = cast(list[TextSection | ImageSection], doc_sections)
326+
if any(SMART_CHIP_CHAR in section.text for section in doc_sections):
327+
basic_sections = _download_and_extract_sections_basic(
328+
file, drive_service(), allow_images
329+
)
330+
sections = align_basic_advanced(basic_sections, doc_sections)
331+
251332
except Exception as e:
252333
logger.warning(
253334
f"Error in advanced parsing: {e}. Falling back to basic extraction."

backend/onyx/connectors/google_drive/section_extraction.py

+136-53
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,20 @@
55
from onyx.connectors.google_utils.resources import GoogleDocsService
66
from onyx.connectors.models import TextSection
77

8+
HEADING_DELIMITER = "\n"
9+
810

911
class CurrentHeading(BaseModel):
10-
id: str
12+
id: str | None
1113
text: str
1214

1315

14-
def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
16+
def _build_gdoc_section_link(doc_id: str, tab_id: str, heading_id: str | None) -> str:
1517
"""Builds a Google Doc link that jumps to a specific heading"""
1618
# NOTE: doesn't support docs with multiple tabs atm, if we need that ask
1719
# @Chris
18-
return (
19-
f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
20-
)
20+
heading_str = f"#heading={heading_id}" if heading_id else ""
21+
return f"https://docs.google.com/document/d/{doc_id}/edit?tab={tab_id}{heading_str}"
2122

2223

2324
def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
@@ -31,75 +32,157 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
3132
for element in paragraph.get("elements", []):
3233
if "textRun" in element:
3334
text_elements.append(element["textRun"].get("content", ""))
35+
36+
# Handle links
37+
if "textStyle" in element and "link" in element["textStyle"]:
38+
text_elements.append(f"({element['textStyle']['link'].get('url', '')})")
39+
40+
if "person" in element:
41+
name = element["person"].get("personProperties", {}).get("name", "")
42+
email = element["person"].get("personProperties", {}).get("email", "")
43+
person_str = "<Person|"
44+
if name:
45+
person_str += f"name: {name}, "
46+
if email:
47+
person_str += f"email: {email}"
48+
person_str += ">"
49+
text_elements.append(person_str)
50+
51+
if "richLink" in element:
52+
props = element["richLink"].get("richLinkProperties", {})
53+
title = props.get("title", "")
54+
uri = props.get("uri", "")
55+
link_str = f"[{title}]({uri})"
56+
text_elements.append(link_str)
57+
3458
return "".join(text_elements)
3559

3660

61+
def _extract_text_from_table(table: dict[str, Any]) -> str:
62+
"""
63+
Extracts the text content from a table element.
64+
"""
65+
row_strs = []
66+
67+
for row in table.get("tableRows", []):
68+
cells = row.get("tableCells", [])
69+
cell_strs = []
70+
for cell in cells:
71+
child_elements = cell.get("content", {})
72+
cell_str = []
73+
for child_elem in child_elements:
74+
if "paragraph" not in child_elem:
75+
continue
76+
cell_str.append(_extract_text_from_paragraph(child_elem["paragraph"]))
77+
cell_strs.append("".join(cell_str))
78+
row_strs.append(", ".join(cell_strs))
79+
return "\n".join(row_strs)
80+
81+
3782
def get_document_sections(
3883
docs_service: GoogleDocsService,
3984
doc_id: str,
4085
) -> list[TextSection]:
4186
"""Extracts sections from a Google Doc, including their headings and content"""
4287
# Fetch the document structure
43-
doc = docs_service.documents().get(documentId=doc_id).execute()
88+
http_request = docs_service.documents().get(documentId=doc_id)
89+
90+
# Google has poor support for tabs in the docs api, see
91+
# https://cloud.google.com/python/docs/reference/cloudtasks/
92+
# latest/google.cloud.tasks_v2.types.HttpRequest
93+
# https://developers.google.com/workspace/docs/api/how-tos/tabs
94+
# https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get
95+
# this is a hack to use the param mentioned in the rest api docs
96+
# TODO: check if it can be specified i.e. in documents()
97+
http_request.uri += "&includeTabsContent=true"
98+
doc = http_request.execute()
4499

45100
# Get the content
46-
content = doc.get("body", {}).get("content", [])
101+
tabs = doc.get("tabs", {})
102+
sections: list[TextSection] = []
103+
for tab in tabs:
104+
sections.extend(get_tab_sections(tab, doc_id))
105+
return sections
106+
107+
108+
def _is_heading(paragraph: dict[str, Any]) -> bool:
109+
"""Checks if a paragraph (a block of text in a drive document) is a heading"""
110+
if not (
111+
"paragraphStyle" in paragraph
112+
and "namedStyleType" in paragraph["paragraphStyle"]
113+
):
114+
return False
115+
116+
style = paragraph["paragraphStyle"]["namedStyleType"]
117+
is_heading = style.startswith("HEADING_")
118+
is_title = style.startswith("TITLE")
119+
return is_heading or is_title
120+
121+
122+
def _add_finished_section(
123+
sections: list[TextSection],
124+
doc_id: str,
125+
tab_id: str,
126+
current_heading: CurrentHeading,
127+
current_section: list[str],
128+
) -> None:
129+
"""Adds a finished section to the list of sections if the section has content.
130+
Returns the list of sections to use going forward, which may be the old list
131+
if a new section was not added.
132+
"""
133+
if not (current_section or current_heading.text):
134+
return
135+
# If we were building a previous section, add it to sections list
136+
137+
# this is unlikely to ever matter, but helps if the doc contains weird headings
138+
header_text = current_heading.text.replace(HEADING_DELIMITER, "")
139+
section_text = f"{header_text}{HEADING_DELIMITER}" + "\n".join(current_section)
140+
sections.append(
141+
TextSection(
142+
text=section_text.strip(),
143+
link=_build_gdoc_section_link(doc_id, tab_id, current_heading.id),
144+
)
145+
)
146+
147+
148+
def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]:
149+
tab_id = tab["tabProperties"]["tabId"]
150+
content = tab.get("documentTab", {}).get("body", {}).get("content", [])
47151

48152
sections: list[TextSection] = []
49153
current_section: list[str] = []
50-
current_heading: CurrentHeading | None = None
154+
current_heading = CurrentHeading(id=None, text="")
51155

52156
for element in content:
53-
if "paragraph" not in element:
54-
continue
55-
56-
paragraph = element["paragraph"]
57-
58-
# Check if this is a heading
59-
if (
60-
"paragraphStyle" in paragraph
61-
and "namedStyleType" in paragraph["paragraphStyle"]
62-
):
63-
style = paragraph["paragraphStyle"]["namedStyleType"]
64-
is_heading = style.startswith("HEADING_")
65-
is_title = style.startswith("TITLE")
66-
67-
if is_heading or is_title:
68-
# If we were building a previous section, add it to sections list
69-
if current_heading is not None and current_section:
70-
heading_text = current_heading.text
71-
section_text = f"{heading_text}\n" + "\n".join(current_section)
72-
sections.append(
73-
TextSection(
74-
text=section_text.strip(),
75-
link=_build_gdoc_section_link(doc_id, current_heading.id),
76-
)
77-
)
78-
current_section = []
79-
80-
# Start new heading
81-
heading_id = _extract_id_from_heading(paragraph)
82-
heading_text = _extract_text_from_paragraph(paragraph)
83-
current_heading = CurrentHeading(
84-
id=heading_id,
85-
text=heading_text,
86-
)
157+
if "paragraph" in element:
158+
paragraph = element["paragraph"]
159+
160+
# If this is not a heading, add content to current section
161+
if not _is_heading(paragraph):
162+
text = _extract_text_from_paragraph(paragraph)
163+
if text.strip():
164+
current_section.append(text)
87165
continue
88166

89-
# Add content to current section
90-
if current_heading is not None:
91-
text = _extract_text_from_paragraph(paragraph)
167+
_add_finished_section(
168+
sections, doc_id, tab_id, current_heading, current_section
169+
)
170+
171+
current_section = []
172+
173+
# Start new heading
174+
heading_id = _extract_id_from_heading(paragraph)
175+
heading_text = _extract_text_from_paragraph(paragraph)
176+
current_heading = CurrentHeading(
177+
id=heading_id,
178+
text=heading_text,
179+
)
180+
elif "table" in element:
181+
text = _extract_text_from_table(element["table"])
92182
if text.strip():
93183
current_section.append(text)
94184

95185
# Don't forget to add the last section
96-
if current_heading is not None and current_section:
97-
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
98-
sections.append(
99-
TextSection(
100-
text=section_text.strip(),
101-
link=_build_gdoc_section_link(doc_id, current_heading.id),
102-
)
103-
)
186+
_add_finished_section(sections, doc_id, tab_id, current_heading, current_section)
104187

105188
return sections

backend/onyx/connectors/models.py

-2
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ class TextSection(Section):
3939
"""Section containing text content"""
4040

4141
text: str
42-
link: str | None = None
4342

4443
def __sizeof__(self) -> int:
4544
return sys.getsizeof(self.text) + sys.getsizeof(self.link)
@@ -49,7 +48,6 @@ class ImageSection(Section):
4948
"""Section containing an image reference"""
5049

5150
image_file_name: str
52-
link: str | None = None
5351

5452
def __sizeof__(self) -> int:
5553
return sys.getsizeof(self.image_file_name) + sys.getsizeof(self.link)

0 commit comments

Comments
 (0)