5
5
from onyx .connectors .google_utils .resources import GoogleDocsService
6
6
from onyx .connectors .models import TextSection
7
7
8
+ HEADING_DELIMITER = "\n "
9
+
8
10
9
11
class CurrentHeading (BaseModel ):
10
- id : str
12
+ id : str | None
11
13
text : str
12
14
13
15
14
- def _build_gdoc_section_link (doc_id : str , heading_id : str ) -> str :
16
+ def _build_gdoc_section_link (doc_id : str , tab_id : str , heading_id : str | None ) -> str :
15
17
"""Builds a Google Doc link that jumps to a specific heading"""
16
18
# NOTE: doesn't support docs with multiple tabs atm, if we need that ask
17
19
# @Chris
18
- return (
19
- f"https://docs.google.com/document/d/{ doc_id } /edit?tab=t.0#heading={ heading_id } "
20
- )
20
+ heading_str = f"#heading={ heading_id } " if heading_id else ""
21
+ return f"https://docs.google.com/document/d/{ doc_id } /edit?tab={ tab_id } { heading_str } "
21
22
22
23
23
24
def _extract_id_from_heading (paragraph : dict [str , Any ]) -> str :
@@ -31,75 +32,157 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
31
32
for element in paragraph .get ("elements" , []):
32
33
if "textRun" in element :
33
34
text_elements .append (element ["textRun" ].get ("content" , "" ))
35
+
36
+ # Handle links
37
+ if "textStyle" in element and "link" in element ["textStyle" ]:
38
+ text_elements .append (f"({ element ['textStyle' ]['link' ].get ('url' , '' )} )" )
39
+
40
+ if "person" in element :
41
+ name = element ["person" ].get ("personProperties" , {}).get ("name" , "" )
42
+ email = element ["person" ].get ("personProperties" , {}).get ("email" , "" )
43
+ person_str = "<Person|"
44
+ if name :
45
+ person_str += f"name: { name } , "
46
+ if email :
47
+ person_str += f"email: { email } "
48
+ person_str += ">"
49
+ text_elements .append (person_str )
50
+
51
+ if "richLink" in element :
52
+ props = element ["richLink" ].get ("richLinkProperties" , {})
53
+ title = props .get ("title" , "" )
54
+ uri = props .get ("uri" , "" )
55
+ link_str = f"[{ title } ]({ uri } )"
56
+ text_elements .append (link_str )
57
+
34
58
return "" .join (text_elements )
35
59
36
60
61
+ def _extract_text_from_table (table : dict [str , Any ]) -> str :
62
+ """
63
+ Extracts the text content from a table element.
64
+ """
65
+ row_strs = []
66
+
67
+ for row in table .get ("tableRows" , []):
68
+ cells = row .get ("tableCells" , [])
69
+ cell_strs = []
70
+ for cell in cells :
71
+ child_elements = cell .get ("content" , {})
72
+ cell_str = []
73
+ for child_elem in child_elements :
74
+ if "paragraph" not in child_elem :
75
+ continue
76
+ cell_str .append (_extract_text_from_paragraph (child_elem ["paragraph" ]))
77
+ cell_strs .append ("" .join (cell_str ))
78
+ row_strs .append (", " .join (cell_strs ))
79
+ return "\n " .join (row_strs )
80
+
81
+
37
82
def get_document_sections (
38
83
docs_service : GoogleDocsService ,
39
84
doc_id : str ,
40
85
) -> list [TextSection ]:
41
86
"""Extracts sections from a Google Doc, including their headings and content"""
42
87
# Fetch the document structure
43
- doc = docs_service .documents ().get (documentId = doc_id ).execute ()
88
+ http_request = docs_service .documents ().get (documentId = doc_id )
89
+
90
+ # Google has poor support for tabs in the docs api, see
91
+ # https://cloud.google.com/python/docs/reference/cloudtasks/
92
+ # latest/google.cloud.tasks_v2.types.HttpRequest
93
+ # https://developers.google.com/workspace/docs/api/how-tos/tabs
94
+ # https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get
95
+ # this is a hack to use the param mentioned in the rest api docs
96
+ # TODO: check if it can be specified i.e. in documents()
97
+ http_request .uri += "&includeTabsContent=true"
98
+ doc = http_request .execute ()
44
99
45
100
# Get the content
46
- content = doc .get ("body" , {}).get ("content" , [])
101
+ tabs = doc .get ("tabs" , {})
102
+ sections : list [TextSection ] = []
103
+ for tab in tabs :
104
+ sections .extend (get_tab_sections (tab , doc_id ))
105
+ return sections
106
+
107
+
108
+ def _is_heading (paragraph : dict [str , Any ]) -> bool :
109
+ """Checks if a paragraph (a block of text in a drive document) is a heading"""
110
+ if not (
111
+ "paragraphStyle" in paragraph
112
+ and "namedStyleType" in paragraph ["paragraphStyle" ]
113
+ ):
114
+ return False
115
+
116
+ style = paragraph ["paragraphStyle" ]["namedStyleType" ]
117
+ is_heading = style .startswith ("HEADING_" )
118
+ is_title = style .startswith ("TITLE" )
119
+ return is_heading or is_title
120
+
121
+
122
+ def _add_finished_section (
123
+ sections : list [TextSection ],
124
+ doc_id : str ,
125
+ tab_id : str ,
126
+ current_heading : CurrentHeading ,
127
+ current_section : list [str ],
128
+ ) -> None :
129
+ """Adds a finished section to the list of sections if the section has content.
130
+ Returns the list of sections to use going forward, which may be the old list
131
+ if a new section was not added.
132
+ """
133
+ if not (current_section or current_heading .text ):
134
+ return
135
+ # If we were building a previous section, add it to sections list
136
+
137
+ # this is unlikely to ever matter, but helps if the doc contains weird headings
138
+ header_text = current_heading .text .replace (HEADING_DELIMITER , "" )
139
+ section_text = f"{ header_text } { HEADING_DELIMITER } " + "\n " .join (current_section )
140
+ sections .append (
141
+ TextSection (
142
+ text = section_text .strip (),
143
+ link = _build_gdoc_section_link (doc_id , tab_id , current_heading .id ),
144
+ )
145
+ )
146
+
147
+
148
+ def get_tab_sections (tab : dict [str , Any ], doc_id : str ) -> list [TextSection ]:
149
+ tab_id = tab ["tabProperties" ]["tabId" ]
150
+ content = tab .get ("documentTab" , {}).get ("body" , {}).get ("content" , [])
47
151
48
152
sections : list [TextSection ] = []
49
153
current_section : list [str ] = []
50
- current_heading : CurrentHeading | None = None
154
+ current_heading = CurrentHeading ( id = None , text = "" )
51
155
52
156
for element in content :
53
- if "paragraph" not in element :
54
- continue
55
-
56
- paragraph = element ["paragraph" ]
57
-
58
- # Check if this is a heading
59
- if (
60
- "paragraphStyle" in paragraph
61
- and "namedStyleType" in paragraph ["paragraphStyle" ]
62
- ):
63
- style = paragraph ["paragraphStyle" ]["namedStyleType" ]
64
- is_heading = style .startswith ("HEADING_" )
65
- is_title = style .startswith ("TITLE" )
66
-
67
- if is_heading or is_title :
68
- # If we were building a previous section, add it to sections list
69
- if current_heading is not None and current_section :
70
- heading_text = current_heading .text
71
- section_text = f"{ heading_text } \n " + "\n " .join (current_section )
72
- sections .append (
73
- TextSection (
74
- text = section_text .strip (),
75
- link = _build_gdoc_section_link (doc_id , current_heading .id ),
76
- )
77
- )
78
- current_section = []
79
-
80
- # Start new heading
81
- heading_id = _extract_id_from_heading (paragraph )
82
- heading_text = _extract_text_from_paragraph (paragraph )
83
- current_heading = CurrentHeading (
84
- id = heading_id ,
85
- text = heading_text ,
86
- )
157
+ if "paragraph" in element :
158
+ paragraph = element ["paragraph" ]
159
+
160
+ # If this is not a heading, add content to current section
161
+ if not _is_heading (paragraph ):
162
+ text = _extract_text_from_paragraph (paragraph )
163
+ if text .strip ():
164
+ current_section .append (text )
87
165
continue
88
166
89
- # Add content to current section
90
- if current_heading is not None :
91
- text = _extract_text_from_paragraph (paragraph )
167
+ _add_finished_section (
168
+ sections , doc_id , tab_id , current_heading , current_section
169
+ )
170
+
171
+ current_section = []
172
+
173
+ # Start new heading
174
+ heading_id = _extract_id_from_heading (paragraph )
175
+ heading_text = _extract_text_from_paragraph (paragraph )
176
+ current_heading = CurrentHeading (
177
+ id = heading_id ,
178
+ text = heading_text ,
179
+ )
180
+ elif "table" in element :
181
+ text = _extract_text_from_table (element ["table" ])
92
182
if text .strip ():
93
183
current_section .append (text )
94
184
95
185
# Don't forget to add the last section
96
- if current_heading is not None and current_section :
97
- section_text = f"{ current_heading .text } \n " + "\n " .join (current_section )
98
- sections .append (
99
- TextSection (
100
- text = section_text .strip (),
101
- link = _build_gdoc_section_link (doc_id , current_heading .id ),
102
- )
103
- )
186
+ _add_finished_section (sections , doc_id , tab_id , current_heading , current_section )
104
187
105
188
return sections
0 commit comments