@@ -86,13 +86,15 @@ def _get_internal_urls(self) -> Iterator[DocumentInfo]:
86
86
if "contents" not in module or len (module ["contents" ]) == 0 :
87
87
continue
88
88
89
+ url = self .normalize_url (module ["contents" ][0 ]["fileurl" ])
90
+
89
91
yield DocumentInfo (
90
- url = self . normalize_url ( module [ "contents" ][ 0 ][ "fileurl" ]) ,
91
- type = self ._get_document_type (module [ "contents" ][ 0 ][ "fileurl" ] ),
92
+ url = url ,
93
+ type = self ._get_document_type (url ),
92
94
title = module ["name" ],
93
95
created = datetime .fromtimestamp (module ["contents" ][0 ]["timecreated" ], tz = timezone .utc ),
94
96
modified = datetime .fromtimestamp (module ["contents" ][0 ]["timemodified" ], tz = timezone .utc ),
95
- file_extension = self . normalize_url ( module [ "contents" ][ 0 ][ "fileurl" ]). split ( "." )[ - 1 ],
97
+ extension = url . rsplit ( "." , 1 )[ 1 ],
96
98
)
97
99
98
100
def _get_external_urls (self ) -> Iterator [DocumentInfo ]:
@@ -124,12 +126,15 @@ def _get_external_urls(self) -> Iterator[DocumentInfo]:
124
126
if content ["course" ] != self .config .course :
125
127
continue
126
128
129
+ url = self .normalize_url (content ["externalurl" ])
130
+
127
131
yield DocumentInfo (
128
- url = self . normalize_url ( content [ "externalurl" ]) ,
129
- type = self ._get_document_type (content [ "externalurl" ] ),
132
+ url = url ,
133
+ type = self ._get_document_type (url ),
130
134
title = content ["name" ],
131
135
created = datetime .fromtimestamp (content ["timemodified" ], tz = timezone .utc ),
132
136
modified = datetime .fromtimestamp (content ["timemodified" ], tz = timezone .utc ),
137
+ extension = url .rsplit ("." , 1 )[1 ],
133
138
)
134
139
135
140
@staticmethod
@@ -206,20 +211,11 @@ def document_needs_parsing(self, document: DocumentInfo) -> bool:
206
211
def document_has_content (self , document : DocumentInfo ) -> bool :
207
212
"""Return whether the document has content."""
208
213
209
- if document .file_extension == "docx" :
214
+ if document .extension == "docx" :
210
215
return True
211
216
212
217
return False
213
218
214
- def get_content (self , document : DocumentInfo , content : bytes ) -> Optional [str ]:
215
- """Get file content of docx circulars."""
216
-
217
- def ignore_images (_image : Image ) -> Dict :
218
- return {}
219
-
220
- result = convert_to_html (io .BytesIO (content ), convert_image = ignore_images )
221
- return typing .cast (str , result .value ) # The generated HTML
222
-
223
219
@with_span (op = "parse" , pass_span = True )
224
220
def parse_document (self , document : DocumentInfo , content : bytes , effective : date , span : Span ) -> None : # type: ignore[override]
225
221
"""Parse the document and store extracted data."""
@@ -246,6 +242,21 @@ def parse_document(self, document: DocumentInfo, content: bytes, effective: date
246
242
# This cannot happen because only menus are provided by the API
247
243
raise KeyError ("Unknown parsable document type from the e-classroom" )
248
244
245
+ @with_span (op = "parse" , pass_span = True )
246
+ def get_content (self , document : DocumentInfo , content : bytes , span : Span ) -> Optional [str ]: # type: ignore[override]
247
+ """Convert content of DOCX circulars to HTML."""
248
+
249
+ def ignore_images (_image : Image ) -> Dict :
250
+ return {}
251
+
252
+ # Set basic Sentry span info
253
+ span .set_tag ("document.format" , "docx" )
254
+ span .set_tag ("document.type" , document .type .value )
255
+
256
+ # Convert DOCX to HTML
257
+ result = convert_to_html (io .BytesIO (content ), convert_image = ignore_images )
258
+ return typing .cast (str , result .value )
259
+
249
260
def _normalize_subject_name (self , name : str ) -> Optional [str ]:
250
261
"""Normalize the subject name."""
251
262
0 commit comments