22import logging
33import re
44from multiprocessing import Manager , Process , set_start_method
5- from typing import List , Optional , Tuple , cast
5+ from typing import Any , Dict , List , Optional , Tuple , cast
66
77import pymupdf
88import torch
@@ -148,7 +148,7 @@ def process(self, file_path: str) -> MultimodalSample:
148148
149149 paragraph_starts , text = self ._parse_pagination (cast (str , text ))
150150
151- metadata = {"file_path" : file_path }
151+ metadata : Dict [ str , Any ] = {"file_path" : file_path }
152152 if paragraph_starts :
153153 metadata ["paragraph_starts" ] = paragraph_starts
154154
@@ -218,7 +218,7 @@ def _extract_images(pdf_doc, xref) -> Optional[Image.Image]:
218218 if image_bytes is None :
219219 logging .error (f"No image data found for xref { xref } " )
220220
221- return Image .open (io .BytesIO (image_bytes )).convert ("RGB" )
221+ return Image .open (io .BytesIO (cast ( bytes , image_bytes ) )).convert ("RGB" )
222222
223223 except KeyError as e :
224224 logging .error (f"KeyError while extracting image: { e } " )
@@ -236,7 +236,7 @@ def _extract_images(pdf_doc, xref) -> Optional[Image.Image]:
236236 )
237237 return None
238238
239- for page_num , page in enumerate (pdf_doc ):
239+ for page_num , page in enumerate (pdf_doc ): # pyright: ignore[reportArgumentType]
240240 text = clean_text (page .get_text ()) # type: ignore[attr-defined]
241241
242242 if text .strip ():
0 commit comments