2020
2121
2222class PDFProcessor (Processor ):
23- artifact_dict = None #create_model_dict()
23+ artifact_dict = None
2424
2525 def __init__ (self , config = None ):
2626 super ().__init__ (config = config or ProcessorConfig ())
@@ -34,7 +34,7 @@ def accepts(cls, file: FileDescriptor) -> bool:
3434 def load_models (disable_image_extraction : bool = False ):
3535 if PDFProcessor .artifact_dict is None :
3636 PDFProcessor .artifact_dict = create_model_dict ()
37-
37+
3838 marker_config = {
3939 "disable_image_extraction" : disable_image_extraction ,
4040 "languages" : None ,
@@ -46,9 +46,9 @@ def load_models(disable_image_extraction: bool = False):
4646 artifact_dict = PDFProcessor .artifact_dict ,
4747 config = config_parser .generate_config_dict (),
4848 )
49-
49+
5050 converter .initialize_processors (converter .default_processors )
51-
51+
5252 return converter
5353
5454 # overwriting the process_batch
@@ -178,8 +178,8 @@ def _extract_images(pdf_doc, xref) -> Optional[Image.Image]:
178178 if self .config .custom_config .get ("extract_images" , True ):
179179 for img_info in page .get_images (full = False ):
180180 image = _extract_images (pdf_doc , img_info [0 ])
181- if image and clean_image (
182- image
181+ if (
182+ image and clean_image ( image )
183183 ): # clean image filters images below size 512x512 and variance below 100, these are defaults and can be changed
184184 embedded_images .append (image )
185185 all_text .append (self .config .attachment_tag )
@@ -209,7 +209,7 @@ def _process_parallel(
209209 ):
210210 try :
211211 torch .cuda .set_device (gpu_id )
212-
212+
213213 if PDFProcessor .artifact_dict is None :
214214 PDFProcessor .artifact_dict = create_model_dict ()
215215
0 commit comments