99
1010from assemblyline_v4_service .common .base import ServiceBase
1111from assemblyline_v4_service .common .result import Result , ResultImageSection
12+ from assemblyline_v4_service .common .request import ServiceRequest as Request
1213
1314from document_preview .helper .emlrender import processEml as eml2image
1415from PIL import Image
@@ -21,7 +22,6 @@ def __init__(self, config=None):
2122 super (DocumentPreview , self ).__init__ (config )
2223
2324 def start (self ):
24- subprocess .Popen (["unoconv" , "--listener" ])
2525 self .log .debug ("Document preview service started" )
2626
2727 def stop (self ):
@@ -59,29 +59,32 @@ def pdf_to_images(self, file):
5959 page .save (self .working_directory + "/output_" + str (i ) + ".jpeg" )
6060 i += 1
6161
62- def render_documents (self , file_type , file , file_contents , max_pages = 1 ):
62+ def render_documents (self , request : Request , max_pages = 1 ):
6363 # Word/Excel/Powerpoint
64- if any (file_type == f'document/office/{ ms_product } ' for ms_product in ['word' , 'excel' , 'powerpoint' ]):
65- orientation = "landscape" if file_type .endswith ('excel' ) else "portrait"
66- converted = self .office_conversion (file , orientation , max_pages )
64+ if any (request .file_type == f'document/office/{ ms_product } ' for ms_product in ['word' , 'excel' , 'powerpoint' ]):
65+ orientation = "landscape" if any (request .file_type .endswith (type )
66+ for type in ['excel' , 'powerpoint' ]) else "portrait"
67+ converted = self .office_conversion (request .file_path , orientation , max_pages )
6768 if converted [0 ]:
6869 self .pdf_to_images (self .working_directory + "/" + converted [1 ])
6970 # PDF
70- elif file_type == 'document/pdf' :
71- self .pdf_to_images (file )
71+ elif request . file_type == 'document/pdf' :
72+ self .pdf_to_images (request . file_path )
7273 # EML/MSG
73- elif file_type .endswith ('email' ):
74+ elif request .file_type .endswith ('email' ):
75+ file_contents = request .file_contents
7476 # Convert MSG to EML where applicable
75- if file_type == 'document/office/email' :
77+ if request . file_type == 'document/office/email' :
7678 with tempfile .NamedTemporaryFile () as tmp :
77- subprocess .run (['msgconvert' , '-outfile' , tmp .name , file ])
79+ subprocess .run (['msgconvert' , '-outfile' , tmp .name , request . file_path ])
7880 tmp .seek (0 )
7981 file_contents = tmp .read ()
8082
8183 # Render EML as PNG
8284 # If we have internet access, we'll attempt to load external images
8385 output_image = eml2image (file_contents , self .working_directory , self .log ,
84- load_images = self .service_attributes .docker_config .allow_internet_access )
86+ load_ext_images = self .service_attributes .docker_config .allow_internet_access ,
87+ load_images = request .get_param ('load_email_images' ))
8588 img = Image .open (output_image )
8689 img_dim = img .size
8790 if img_dim [1 ] > WEBP_MAX_SIZE :
@@ -93,14 +96,14 @@ def render_documents(self, file_type, file, file_contents, max_pages=1):
9396 height = img_dim [1 ] - pos_y
9497 box = (0 , pos_y , img_dim [0 ], pos_y + height )
9598 slice = img .crop (box )
96- slice .save (f" { output_image } _ { index } " , "PNG" )
99+ slice .save (os . path . join ( self . working_directory , f"output_ { index } .png" ) , "PNG" )
97100 index += 1
98101 pos_y = index * WEBP_MAX_SIZE
99102
100103 os .remove (output_image )
101104
102- elif file_type .endswith ('emf' ):
103- self .libreoffice_conversion (file , convert_to = "png" )
105+ elif request . file_type .endswith ('emf' ):
106+ self .libreoffice_conversion (request . file_path , convert_to = "png" )
104107
105108 def execute (self , request ):
106109 start = time ()
@@ -109,7 +112,7 @@ def execute(self, request):
109112 # Attempt to render documents given and dump them to the working directory
110113 max_pages = request .get_param ('max_pages_rendered' )
111114 try :
112- self .render_documents (request . file_type , request . file_path , request . file_contents , max_pages )
115+ self .render_documents (request , max_pages )
113116 except Exception as e :
114117 # Unable to complete analysis after unexpected error, give up
115118 self .log .error (e )
@@ -119,9 +122,10 @@ def execute(self, request):
119122 if any ("output" in s for s in os .listdir (self .working_directory )):
120123 previews = [s for s in os .listdir (self .working_directory ) if "output" in s ]
121124 image_section = ResultImageSection (request , "Successfully extracted the preview." )
125+ heur_id = 1 if request .deep_scan or request .get_param ('run_ocr' ) else None
122126 [image_section .add_image (f"{ self .working_directory } /{ preview } " ,
123127 name = f"page_{ str (i ).zfill (3 )} .jpeg" , description = f"Here's the preview for page { i } " ,
124- ocr_heuristic_id = 1 )
128+ ocr_heuristic_id = heur_id )
125129 for i , preview in enumerate (natsorted (previews ))]
126130
127131 result .add_section (image_section )
0 commit comments