1+ import json
12import os
23import subprocess
34
45from natsort import natsorted
56from pdf2image import convert_from_path
67
78from assemblyline_v4_service .common .base import ServiceBase
8- from assemblyline_v4_service .common .result import Heuristic , Result , ResultImageSection
9+ from assemblyline_v4_service .common .result import BODY_FORMAT , Result , ResultImageSection , ResultJSONSection , Heuristic
10+ from assemblyline_v4_service .common .extractor .ocr import ocr_detections
911
10- from helper .emlrender import processEml as eml2image
11- from helper .outlookmsgfile import load as msg2eml
12+ from document_preview . helper .emlrender import processEml as eml2image
13+ from document_preview . helper .outlookmsgfile import load as msg2eml
1214
1315
1416class DocumentPreview (ServiceBase ):
@@ -40,12 +42,7 @@ def pdf_to_images(self, file):
4042 page .save (self .working_directory + "/output_" + str (i ) + ".jpeg" )
4143 i += 1
4244
43- def execute (self , request ):
44- result = Result ()
45-
46- file = request .file_path
47- file_type = request .file_type
48-
45+ def render_documents (self , file_type , file , file_contents ):
4946 # Word/Excel/Powerpoint
5047 if any (file_type == f'document/office/{ ms_product } ' for ms_product in ['word' , 'excel' , 'powerpoint' ]):
5148 converted = self .libreoffice_conversion (file )
@@ -57,35 +54,47 @@ def execute(self, request):
5754 # EML/MSG
5855 elif file_type .endswith ('email' ):
5956 # Convert MSG to EML where applicable
60- file_contents = msg2eml (file ).as_bytes () if file_type == 'document/office/email' else request . file_contents
57+ file_contents = msg2eml (file ).as_bytes () if file_type == 'document/office/email' else file_contents
6158
6259 # Render EML as PNG
6360 eml2image (file_contents , self .working_directory , self .log )
6461
65- # Attempt to preview unknown document format
66- else :
67- try :
68- converted = self .libreoffice_conversion (file )
69- if converted [0 ]:
70- self .pdf_to_images (self .working_directory + "/" + converted [1 ])
71- except :
72- # Conversion not successfull
73- pass
62+ def execute (self , request ):
63+ result = Result ()
7464
75- if any ("output" in s for s in os .listdir (self .working_directory )):
76- image_section = ResultImageSection (request , "Successfully extracted the preview." )
65+ # Attempt to render documents given and dump them to the working directory
66+ self .render_documents (request .file_type , request .file_path , request .file_contents )
67+ max_pages = request .get_param ('max_pages_rendered' )
68+ images = list ()
7769
78- i = 0
70+ # Create an image gallery section to show the renderings
71+ if any ("output" in s for s in os .listdir (self .working_directory )):
7972 previews = [s for s in os .listdir (self .working_directory ) if "output" in s ]
80- for preview in natsorted (previews ):
73+ total_pages = len (previews )
74+ image_section = ResultImageSection (request ,
75+ "Successfully extracted the preview. "
76+ f"Displaying { min (max_pages , total_pages )} of { total_pages } ." )
77+ for i , preview in enumerate (natsorted (previews )):
78+ if i >= max_pages :
79+ break
8180 image_path = f"{ self .working_directory } /{ preview } "
81+ images .append (image_path )
8282 title = f"preview_{ i } .jpeg"
8383 desc = f"Here's the preview for page { i } "
84- if request .get_param ('analyze_output' ):
85- request .add_extracted (image_path , title , desc )
8684 image_section .add_image (image_path , title , desc )
87- i += 1
8885
8986 result .add_section (image_section )
9087
88+ # Proceed with analysis of output images
89+ for i , image_path in enumerate (images ):
90+ if i >= max_pages :
91+ break
92+
93+ detections = ocr_detections (image_path )
94+ if any (v for v in detections .values ()):
95+ result .add_section (
96+ ResultJSONSection (f'OCR Analysis on { os .path .basename (image_path )} ' ,
97+ body = json .dumps (detections ),
98+ heuristic = Heuristic (1 , signatures = {k : len (v ) for k , v in detections .items ()}))
99+ )
91100 request .result = result
0 commit comments