Skip to content

Commit fd5afa5

Browse files
committed
speed up emails by being selective about including images
1 parent 101dd4a commit fd5afa5

4 files changed

Lines changed: 58 additions & 50 deletions

File tree

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,6 @@ COPY . .
2727
ARG version=4.0.0.dev1
2828
USER root
2929
RUN sed -i -e "s/\$SERVICE_TAG/$version/g" service_manifest.yml
30+
RUN unoconv --listener &
3031

3132
USER assemblyline

document_preview/document_preview.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from assemblyline_v4_service.common.base import ServiceBase
1111
from assemblyline_v4_service.common.result import Result, ResultImageSection
12+
from assemblyline_v4_service.common.request import ServiceRequest as Request
1213

1314
from document_preview.helper.emlrender import processEml as eml2image
1415
from PIL import Image
@@ -21,7 +22,6 @@ def __init__(self, config=None):
2122
super(DocumentPreview, self).__init__(config)
2223

2324
def start(self):
24-
subprocess.Popen(["unoconv", "--listener"])
2525
self.log.debug("Document preview service started")
2626

2727
def stop(self):
@@ -59,29 +59,32 @@ def pdf_to_images(self, file):
5959
page.save(self.working_directory + "/output_" + str(i) + ".jpeg")
6060
i += 1
6161

62-
def render_documents(self, file_type, file, file_contents, max_pages=1):
62+
def render_documents(self, request: Request, max_pages=1):
6363
# Word/Excel/Powerpoint
64-
if any(file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']):
65-
orientation = "landscape" if file_type.endswith('excel') else "portrait"
66-
converted = self.office_conversion(file, orientation, max_pages)
64+
if any(request.file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']):
65+
orientation = "landscape" if any(request.file_type.endswith(type)
66+
for type in ['excel', 'powerpoint']) else "portrait"
67+
converted = self.office_conversion(request.file_path, orientation, max_pages)
6768
if converted[0]:
6869
self.pdf_to_images(self.working_directory + "/" + converted[1])
6970
# PDF
70-
elif file_type == 'document/pdf':
71-
self.pdf_to_images(file)
71+
elif request.file_type == 'document/pdf':
72+
self.pdf_to_images(request.file_path)
7273
# EML/MSG
73-
elif file_type.endswith('email'):
74+
elif request.file_type.endswith('email'):
75+
file_contents = request.file_contents
7476
# Convert MSG to EML where applicable
75-
if file_type == 'document/office/email':
77+
if request.file_type == 'document/office/email':
7678
with tempfile.NamedTemporaryFile() as tmp:
77-
subprocess.run(['msgconvert', '-outfile', tmp.name, file])
79+
subprocess.run(['msgconvert', '-outfile', tmp.name, request.file_path])
7880
tmp.seek(0)
7981
file_contents = tmp.read()
8082

8183
# Render EML as PNG
8284
# If we have internet access, we'll attempt to load external images
8385
output_image = eml2image(file_contents, self.working_directory, self.log,
84-
load_images=self.service_attributes.docker_config.allow_internet_access)
86+
load_ext_images=self.service_attributes.docker_config.allow_internet_access,
87+
load_images=request.get_param('load_email_images'))
8588
img = Image.open(output_image)
8689
img_dim = img.size
8790
if img_dim[1] > WEBP_MAX_SIZE:
@@ -93,14 +96,14 @@ def render_documents(self, file_type, file, file_contents, max_pages=1):
9396
height = img_dim[1] - pos_y
9497
box = (0, pos_y, img_dim[0], pos_y + height)
9598
slice = img.crop(box)
96-
slice.save(f"{output_image}_{index}", "PNG")
99+
slice.save(os.path.join(self.working_directory, f"output_{index}.png"), "PNG")
97100
index += 1
98101
pos_y = index * WEBP_MAX_SIZE
99102

100103
os.remove(output_image)
101104

102-
elif file_type.endswith('emf'):
103-
self.libreoffice_conversion(file, convert_to="png")
105+
elif request.file_type.endswith('emf'):
106+
self.libreoffice_conversion(request.file_path, convert_to="png")
104107

105108
def execute(self, request):
106109
start = time()
@@ -109,7 +112,7 @@ def execute(self, request):
109112
# Attempt to render documents given and dump them to the working directory
110113
max_pages = request.get_param('max_pages_rendered')
111114
try:
112-
self.render_documents(request.file_type, request.file_path, request.file_contents, max_pages)
115+
self.render_documents(request, max_pages)
113116
except Exception as e:
114117
# Unable to complete analysis after unexpected error, give up
115118
self.log.error(e)
@@ -119,9 +122,10 @@ def execute(self, request):
119122
if any("output" in s for s in os.listdir(self.working_directory)):
120123
previews = [s for s in os.listdir(self.working_directory) if "output" in s]
121124
image_section = ResultImageSection(request, "Successfully extracted the preview.")
125+
heur_id = 1 if request.deep_scan or request.get_param('run_ocr') else None
122126
[image_section.add_image(f"{self.working_directory}/{preview}",
123127
name=f"page_{str(i).zfill(3)}.jpeg", description=f"Here's the preview for page {i}",
124-
ocr_heuristic_id=1)
128+
ocr_heuristic_id=heur_id)
125129
for i, preview in enumerate(natsorted(previews))]
126130

127131
result.add_section(image_section)

document_preview/helper/emlrender.py

Lines changed: 27 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
import email
1616
import email.header
1717
import quopri
18-
import hashlib
1918
import base64
2019
import regex
20+
from tempfile import NamedTemporaryFile
2121

2222
try:
2323
import imgkit
@@ -58,7 +58,7 @@ def appendImages(images):
5858
return new_im
5959

6060

61-
def processEml(data, dumpDir, logger, load_images=False):
61+
def processEml(data, output_dir, logger, load_ext_images=False, load_images=False):
6262
'''
6363
Process the email (bytes), extract MIME parts and useful headers.
6464
Generate a PNG picture of the mail
@@ -104,7 +104,7 @@ def processEml(data, dumpDir, logger, load_images=False):
104104
idField = idField.replace('<', '&lt;').replace('>', '&gt;')
105105

106106
imgkitOptions = {'load-error-handling': 'skip'}
107-
if not load_images:
107+
if not load_ext_images:
108108
imgkitOptions.update({'no-images': None})
109109
# imgkitOptions.update({ 'quiet': None })
110110
imagesList = []
@@ -121,15 +121,13 @@ def processEml(data, dumpDir, logger, load_images=False):
121121
</table>
122122
<hr></p>
123123
''' % (dateField, fromField, toField, subjectField, idField)
124-
m = hashlib.md5()
125-
m.update(headers.encode('utf-8'))
126-
imagePath = f'output_{m.hexdigest()}.png'
127124
try:
128-
imgkit.from_string(headers, dumpDir + '/' + imagePath, options=imgkitOptions)
129-
logger.info('Created headers %s' % imagePath)
130-
imagesList.append(dumpDir + '/' + imagePath)
131-
except:
132-
logger.warning('Creation of headers failed')
125+
header_path = NamedTemporaryFile(suffix=".png").name
126+
imgkit.from_string(headers, header_path, options=imgkitOptions)
127+
logger.info('Created headers %s' % header_path)
128+
imagesList.append(header_path)
129+
except Exception as e:
130+
logger.warning(f'Creation of headers failed: {e}')
133131

134132
#
135133
# Main loop - process the MIME parts
@@ -157,38 +155,33 @@ def processEml(data, dumpDir, logger, load_images=False):
157155
# for char in dirtyChars:
158156
# payload = payload.replace(char, '')
159157

160-
# Generate MD5 hash of the payload
161-
m = hashlib.md5()
162-
m.update(payload.encode('utf-8'))
163-
imagePath = f'output_{m.hexdigest()}.png'
164158
try:
165-
imgkit.from_string(payload, dumpDir + '/' + imagePath, options=imgkitOptions)
166-
logger.info('Decoded %s' % imagePath)
167-
imagesList.append(dumpDir + '/' + imagePath)
159+
payload_path = NamedTemporaryFile(suffix=".png").name
160+
imgkit.from_string(payload, payload_path, options=imgkitOptions)
161+
logger.info('Decoded %s' % payload_path)
162+
imagesList.append(payload_path)
168163
except Exception as e:
169164
logger.warning(f'Decoding this MIME part returned error: {e}')
170-
elif mimeType in imageTypes:
165+
166+
elif mimeType in imageTypes and load_images:
171167
payload = part.get_payload(decode=False)
168+
payload_path = NamedTemporaryFile(suffix=".png").name
172169
imgdata = base64.b64decode(payload)
173-
# Generate MD5 hash of the payload
174-
m = hashlib.md5()
175-
m.update(payload.encode('utf-8'))
176-
imagePath = m.hexdigest() + '.' + mimeType.split('/')[1]
177170
try:
178-
with open(dumpDir + '/' + imagePath, 'wb') as f:
171+
with open(payload_path, 'wb') as f:
179172
f.write(imgdata)
180-
logger.info('Decoded %s' % imagePath)
181-
imagesList.append(dumpDir + '/' + imagePath)
173+
logger.info('Decoded %s' % payload_path)
174+
imagesList.append(payload_path)
182175
except Exception as e:
183176
logger.warning(f'Decoding this MIME part returned error: {e}')
184-
else:
185-
fileName = part.get_filename()
186-
if not fileName:
187-
fileName = "Unknown"
188-
attachments.append("%s (%s)" % (fileName, mimeType))
189-
logger.info('Skipped attachment %s (%s)' % (fileName, mimeType))
190-
191-
resultImage = dumpDir + '/' + 'output.png'
177+
# else:
178+
# fileName = part.get_filename()
179+
# if not fileName:
180+
# fileName = "Unknown"
181+
# attachments.append("%s (%s)" % (fileName, mimeType))
182+
# logger.info('Skipped attachment %s (%s)' % (fileName, mimeType))
183+
184+
resultImage = os.path.join(output_dir, 'output.png')
192185
if len(imagesList) > 0:
193186
images = list(map(Image.open, imagesList))
194187
combo = appendImages(images)

service_manifest.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,16 @@ submission_params:
2323
value: 1
2424
default: 1
2525

26+
- name: run_ocr
27+
type: bool
28+
value: false
29+
default: false
30+
31+
- name: load_email_images
32+
type: bool
33+
value: false
34+
default: false
35+
2636
heuristics:
2737
- heur_id: 1
2838
name: OCR Detection found

0 commit comments

Comments
 (0)