Skip to content

Commit bbb6be9

Browse files
authored
Merge pull request #4 from CybercentreCanada/AL-1667
Al 1667
2 parents 6674fa2 + 9db8fe1 commit bbb6be9

5 files changed

Lines changed: 55 additions & 35 deletions

File tree

Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
ARG branch=latest
22
FROM cccs/assemblyline-v4-service-base:$branch
33

4-
ENV SERVICE_PATH document_preview.DocumentPreview
4+
ENV SERVICE_PATH document_preview.document_preview.DocumentPreview
55

66
USER root
77

88
RUN mkdir -p /usr/share/man/man1mkdir -p /usr/share/man/man1
9-
RUN apt-get update && apt-get install -y wget
9+
RUN apt-get update && apt-get install -y wget tesseract-ocr
1010
RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb
1111
RUN apt-get install -y poppler-utils libreoffice ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends
12-
RUN pip3 install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf
12+
RUN pip install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf pytesseract
1313

1414
USER assemblyline
1515

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1+
import json
12
import os
23
import subprocess
34

45
from natsort import natsorted
56
from pdf2image import convert_from_path
67

78
from assemblyline_v4_service.common.base import ServiceBase
8-
from assemblyline_v4_service.common.result import Heuristic, Result, ResultImageSection
9+
from assemblyline_v4_service.common.result import BODY_FORMAT, Result, ResultImageSection, ResultJSONSection, Heuristic
10+
from assemblyline_v4_service.common.extractor.ocr import ocr_detections
911

10-
from helper.emlrender import processEml as eml2image
11-
from helper.outlookmsgfile import load as msg2eml
12+
from document_preview.helper.emlrender import processEml as eml2image
13+
from document_preview.helper.outlookmsgfile import load as msg2eml
1214

1315

1416
class DocumentPreview(ServiceBase):
@@ -40,12 +42,7 @@ def pdf_to_images(self, file):
4042
page.save(self.working_directory + "/output_" + str(i) + ".jpeg")
4143
i += 1
4244

43-
def execute(self, request):
44-
result = Result()
45-
46-
file = request.file_path
47-
file_type = request.file_type
48-
45+
def render_documents(self, file_type, file, file_contents):
4946
# Word/Excel/Powerpoint
5047
if any(file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']):
5148
converted = self.libreoffice_conversion(file)
@@ -57,35 +54,47 @@ def execute(self, request):
5754
# EML/MSG
5855
elif file_type.endswith('email'):
5956
# Convert MSG to EML where applicable
60-
file_contents = msg2eml(file).as_bytes() if file_type == 'document/office/email' else request.file_contents
57+
file_contents = msg2eml(file).as_bytes() if file_type == 'document/office/email' else file_contents
6158

6259
# Render EML as PNG
6360
eml2image(file_contents, self.working_directory, self.log)
6461

65-
# Attempt to preview unknown document format
66-
else:
67-
try:
68-
converted = self.libreoffice_conversion(file)
69-
if converted[0]:
70-
self.pdf_to_images(self.working_directory + "/" + converted[1])
71-
except:
72-
# Conversion not successfull
73-
pass
62+
def execute(self, request):
63+
result = Result()
7464

75-
if any("output" in s for s in os.listdir(self.working_directory)):
76-
image_section = ResultImageSection(request, "Successfully extracted the preview.")
65+
# Attempt to render documents given and dump them to the working directory
66+
self.render_documents(request.file_type, request.file_path, request.file_contents)
67+
max_pages = request.get_param('max_pages_rendered')
68+
images = list()
7769

78-
i = 0
70+
# Create an image gallery section to show the renderings
71+
if any("output" in s for s in os.listdir(self.working_directory)):
7972
previews = [s for s in os.listdir(self.working_directory) if "output" in s]
80-
for preview in natsorted(previews):
73+
total_pages = len(previews)
74+
image_section = ResultImageSection(request,
75+
"Successfully extracted the preview. "
76+
f"Displaying {min(max_pages, total_pages)} of {total_pages}.")
77+
for i, preview in enumerate(natsorted(previews)):
78+
if i >= max_pages:
79+
break
8180
image_path = f"{self.working_directory}/{preview}"
81+
images.append(image_path)
8282
title = f"preview_{i}.jpeg"
8383
desc = f"Here's the preview for page {i}"
84-
if request.get_param('analyze_output'):
85-
request.add_extracted(image_path, title, desc)
8684
image_section.add_image(image_path, title, desc)
87-
i += 1
8885

8986
result.add_section(image_section)
9087

88+
# Proceed with analysis of output images
89+
for i, image_path in enumerate(images):
90+
if i >= max_pages:
91+
break
92+
93+
detections = ocr_detections(image_path)
94+
if any(v for v in detections.values()):
95+
result.add_section(
96+
ResultJSONSection(f'OCR Analysis on {os.path.basename(image_path)}',
97+
body=json.dumps(detections),
98+
heuristic=Heuristic(1, signatures={k: len(v) for k, v in detections.items()}))
99+
)
91100
request.result = result

service_manifest.yml

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
name: DocumentPreview
22
version: $SERVICE_TAG
3-
description: Automatically extract the first page of a document as an image
3+
description: Use OCR to detect for signs of malicious behaviour in Office and PDF files
44

55
accepts: (document/pdf|document/office/.*)
66
rejects: empty|metadata/.*
77

88
stage: CORE
9-
category: Extraction
9+
category: Static Analysis
1010

1111
file_required: true
1212
timeout: 60
@@ -17,10 +17,21 @@ is_external: false
1717
licence_count: 0
1818

1919
submission_params:
20-
- default: false
21-
value: false
22-
type: bool
23-
name: analyze_output
20+
# Default is selected for phishing campaigns that tend to be singular-paged documents
21+
- name: max_pages_rendered
22+
type: int
23+
value: 1
24+
default: 1
25+
26+
heuristics:
27+
- heur_id: 1
28+
name: OCR Detection found
29+
description: Suspicious verbage found in OCR inspection.
30+
score: 0
31+
signature_score_map:
32+
macros: 100
33+
ransomware: 100
34+
filetype: "*"
2435

2536
docker_config:
2637
image: cccs/assemblyline-service-document-preview:$SERVICE_TAG

0 commit comments

Comments
 (0)