CybercentreCanada
diff --git a/‎Dockerfile‎
Lines changed: 3 additions & 3 deletions b/‎Dockerfile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎document_preview.py‎ ‎document_preview/document_preview.py‎document_preview.py renamed to document_preview/document_preview.py
Lines changed: 35 additions & 26 deletions b/‎document_preview.py‎ ‎document_preview/document_preview.py‎document_preview.py renamed to document_preview/document_preview.py
Lines changed: 35 additions & 26 deletions
diff --git a/‎helper/emlrender.py‎ ‎document_preview/helper/emlrender.py‎helper/emlrender.py renamed to document_preview/helper/emlrender.py b/‎helper/emlrender.py‎ ‎document_preview/helper/emlrender.py‎helper/emlrender.py renamed to document_preview/helper/emlrender.py
diff --git a/‎helper/outlookmsgfile.py‎ ‎document_preview/helper/outlookmsgfile.py‎helper/outlookmsgfile.py renamed to document_preview/helper/outlookmsgfile.py b/‎helper/outlookmsgfile.py‎ ‎document_preview/helper/outlookmsgfile.py‎helper/outlookmsgfile.py renamed to document_preview/helper/outlookmsgfile.py
diff --git a/‎service_manifest.yml‎
Lines changed: 17 additions & 6 deletions b/‎service_manifest.yml‎
Lines changed: 17 additions & 6 deletions
@@ -1,15 +1,15 @@
 ARG branch=latest
 FROM cccs/assemblyline-v4-service-base:$branch
 
-ENV SERVICE_PATH document_preview.DocumentPreview
+ENV SERVICE_PATH document_preview.document_preview.DocumentPreview
 
 USER root
 
 RUN mkdir -p /usr/share/man/man1mkdir -p /usr/share/man/man1
-RUN apt-get update && apt-get install -y wget
+RUN apt-get update && apt-get install -y wget tesseract-ocr
 RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb
 RUN apt-get install -y poppler-utils libreoffice  ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends
-RUN pip3 install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf
+RUN pip install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf pytesseract
 
 USER assemblyline
 
 
@@ -1,14 +1,16 @@
+import json
 import os
 import subprocess
 
 from natsort import natsorted
 from pdf2image import convert_from_path
 
 from assemblyline_v4_service.common.base import ServiceBase
-from assemblyline_v4_service.common.result import Heuristic, Result, ResultImageSection
+from assemblyline_v4_service.common.result import BODY_FORMAT, Result, ResultImageSection, ResultJSONSection, Heuristic
+from assemblyline_v4_service.common.extractor.ocr import ocr_detections
 
-from helper.emlrender import processEml as eml2image
-from helper.outlookmsgfile import load as msg2eml
+from document_preview.helper.emlrender import processEml as eml2image
+from document_preview.helper.outlookmsgfile import load as msg2eml
 
 
 class DocumentPreview(ServiceBase):
@@ -40,12 +42,7 @@ def pdf_to_images(self, file):
             page.save(self.working_directory + "/output_" + str(i) + ".jpeg")
             i += 1
 
-    def execute(self, request):
-        result = Result()
-
-        file = request.file_path
-        file_type = request.file_type
-
+    def render_documents(self, file_type, file, file_contents):
         # Word/Excel/Powerpoint
         if any(file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']):
             converted = self.libreoffice_conversion(file)
@@ -57,35 +54,47 @@ def execute(self, request):
         # EML/MSG
         elif file_type.endswith('email'):
             # Convert MSG to EML where applicable
-            file_contents = msg2eml(file).as_bytes() if file_type == 'document/office/email' else request.file_contents
+            file_contents = msg2eml(file).as_bytes() if file_type == 'document/office/email' else file_contents
 
             # Render EML as PNG
             eml2image(file_contents, self.working_directory, self.log)
 
-        # Attempt to preview unknown document format
-        else:
-            try:
-                converted = self.libreoffice_conversion(file)
-                if converted[0]:
-                    self.pdf_to_images(self.working_directory + "/" + converted[1])
-            except:
-                # Conversion not successfull
-                pass
+    def execute(self, request):
+        result = Result()
 
-        if any("output" in s for s in os.listdir(self.working_directory)):
-            image_section = ResultImageSection(request, "Successfully extracted the preview.")
+        # Attempt to render documents given and dump them to the working directory
+        self.render_documents(request.file_type, request.file_path, request.file_contents)
+        max_pages = request.get_param('max_pages_rendered')
+        images = list()
 
-            i = 0
+        # Create an image gallery section to show the renderings
+        if any("output" in s for s in os.listdir(self.working_directory)):
             previews = [s for s in os.listdir(self.working_directory) if "output" in s]
-            for preview in natsorted(previews):
+            total_pages = len(previews)
+            image_section = ResultImageSection(request,
+                                               "Successfully extracted the preview. "
+                                               f"Displaying {min(max_pages, total_pages)} of {total_pages}.")
+            for i, preview in enumerate(natsorted(previews)):
+                if i >= max_pages:
+                    break
                 image_path = f"{self.working_directory}/{preview}"
+                images.append(image_path)
                 title = f"preview_{i}.jpeg"
                 desc = f"Here's the preview for page {i}"
-                if request.get_param('analyze_output'):
-                    request.add_extracted(image_path, title, desc)
                 image_section.add_image(image_path, title, desc)
-                i += 1
 
             result.add_section(image_section)
 
+        # Proceed with analysis of output images
+        for i, image_path in enumerate(images):
+            if i >= max_pages:
+                break
+
+            detections = ocr_detections(image_path)
+            if any(v for v in detections.values()):
+                result.add_section(
+                    ResultJSONSection(f'OCR Analysis on {os.path.basename(image_path)}',
+                                      body=json.dumps(detections),
+                                      heuristic=Heuristic(1, signatures={k: len(v) for k, v in detections.items()}))
+                )
         request.result = result
@@ -1,12 +1,12 @@
 name: DocumentPreview
 version: $SERVICE_TAG
-description: Automatically extract the first page of a document as an image
+description: Use OCR to detect for signs of malicious behaviour in Office and PDF files
 
 accepts: (document/pdf|document/office/.*)
 rejects: empty|metadata/.*
 
 stage: CORE
-category: Extraction
+category: Static Analysis
 
 file_required: true
 timeout: 60
@@ -17,10 +17,21 @@ is_external: false
 licence_count: 0
 
 submission_params:
-  - default: false
-    value: false
-    type: bool
-    name: analyze_output
+  # Default is selected for phishing campaigns that tend to be singular-paged documents
+  - name: max_pages_rendered
+    type: int
+    value: 1
+    default: 1
+
+heuristics:
+  - heur_id: 1
+    name: OCR Detection found
+    description: Suspicious verbage found in OCR inspection.
+    score: 0
+    signature_score_map:
+      macros: 100
+      ransomware: 100
+    filetype: "*"
 
 docker_config:
   image: cccs/assemblyline-service-document-preview:$SERVICE_TAG