Skip to content

Commit 37ef3c4

Browse files
committed
Merge branch 'master' of https://github.com/target/strelka
2 parents 58eba27 + e2cd87a commit 37ef3c4

File tree

1 file changed

+44
-39
lines changed

1 file changed

+44
-39
lines changed

src/python/strelka/scanners/scan_pdf.py

Lines changed: 44 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,3 @@
1-
"""
2-
This module contains a scanner for extracting metadata and files from PDF files.
3-
4-
Resources:
5-
- https://pymupdf.readthedocs.io/en/latest/index.html
6-
- https://www.osti.gov/servlets/purl/1030303
7-
8-
Requirements:
9-
- PyMuPDF
10-
"""
11-
12-
131
import io
142
import re
153
from collections import Counter
@@ -19,46 +7,61 @@
197

208
from strelka import strelka
219

22-
# Hide PyMuPDF warnings
10+
# Suppress PyMuPDF warnings
2311
fitz.TOOLS.mupdf_display_errors(False)
2412

25-
# Regex to extract phone numbers from PDF file
26-
phone_numbers = re.compile(
27-
r"[+]?(?:\d{1,2})?\s?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{2,4}?-?\d{2,4}?",
28-
flags=0,
13+
# Regular expression for extracting phone numbers from PDFs
14+
PHONE_NUMBERS_REGEX = re.compile(
15+
r"[+]?(?:\d{1,2})?\s?\(?\d{3}\)?[\s.-][\n]?\d{3}[\s.-][\n]?\d{2,4}?-?\d{2,4}?",
16+
flags=re.IGNORECASE,
2917
)
3018

3119

3220
class ScanPdf(strelka.Scanner):
3321
"""
34-
A scanner that collects metadata and extracts files from PDF files.
22+
Extracts metadata, embedded files, images, and text from PDF files.
23+
24+
This scanner utilizes PyMuPDF to parse PDF files, extracting various types of data,
25+
including metadata, embedded files, images, and textual content. Phone numbers and
26+
URLs within the document are also extracted and reported.
3527
"""
3628

3729
@staticmethod
3830
def _convert_timestamp(timestamp):
3931
"""
40-
Converts a date string to a DateTime object, sets the timezone to UTC, and returns it as an ISO string.
32+
Converts a PDF timestamp string to an ISO 8601 formatted string.
33+
34+
PDF timestamps are typically in the 'D:%Y%m%d%H%M%S%z' format. This function
35+
converts them to a more standard ISO 8601 format.
4136
4237
Args:
43-
timestamp (str): A date string in the format 'D:%Y%m%d%H%M%S%z'.
38+
timestamp: A string representing the timestamp in PDF format.
4439
4540
Returns:
46-
str: An ISO-formatted date string in the format '%Y-%m-%dT%H:%M:%SZ'.
41+
An ISO 8601 formatted timestamp string, or None if conversion fails.
4742
"""
48-
4943
try:
50-
# Date string is converted to DateTime, timezone is set to UTC, and returned as ISO string
5144
return (
5245
datetime.strptime(timestamp.replace("'", ""), "D:%Y%m%d%H%M%S%z")
5346
.astimezone(timezone.utc)
5447
.strftime("%Y-%m-%dT%H:%M:%SZ")
5548
)
56-
except strelka.ScannerTimeout:
57-
raise
5849
except Exception:
59-
return
50+
return None
6051

6152
def scan(self, data, file, options, expire_at):
53+
"""
54+
Performs the scanning process on the provided data.
55+
56+
The function opens the PDF using PyMuPDF and extracts metadata, embedded files,
57+
images, and text. Phone numbers and URLs are also extracted using regular expressions.
58+
59+
Args:
60+
data: Data of the file to be scanned.
61+
file: The File object associated with the data.
62+
options: Dictionary of scanner-specific options.
63+
expire_at: Expiration time of the scan.
64+
"""
6265
# Set maximum XREF objects to be collected (default: 250)
6366
max_objects = options.get("max_objects", 250)
6467

@@ -67,7 +70,7 @@ def scan(self, data, file, options, expire_at):
6770
self.event["lines"] = 0
6871
self.event["links"] = []
6972
self.event["words"] = 0
70-
self.event.setdefault("xref_object", set())
73+
self.event.setdefault("xref_object", list())
7174
keys = list()
7275

7376
try:
@@ -108,19 +111,19 @@ def scan(self, data, file, options, expire_at):
108111
[
109112
re.sub("[^0-9]", "", x)
110113
for x in re.findall(
111-
phone_numbers,
114+
PHONE_NUMBERS_REGEX,
112115
reader.get_page_text(i).replace("\t", " "),
113116
)
114117
]
115118
)
116119
self.event["phones"] = list(set(phones))
117120

118121
# iterate through xref objects. Collect, count, and extract objects
119-
self.event["xref_object"] = set()
122+
self.event["xref_object"] = list()
120123
for xref in range(1, reader.xref_length()):
121124
xref_object = reader.xref_object(xref, compressed=True)
122125
if xref_object not in self.event["xref_object"]:
123-
self.event["xref_object"].add(xref_object)
126+
self.event["xref_object"].append(xref_object)
124127
for obj in options.get("objects", []):
125128
pattern = f"/{obj}"
126129
if pattern in xref_object:
@@ -130,7 +133,9 @@ def scan(self, data, file, options, expire_at):
130133
self.event["objects"] = dict(Counter(keys))
131134

132135
# Convert unique xref_object set back to list
133-
self.event["xref_object"] = list(self.event["xref_object"])[:max_objects]
136+
self.event["xref_object"] = list(
137+
set(self.event["xref_object"][:max_objects])
138+
)
134139

135140
# Submit embedded files to strelka
136141
try:
@@ -142,8 +147,8 @@ def scan(self, data, file, options, expire_at):
142147

143148
except strelka.ScannerTimeout:
144149
raise
145-
except Exception:
146-
self.flags.append("embedded_parsing_failure")
150+
except Exception as e:
151+
self.flags.append(f"pdf_embedded_processing_error: {str(e)[:50]}")
147152

148153
# Submit extracted images to strelka
149154
try:
@@ -157,8 +162,8 @@ def scan(self, data, file, options, expire_at):
157162

158163
except strelka.ScannerTimeout:
159164
raise
160-
except Exception:
161-
self.flags.append("image_parsing_failure")
165+
except Exception as e:
166+
self.flags.append(f"pdf_image_processing_error: {str(e)[:50]}")
162167

163168
# Parse data from each page
164169
try:
@@ -179,9 +184,9 @@ def scan(self, data, file, options, expire_at):
179184

180185
except strelka.ScannerTimeout:
181186
raise
182-
except Exception:
183-
self.flags.append("page_parsing_failure")
187+
except Exception as e:
188+
self.flags.append(f"pdf_page_processing_error: {str(e)[:50]}")
184189
except strelka.ScannerTimeout:
185190
raise
186-
except Exception:
187-
self.flags.append("pdf_load_error")
191+
except Exception as e:
192+
self.flags.append(f"pdf_load_error: {str(e)[:50]}")

0 commit comments

Comments
 (0)