1- """
2- This module contains a scanner for extracting metadata and files from PDF files.
3-
4- Resources:
5- - https://pymupdf.readthedocs.io/en/latest/index.html
6- - https://www.osti.gov/servlets/purl/1030303
7-
8- Requirements:
9- - PyMuPDF
10- """
11-
12-
131import io
142import re
153from collections import Counter
197
208from strelka import strelka
219
22- # Hide PyMuPDF warnings
10+ # Suppress PyMuPDF warnings
2311fitz .TOOLS .mupdf_display_errors (False )
2412
25- # Regex to extract phone numbers from PDF file
26- phone_numbers = re .compile (
27- r"[+]?(?:\d{1,2})?\s?\(?\d{3}\)?[\s.-]\ d{3}[\s.-]\d{2,4}?-?\d{2,4}?" ,
28- flags = 0 ,
13+ # Regular expression for extracting phone numbers from PDFs
14+ PHONE_NUMBERS_REGEX = re .compile (
15+ r"[+]?(?:\d{1,2})?\s?\(?\d{3}\)?[\s.-][\n]?\ d{3}[\s.-][\n]? \d{2,4}?-?\d{2,4}?" ,
16+ flags = re . IGNORECASE ,
2917)
3018
3119
3220class ScanPdf (strelka .Scanner ):
3321 """
34- A scanner that collects metadata and extracts files from PDF files.
22+ Extracts metadata, embedded files, images, and text from PDF files.
23+
24+ This scanner utilizes PyMuPDF to parse PDF files, extracting various types of data,
25+ including metadata, embedded files, images, and textual content. Phone numbers and
26+ URLs within the document are also extracted and reported.
3527 """
3628
3729 @staticmethod
3830 def _convert_timestamp (timestamp ):
3931 """
40- Converts a date string to a DateTime object, sets the timezone to UTC, and returns it as an ISO string.
32+ Converts a PDF timestamp string to an ISO 8601 formatted string.
33+
34+ PDF timestamps are typically in the 'D:%Y%m%d%H%M%S%z' format. This function
35+ converts them to a more standard ISO 8601 format.
4136
4237 Args:
43- timestamp (str) : A date string in the format 'D:%Y%m%d%H%M%S%z' .
38+ timestamp: A string representing the timestamp in PDF format .
4439
4540 Returns:
46- str: An ISO- formatted date string in the format '%Y-%m-%dT%H:%M:%SZ' .
41+ An ISO 8601 formatted timestamp string, or None if conversion fails .
4742 """
48-
4943 try :
50- # Date string is converted to DateTime, timezone is set to UTC, and returned as ISO string
5144 return (
5245 datetime .strptime (timestamp .replace ("'" , "" ), "D:%Y%m%d%H%M%S%z" )
5346 .astimezone (timezone .utc )
5447 .strftime ("%Y-%m-%dT%H:%M:%SZ" )
5548 )
56- except strelka .ScannerTimeout :
57- raise
5849 except Exception :
59- return
50+ return None
6051
6152 def scan (self , data , file , options , expire_at ):
53+ """
54+ Performs the scanning process on the provided data.
55+
56+ The function opens the PDF using PyMuPDF and extracts metadata, embedded files,
57+ images, and text. Phone numbers and URLs are also extracted using regular expressions.
58+
59+ Args:
60+ data: Data of the file to be scanned.
61+ file: The File object associated with the data.
62+ options: Dictionary of scanner-specific options.
63+ expire_at: Expiration time of the scan.
64+ """
6265 # Set maximum XREF objects to be collected (default: 250)
6366 max_objects = options .get ("max_objects" , 250 )
6467
@@ -67,7 +70,7 @@ def scan(self, data, file, options, expire_at):
6770 self .event ["lines" ] = 0
6871 self .event ["links" ] = []
6972 self .event ["words" ] = 0
70- self .event .setdefault ("xref_object" , set ())
73+ self .event .setdefault ("xref_object" , list ())
7174 keys = list ()
7275
7376 try :
@@ -108,19 +111,19 @@ def scan(self, data, file, options, expire_at):
108111 [
109112 re .sub ("[^0-9]" , "" , x )
110113 for x in re .findall (
111- phone_numbers ,
114+ PHONE_NUMBERS_REGEX ,
112115 reader .get_page_text (i ).replace ("\t " , " " ),
113116 )
114117 ]
115118 )
116119 self .event ["phones" ] = list (set (phones ))
117120
118121 # iterate through xref objects. Collect, count, and extract objects
119- self .event ["xref_object" ] = set ()
122+ self .event ["xref_object" ] = list ()
120123 for xref in range (1 , reader .xref_length ()):
121124 xref_object = reader .xref_object (xref , compressed = True )
122125 if xref_object not in self .event ["xref_object" ]:
123- self .event ["xref_object" ].add (xref_object )
126+ self .event ["xref_object" ].append (xref_object )
124127 for obj in options .get ("objects" , []):
125128 pattern = f"/{ obj } "
126129 if pattern in xref_object :
@@ -130,7 +133,9 @@ def scan(self, data, file, options, expire_at):
130133 self .event ["objects" ] = dict (Counter (keys ))
131134
132135 # Convert unique xref_object set back to list
133- self .event ["xref_object" ] = list (self .event ["xref_object" ])[:max_objects ]
136+ self .event ["xref_object" ] = list (
137+ set (self .event ["xref_object" ][:max_objects ])
138+ )
134139
135140 # Submit embedded files to strelka
136141 try :
@@ -142,8 +147,8 @@ def scan(self, data, file, options, expire_at):
142147
143148 except strelka .ScannerTimeout :
144149 raise
145- except Exception :
146- self .flags .append ("embedded_parsing_failure " )
150+ except Exception as e :
151+ self .flags .append (f"pdf_embedded_processing_error: { str ( e )[: 50 ] } " )
147152
148153 # Submit extracted images to strelka
149154 try :
@@ -157,8 +162,8 @@ def scan(self, data, file, options, expire_at):
157162
158163 except strelka .ScannerTimeout :
159164 raise
160- except Exception :
161- self .flags .append ("image_parsing_failure " )
165+ except Exception as e :
166+ self .flags .append (f"pdf_image_processing_error: { str ( e )[: 50 ] } " )
162167
163168 # Parse data from each page
164169 try :
@@ -179,9 +184,9 @@ def scan(self, data, file, options, expire_at):
179184
180185 except strelka .ScannerTimeout :
181186 raise
182- except Exception :
183- self .flags .append ("page_parsing_failure " )
187+ except Exception as e :
188+ self .flags .append (f"pdf_page_processing_error: { str ( e )[: 50 ] } " )
184189 except strelka .ScannerTimeout :
185190 raise
186- except Exception :
187- self .flags .append ("pdf_load_error" )
191+ except Exception as e :
192+ self .flags .append (f "pdf_load_error: { str ( e )[: 50 ] } " )
0 commit comments