11#!/usr/bin/python
22
3+ from typing import Iterator , List , Optional , Union
4+
35__description__ = 'pdf-parser, use it to parse a PDF document'
46__author__ = 'Didier Stevens'
57__version__ = '0.7.1'
7476
7577from .logger import getStatusLogger
7678
77- log = getStatusLogger ("PDF" )
78-
7979import re
8080import optparse
8181import zlib
100100except :
101101 pass
102102
103+ log = getStatusLogger ("PDF" )
104+
103105CHAR_WHITESPACE = 1
104106CHAR_DELIMITER = 2
105107CHAR_REGULAR = 3
118120
119121dumplinelength = 16
120122
121- #Convert 2 Bytes If Python 3
123+
124+ # Convert 2 Bytes If Python 3
122125def C2BIP3 (string ):
123126 if sys .version_info [0 ] > 2 :
124127 return bytes ([ord (x ) for x in string ])
125128 else :
126129 return string
127130
128- #Convert 2 String If Python 3
131+
132+ # Convert 2 String If Python 3
129133def C2SIP3 (bytes ):
130134 if sys .version_info [0 ] > 2 :
131135 return '' .join ([chr (byte ) for byte in bytes ])
132136 else :
133137 return bytes
134138
139+
135140# CIC: Call If Callable
136141def CIC (expression ):
137142 if callable (expression ):
138143 return expression ()
139144 else :
140145 return expression
141146
147+
142148# IFF: IF Function
143149def IFF (expression , valueTrue , valueFalse ):
144150 if expression :
145151 return CIC (valueTrue )
146152 else :
147153 return CIC (valueFalse )
148154
155+
149156def Timestamp (epoch = None ):
150157 if epoch == None :
151158 localTime = time .localtime ()
152159 else :
153160 localTime = time .localtime (epoch )
154161 return '%04d%02d%02d-%02d%02d%02d' % localTime [0 :6 ]
155162
163+
156164def CopyWithoutWhiteSpace (content ):
157165 result = []
158166 for token in content :
159167 if token [0 ] != CHAR_WHITESPACE :
160168 result .append (token )
161169 return result
162170
171+
163172def Obj2Str (content ):
164173 return '' .join (map (lambda x : repr (x [1 ])[1 :- 1 ], CopyWithoutWhiteSpace (content )))
165174
166175
167176class ByteOffset :
168- def __init__ (self , offset , lineno ):
177+ def __init__ (self , offset : int , lineno : int ):
169178 if isinstance (offset , ByteOffset ):
170- self .offset = offset .offset
179+ self .offset : int = offset .offset
171180 else :
172181 self .offset = offset
173- self .lineno = lineno
182+ self .lineno : int = lineno
174183
175184 def __sub__ (self , other ):
176185 return ByteOffset (self .offset - other , self .lineno )
@@ -218,11 +227,11 @@ def __init__(self, file):
218227 except :
219228 log .error (f'Error opening file { file } : { sys .exec_info ()[1 ]} ' )
220229 sys .exit ()
221- self .ungetted = []
222- self .position = - 1
223- self .lineno = 1
230+ self .ungetted : List [ PDFByte ] = []
231+ self .position : int = - 1
232+ self .lineno : int = 1
224233
225- def byte (self ):
234+ def byte (self ) -> Optional [ "PDFByte" ] :
226235 if len (self .ungetted ) != 0 :
227236 self .position += 1
228237 return self .ungetted .pop ()
@@ -236,28 +245,30 @@ def byte(self):
236245 self .lineno += 1
237246 return ret
238247
239- def unget (self , byte ):
248+ def unget (self , byte : "PDFByte" ):
240249 assert isinstance (byte , PDFByte )
241250 self .position -= 1
242- if byte == b'\n ' :
251+ if byte . byte == b'\n ' :
243252 self .lineno -= 1
244253 self .ungetted .append (byte )
245254
255+
246256def CharacterClass (byte ):
247257 if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32 :
248258 return CHAR_WHITESPACE
249259 if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25 :
250260 return CHAR_DELIMITER
251261 return CHAR_REGULAR
252262
263+
253264def IsNumeric (str ):
254265 return re .match ('^[0-9]+' , str )
255266
256267
257268class PDFByte :
258- def __init__ (self , byte , offset ):
259- self .byte = byte
260- self .offset = offset
269+ def __init__ (self , byte : int , offset : ByteOffset ):
270+ self .byte : int = byte
271+ self .offset : ByteOffset = offset
261272
262273 def chr (self ):
263274 return chr (self .byte )
@@ -282,18 +293,18 @@ def __repr__(self):
282293
283294
284295class PDFToken :
285- def __init__ (self , token_type , token , offset ):
286- self .token_type = token_type
287- self .token = token
288- self .offset = offset
296+ def __init__ (self , token_type : int , token : str , offset : ByteOffset ):
297+ self .token_type : int = token_type
298+ self .token : str = token
299+ self .offset : ByteOffset = offset
289300
290- def __iter__ (self ):
301+ def __iter__ (self ) -> Iterator [ Union [ int , str ]] :
291302 return iter ((self .token_type , self .token ))
292303
293304 def __len__ (self ):
294305 return 2
295306
296- def __getitem__ (self , key ) :
307+ def __getitem__ (self , key : int ) -> Union [ int , str ] :
297308 if key == 0 :
298309 return self .token_type
299310 elif key == 1 :
@@ -321,25 +332,27 @@ def __repr__(self):
321332
322333class cPDFTokenizer :
323334 def __init__ (self , file ):
324- self .oPDF = cPDFDocument (file )
325- self .ungetted = []
335+ self .oPDF : Optional [cPDFDocument ] = cPDFDocument (file )
336+ self .ungetted : List [PDFToken ] = []
337+ self .byte : Optional [PDFByte ] = None
338+ self .token : Optional [str ] = None
326339
327- def Token (self ):
340+ def Token (self ) -> Optional [ PDFToken ] :
328341 if len (self .ungetted ) != 0 :
329342 return self .ungetted .pop ()
330- if self .oPDF == None :
343+ if self .oPDF is None :
331344 return None
332345 self .byte = self .oPDF .byte ()
333- if self .byte == None :
346+ if self .byte is None :
334347 self .oPDF = None
335348 return None
336349 elif CharacterClass (self .byte ) == CHAR_WHITESPACE :
337350 first_offset = self .byte .offset
338351 file_str = StringIO ()
339- while self .byte != None and CharacterClass (self .byte ) == CHAR_WHITESPACE :
352+ while self .byte is not None and CharacterClass (self .byte ) == CHAR_WHITESPACE :
340353 file_str .write (self .byte .chr ())
341354 self .byte = self .oPDF .byte ()
342- if self .byte != None :
355+ if self .byte is not None :
343356 self .oPDF .unget (self .byte )
344357 else :
345358 self .oPDF = None
@@ -348,10 +361,10 @@ def Token(self):
348361 elif CharacterClass (self .byte ) == CHAR_REGULAR :
349362 file_str = StringIO ()
350363 token_offset = self .byte .offset
351- while self .byte != None and CharacterClass (self .byte ) == CHAR_REGULAR :
364+ while self .byte is not None and CharacterClass (self .byte ) == CHAR_REGULAR :
352365 file_str .write (self .byte .chr ())
353366 self .byte = self .oPDF .byte ()
354- if self .byte != None :
367+ if self .byte is not None :
355368 self .oPDF .unget (self .byte )
356369 else :
357370 self .oPDF = None
@@ -375,13 +388,13 @@ def Token(self):
375388 elif self .byte == 0x25 :
376389 file_str = StringIO ()
377390 token_offset = self .byte .offset
378- while self .byte != None :
391+ while self .byte is not None :
379392 file_str .write (self .byte .chr ())
380393 if self .byte == 10 or self .byte == 13 :
381394 self .byte = self .oPDF .byte ()
382395 break
383396 self .byte = self .oPDF .byte ()
384- if self .byte != None :
397+ if self .byte is not None :
385398 if self .byte == 10 :
386399 file_str .write (self .byte .chr ())
387400 else :
@@ -392,29 +405,30 @@ def Token(self):
392405 return PDFToken (CHAR_DELIMITER , self .token , token_offset )
393406 return PDFToken (CHAR_DELIMITER , self .byte .chr (), self .byte .offset )
394407
395- def TokenIgnoreWhiteSpace (self ):
408+ def TokenIgnoreWhiteSpace (self ) -> Optional [ PDFToken ] :
396409 token = self .Token ()
397- while token != None and token [0 ] == CHAR_WHITESPACE :
410+ while token is not None and token [0 ] == CHAR_WHITESPACE :
398411 token = self .Token ()
399412 return token
400413
401- def Tokens (self ):
414+ def Tokens (self ) -> List [ PDFToken ] :
402415 tokens = []
403416 token = self .Token ()
404- while token != None :
417+ while token is not None :
405418 tokens .append (token )
406419 token = self .Token ()
407420 return tokens
408421
409- def unget (self , byte ):
410- self .ungetted .append (byte )
422+ def unget (self , token : PDFToken ):
423+ self .ungetted .append (token )
424+
411425
412426class cPDFParser :
413- def __init__ (self , file , verbose = False , extract = None , objstm = None ):
414- self .context = CONTEXT_NONE
415- self .content = []
416- self .oPDFTokenizer = cPDFTokenizer (file )
417- self .verbose = verbose
427+ def __init__ (self , file , verbose : bool = False , extract = None , objstm = None ):
428+ self .context : int = CONTEXT_NONE
429+ self .content : List [ PDFToken ] = []
430+ self .oPDFTokenizer : cPDFTokenizer = cPDFTokenizer (file )
431+ self .verbose : bool = verbose
418432 self .extract = extract
419433 self .objstm = objstm
420434
@@ -435,8 +449,9 @@ def GetObject(self):
435449 self .token2 = self .oPDFTokenizer .Token ()
436450 if self .token2 [0 ] == CHAR_REGULAR :
437451 if self .context != CONTEXT_NONE :
438- #self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1]))
439- self .content .append (PDFToken (CHAR_DELIMITER , self .token [1 ] + self .token2 [1 ], self .token .offset ))
452+ # self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1]))
453+ self .content .append (PDFToken (CHAR_DELIMITER , self .token [1 ] + self .token2 [1 ],
454+ self .token .offset ))
440455 elif self .verbose :
441456 log .warn ('todo 1: %s' % (self .token [1 ] + self .token2 [1 ]))
442457 else :
@@ -536,6 +551,7 @@ def GetObject(self):
536551 else :
537552 break
538553
554+
539555class cPDFElementComment :
540556 def __init__ (self , comment , offset = None ):
541557 self .type = PDF_ELEMENT_COMMENT
@@ -552,6 +568,7 @@ def __init__(self, content, offset):
552568 self .content = content
553569 self .offset = offset
554570
571+
555572class cPDFElementTrailer :
556573 def __init__ (self , content ):
557574 self .type = PDF_ELEMENT_TRAILER
@@ -566,12 +583,14 @@ def Contains(self, keyword):
566583 data += Canonicalize (self .content [i ][1 ])
567584 return data .upper ().find (keyword .upper ()) != - 1
568585
586+
569587def IIf (expr , truepart , falsepart ):
570588 if expr :
571589 return truepart
572590 else :
573591 return falsepart
574592
593+
575594class cPDFElementIndirectObject :
576595 def __init__ (self , id , version , content , objtokens , objstm = None ):
577596 self .type = PDF_ELEMENT_INDIRECT_OBJECT
@@ -580,7 +599,7 @@ def __init__(self, id, version, content, objtokens, objstm=None):
580599 self .content = content
581600 self .objstm = objstm
582601 self .objtokens = objtokens
583- #fix stream for Ghostscript bug reported by Kurt
602+ # fix stream for Ghostscript bug reported by Kurt
584603 if self .ContainsStream ():
585604 position = len (self .content ) - 1
586605 if position < 0 :
@@ -909,7 +928,7 @@ def ParseDictionary(self, tokens):
909928 elif value != [] and value [0 ][1 ] == '(' and tokens [0 ][1 ] != ')' :
910929 if tokens [0 ][1 ][0 ] == '%' :
911930 tokens = [tokens [0 ]] + cPDFTokenizer (StringIO (tokens [0 ][1 ][1 :])).Tokens () + tokens [1 :]
912- value .append ('%' )
931+ value .append (PDFToken ( CHAR_REGULAR , '%' , tokens [ 0 ]. offset ) )
913932 else :
914933 value .append (tokens [0 ])
915934
0 commit comments