Skip to content

Commit dfb393b

Browse files
committed
Typehints and bugfixes
1 parent ab8cb2c commit dfb393b

File tree

1 file changed

+67
-48
lines changed

1 file changed

+67
-48
lines changed

polyfile/pdfparser.py

Lines changed: 67 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/usr/bin/python
22

3+
from typing import Iterator, List, Optional, Union
4+
35
__description__ = 'pdf-parser, use it to parse a PDF document'
46
__author__ = 'Didier Stevens'
57
__version__ = '0.7.1'
@@ -74,8 +76,6 @@
7476

7577
from .logger import getStatusLogger
7678

77-
log = getStatusLogger("PDF")
78-
7979
import re
8080
import optparse
8181
import zlib
@@ -100,6 +100,8 @@
100100
except:
101101
pass
102102

103+
log = getStatusLogger("PDF")
104+
103105
CHAR_WHITESPACE = 1
104106
CHAR_DELIMITER = 2
105107
CHAR_REGULAR = 3
@@ -118,59 +120,66 @@
118120

119121
dumplinelength = 16
120122

121-
#Convert 2 Bytes If Python 3
123+
124+
# Convert 2 Bytes If Python 3
122125
def C2BIP3(string):
123126
if sys.version_info[0] > 2:
124127
return bytes([ord(x) for x in string])
125128
else:
126129
return string
127130

128-
#Convert 2 String If Python 3
131+
132+
# Convert 2 String If Python 3
129133
def C2SIP3(bytes):
130134
if sys.version_info[0] > 2:
131135
return ''.join([chr(byte) for byte in bytes])
132136
else:
133137
return bytes
134138

139+
135140
# CIC: Call If Callable
136141
def CIC(expression):
137142
if callable(expression):
138143
return expression()
139144
else:
140145
return expression
141146

147+
142148
# IFF: IF Function
143149
def IFF(expression, valueTrue, valueFalse):
144150
if expression:
145151
return CIC(valueTrue)
146152
else:
147153
return CIC(valueFalse)
148154

155+
149156
def Timestamp(epoch=None):
150157
if epoch == None:
151158
localTime = time.localtime()
152159
else:
153160
localTime = time.localtime(epoch)
154161
return '%04d%02d%02d-%02d%02d%02d' % localTime[0:6]
155162

163+
156164
def CopyWithoutWhiteSpace(content):
157165
result = []
158166
for token in content:
159167
if token[0] != CHAR_WHITESPACE:
160168
result.append(token)
161169
return result
162170

171+
163172
def Obj2Str(content):
164173
return ''.join(map(lambda x: repr(x[1])[1:-1], CopyWithoutWhiteSpace(content)))
165174

166175

167176
class ByteOffset:
168-
def __init__(self, offset, lineno):
177+
def __init__(self, offset: int, lineno: int):
169178
if isinstance(offset, ByteOffset):
170-
self.offset = offset.offset
179+
self.offset: int = offset.offset
171180
else:
172181
self.offset = offset
173-
self.lineno = lineno
182+
self.lineno: int = lineno
174183

175184
def __sub__(self, other):
176185
return ByteOffset(self.offset - other, self.lineno)
@@ -218,11 +227,11 @@ def __init__(self, file):
218227
except:
219228
log.error(f'Error opening file {file}: {sys.exec_info()[1]}')
220229
sys.exit()
221-
self.ungetted = []
222-
self.position = -1
223-
self.lineno = 1
230+
self.ungetted: List[PDFByte] = []
231+
self.position: int = -1
232+
self.lineno: int = 1
224233

225-
def byte(self):
234+
def byte(self) -> Optional["PDFByte"]:
226235
if len(self.ungetted) != 0:
227236
self.position += 1
228237
return self.ungetted.pop()
@@ -236,28 +245,30 @@ def byte(self):
236245
self.lineno += 1
237246
return ret
238247

239-
def unget(self, byte):
248+
def unget(self, byte: "PDFByte"):
240249
assert isinstance(byte, PDFByte)
241250
self.position -= 1
242-
if byte == b'\n':
251+
if byte.byte == b'\n':
243252
self.lineno -= 1
244253
self.ungetted.append(byte)
245254

255+
246256
def CharacterClass(byte):
247257
if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32:
248258
return CHAR_WHITESPACE
249259
if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25:
250260
return CHAR_DELIMITER
251261
return CHAR_REGULAR
252262

263+
253264
def IsNumeric(str):
254265
return re.match('^[0-9]+', str)
255266

256267

257268
class PDFByte:
258-
def __init__(self, byte, offset):
259-
self.byte = byte
260-
self.offset = offset
269+
def __init__(self, byte: int, offset: ByteOffset):
270+
self.byte: int = byte
271+
self.offset: ByteOffset = offset
261272

262273
def chr(self):
263274
return chr(self.byte)
@@ -282,18 +293,18 @@ def __repr__(self):
282293

283294

284295
class PDFToken:
285-
def __init__(self, token_type, token, offset):
286-
self.token_type = token_type
287-
self.token = token
288-
self.offset = offset
296+
def __init__(self, token_type: int, token: str, offset: ByteOffset):
297+
self.token_type: int = token_type
298+
self.token: str = token
299+
self.offset: ByteOffset = offset
289300

290-
def __iter__(self):
301+
def __iter__(self) -> Iterator[Union[int, str]]:
291302
return iter((self.token_type, self.token))
292303

293304
def __len__(self):
294305
return 2
295306

296-
def __getitem__(self, key):
307+
def __getitem__(self, key: int) -> Union[int, str]:
297308
if key == 0:
298309
return self.token_type
299310
elif key == 1:
@@ -321,25 +332,27 @@ def __repr__(self):
321332

322333
class cPDFTokenizer:
323334
def __init__(self, file):
324-
self.oPDF = cPDFDocument(file)
325-
self.ungetted = []
335+
self.oPDF: Optional[cPDFDocument] = cPDFDocument(file)
336+
self.ungetted: List[PDFToken] = []
337+
self.byte: Optional[PDFByte] = None
338+
self.token: Optional[str] = None
326339

327-
def Token(self):
340+
def Token(self) -> Optional[PDFToken]:
328341
if len(self.ungetted) != 0:
329342
return self.ungetted.pop()
330-
if self.oPDF == None:
343+
if self.oPDF is None:
331344
return None
332345
self.byte = self.oPDF.byte()
333-
if self.byte == None:
346+
if self.byte is None:
334347
self.oPDF = None
335348
return None
336349
elif CharacterClass(self.byte) == CHAR_WHITESPACE:
337350
first_offset = self.byte.offset
338351
file_str = StringIO()
339-
while self.byte != None and CharacterClass(self.byte) == CHAR_WHITESPACE:
352+
while self.byte is not None and CharacterClass(self.byte) == CHAR_WHITESPACE:
340353
file_str.write(self.byte.chr())
341354
self.byte = self.oPDF.byte()
342-
if self.byte != None:
355+
if self.byte is not None:
343356
self.oPDF.unget(self.byte)
344357
else:
345358
self.oPDF = None
@@ -348,10 +361,10 @@ def Token(self):
348361
elif CharacterClass(self.byte) == CHAR_REGULAR:
349362
file_str = StringIO()
350363
token_offset = self.byte.offset
351-
while self.byte != None and CharacterClass(self.byte) == CHAR_REGULAR:
364+
while self.byte is not None and CharacterClass(self.byte) == CHAR_REGULAR:
352365
file_str.write(self.byte.chr())
353366
self.byte = self.oPDF.byte()
354-
if self.byte != None:
367+
if self.byte is not None:
355368
self.oPDF.unget(self.byte)
356369
else:
357370
self.oPDF = None
@@ -375,13 +388,13 @@ def Token(self):
375388
elif self.byte == 0x25:
376389
file_str = StringIO()
377390
token_offset = self.byte.offset
378-
while self.byte != None:
391+
while self.byte is not None:
379392
file_str.write(self.byte.chr())
380393
if self.byte == 10 or self.byte == 13:
381394
self.byte = self.oPDF.byte()
382395
break
383396
self.byte = self.oPDF.byte()
384-
if self.byte != None:
397+
if self.byte is not None:
385398
if self.byte == 10:
386399
file_str.write(self.byte.chr())
387400
else:
@@ -392,29 +405,30 @@ def Token(self):
392405
return PDFToken(CHAR_DELIMITER, self.token, token_offset)
393406
return PDFToken(CHAR_DELIMITER, self.byte.chr(), self.byte.offset)
394407

395-
def TokenIgnoreWhiteSpace(self):
408+
def TokenIgnoreWhiteSpace(self) -> Optional[PDFToken]:
396409
token = self.Token()
397-
while token != None and token[0] == CHAR_WHITESPACE:
410+
while token is not None and token[0] == CHAR_WHITESPACE:
398411
token = self.Token()
399412
return token
400413

401-
def Tokens(self):
414+
def Tokens(self) -> List[PDFToken]:
402415
tokens = []
403416
token = self.Token()
404-
while token != None:
417+
while token is not None:
405418
tokens.append(token)
406419
token = self.Token()
407420
return tokens
408421

409-
def unget(self, byte):
410-
self.ungetted.append(byte)
422+
def unget(self, token: PDFToken):
423+
self.ungetted.append(token)
424+
411425

412426
class cPDFParser:
413-
def __init__(self, file, verbose=False, extract=None, objstm=None):
414-
self.context = CONTEXT_NONE
415-
self.content = []
416-
self.oPDFTokenizer = cPDFTokenizer(file)
417-
self.verbose = verbose
427+
def __init__(self, file, verbose: bool = False, extract=None, objstm=None):
428+
self.context: int = CONTEXT_NONE
429+
self.content: List[PDFToken] = []
430+
self.oPDFTokenizer: cPDFTokenizer = cPDFTokenizer(file)
431+
self.verbose: bool = verbose
418432
self.extract = extract
419433
self.objstm = objstm
420434

@@ -435,8 +449,9 @@ def GetObject(self):
435449
self.token2 = self.oPDFTokenizer.Token()
436450
if self.token2[0] == CHAR_REGULAR:
437451
if self.context != CONTEXT_NONE:
438-
#self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1]))
439-
self.content.append(PDFToken(CHAR_DELIMITER, self.token[1] + self.token2[1], self.token.offset))
452+
# self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1]))
453+
self.content.append(PDFToken(CHAR_DELIMITER, self.token[1] + self.token2[1],
454+
self.token.offset))
440455
elif self.verbose:
441456
log.warn('todo 1: %s' % (self.token[1] + self.token2[1]))
442457
else:
@@ -536,6 +551,7 @@ def GetObject(self):
536551
else:
537552
break
538553

554+
539555
class cPDFElementComment:
540556
def __init__(self, comment, offset=None):
541557
self.type = PDF_ELEMENT_COMMENT
@@ -552,6 +568,7 @@ def __init__(self, content, offset):
552568
self.content = content
553569
self.offset = offset
554570

571+
555572
class cPDFElementTrailer:
556573
def __init__(self, content):
557574
self.type = PDF_ELEMENT_TRAILER
@@ -566,12 +583,14 @@ def Contains(self, keyword):
566583
data += Canonicalize(self.content[i][1])
567584
return data.upper().find(keyword.upper()) != -1
568585

586+
569587
def IIf(expr, truepart, falsepart):
570588
if expr:
571589
return truepart
572590
else:
573591
return falsepart
574592

593+
575594
class cPDFElementIndirectObject:
576595
def __init__(self, id, version, content, objtokens, objstm=None):
577596
self.type = PDF_ELEMENT_INDIRECT_OBJECT
@@ -580,7 +599,7 @@ def __init__(self, id, version, content, objtokens, objstm=None):
580599
self.content = content
581600
self.objstm = objstm
582601
self.objtokens = objtokens
583-
#fix stream for Ghostscript bug reported by Kurt
602+
# fix stream for Ghostscript bug reported by Kurt
584603
if self.ContainsStream():
585604
position = len(self.content) - 1
586605
if position < 0:
@@ -909,7 +928,7 @@ def ParseDictionary(self, tokens):
909928
elif value != [] and value[0][1] == '(' and tokens[0][1] != ')':
910929
if tokens[0][1][0] == '%':
911930
tokens = [tokens[0]] + cPDFTokenizer(StringIO(tokens[0][1][1:])).Tokens() + tokens[1:]
912-
value.append('%')
931+
value.append(PDFToken(CHAR_REGULAR, '%', tokens[0].offset))
913932
else:
914933
value.append(tokens[0])
915934

0 commit comments

Comments
 (0)