Skip to content

Commit 417c662

Browse files
authored
Fix path traversal and deserialization vulnerabilities in cmap code (#179)
* fix(security): prevent path traversal in cmap loading * test(security): test fix to path traversal * fix(tests): wtf, hatch * fix(security): remove useless and dangerous CMAP_PATH envvar
1 parent ca09229 commit 417c662

File tree

4 files changed

+24
-22
lines changed

4 files changed

+24
-22
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ jobs:
3131
- name: Install Hatch
3232
uses: pypa/hatch@install
3333
- name: Run tests
34-
run: hatch test --cover -py 3.8 --full-trace
34+
run: hatch test -py 3.8 --full-trace

playa/cmapdb.py

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,11 @@
1111

1212
import gzip
1313
import logging
14-
import os
15-
import os.path
1614
import pickle as pickle
1715
import struct
1816
import sys
1917
from bisect import bisect_left
18+
from pathlib import Path
2019
from typing import (
2120
Any,
2221
Dict,
@@ -48,6 +47,7 @@ def cache(func): # type: ignore
4847

4948

5049
log = logging.getLogger(__name__)
50+
CMAP_DIR = (Path(__file__).parent / "cmap").resolve()
5151

5252

5353
class CMapError(Exception):
@@ -175,21 +175,21 @@ def dump(self, out: TextIO = sys.stdout) -> None:
175175

176176

177177
class PyCMap(CMap):
178-
def __init__(self, name: str, module: Any) -> None:
178+
def __init__(self, name: str, data: Dict) -> None:
179179
super().__init__(CMapName=name)
180-
self.code2cid = module.CODE2CID
181-
if module.IS_VERTICAL:
180+
self.code2cid = data["CODE2CID"]
181+
if data["IS_VERTICAL"]:
182182
self.attrs["WMode"] = 1
183183

184184

185185
class PyUnicodeMap(UnicodeMap):
186-
def __init__(self, name: str, module: Any, vertical: bool) -> None:
186+
def __init__(self, name: str, data: Dict, vertical: bool) -> None:
187187
super().__init__(CMapName=name)
188188
if vertical:
189-
self.cid2unichr = module.CID2UNICHR_V
189+
self.cid2unichr = data["CID2UNICHR_V"]
190190
self.attrs["WMode"] = 1
191191
else:
192-
self.cid2unichr = module.CID2UNICHR_H
192+
self.cid2unichr = data["CID2UNICHR_H"]
193193

194194

195195
class CMapDB:
@@ -200,19 +200,14 @@ class CMapDB:
200200
def _load_data(cls, name: str) -> Any:
201201
name = name.replace("\0", "")
202202
filename = "%s.pickle.gz" % name
203-
cmap_paths = (
204-
os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
205-
os.path.join(os.path.dirname(__file__), "cmap"),
206-
)
207-
for directory in cmap_paths:
208-
path = os.path.join(directory, filename)
209-
if os.path.exists(path):
210-
gzfile = gzip.open(path)
211-
try:
212-
return type(str(name), (), pickle.loads(gzfile.read()))
213-
finally:
214-
gzfile.close()
215-
raise KeyError(f"CMap {name!r} not found in CMapDB")
203+
pklpath = (CMAP_DIR / filename).resolve()
204+
if not pklpath.is_relative_to(CMAP_DIR):
205+
raise KeyError(f"Ignoring malicious or malformed CMap {name}")
206+
try:
207+
with gzip.open(pklpath) as gzfile:
208+
return pickle.load(gzfile)
209+
except FileNotFoundError as e:
210+
raise KeyError(f"CMap {name} not found in CMapDB") from e
216211

217212
@classmethod
218213
def get_cmap(cls, name: str) -> CMapBase:

samples/evil_cmap.pdf

3.65 KB
Binary file not shown.

tests/test_cmapdb.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,10 @@ def test_various_tounicode(name, text):
120120
with playa.open(TESTDIR / "simple3.pdf") as pdf:
121121
text = "".join(x.chars for x in pdf.pages[0].texts)
122122
assert text == "HelloHelloあいうえおあいうえおWorldWorldあいうえおあいうえお"
123+
124+
125+
def test_cmap_sanitization(caplog):
126+
"""Verify that an evil PDF cannot read outside the cmap directory."""
127+
with playa.open(TESTDIR / "evil_cmap.pdf") as pdf:
128+
pdf.pages[0].extract_text()
129+
assert "malicious" in caplog.text

0 commit comments

Comments
 (0)