Skip to content

Commit 3731559

Browse files
authored
Avoid creation of sized POINTER types at runtime (#347)
Fixes #346
1 parent 09e06cc commit 3731559

File tree

6 files changed

+59
-33
lines changed

6 files changed

+59
-33
lines changed

README.md

+25-12
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ Here are some examples of using the support model API.
246246
* Import the library
247247
```python
248248
import pypdfium2 as pdfium
249+
import pypdfium2.raw as pdfium_c
249250
```
250251

251252
* Open a PDF using the helper class `PdfDocument` (supports file path strings, bytes, and byte buffers)
@@ -266,6 +267,10 @@ Here are some examples of using the support model API.
266267
pil_image = bitmap.to_pil()
267268
pil_image.show()
268269
```
270+
271+
Note, with the PIL adapter, it might be advantageous to use `force_bitmap_format=pdfium_c.FPDFBitmap_BGRA, rev_byteorder=True` or maybe `prefer_bgrx=True, use_bgra_on_transparency=True, rev_byteorder=True`, to achieve a pixel format supported natively by PIL, and avoid rendering with transparency to a non-alpha bitmap, which can slow down pdfium.
272+
273+
With `.to_numpy()`, all formats are zero-copy, but passing either `use_bgra_on_transparency=True` (if dynamic pixel format is acceptable) or `force_bitmap_format=pdfium_c.FPDFBitmap_BGRA` is also recommended for the transparency problem.
269274

270275
* Try some page methods
271276
```python
@@ -371,6 +376,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
371376
[^pdfium_docs]: Unfortunately, no recent HTML-rendered docs are available for PDFium at the moment.
372377

373378
<!-- TODO write something about weakref.finalize(); add example on creating a C page array -->
379+
<!-- TODO doctests? -->
374380

375381
* In general, PDFium functions can be called just like normal Python functions.
376382
However, parameters may only be passed positionally, i.e. it is not possible to use keyword arguments.
@@ -478,25 +484,29 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
478484

479485
* Leaving strings, let's suppose you have a C memory buffer allocated by PDFium and wish to read its data.
480486
PDFium will provide you with a pointer to the first item of the byte array.
481-
To access the data, you'll want to re-interpret the pointer with `ctypes.cast()` to encompass the whole array:
487+
To access the data, you'll want to re-interpret the pointer to an array view with `.from_address()`:
482488
```python
483489
# (Assuming `bitmap` is an FPDF_BITMAP and `size` is the expected number of bytes in the buffer)
484-
buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
485-
buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * size))
490+
# FPDFBitmap_GetBuffer() has c_void_p as restype, which ctypes will auto-resolve to int or None
491+
buffer_ptrval = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
492+
assert buffer_ptrval # make sure it's non-null
493+
# Get an actual pointer object so we can access .contents
494+
buffer_ptr = ctypes.cast(buffer_ptrval, ctypes.POINTER(ctypes.c_ubyte))
486495
# Buffer as ctypes array (referencing the original buffer, will be unavailable as soon as the bitmap is destroyed)
487-
c_array = buffer_ptr.contents
496+
c_buffer = (ctypes.c_ubyte * size).from_address( ctypes.addressof(buffer_ptr.contents) )
488497
# Buffer as Python bytes (independent copy)
489-
data = bytes(c_array)
498+
py_buffer = bytes(c_buffer)
490499
```
500+
Note that you can achieve the same result with `ctypes.cast(ptr, POINTER(type * size)).contents`, but this is somewhat problematic as ctypes seems to cache pointer types eternally. As `size` may vary, this can lead to memory leak like scenarios with long-running applications, so better avoid doing that.
491501

492502
* Writing data from Python into a C buffer works in a similar fashion:
493503
```python
494504
# (Assuming `buffer_ptr` is a pointer to the first item of a C buffer to write into,
495505
# `size` the number of bytes it can store, and `py_buffer` a Python byte buffer)
496-
buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_char * size))
506+
buffer = (ctypes.c_char * size).from_address( ctypes.addressof(buffer_ptr.contents) )
497507
# Read from the Python buffer, starting at its current position, directly into the C buffer
498508
# (until the target is full or the end of the source is reached)
499-
n_bytes = py_buffer.readinto(buffer_ptr.contents) # returns the number of bytes read
509+
n_bytes = py_buffer.readinto(buffer) # returns the number of bytes read
500510
```
501511

502512
* If you wish to check whether two objects returned by PDFium are the same, the `is` operator won't help because `ctypes` does not have original object return (OOR), i.e. new, equivalent Python objects are created each time, although they might represent one and the same C object.[^ctypes_no_oor]
@@ -642,13 +652,16 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
642652
# Render the page
643653
pdfium_c.FPDF_RenderPageBitmap(*render_args)
644654

645-
# Get a pointer to the first item of the buffer
646-
buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
647-
# Re-interpret the pointer to encompass the whole buffer
648-
buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * (width * height * 4)))
655+
# Get the value of a pointer to the first item of the buffer
656+
buffer_ptrval = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
657+
assert buffer_ptrval, "buffer pointer value must be non-null"
658+
# Cast the pointer value to an actual pointer object so we can access .contents
659+
buffer_ptr = ctypes.cast(buffer_ptrval, ctypes.POINTER(ctypes.c_ubyte))
660+
# Re-interpret as array
661+
buffer = (ctypes.c_ubyte * (width * height * 4)).from_address(ctypes.addressof(buffer_ptr.contents))
649662

650663
# Create a PIL image from the buffer contents
651-
img = PIL.Image.frombuffer("RGBA", (width, height), buffer_ptr.contents, "raw", "BGRA", 0, 1)
664+
img = PIL.Image.frombuffer("RGBA", (width, height), buffer, "raw", "BGRA", 0, 1)
652665
# Save it as file
653666
img.save("out.png")
654667

docs/devel/changelog_staging.md

+5
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@
3333
- In `PdfBitmap.new_*()` methods, avoid use of `.from_raw()`, and instead call the constructor directly, as most parameters are already known on the caller side when creating a bitmap.
3434
- In the rendering CLI, added `--invert-lightness --exclude-images` post-processing options to render with selective lightness inversion. This may be useful to achieve a "dark theme" for light PDFs while preserving different colors, but goes at the cost of performance. (PDFium also provides a color scheme option, but this only allows you to set colors for certain object types, which are then forced on all instances of the type in question. This may flatten different colors into one, leading to a loss of visual information.)
3535
- Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
36+
- Avoid creation of sized pointer types at runtime, to avoid blowing up Python's unbounded pointer type cache, which could effectively lead to a memory leak in a long-running application (i.e. do `(type * size).from_address(addressof(first_ptr.contents))` instead of `cast(first_ptr, POINTER(type * size)).contents`). In our opinion, the root issue is ctypes using an unlimited cache in the first place. Upstream have already signalled willingness to address this in a future version of Python. Thanks to Richard Hundt for the bug report, {issue}`346`. See below for a list of APIs that were affected:
37+
* Anything using `_buffer_reader`/`_buffer_writer` under the hood (`PdfDocument` created from byte stream input, `PdfImage.load_jpeg()`, `PdfDocument.save()`).
38+
* `PdfBitmap.from_raw()` rsp. `PdfBitmap._get_buffer()` and their internal callers (`PdfBitmap` makers `new_foreign` and `new_foreign_simple`, `PdfImage.get_bitmap()`).
39+
* Also, some Readme snippets were affected, including the raw API rendering example. The Readme has been updated to mention the problem and use `.from_address(...)` instead.
40+
* *With older versions, periodically calling `ctypes._reset_cache()` can work around this issue.*
3641
- Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them.
3742
- Simplified version classes (no API change expected).
3843

src/pypdfium2/_helpers/bitmap.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,14 @@ def parent(self): # AutoCloseable hook
8080
# pypdfium2 extract-images "$DOCPATH" -o out/ --use-bitmap
8181

8282

83-
@classmethod
84-
def _get_buffer(cls, raw, stride, height):
83+
@staticmethod
84+
def _get_buffer(raw, stride, height):
85+
# This assumes the pypdfium2-team branch of ctypesgen. With mainline ctypesgen, this might fail.
8586
buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(raw)
8687
if not buffer_ptr:
8788
raise PdfiumError("Failed to get bitmap buffer (null pointer returned)")
88-
buffer = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * (stride * height))).contents
89-
return buffer
89+
buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte))
90+
return pdfium_i.get_buffer(buffer_ptr, stride*height)
9091

9192

9293
@classmethod
@@ -220,7 +221,7 @@ def fill_rect(self, color, left, top, width, height):
220221

221222
def to_numpy(self):
222223
"""
223-
Convert the bitmap to a :mod:`numpy` array.
224+
Get a :mod:`numpy` array view of the bitmap.
224225
225226
The array contains as many rows as the bitmap is high.
226227
Each row contains as many pixels as the bitmap is wide.
@@ -249,7 +250,7 @@ def to_numpy(self):
249250

250251
def to_pil(self):
251252
"""
252-
Convert the bitmap to a :mod:`PIL` image, using :func:`PIL.Image.frombuffer`.
253+
Get a :mod:`PIL` image of the bitmap, using :func:`PIL.Image.frombuffer`.
253254
254255
For ``RGBA``, ``RGBX`` and ``L`` bitmaps, PIL is supposed to share memory with
255256
the original buffer, so changes to the buffer should be reflected in the image, and vice versa.

src/pypdfium2/_helpers/document.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ def save(self, dest, version=None, flags=pdfium_c.FPDF_NO_INCREMENTAL):
224224

225225
if isinstance(dest, (str, Path)):
226226
buffer, need_close = open(dest, "wb"), True
227-
elif pdfium_i.is_buffer(dest, "w"):
227+
elif pdfium_i.is_stream(dest, "w"):
228228
buffer, need_close = dest, False
229229
else:
230230
raise ValueError(f"Cannot save to '{dest}'")
@@ -536,7 +536,7 @@ def _open_pdf(input_data, password, autoclose):
536536
elif isinstance(input_data, (bytes, ctypes.Array)):
537537
pdf = pdfium_c.FPDF_LoadMemDocument64(input_data, len(input_data), password)
538538
to_hold = (input_data, )
539-
elif pdfium_i.is_buffer(input_data, "r"):
539+
elif pdfium_i.is_stream(input_data, "r"):
540540
bufaccess, to_hold = pdfium_i.get_bufreader(input_data)
541541
if autoclose:
542542
to_close = (input_data, )

src/pypdfium2/_helpers/pageobjects.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
231231
if isinstance(source, (str, Path)):
232232
buffer = open(source, "rb")
233233
autoclose = True
234-
elif pdfium_i.is_buffer(source, "r"):
234+
elif pdfium_i.is_stream(source, "r"):
235235
buffer = source
236236
else:
237237
raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte stream.")
@@ -404,7 +404,7 @@ def extract(self, dest, *args, **kwargs):
404404
if isinstance(dest, (str, Path)):
405405
with open(f"{dest}.{format}", "wb") as buf:
406406
extraction_gen.send(buf)
407-
elif pdfium_i.is_buffer(dest, "w"):
407+
elif pdfium_i.is_stream(dest, "w"):
408408
extraction_gen.send(dest)
409409
else:
410410
raise ValueError(f"Cannot extract to '{dest}'")

src/pypdfium2/internal/utils.py

+18-11
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]>
22
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
33

4+
import os
45
import ctypes
56
import pypdfium2.raw as pdfium_c
67

@@ -29,7 +30,7 @@ def set_callback(struct, fname, callback):
2930
setattr(struct, fname, type( getattr(struct, fname) )(callback))
3031

3132

32-
def is_buffer(buf, spec="r"):
33+
def is_stream(buf, spec="r"):
3334
methods = []
3435
assert set(spec).issubset( set("rw") )
3536
if "r" in spec:
@@ -39,32 +40,38 @@ def is_buffer(buf, spec="r"):
3940
return all(callable(getattr(buf, a, None)) for a in methods)
4041

4142

43+
def get_buffer(ptr, size):
44+
obj = ptr.contents
45+
return (type(obj) * size).from_address( ctypes.addressof(obj) )
46+
47+
4248
class _buffer_reader:
4349

44-
def __init__(self, buffer):
45-
self.buffer = buffer
50+
def __init__(self, py_buffer):
51+
self.py_buffer = py_buffer
4652

4753
def __call__(self, _, position, p_buf_first, size):
48-
p_buf = ctypes.cast(p_buf_first, ctypes.POINTER(ctypes.c_char * size))
49-
self.buffer.seek(position)
50-
self.buffer.readinto(p_buf.contents)
54+
c_buffer = get_buffer(p_buf_first, size)
55+
self.py_buffer.seek(position)
56+
self.py_buffer.readinto(c_buffer)
5157
return 1
5258

5359

5460
class _buffer_writer:
5561

56-
def __init__(self, buffer):
57-
self.buffer = buffer
62+
def __init__(self, py_buffer):
63+
self.py_buffer = py_buffer
5864

5965
def __call__(self, _, p_data_first, size):
60-
p_data = ctypes.cast(p_data_first, ctypes.POINTER(ctypes.c_ubyte * size))
61-
self.buffer.write(p_data.contents)
66+
p_data_first = ctypes.cast(p_data_first, ctypes.POINTER(ctypes.c_ubyte))
67+
c_buffer = get_buffer(p_data_first, size)
68+
self.py_buffer.write(c_buffer)
6269
return 1
6370

6471

6572
def get_bufreader(buffer):
6673

67-
file_len = buffer.seek(0, 2)
74+
file_len = buffer.seek(0, os.SEEK_END)
6875
buffer.seek(0)
6976

7077
reader = pdfium_c.FPDF_FILEACCESS()

0 commit comments

Comments
 (0)