Avoid creation of sized POINTER types at runtime (#347)

mara004 · web-flow · commit 3731559b99a0 · 2025-03-14T15:31:00.000+01:00
Fixes #346
diff --git a/README.md b/README.md
@@ -246,6 +246,7 @@ Here are some examples of using the support model API.
 * Import the library
   ```python
   import pypdfium2 as pdfium
+  import pypdfium2.raw as pdfium_c
   ```
 
 * Open a PDF using the helper class `PdfDocument` (supports file path strings, bytes, and byte buffers)
@@ -266,6 +267,10 @@ Here are some examples of using the support model API.
   pil_image = bitmap.to_pil()
   pil_image.show()
   ```
+  
+  Note, with the PIL adapter, it might be advantageous to use `force_bitmap_format=pdfium_c.FPDFBitmap_BGRA, rev_byteorder=True` or maybe `prefer_bgrx=True, use_bgra_on_transparency=True, rev_byteorder=True`, to achieve a pixel format supported natively by PIL, and avoid rendering with transparency to a non-alpha bitmap, which can slow down pdfium.
+  
+  With `.to_numpy()`, all formats are zero-copy, but passing either `use_bgra_on_transparency=True` (if dynamic pixel format is acceptable) or `force_bitmap_format=pdfium_c.FPDFBitmap_BGRA` is also recommended for the transparency problem.
 
 * Try some page methods
   ```python
@@ -371,6 +376,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
 [^pdfium_docs]: Unfortunately, no recent HTML-rendered docs are available for PDFium at the moment.
 
 <!-- TODO write something about weakref.finalize(); add example on creating a C page array -->
+<!-- TODO doctests? -->
 
 * In general, PDFium functions can be called just like normal Python functions.
   However, parameters may only be passed positionally, i.e. it is not possible to use keyword arguments.
@@ -478,25 +484,29 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
 
 * Leaving strings, let's suppose you have a C memory buffer allocated by PDFium and wish to read its data.
   PDFium will provide you with a pointer to the first item of the byte array.
-  To access the data, you'll want to re-interpret the pointer with `ctypes.cast()` to encompass the whole array:
+  To access the data, you'll want to re-interpret the pointer to an array view with `.from_address()`:
   ```python
   # (Assuming `bitmap` is an FPDF_BITMAP and `size` is the expected number of bytes in the buffer)
-  buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
-  buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * size))
+  # FPDFBitmap_GetBuffer() has c_void_p as restype, which ctypes will auto-resolve to int or None
+  buffer_ptrval = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
+  assert buffer_ptrval  # make sure it's non-null
+  # Get an actual pointer object so we can access .contents
+  buffer_ptr = ctypes.cast(buffer_ptrval, ctypes.POINTER(ctypes.c_ubyte))
   # Buffer as ctypes array (referencing the original buffer, will be unavailable as soon as the bitmap is destroyed)
-  c_array = buffer_ptr.contents
+  c_buffer = (ctypes.c_ubyte * size).from_address( ctypes.addressof(buffer_ptr.contents) )
   # Buffer as Python bytes (independent copy)
-  data = bytes(c_array)
+  py_buffer = bytes(c_buffer)
   ```
+  Note that you can achieve the same result with `ctypes.cast(ptr, POINTER(type * size)).contents`, but this is somewhat problematic as ctypes seems to cache pointer types eternally. As `size` may vary, this can lead to memory leak like scenarios with long-running applications, so better avoid doing that.
 
 * Writing data from Python into a C buffer works in a similar fashion:
   ```python
   # (Assuming `buffer_ptr` is a pointer to the first item of a C buffer to write into,
   #  `size` the number of bytes it can store, and `py_buffer` a Python byte buffer)
-  buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_char * size))
+  buffer = (ctypes.c_char * size).from_address( ctypes.addressof(buffer_ptr.contents) )
   # Read from the Python buffer, starting at its current position, directly into the C buffer
   # (until the target is full or the end of the source is reached)
-  n_bytes = py_buffer.readinto(buffer_ptr.contents)  # returns the number of bytes read
+  n_bytes = py_buffer.readinto(buffer)  # returns the number of bytes read
   ```
 
 * If you wish to check whether two objects returned by PDFium are the same, the `is` operator won't help because `ctypes` does not have original object return (OOR), i.e. new, equivalent Python objects are created each time, although they might represent one and the same C object.[^ctypes_no_oor]
@@ -642,13 +652,16 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
   # Render the page
   pdfium_c.FPDF_RenderPageBitmap(*render_args)
   
-  # Get a pointer to the first item of the buffer
-  buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
-  # Re-interpret the pointer to encompass the whole buffer
-  buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * (width * height * 4)))
+  # Get the value of a pointer to the first item of the buffer
+  buffer_ptrval = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
+  assert buffer_ptrval, "buffer pointer value must be non-null"
+  # Cast the pointer value to an actual pointer object so we can access .contents
+  buffer_ptr = ctypes.cast(buffer_ptrval, ctypes.POINTER(ctypes.c_ubyte))
+  # Re-interpret as array
+  buffer = (ctypes.c_ubyte * (width * height * 4)).from_address(ctypes.addressof(buffer_ptr.contents))
   
   # Create a PIL image from the buffer contents
-  img = PIL.Image.frombuffer("RGBA", (width, height), buffer_ptr.contents, "raw", "BGRA", 0, 1)
+  img = PIL.Image.frombuffer("RGBA", (width, height), buffer, "raw", "BGRA", 0, 1)
   # Save it as file
   img.save("out.png")
   
diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
@@ -33,6 +33,11 @@
 - In `PdfBitmap.new_*()` methods, avoid use of `.from_raw()`, and instead call the constructor directly, as most parameters are already known on the caller side when creating a bitmap.
 - In the rendering CLI, added `--invert-lightness --exclude-images` post-processing options to render with selective lightness inversion. This may be useful to achieve a "dark theme" for light PDFs while preserving different colors, but goes at the cost of performance. (PDFium also provides a color scheme option, but this only allows you to set colors for certain object types, which are then forced on all instances of the type in question. This may flatten different colors into one, leading to a loss of visual information.)
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
+- Avoid creation of sized pointer types at runtime, to avoid blowing up Python's unbounded pointer type cache, which could effectively lead to a memory leak in a long-running application (i.e. do `(type * size).from_address(addressof(first_ptr.contents))` instead of `cast(first_ptr, POINTER(type * size)).contents`). In our opinion, the root issue is ctypes using an unlimited cache in the first place. Upstream have already signalled willingness to address this in a future version of Python. Thanks to Richard Hundt for the bug report, {issue}`346`. See below for a list of APIs that were affected:
+  * Anything using `_buffer_reader`/`_buffer_writer` under the hood (`PdfDocument` created from byte stream input, `PdfImage.load_jpeg()`, `PdfDocument.save()`).
+  * `PdfBitmap.from_raw()` rsp. `PdfBitmap._get_buffer()` and their internal callers (`PdfBitmap` makers `new_foreign` and `new_foreign_simple`, `PdfImage.get_bitmap()`).
+  * Also, some Readme snippets were affected, including the raw API rendering example. The Readme has been updated to mention the problem and use `.from_address(...)` instead.
+  * *With older versions, periodically calling `ctypes._reset_cache()` can work around this issue.*
 - Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them.
 - Simplified version classes (no API change expected).
 
diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
@@ -80,13 +80,14 @@ def parent(self):  # AutoCloseable hook
     # pypdfium2 extract-images "$DOCPATH" -o out/ --use-bitmap
     
     
-    @classmethod
-    def _get_buffer(cls, raw, stride, height):
+    @staticmethod
+    def _get_buffer(raw, stride, height):
+        # This assumes the pypdfium2-team branch of ctypesgen. With mainline ctypesgen, this might fail.
         buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(raw)
         if not buffer_ptr:
             raise PdfiumError("Failed to get bitmap buffer (null pointer returned)")
-        buffer = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * (stride * height))).contents
-        return buffer
+        buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte))
+        return pdfium_i.get_buffer(buffer_ptr, stride*height)
     
     
     @classmethod
@@ -220,7 +221,7 @@ def fill_rect(self, color, left, top, width, height):
     
     def to_numpy(self):
         """
-        Convert the bitmap to a :mod:`numpy` array.
+        Get a :mod:`numpy` array view of the bitmap.
         
         The array contains as many rows as the bitmap is high.
         Each row contains as many pixels as the bitmap is wide.
@@ -249,7 +250,7 @@ def to_numpy(self):
     
     def to_pil(self):
         """
-        Convert the bitmap to a :mod:`PIL` image, using :func:`PIL.Image.frombuffer`.
+        Get a :mod:`PIL` image of the bitmap, using :func:`PIL.Image.frombuffer`.
         
         For ``RGBA``, ``RGBX`` and ``L`` bitmaps, PIL is supposed to share memory with
         the original buffer, so changes to the buffer should be reflected in the image, and vice versa.
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
@@ -224,7 +224,7 @@ def save(self, dest, version=None, flags=pdfium_c.FPDF_NO_INCREMENTAL):
         
         if isinstance(dest, (str, Path)):
             buffer, need_close = open(dest, "wb"), True
-        elif pdfium_i.is_buffer(dest, "w"):
+        elif pdfium_i.is_stream(dest, "w"):
             buffer, need_close = dest, False
         else:
             raise ValueError(f"Cannot save to '{dest}'")
@@ -536,7 +536,7 @@ def _open_pdf(input_data, password, autoclose):
     elif isinstance(input_data, (bytes, ctypes.Array)):
         pdf = pdfium_c.FPDF_LoadMemDocument64(input_data, len(input_data), password)
         to_hold = (input_data, )
-    elif pdfium_i.is_buffer(input_data, "r"):
+    elif pdfium_i.is_stream(input_data, "r"):
         bufaccess, to_hold = pdfium_i.get_bufreader(input_data)
         if autoclose:
             to_close = (input_data, )
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
@@ -231,7 +231,7 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
         if isinstance(source, (str, Path)):
             buffer = open(source, "rb")
             autoclose = True
-        elif pdfium_i.is_buffer(source, "r"):
+        elif pdfium_i.is_stream(source, "r"):
             buffer = source
         else:
             raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte stream.")
@@ -404,7 +404,7 @@ def extract(self, dest, *args, **kwargs):
         if isinstance(dest, (str, Path)):
             with open(f"{dest}.{format}", "wb") as buf:
                 extraction_gen.send(buf)
-        elif pdfium_i.is_buffer(dest, "w"):
+        elif pdfium_i.is_stream(dest, "w"):
             extraction_gen.send(dest)
         else:
             raise ValueError(f"Cannot extract to '{dest}'")
diff --git a/src/pypdfium2/internal/utils.py b/src/pypdfium2/internal/utils.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2025 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
+import os
 import ctypes
 import pypdfium2.raw as pdfium_c
 
@@ -29,7 +30,7 @@ def set_callback(struct, fname, callback):
     setattr(struct, fname, type( getattr(struct, fname) )(callback))
 
 
-def is_buffer(buf, spec="r"):
+def is_stream(buf, spec="r"):
     methods = []
     assert set(spec).issubset( set("rw") )
     if "r" in spec:
@@ -39,32 +40,38 @@ def is_buffer(buf, spec="r"):
     return all(callable(getattr(buf, a, None)) for a in methods)
 
 
+def get_buffer(ptr, size):
+    obj = ptr.contents
+    return (type(obj) * size).from_address( ctypes.addressof(obj) )
+
+
 class _buffer_reader:
     
-    def __init__(self, buffer):
-        self.buffer = buffer
+    def __init__(self, py_buffer):
+        self.py_buffer = py_buffer
     
     def __call__(self, _, position, p_buf_first, size):
-        p_buf = ctypes.cast(p_buf_first, ctypes.POINTER(ctypes.c_char * size))
-        self.buffer.seek(position)
-        self.buffer.readinto(p_buf.contents)
+        c_buffer = get_buffer(p_buf_first, size)
+        self.py_buffer.seek(position)
+        self.py_buffer.readinto(c_buffer)
         return 1
 
 
 class _buffer_writer:
     
-    def __init__(self, buffer):
-        self.buffer = buffer
+    def __init__(self, py_buffer):
+        self.py_buffer = py_buffer
     
     def __call__(self, _, p_data_first, size):
-        p_data = ctypes.cast(p_data_first, ctypes.POINTER(ctypes.c_ubyte * size))
-        self.buffer.write(p_data.contents)
+        p_data_first = ctypes.cast(p_data_first, ctypes.POINTER(ctypes.c_ubyte))
+        c_buffer = get_buffer(p_data_first, size)
+        self.py_buffer.write(c_buffer)
         return 1
 
 
 def get_bufreader(buffer):
     
-    file_len = buffer.seek(0, 2)
+    file_len = buffer.seek(0, os.SEEK_END)
     buffer.seek(0)
     
     reader = pdfium_c.FPDF_FILEACCESS()