BoboTiG · jholveck · Jun 1, 2026
diff --git a/docs/source/release-history/v11.0.0.md b/docs/source/release-history/v11.0.0.md
@@ -33,6 +33,19 @@ Improved error handling when interacting with Win32 API, which will improve diag
 
 Device contexts are now acquired and released within each `grab()` call, allowing monitor enumeration to work even when `GetWindowDC(0)` fails (#509).
 
+### Zero-Copy Screenshot Buffers (GNU/Linux, Python 3.12+)
+
+MSS now supports zero-copy screenshot buffers on GNU/Linux when running under Python 3.12 or later.  Screenshot data can
+be exposed directly from operating system buffers without first being copied into a Python-owned buffer.
+
+This removes an additional memory copy from the screenshot path and is enabled automatically with no application changes
+required.
+
+In a benchmark capturing 3840×2160 screenshots as quickly as possible while forcing all pixel data to be read,
+processing time decreased from 22.64 ms to 18.59 ms per frame (approximately 18% faster).
+
+Support for additional operating systems is planned.
+
 ### General Improvements
 
 The MSS context object will now always surface inner exceptions, even if `__exit__` may also generate an exception during tear-down.

diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -52,6 +52,21 @@ This is a much better usage, memory efficient::
 
 Also, it is a good thing to save the MSS instance inside an attribute of your class and calling it when needed.
 
+Direct Screenshot Buffers
+=========================
+
+On supported platforms, MSS can expose screenshot data directly from operating system buffers instead of copying it into
+a separate Python-owned buffer. This reduces memory copying and can improve performance when processing screenshots with
+libraries that support the Python buffer protocol, such as NumPy and OpenCV.
+
+This optimization is enabled automatically and does not require any changes to application code.
+
+Requirements:
+
+- Python 3.12 or later
+- GNU/Linux
+
+Support for additional operating systems is planned.
 
 Multithreading
 ==============

diff --git a/pyproject.toml b/pyproject.toml
@@ -85,8 +85,8 @@ docs = [
   "sphinx-new-tab-link==0.8.1 ; python_version >= '3.12'",
 ]
 tests = [
-  "numpy==2.4.3 ; sys_platform == 'linux' and python_version == '3.13'",
-  "pillow==12.2.0 ; sys_platform == 'linux' and python_version == '3.13'",
+  "numpy==2.4.3 ; python_version >= '3.12'",
+  "pillow==12.2.0 ; python_version >= '3.12'",
   "pytest==9.0.3",
   "pytest-cov==7.1.0",
   "pytest-rerunfailures==16.3",

diff --git a/src/mss/base.py b/src/mss/base.py
@@ -18,7 +18,7 @@
     from collections.abc import Callable, Iterator
     from types import TracebackType
 
-    from typing_extensions import Self
+    from typing_extensions import Buffer, Self
 
     from mss.models import Monitor, Monitors, Size
 
@@ -89,7 +89,7 @@ def cursor(self) -> ScreenShot | None:
         """Retrieve all cursor data. Pixels have to be RGB."""
 
     @abstractmethod
-    def grab(self, monitor: Monitor, /) -> bytearray | tuple[bytearray, Size]:
+    def grab(self, monitor: Monitor, /) -> Buffer | tuple[Buffer, Size]:
         """Retrieve all pixels from a monitor. Pixels have to be RGB.
 
         If the monitor size is not in pixel units, include a Size in

diff --git a/src/mss/buffer.py b/src/mss/buffer.py
@@ -0,0 +1,250 @@
+"""Buffers with Finalizers
+
+This is an implementation of buffer objects with Python finalizers,
+specific to the needs of MSS.
+
+# Caller Contract
+
+The entry point is `finalizing_buffer`.  This is intended to be called
+by `MSSImplementation` subclasses.  They provide a buffer (such as a
+ctypes array or mmap object) and a finalizer, and are given a
+`memoryview` object.  Once the memoryview is garbage collected, and
+the consumers downstream of that memoryview have released their views
+of the buffer, the finalizer will be invoked (with no arguments).
+
+At that time, the `MSSImplementation` may release the buffer, return
+it to a pool for reuse, etc.
+
+This finalizer may be called at any time, from any thread.  It may be
+called after the MSSImplementation's `close()` method has been called.
+Implementations must take care not to invalidate their buffers during
+`close()`, but rather only after finalization.
+
+The finalizer may also be called before `finalizing_buffer()` returns.
+This may happen if the implementation needs to make a copy rather than
+using the originally-provided buffer (which is the case on Python
+versions prior to 3.12).
+
+(Some more caveats appear at the end of this docstring.)
+
+# Background
+
+The Python buffer protocol lets different objects share underlying
+memory.  For instance, a NumPy ndarray, a Python bytearray object, and
+a PyTorch Tensor object can all share the same underlying memory.
+This allows interoperability between these systems without requiring
+copies.
+
+Copying ("blitting") all the pixels in a screenshot takes time;
+copying a 4K (3840x2160) BGRA image can take several milliseconds.  If
+an application is attempting to operate at 60 FPS, each copy consumes
+a meaningful fraction of the frame budget.
+
+For a high-performance screenshot library such as MSS, it is therefore
+important to minimize copies.  Ideally, screenshot data would remain
+in the buffer originally allocated by the operating system, such as
+the memory returned by CreateDIBSection on Windows or a shared memory
+segment on X11.  This approach is commonly called "zero-copy".
+
+Getting the buffer to the user is only half the problem.  MSS also
+needs to know when the user is finished with the buffer's contents so
+that the underlying resources can be reused or released.
+
+Most code that uses the buffer protocol is written in C.  Since Python
+3.0, the C-level buffer protocol has provided a mechanism for
+exporters to learn when their buffers are no longer in use.  However,
+the corresponding Python-level API (which can be used by C consumers)
+was not added until Python 3.12.
+
+Buffer lifetime is not the same as Python object lifetime.  A user may
+pass the returned memoryview to NumPy, PIL, PyTorch, or other
+libraries.  Those libraries may keep the exported buffer alive after
+the original Python memoryview object is no longer reachable.
+
+Therefore, the lifetime of the returned memoryview object is not a
+reliable signal that the buffer is no longer in use.  Other objects
+may still hold references to the buffer after that memoryview has been
+destroyed.  To know when the buffer can safely be reused or released,
+MSS relies on the buffer protocol's release mechanism.
+
+The buffer protocol permits a wide variety of consumer behaviors and
+derived-buffer relationships.  Rather than attempting to model all of
+those interactions directly, this implementation delegates that
+complexity to Python's existing buffer-management machinery.
+
+## Performance note
+
+As a rough reference, copying a 3840x2160 BGRA screenshot on
+contemporary hardware (Amazon EC2 m8i.large, Intel Xeon 6, DDR5-7200)
+takes approximately 2.5 ms. At 60 FPS, that is about 15% of the
+available frame time for a single copy.  These numbers are intended
+only to provide intuition about the cost of copies; actual performance
+varies substantially by hardware and memory subsystem.
+
+# Design
+
+The central design decision in this file is that MSS interacts with
+exactly one downstream buffer consumer: a memoryview.
+
+A memoryview is Python's standard object for representing a buffer.
+It already implements the reference tracking, buffer export, slicing,
+and format-conversion behavior required by the buffer protocol.
+
+Notably, memoryview objects do not pass buffer requests upstream to
+arbitrary exporters.  Once a memoryview has been created, it manages
+downstream consumers itself.
+
+This means MSS only needs to reason about a single interaction: the
+interaction between `_FinalizingBufferIntermediate` and the memoryview
+created from it.
+
+One idea that has been proposed is to attach a weakref finalizer
+directly to a memoryview object and use that as the signal that the
+buffer is no longer in use.  Testing has shown that this is not
+sufficient.  A memoryview Python object may be finalized while
+downstream consumers still hold active references to the underlying
+buffer.
+
+To obtain a correct signal, MSS uses the Python-side buffer protocol
+introduced in Python 3.12 via the `__buffer__` and
+`__release_buffer__` methods.
+
+An instance of `_FinalizingBufferIntermediate` is created and exactly
+one memoryview is constructed from it.  That memoryview is returned to
+the caller.
+
+The memoryview tracks all downstream users of the buffer.  When all of
+those users have released their references, the memoryview
+automatically invokes `_FinalizingBufferIntermediate.__release_buffer__`.
+
+That method invokes the caller-provided finalizer, which can release
+or recycle the underlying storage.
+
+If this implementation appears more indirect than necessary, that
+indirection is intentional.  It narrows the portion of the buffer
+protocol that MSS must reason about and test.
+
+# Caveats and Invariants
+
+* The finalizer may run after `MSSImplementation.close()` has been
+  called.  `close()` must not free, reuse, or otherwise invalidate
+  buffers that may still be visible to users.
+
+* The finalizer may run at any time and on any thread.  Finalizer code
+  must therefore be thread-safe and must not assume that it executes
+  on the thread that created the buffer.
+
+* On Python versions prior to 3.12, `finalizing_buffer()` creates a
+  copy of the data and invokes the finalizer immediately.  In this
+  case, the finalizer may run before `finalizing_buffer()` returns.
+
+* `_FinalizingBufferIntermediate` intentionally supports exactly one
+  buffer request.  This restriction simplifies reasoning about
+  correctness and should not be removed without carefully considering
+  the resulting buffer-lifetime semantics.
+
+* `_FinalizingBufferIntermediate` remains reachable through
+  `memoryview.obj`.  Consumers must treat this as an implementation
+  detail and must not invoke `__buffer__()` or `__release_buffer__()`
+  directly.
+
+* Finalizer execution during interpreter shutdown is not guaranteed.
+  Implementations should not rely on finalizers running during process
+  termination.
+"""
+
+from __future__ import annotations
+
+import sys
+from threading import Lock
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from typing_extensions import Buffer
+
+# You can always use this module, and finalizing_buffer.  This variable is for conditionalizing things like test code or
+# optimizations, but most code should always follow the same path.
+FAST_PATH_AVAILABLE = sys.version_info >= (3, 12)
+
+
+class _FinalizingBufferIntermediate:
+    """Finalizing buffer class.
+
+    Contrary to the buffer protocol, this class only allows a single
+    buffer to be created.  This simplifies the implementation and
+    reasoning.
+
+    The creator must provide a finalizer to ensure that resources are
+    properly released when the underlying buffer is no longer needed.
+    This will be invoked, with no arguments, after all the downstream
+    users, such as NumPy or PIL, have released their references to
+    the buffer.
+
+    This is only useful on Python 3.12 and later; earlier versions do
+    not support the __buffer__ and __release_buffer__ methods.
+
+    This class should only be used by the finalizing_buffer function.
+    It is not appropriate for other uses!
+    """
+
+    def __init__(self, data: Buffer, finalizer: Callable) -> None:
+        self._mv: memoryview | None = memoryview(data)
+        self._finalizer = finalizer
+        # The remainder of these shouldn't be necessary.  As a consequence of the __buffer__ contract and the
+        # implementation of finalizing_buffer, only one call to __buffer__ and one call to __release_buffer__ should be
+        # made, and never simultaneously.  But we still include them out of an abundance of caution.
+        self._buffer_invoked = False
+        self._release_invoked = False
+        self._lock = Lock()
+
+    def __buffer__(self, _flags: int) -> memoryview:
+        with self._lock:
+            assert not self._buffer_invoked, "Buffer can only be requested once"  # noqa: S101
+            self._buffer_invoked = True
+        assert self._mv is not None, "Buffer has already been released"  # noqa: S101
+        return self._mv
+
+    def __release_buffer__(self, _buffer: memoryview) -> None:
+        with self._lock:
+            assert not self._release_invoked, "Buffer can only be released once"  # noqa: S101
+            self._release_invoked = True
+        assert self._mv is not None, "Buffer has already been released"  # noqa: S101
+        # We need to release the memoryview itself, so that when the finalizer is invoked, the underlying buffer object
+        # doesn't think there are still exported buffers.  (mmap, for instance, won't close a region with exported
+        # buffers.)
+        self._mv.release()
+        self._mv = None  # Extra-defensive
+        self._finalizer()
+
+
+def finalizing_buffer(data: Buffer, finalizer: Callable) -> memoryview:
+    """Create a finalizing buffer or a copy depending on Python version.
+
+    The finalizer will be invoked when the buffer is no longer in use,
+    with a caveat.  This will only track uses downstream of the
+    returned buffer.  If the input buffer is also used in other
+    places, those are not accounted for.
+
+    On Python 3.12 and later, this returns a memoryview object that
+    provides a reusable buffer interface.  On earlier versions, this
+    returns a copy of the data, and invokes the finalizer immediately
+    after the copy is made.
+
+    This preserves read/write semantics of the original data: if the
+    original buffer is read-only, the returned memoryview will be
+    read-only.
+    """
+    if FAST_PATH_AVAILABLE:
+        # Fast path: we can use the Python 3.12 features
+        return memoryview(_FinalizingBufferIntermediate(data, finalizer))
+    # Slow path: copy the data.
+    with memoryview(data) as mv:
+        # We create a memoryview of the original data so that we can tell if it's read-only or not.  We can't return
+        # this memoryview, since we're about to invoke the finalizer to release the buffer it got its data from.
+        copied_data = bytes(mv) if mv.readonly else bytearray(mv)
+    finalizer()
+    # We could return copied_data directly and still have a perfectly fine buffer, but always returning a memoryview
+    # provides more consistency.
+    return memoryview(copied_data)
diff --git a/src/mss/linux/base.py b/src/mss/linux/base.py
@@ -453,6 +453,10 @@ def _grab_xgetimage(self, monitor: Monitor, /) -> bytearray:
         # Now, save the image.  This is a reference into the img_reply structure.
         img_data_arr = xcb.get_image_data(img_reply)
         # Copy this into a new bytearray, so that it will persist after we clear the image structure.
+        #
+        # We might be able to hold onto img_reply in a finalizing_buffer finalizer, so that we can use the image data
+        # without copying.  That would be more efficient, but it would be a bit more complex, and presently the
+        # XGetImage implementation is already a slow and less-common path.
         img_data = bytearray(img_data_arr)
 
         if img_reply.depth != self.drawable_depth or img_reply.visual != self.drawable_visual_id: