From 2746f2136b82e877e105f64dfc299d520641c625 Mon Sep 17 00:00:00 2001 From: WyattBlue Date: Sat, 13 Jun 2026 20:14:36 -0400 Subject: [PATCH 1/2] Support DLPack export for any CPU VideoPlane VideoPlane.__dlpack__() only handled nv12/p010le/p016le and raised NotImplementedError otherwise, so common CPU formats (packed RGB, planar YUV, gray, ...) needed a NumPy intermediate to export. Add a generic CPU path that describes the plane from its pixel-format descriptor: single-component planes export as 2D (H, W), planes that interleave components as 3D (H, W, C). This reproduces the existing nv12/p010le/p016le layouts exactly. Bitstream, palette, Bayer, big-endian 16-bit, and sub-byte packings are rejected; CUDA export is unchanged. Fixes #2217 --- av/video/plane.py | 70 +++++++++++++++++++++++++++++++++++++++++++- tests/test_dlpack.py | 43 +++++++++++++++++++++++++-- 2 files changed, 109 insertions(+), 4 deletions(-) diff --git a/av/video/plane.py b/av/video/plane.py index 15fe70a94..c6d3c095c 100644 --- a/av/video/plane.py +++ b/av/video/plane.py @@ -186,8 +186,76 @@ def __dlpack__(self, *, stream: int | None = None): st2 = 1 else: raise ValueError("invalid plane index for P010/P016") - else: + elif device_type != kCPU: raise NotImplementedError("unsupported sw_format for DLPack export") + else: + # Generic CPU export. Describe the plane straight from its + # pixel-format descriptor: planes holding a single component (the + # Y/U/V planes of planar YUV, gray, ...) become 2D (H, W), while + # planes that interleave several components (packed RGB, the chroma + # plane of NV12, ...) become 3D (H, W, C). + desc: cython.pointer[lib.AVPixFmtDescriptor] = lib.av_pix_fmt_desc_get( + sw_fmt + ) + if desc == cython.NULL: + raise NotImplementedError("unknown pixel format for DLPack export") + if desc.flags & ( + lib.AV_PIX_FMT_FLAG_BITSTREAM + | lib.AV_PIX_FMT_FLAG_PAL + | lib.AV_PIX_FMT_FLAG_BAYER + ): + raise NotImplementedError( + "bitstream, palette, and Bayer formats are not supported for " + "DLPack export" + ) + + step_bytes: cython.int = 0 + ncomp: cython.int = 0 + i: cython.int + for i in range(desc.nb_components): + if desc.comp[i].plane != self.index: + continue + if ncomp == 0: + step_bytes = desc.comp[i].step + elif cython.cast(cython.int, desc.comp[i].step) != step_bytes: + raise NotImplementedError( + "mixed component step is not supported for DLPack export" + ) + ncomp += 1 + + if ncomp == 0: + raise ValueError(f"plane {self.index} has no components") + if step_bytes % ncomp: + raise NotImplementedError( + "unsupported component packing for DLPack export" + ) + itemsize = step_bytes // ncomp + if itemsize != 1 and itemsize != 2: + raise NotImplementedError( + "only 8- and 16-bit components are supported for DLPack export" + ) + if itemsize == 2 and desc.flags & lib.AV_PIX_FMT_FLAG_BE: + raise NotImplementedError( + "big-endian formats are not supported for DLPack export" + ) + bits = itemsize * 8 + if line_size % itemsize: + raise ValueError("linesize is not aligned to dtype") + + if ncomp == 1: + ndim = 2 + s0 = self.height + s1 = self.width + st0 = line_size // itemsize + st1 = 1 + else: + ndim = 3 + s0 = self.height + s1 = self.width + s2 = ncomp + st0 = line_size // itemsize + st1 = ncomp + st2 = 1 frame_ref: cython.pointer[lib.AVFrame] = lib.av_frame_alloc() if frame_ref == cython.NULL: diff --git a/tests/test_dlpack.py b/tests/test_dlpack.py index e017affd2..003e696f5 100644 --- a/tests/test_dlpack.py +++ b/tests/test_dlpack.py @@ -235,13 +235,50 @@ def test_video_plane_dlpack_export_keeps_frame_alive_after_gc() -> None: assertNdarraysEqual(y_dl, expected) -def test_video_plane_dlpack_unsupported_format_raises() -> None: - rgb = numpy.zeros((16, 16, 3), dtype=numpy.uint8) +def test_video_plane_dlpack_export_packed_rgb_cpu() -> None: + # Packed formats interleave several components in one plane and export as + # a 3D (H, W, C) tensor (issue #2217). + rgb = (numpy.arange(16 * 24 * 3) % 251).astype(numpy.uint8).reshape(16, 24, 3) frame = VideoFrame.from_ndarray(rgb, format="rgb24") + plane = frame.planes[0] + assert plane.__dlpack_device__() == (1, 0) + + arr = numpy.from_dlpack(plane) + assert arr.shape == (16, 24, 3) + assert arr.strides == (plane.line_size, 3, 1) + assert arr.dtype == numpy.uint8 + assertNdarraysEqual(arr, rgb) + + +def test_video_plane_dlpack_export_planar_yuv_cpu() -> None: + # Planar formats expose each single-component plane as a 2D (H, W) tensor + # (issue #2217). + frame = VideoFrame(16, 16, "yuv420p") + for index, (h, w) in enumerate([(16, 16), (8, 8), (8, 8)]): + plane = frame.planes[index] + assert plane.__dlpack_device__() == (1, 0) + arr = numpy.from_dlpack(plane) + assert arr.shape == (h, w) + assert arr.strides == (plane.line_size, 1) + assert arr.dtype == numpy.uint8 + + +def test_video_plane_dlpack_export_planar_yuv16_cpu() -> None: + # 16-bit planar formats export as uint16. + frame = VideoFrame(16, 16, "yuv420p10le") + arr = numpy.from_dlpack(frame.planes[0]) + assert arr.shape == (16, 16) + assert arr.dtype == numpy.uint16 + + +def test_video_plane_dlpack_unsupported_format_raises() -> None: + # Palette formats still cannot be exported. + frame = VideoFrame(16, 16, "pal8") assert frame.planes[0].__dlpack_device__() == (1, 0) with pytest.raises( - NotImplementedError, match="unsupported sw_format for DLPack export" + NotImplementedError, + match="bitstream, palette, and Bayer formats are not supported", ): frame.planes[0].__dlpack__() From 684bdbb5e42953cfdead14ab9e902d5ed03966b3 Mon Sep 17 00:00:00 2001 From: WyattBlue Date: Sat, 13 Jun 2026 20:22:11 -0400 Subject: [PATCH 2/2] Support DLPack import for planar formats like yuv420p VideoFrame.from_dlpack() still only accepted the 2-plane nv12 family, so a decoded yuv420p frame could not be rebuilt from the plane tensors the previous commit can now export. Add a zero-copy CPU path for planar formats whose planes each hold a single component: yuv420p/yuv422p/yuv444p, gray, gbrp, and their 16-bit little-endian variants. Plane count, per-plane shape, and chroma subsampling come from the pixel-format descriptor. Continues #2217 --- av/video/frame.py | 163 +++++++++++++++++++++++++++++++++++++++++-- av/video/plane.py | 4 +- tests/test_dlpack.py | 101 +++++++++++++++++++++++++-- 3 files changed, 255 insertions(+), 13 deletions(-) diff --git a/av/video/frame.py b/av/video/frame.py index e530f9026..78b97a550 100644 --- a/av/video/frame.py +++ b/av/video/frame.py @@ -1459,18 +1459,20 @@ def from_dlpack( if not isinstance(planes, (tuple, list)): planes = (planes,) - if len(planes) != 2: - raise ValueError( - "from_dlpack currently supports 2-plane formats only (nv12/p010le/p016le)" - ) - sw_fmt: lib.AVPixelFormat = get_pix_fmt(format) nv12 = get_pix_fmt(b"nv12") p010le = get_pix_fmt(b"p010le") p016le = get_pix_fmt(b"p016le") if sw_fmt not in (nv12, p010le, p016le): - raise NotImplementedError("from_dlpack supports nv12, p010le, p016le only") + return VideoFrame._from_dlpack_planar( + planes, sw_fmt, format, width, height, stream, device_id + ) + + if len(planes) != 2: + raise ValueError( + "from_dlpack currently supports 2-plane formats only (nv12/p010le/p016le)" + ) expected_bits = 8 if sw_fmt == nv12 else 16 itemsize = 1 if expected_bits == 8 else 2 @@ -1656,3 +1658,152 @@ def from_dlpack( if m1 != cython.NULL: m1.deleter(m1) raise + + @staticmethod + def _from_dlpack_planar(planes, sw_fmt, format, width, height, stream, device_id): + # CPU-only import for planar formats whose planes each hold a single + # component: yuv420p/yuv422p/yuv444p, gray, gbrp, and their 16-bit + # little-endian variants. nv12/p010le/p016le keep their dedicated + # 2-plane path in from_dlpack(). + desc: cython.pointer[cython.const[lib.AVPixFmtDescriptor]] = ( + lib.av_pix_fmt_desc_get(sw_fmt) + ) + if desc == cython.NULL: + raise NotImplementedError(f"unknown pixel format {format!r}") + if desc.flags & ( + lib.AV_PIX_FMT_FLAG_BITSTREAM + | lib.AV_PIX_FMT_FLAG_PAL + | lib.AV_PIX_FMT_FLAG_BAYER + ): + raise NotImplementedError( + f"from_dlpack does not support bitstream, palette, or Bayer " + f"formats ({format!r})" + ) + + i: cython.int + nb_planes: cython.int = 0 + for i in range(desc.nb_components): + if cython.cast(cython.int, desc.comp[i].plane) + 1 > nb_planes: + nb_planes = desc.comp[i].plane + 1 + + p: cython.int + count: cython.int + comp_of_plane = [0] * nb_planes + for p in range(nb_planes): + count = 0 + for i in range(desc.nb_components): + if cython.cast(cython.int, desc.comp[i].plane) == p: + comp_of_plane[p] = i + count += 1 + if count != 1: + raise NotImplementedError( + "from_dlpack supports nv12/p010le/p016le and planar " + f"single-component formats; {format!r} is not supported" + ) + + if len(planes) != nb_planes: + raise ValueError( + f"{format!r} requires {nb_planes} plane(s), got {len(planes)}" + ) + + itemsize: cython.int = desc.comp[0].step + if itemsize != 1 and itemsize != 2: + raise NotImplementedError( + "only 8- and 16-bit components are supported for DLPack import" + ) + if itemsize == 2 and desc.flags & lib.AV_PIX_FMT_FLAG_BE: + raise NotImplementedError( + "big-endian formats are not supported for DLPack import" + ) + expected_bits: cython.int = itemsize * 8 + + if device_id not in (None, 0): + raise ValueError("device_id must be 0 for CPU tensors") + + log2_w: cython.int = desc.log2_chroma_w + log2_h: cython.int = desc.log2_chroma_h + + frame: VideoFrame = None + m: cython.pointer[DLManagedTensor] = cython.NULL + try: + frame = alloc_video_frame() + frame.ptr.format = sw_fmt + + for p in range(nb_planes): + m = _consume_dlpack(planes[p], stream) + + if m.dl_tensor.device_type != kCPU: + raise NotImplementedError( + "only CPU DLPack tensors are supported for this format" + ) + if m.dl_tensor.device_id != 0: + raise ValueError("CPU DLPack tensors must have device_id == 0") + + if ( + m.dl_tensor.dtype.code != 1 + or m.dl_tensor.dtype.bits != expected_bits + or m.dl_tensor.dtype.lanes != 1 + ): + raise TypeError(f"unexpected dtype for plane {p}") + + if m.dl_tensor.ndim != 2: + raise ValueError(f"plane {p} must be 2D (H, W)") + + ph = cython.cast(int64_t, m.dl_tensor.shape[0]) + pw = cython.cast(int64_t, m.dl_tensor.shape[1]) + + if p == 0: + if width == 0 and height == 0: + width = cython.cast(int, pw) + height = cython.cast(int, ph) + elif width == 0 or height == 0: + raise ValueError("either specify both width/height or neither") + elif pw != width or ph != height: + raise ValueError("plane 0 shape does not match width/height") + if (log2_w and width % 2) or (log2_h and height % 2): + raise ValueError(f"width/height must be even for {format!r}") + frame.ptr.width = width + frame.ptr.height = height + + comp_idx = comp_of_plane[p] + is_chroma = (comp_idx == 1 or comp_idx == 2) and (log2_w or log2_h) + exp_w = (-((-width) >> log2_w)) if is_chroma else width + exp_h = (-((-height) >> log2_h)) if is_chroma else height + + if pw != exp_w or ph != exp_h: + raise ValueError(f"plane {p} must have shape ({exp_h}, {exp_w})") + + if m.dl_tensor.strides != cython.NULL: + if m.dl_tensor.strides[1] != 1: + raise ValueError( + f"plane {p} must be contiguous in the last dimension" + ) + pitch_elems = cython.cast(int64_t, m.dl_tensor.strides[0]) + else: + pitch_elems = cython.cast(int64_t, exp_w) + + linesize = cython.cast(int, pitch_elems * itemsize) + size = cython.cast(int, linesize * exp_h) + + ptr = cython.cast( + cython.pointer[uint8_t], m.dl_tensor.data + ) + cython.cast(cython.size_t, m.dl_tensor.byte_offset) + + frame.ptr.buf[p] = lib.av_buffer_create( + ptr, size, _dlpack_avbuffer_free, cython.cast(cython.p_void, m), 0 + ) + if frame.ptr.buf[p] == cython.NULL: + raise MemoryError(f"av_buffer_create failed for plane {p}") + frame.ptr.data[p] = ptr + frame.ptr.linesize[p] = linesize + m = cython.NULL + + frame._init_user_attributes() + return frame + + except Exception: + if frame is not None: + lib.av_frame_unref(frame.ptr) + if m != cython.NULL: + m.deleter(m) + raise diff --git a/av/video/plane.py b/av/video/plane.py index c6d3c095c..51a3a13f1 100644 --- a/av/video/plane.py +++ b/av/video/plane.py @@ -194,8 +194,8 @@ def __dlpack__(self, *, stream: int | None = None): # Y/U/V planes of planar YUV, gray, ...) become 2D (H, W), while # planes that interleave several components (packed RGB, the chroma # plane of NV12, ...) become 3D (H, W, C). - desc: cython.pointer[lib.AVPixFmtDescriptor] = lib.av_pix_fmt_desc_get( - sw_fmt + desc: cython.pointer[cython.const[lib.AVPixFmtDescriptor]] = ( + lib.av_pix_fmt_desc_get(sw_fmt) ) if desc == cython.NULL: raise NotImplementedError("unknown pixel format for DLPack export") diff --git a/tests/test_dlpack.py b/tests/test_dlpack.py index 003e696f5..13e05fbbc 100644 --- a/tests/test_dlpack.py +++ b/tests/test_dlpack.py @@ -289,13 +289,104 @@ def test_video_frame_from_dlpack_requires_two_planes() -> None: VideoFrame.from_dlpack(y, format="nv12") -def test_video_frame_from_dlpack_rejects_unsupported_format() -> None: +def test_video_frame_from_dlpack_rejects_packed_format() -> None: + # Packed formats interleave several components in one plane and cannot be + # imported (only planar single-component formats and the nv12 family are). + rgb = numpy.zeros((16, 16, 3), dtype=numpy.uint8) + + with pytest.raises(NotImplementedError, match="is not supported"): + VideoFrame.from_dlpack((rgb,), format="rgb24") + + +@pytest.mark.parametrize( + "fmt,dtype,planes_hw", + [ + ("yuv420p", numpy.uint8, [(48, 64), (24, 32), (24, 32)]), + ("yuv422p", numpy.uint8, [(48, 64), (48, 32), (48, 32)]), + ("yuv444p", numpy.uint8, [(48, 64), (48, 64), (48, 64)]), + ("gray", numpy.uint8, [(48, 64)]), + ("yuv420p10le", numpy.uint16, [(48, 64), (24, 32), (24, 32)]), + ], +) +def test_video_frame_from_dlpack_planar_cpu(fmt, dtype, planes_hw) -> None: + # Issue #2217: planar formats whose planes each hold one component round + # trip through DLPack without a NumPy intermediate. width, height = 64, 48 - y = numpy.zeros((height, width), dtype=numpy.uint8) - uv = numpy.zeros((height // 2, width // 2, 2), dtype=numpy.uint8) + make = _make_u16 if dtype == numpy.uint16 else _make_u8 + src = [make((h, w)) for (h, w) in planes_hw] + + frame = VideoFrame.from_dlpack(tuple(src), format=fmt) + + assert frame.format.name == fmt + assert frame.width == width and frame.height == height + assert len(frame.planes) == len(src) + + for i, plane in enumerate(frame.planes): + arr = numpy.from_dlpack(plane) + assert arr.dtype == dtype + assert arr.shape == planes_hw[i] + assertNdarraysEqual(arr, src[i]) + + +def test_video_frame_from_dlpack_yuv420p_zero_copy_and_lifetime() -> None: + width, height = 64, 48 + y = _make_u8((height, width)) + u = _make_u8((height // 2, width // 2)) + v = _make_u8((height // 2, width // 2)) + + frame = VideoFrame.from_dlpack((y, u, v), format="yuv420p") + + # Mutating the source is visible through the frame (zero copy). + y[0, 0] = 200 + assert memoryview(frame.planes[0])[0] == 200 + + expected = [y.copy(), u.copy(), v.copy()] + del y, u, v + gc.collect() + + for i, plane in enumerate(frame.planes): + assertNdarraysEqual(numpy.from_dlpack(plane), expected[i]) + + +def test_video_frame_from_dlpack_yuv420p_with_pitch() -> None: + width, height = 64, 48 + pad = 16 + + y = _make_u8((height, width + pad))[:, :width] + u = _make_u8((height // 2, (width + pad) // 2))[:, : width // 2] + v = _make_u8((height // 2, (width + pad) // 2))[:, : width // 2] + + frame = VideoFrame.from_dlpack((y, u, v), format="yuv420p") + + assert frame.planes[0].line_size == width + pad + assert frame.planes[1].line_size == (width + pad) // 2 + assertNdarraysEqual(numpy.from_dlpack(frame.planes[0]), y) + assertNdarraysEqual(numpy.from_dlpack(frame.planes[1]), u) + assertNdarraysEqual(numpy.from_dlpack(frame.planes[2]), v) + + +def test_video_frame_from_dlpack_planar_wrong_plane_count() -> None: + y = numpy.zeros((48, 64), dtype=numpy.uint8) + u = numpy.zeros((24, 32), dtype=numpy.uint8) + + with pytest.raises(ValueError, match=r"requires 3 plane\(s\), got 2"): + VideoFrame.from_dlpack((y, u), format="yuv420p") + + +def test_video_frame_from_dlpack_planar_rejects_odd_dimensions() -> None: + y = numpy.zeros((48, 63), dtype=numpy.uint8) + u = numpy.zeros((24, 32), dtype=numpy.uint8) + v = numpy.zeros((24, 32), dtype=numpy.uint8) + + with pytest.raises(ValueError, match="must be even"): + VideoFrame.from_dlpack((y, u, v), format="yuv420p") + + +def test_video_frame_from_dlpack_planar_rejects_palette() -> None: + idx = numpy.zeros((16, 16), dtype=numpy.uint8) - with pytest.raises(NotImplementedError, match="supports nv12, p010le, p016le only"): - VideoFrame.from_dlpack((y, uv), format="yuv420p") + with pytest.raises(NotImplementedError, match="palette"): + VideoFrame.from_dlpack((idx,), format="pal8") def test_video_frame_from_dlpack_rejects_device_id_for_cpu() -> None: