diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8364a5bad..0297cd155 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -37,7 +37,8 @@ Breaking: Features: -- Nothing (yet) +- Add an ``out`` parameter to :meth:`.VideoFrame.to_ndarray` to write into a preallocated array and avoid a per-frame allocation. +- Add :meth:`.VideoPlane.to_ndarray` to read a single, single-component plane (e.g. just the luma plane of a planar YUV frame) as a 2D array, with an optional ``out`` buffer. Fixes: diff --git a/av/video/frame.py b/av/video/frame.py index 1a7b5364d..4707586b7 100644 --- a/av/video/frame.py +++ b/av/video/frame.py @@ -386,6 +386,18 @@ def byteswap_array(array, big_endian: cython.bint): return array +@cython.cfunc +def _check_out(out, shape, dtype): + import numpy as np + + if not out.flags["C_CONTIGUOUS"]: + raise ValueError("out must be a C-contiguous array") + if tuple(out.shape) != tuple(shape): + raise ValueError(f"out must have shape {tuple(shape)}, got {tuple(out.shape)}") + if out.dtype != np.dtype(dtype): + raise ValueError(f"out must have dtype {np.dtype(dtype)}, got {out.dtype}") + + @cython.cfunc def copy_bytes_to_plane( img_bytes, @@ -736,7 +748,7 @@ def to_image(self, **kwargs): "RGB", (plane.width, plane.height), bytes(o_buf), "raw", "RGB", 0, 1 ) - def to_ndarray(self, channel_last=False, **kwargs): + def to_ndarray(self, channel_last=False, out=None, **kwargs): """Get a numpy array of this frame. Any ``**kwargs`` are passed to :meth:`.VideoReformatter.reformat`. @@ -746,6 +758,12 @@ def to_ndarray(self, channel_last=False, **kwargs): :param bool channel_last: If True, the shape of array will be (height, width, channels) rather than (channels, height, width) for the "yuv444p" and "yuvj444p" formats. + :param out: An optional, preallocated, C-contiguous numpy array to copy + the result into. It must have exactly the shape and dtype that this + method would otherwise allocate; ``out`` is then returned in place of + a freshly allocated array. This lets callers reuse a buffer and avoid + a per-frame allocation. Not supported for the ``pal8`` format, which + returns a tuple. .. note:: Numpy must be installed. @@ -784,34 +802,80 @@ def to_ndarray(self, channel_last=False, **kwargs): itemsize: cython.uint itemsize, dtype = _np_pix_fmt_dtypes[format_name] num_planes: cython.size_t = len(planes) - if num_planes == 1: # shortcut, avoid memory copy - array = useful_array(planes[0], itemsize, dtype) - else: # general case - array = np.empty( - (frame.ptr.height, frame.ptr.width, num_planes), dtype=dtype + big_endian: cython.bint = format_name.endswith("be") + transpose: cython.bint = not channel_last and format_name in { + "yuv444p", + "yuvj444p", + } + + if num_planes == 1: # shortcut, avoid a memory copy when out is None + array = byteswap_array( + useful_array(planes[0], itemsize, dtype), big_endian + ) + if out is None: + return array + _check_out(out, array.shape, dtype) + out[...] = array + return out + + shape = (frame.ptr.height, frame.ptr.width, num_planes) + if transpose: + expected = (num_planes, frame.ptr.height, frame.ptr.width) + else: + expected = shape + if out is not None: + _check_out(out, expected, dtype) + + # Fill the channels straight into `out` when its layout matches; + # only the (channel, height, width) transpose needs a scratch array. + array = np.empty(shape, dtype=dtype) if (out is None or transpose) else out + if format_name.startswith("gbr"): + plane_indices = (2, 0, 1, *range(3, num_planes)) + else: + plane_indices = range(num_planes) + for i, p_idx in enumerate(plane_indices): + array[:, :, i] = byteswap_array( + useful_array(planes[p_idx], itemsize, dtype), big_endian ) - if format_name.startswith("gbr"): - plane_indices = (2, 0, 1, *range(3, num_planes)) - else: - plane_indices = range(num_planes) - for i, p_idx in enumerate(plane_indices): - array[:, :, i] = useful_array(planes[p_idx], itemsize, dtype) - array = byteswap_array(array, format_name.endswith("be")) - if not channel_last and format_name in {"yuv444p", "yuvj444p"}: - array = np.moveaxis(array, 2, 0) - return array + + if not transpose: + return array + if out is None: + return np.moveaxis(array, 2, 0) + out[...] = np.moveaxis(array, 2, 0) + return out # special cases - if format_name in {"yuv420p", "yuvj420p", "yuv422p"}: + + # Planar formats we expose as a single (height * k, width) array by + # flattening and concatenating the planes. With `out`, copy each plane + # into its slice of `out` directly and skip the hstack allocation. + if format_name in {"yuv420p", "yuvj420p", "yuv422p", "nv12"}: assert frame.ptr.width % 2 == 0, "width has to be even for this format" assert frame.ptr.height % 2 == 0, "height has to be even for this format" - return np.hstack( - [ + if format_name == "nv12": + flats = [ + useful_array(planes[0]).reshape(-1), + useful_array(planes[1], 2).reshape(-1), + ] + else: + flats = [ useful_array(planes[0]).reshape(-1), useful_array(planes[1]).reshape(-1), useful_array(planes[2]).reshape(-1), ] - ).reshape(-1, frame.ptr.width) + if out is None: + return np.hstack(flats).reshape(-1, frame.ptr.width) + total: cython.size_t = sum(flat.shape[0] for flat in flats) + # _check_out enforces C-contiguity, so reshape(-1) stays a view and + # the per-plane writes below land in `out`. + _check_out(out, (total // frame.ptr.width, frame.ptr.width), "uint8") + flat_out = out.reshape(-1) + offset: cython.size_t = 0 + for flat in flats: + flat_out[offset : offset + flat.shape[0]] = flat + offset += flat.shape[0] + return out if format_name == "yuv422p10le": assert frame.ptr.width % 2 == 0, "width has to be even for this format" assert frame.ptr.height % 2 == 0, "height has to be even for this format" @@ -823,10 +887,15 @@ def to_ndarray(self, channel_last=False, **kwargs): # Double the width of U and V by repeating each value u_full = np.repeat(u, 2, axis=1) v_full = np.repeat(v, 2, axis=1) - if channel_last: - return np.stack([y, u_full, v_full], axis=2) - return np.stack([y, u_full, v_full], axis=0) + array = np.stack([y, u_full, v_full], axis=2 if channel_last else 0) + if out is None: + return array + _check_out(out, array.shape, "uint16") + out[...] = array + return out if format_name == "pal8": + if out is not None: + raise ValueError("out is not supported for the pal8 format") image = useful_array(planes[0]) palette = ( np.frombuffer(planes[1], "i4") @@ -835,13 +904,6 @@ def to_ndarray(self, channel_last=False, **kwargs): .view(np.uint8) ) return image, palette - if format_name == "nv12": - return np.hstack( - [ - useful_array(planes[0]).reshape(-1), - useful_array(planes[1], 2).reshape(-1), - ] - ).reshape(-1, frame.ptr.width) raise ValueError( f"Conversion to numpy array with format `{format_name}` is not yet supported" diff --git a/av/video/frame.pyi b/av/video/frame.pyi index 12a85182b..94097e6d8 100644 --- a/av/video/frame.pyi +++ b/av/video/frame.pyi @@ -76,7 +76,10 @@ class VideoFrame(Frame): def save(self, filepath: str | Path) -> None: ... def to_image(self, **kwargs): ... def to_ndarray( - self, channel_last: bool = False, **kwargs: Any + self, + channel_last: bool = False, + out: _SupportedNDarray | None = None, + **kwargs: Any, ) -> _SupportedNDarray: ... @staticmethod def from_image(img): ... diff --git a/av/video/plane.py b/av/video/plane.py index 15fe70a94..1107f6fe3 100644 --- a/av/video/plane.py +++ b/av/video/plane.py @@ -70,6 +70,87 @@ def _buffer_writable(self) -> cython.bint: return False return True + def to_ndarray(self, out=None): + """Get a numpy array of this single plane, with line padding removed. + + Unlike :meth:`.VideoFrame.to_ndarray`, which assembles every plane into + one array, this returns just this plane as a ``(height, width)`` array. + That makes it cheap to read a single component -- for example the luma + (Y) plane of a planar YUV frame as a grayscale image -- without ever + touching the chroma planes. + + Only single-component planes are supported (the Y/U/V planes of planar + formats, or ``gray``). Packed or semi-planar planes (e.g. ``rgb24`` or + the interleaved chroma plane of ``nv12``) hold more than one component + per sample; use :meth:`.VideoFrame.to_ndarray` for those. + + :param out: An optional, preallocated ``(height, width)`` numpy array of + the matching dtype to copy into; it is returned in place of a freshly + allocated array. When ``out`` is ``None`` a zero-copy view onto the + plane's buffer is returned: it shares memory with the frame, so + writing to it mutates the frame, and it is only valid until the + frame's buffer is reused (e.g. by decoding the next frame). + + .. note:: Numpy must be installed. + """ + import sys + + import numpy as np + + fmt = self.frame.format + components = [c for c in fmt.components if c.plane == self.index] + if len(components) != 1: + raise ValueError( + "VideoPlane.to_ndarray() only supports single-component planes; " + f"plane {self.index} of format {fmt.name!r} has " + f"{len(components)} component(s). Use VideoFrame.to_ndarray() " + "for packed or interleaved planes." + ) + + depth: cython.int = components[0].bits + if depth <= 8: + dtype = np.dtype("uint8") + elif depth <= 16: + dtype = np.dtype("uint16") + else: + raise ValueError( + f"Unsupported component depth {depth} for VideoPlane.to_ndarray()" + ) + + itemsize: cython.int = dtype.itemsize + line_size: cython.int = self.line_size + total_line_size: cython.int = abs(line_size) + shape = (self.height, self.width) + if line_size < 0: + offset = (self.height - 1) * total_line_size + array = np.ndarray( + shape, + dtype=dtype, + buffer=self, + offset=offset, + strides=(-total_line_size, itemsize), + ) + else: + array = np.ndarray( + shape, dtype=dtype, buffer=self, strides=(total_line_size, itemsize) + ) + + if itemsize > 1 and fmt.name.endswith("be") != (sys.byteorder == "big"): + array = array.byteswap() + + if out is None: + return array + if out.shape != array.shape: + raise ValueError( + f"out has shape {out.shape}, but this plane has shape {array.shape}" + ) + if out.dtype != array.dtype: + raise ValueError( + f"out has dtype {out.dtype}, but this plane has dtype {array.dtype}" + ) + out[...] = array + return out + def __getbuffer__(self, view: cython.pointer[Py_buffer], flags: cython.int): if self.frame.ptr.hw_frames_ctx: raise TypeError( diff --git a/av/video/plane.pyi b/av/video/plane.pyi index fcbf8e6ed..5802937c8 100644 --- a/av/video/plane.pyi +++ b/av/video/plane.pyi @@ -1,9 +1,17 @@ from types import CapsuleType +from typing import Any, Union + +import numpy as np from av.plane import Plane from .frame import VideoFrame +_SupportedNDarray = Union[ + np.ndarray[Any, np.dtype[np.uint8]], + np.ndarray[Any, np.dtype[np.uint16]], +] + class VideoPlane(Plane): line_size: int width: int @@ -11,5 +19,8 @@ class VideoPlane(Plane): buffer_size: int def __init__(self, frame: VideoFrame, index: int) -> None: ... + def to_ndarray( + self, out: _SupportedNDarray | None = None + ) -> _SupportedNDarray: ... def __dlpack_device__(self) -> tuple[int, int]: ... def __dlpack__(self, *, stream: int | None = None) -> CapsuleType: ... diff --git a/tests/test_videoframe.py b/tests/test_videoframe.py index 26386adb0..e8c6cc284 100644 --- a/tests/test_videoframe.py +++ b/tests/test_videoframe.py @@ -1339,3 +1339,152 @@ def test_reformat_pixel_format_align() -> None: result = frame_rgb.to_ndarray() assert result.shape == expected_rgb.shape assert numpy.abs(result.astype(int) - expected_rgb.astype(int)).max() <= 1 + + +@pytest.mark.parametrize("format", ["rgb24", "bgr24", "gray8", "yuv444p"]) +def test_to_ndarray_out(format: str) -> None: + # to_ndarray(out=...) writes into a preallocated buffer and returns it, + # producing the same result as the allocating path. + frame = VideoFrame(64, 48, format) + expected = frame.to_ndarray() + out = numpy.empty(expected.shape, dtype=expected.dtype) + result = frame.to_ndarray(out=out) + assert result is out + assertNdarraysEqual(result, expected) + + +@pytest.mark.parametrize("format", ["yuv420p", "yuvj420p", "yuv422p", "nv12"]) +def test_to_ndarray_out_planar(format: str) -> None: + # The flattened/concatenated planar formats fill `out` slice by slice + # without the intermediate hstack allocation. + frame = VideoFrame(64, 48, format) + expected = frame.to_ndarray() + out = numpy.empty(expected.shape, dtype=expected.dtype) + result = frame.to_ndarray(out=out) + assert result is out + assertNdarraysEqual(result, expected) + + +def test_to_ndarray_out_validation() -> None: + frame = VideoFrame(64, 48, "rgb24") + # Wrong shape. + with pytest.raises(ValueError): + frame.to_ndarray(out=numpy.empty((10, 10, 3), dtype=numpy.uint8)) + # Wrong dtype. + with pytest.raises(ValueError): + frame.to_ndarray(out=numpy.empty((48, 64, 3), dtype=numpy.float32)) + # Non-contiguous. + scratch = numpy.empty((48, 64, 6), dtype=numpy.uint8) + with pytest.raises(ValueError): + frame.to_ndarray(out=scratch[:, :, ::2]) + # pal8 returns a tuple and does not support out. + with pytest.raises(ValueError): + VideoFrame(64, 48, "pal8").to_ndarray( + out=numpy.empty((48, 64), dtype=numpy.uint8) + ) + + +def test_plane_to_ndarray_luma() -> None: + # Reading just the Y plane of a planar YUV frame yields the luma image + # without touching the chroma planes -- it matches the top rows that + # VideoFrame.to_ndarray() returns for yuv420p. + height, width = 48, 64 + full = numpy.random.randint( + 0, 256, size=(height * 3 // 2, width), dtype=numpy.uint8 + ) + frame = VideoFrame.from_ndarray(full, format="yuv420p") + + y = frame.planes[0].to_ndarray() + assert y.shape == (height, width) + assertNdarraysEqual(y, full[:height]) + assertNdarraysEqual(y, frame.to_ndarray()[:height]) + + out = numpy.empty((height, width), dtype=numpy.uint8) + result = frame.planes[0].to_ndarray(out=out) + assert result is out + assertNdarraysEqual(out, full[:height]) + + +def test_plane_to_ndarray_gray_matches_frame() -> None: + array = numpy.random.randint(0, 256, size=(48, 64), dtype=numpy.uint8) + frame = VideoFrame.from_ndarray(array, format="gray8") + assertNdarraysEqual(frame.planes[0].to_ndarray(), frame.to_ndarray()) + + +def test_plane_to_ndarray_rejects_packed() -> None: + # rgb24 packs three components into one plane; there is no single-component + # interpretation, so it must be rejected with a helpful error. + frame = VideoFrame(64, 48, "rgb24") + with pytest.raises(ValueError): + frame.planes[0].to_ndarray() + + +def test_plane_to_ndarray_out_validation() -> None: + frame = VideoFrame(64, 48, "yuv420p") + with pytest.raises(ValueError): + frame.planes[0].to_ndarray(out=numpy.empty((10, 10), dtype=numpy.uint8)) + with pytest.raises(ValueError): + frame.planes[0].to_ndarray(out=numpy.empty((48, 64), dtype=numpy.uint16)) + + +def test_plane_to_ndarray_negative_linesize() -> None: + # A bottom-up (vflipped) gray frame has a negative line_size; the plane view + # must still come back top-down. + height, width = 6, 4 + array = numpy.arange(height * width, dtype=numpy.uint8).reshape(height, width) + frame = _vflip(VideoFrame.from_ndarray(array, format="gray")) + assert frame.planes[0].line_size < 0 + result = frame.planes[0].to_ndarray() + assertNdarraysEqual(result, array[::-1]) + assert result.copy().sum() == int(array.sum()) + + +@pytest.mark.parametrize("format", ["gbrp", "yuv422p10le"]) +def test_to_ndarray_out_extra_formats(format: str) -> None: + # Cover the gbr plane-reorder and the yuv422p10le stack paths with out=. + frame = VideoFrame(64, 48, format) + expected = frame.to_ndarray() + out = numpy.empty(expected.shape, dtype=expected.dtype) + result = frame.to_ndarray(out=out) + assert result is out + assertNdarraysEqual(result, expected) + + +def test_to_ndarray_out_channel_last() -> None: + frame = VideoFrame(64, 48, "yuv444p") + expected = frame.to_ndarray(channel_last=True) + out = numpy.empty(expected.shape, dtype=expected.dtype) + assert frame.to_ndarray(channel_last=True, out=out) is out + assertNdarraysEqual(out, expected) + + +@pytest.mark.parametrize("format", ["rgb24", "yuv420p"]) +def test_to_ndarray_out_noncontiguous_rejected(format: str) -> None: + # to_ndarray(out=) requires a C-contiguous buffer for every format. + frame = VideoFrame(64, 48, format) + expected = frame.to_ndarray() + out = numpy.empty(expected.shape + (2,), dtype=expected.dtype)[..., 0] + assert not out.flags["C_CONTIGUOUS"] + with pytest.raises(ValueError): + frame.to_ndarray(out=out) + + +def test_plane_to_ndarray_big_endian() -> None: + array = numpy.random.randint(0, 1024, size=(48, 64), dtype=numpy.uint16) + frame = VideoFrame.from_ndarray(array, format="gray16be") + result = frame.planes[0].to_ndarray() + assert result.dtype == numpy.uint16 + assertNdarraysEqual(result, array) + out = numpy.empty((48, 64), dtype=numpy.uint16) + assert frame.planes[0].to_ndarray(out=out) is out + assertNdarraysEqual(out, array) + + +def test_plane_to_ndarray_negative_linesize_out() -> None: + height, width = 6, 4 + array = numpy.arange(height * width, dtype=numpy.uint8).reshape(height, width) + frame = _vflip(VideoFrame.from_ndarray(array, format="gray")) + assert frame.planes[0].line_size < 0 + out = numpy.empty((height, width), dtype=numpy.uint8) + assert frame.planes[0].to_ndarray(out=out) is out + assertNdarraysEqual(out, array[::-1])