shiguredo · voluntas · Dec 13, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -11,6 +11,10 @@
 
 ## develop
 
+- [ADD] `SoraVideoFrame` に `planes()` メソッドを追加する
+  - I420 形式の Y, U, V プレーンを直接取得できるようになる
+  - webcodecs-py の `VideoFrame.planes()` と同様のインターフェース
+  - @voluntas
 - [UPDATE] Sora C++ SDK のバージョンを `2025.7.0-canary.0` に上げる
   - WEBRTC_BUILD_VERSION を `m144.7559.0.0` に上げる
   - CMAKE_VERSION を `4.1.3` に上げる

diff --git a/src/sora_sdk_ext.cpp b/src/sora_sdk_ext.cpp
@@ -484,7 +484,8 @@ NB_MODULE(sora_sdk_ext, m) {
       .def("analyze", &SoraVAD::Analyze, "frame"_a);
 
   nb::class_<SoraVideoFrame>(m, "SoraVideoFrame")
-      .def("data", &SoraVideoFrame::Data, nb::rv_policy::reference);
+      .def("data", &SoraVideoFrame::Data, nb::rv_policy::reference)
+      .def("planes", &SoraVideoFrame::Planes);
 
   nb::class_<SoraVideoSinkImpl>(m, "SoraVideoSinkImpl",
                                 nb::type_slots(video_sink_slots))

diff --git a/src/sora_video_sink.cpp b/src/sora_video_sink.cpp
@@ -10,26 +10,62 @@
 #include "sora_call.h"
 
 SoraVideoFrame::SoraVideoFrame(
-    webrtc::scoped_refptr<webrtc::I420BufferInterface> i420_data)
-    : width_(i420_data->width()), height_(i420_data->height()) {
-  /**
-   * データを取り出す際に Python 側で自由に FourCC を指定できる形にするのも手ですが、
-   * その場合は関数を呼び出すたびに変換が走るので GIL を長く保持してしまいます。
-   * また、複数回呼び出された際に毎回変換を行いパフォーマンスが悪化してしまうので、
-   * ここで numpy の形式である 24BG に変換することとしました。
-   */
-  argb_data_ = std::unique_ptr<uint8_t>(new uint8_t[width_ * height_ * 3]);
-  libyuv::ConvertFromI420(
-      i420_data->DataY(), i420_data->StrideY(), i420_data->DataU(),
-      i420_data->StrideU(), i420_data->DataV(), i420_data->StrideV(),
-      argb_data_.get(), width_ * 3, width_, height_, libyuv::FOURCC_24BG);
+    webrtc::scoped_refptr<webrtc::I420BufferInterface> i420_buffer)
+    : width_(i420_buffer->width()),
+      height_(i420_buffer->height()),
+      i420_buffer_(i420_buffer),
+      bgr_converted_(false) {
+  // I420 バッファの参照を保持するだけで、変換は遅延実行する
 }
 
 nb::ndarray<nb::numpy, uint8_t, nb::shape<-1, -1, 3>> SoraVideoFrame::Data() {
+  if (!bgr_converted_) {
+    /**
+     * データを取り出す際に Python 側で自由に FourCC を指定できる形にするのも手ですが、
+     * その場合は関数を呼び出すたびに変換が走るので GIL を長く保持してしまいます。
+     * また、複数回呼び出された際に毎回変換を行いパフォーマンスが悪化してしまうので、
+     * ここで numpy の形式である 24BG に変換することとしました。
+     */
+    bgr_data_ = std::make_unique<uint8_t[]>(width_ * height_ * 3);
+    libyuv::ConvertFromI420(
+        i420_buffer_->DataY(), i420_buffer_->StrideY(), i420_buffer_->DataU(),
+        i420_buffer_->StrideU(), i420_buffer_->DataV(), i420_buffer_->StrideV(),
+        bgr_data_.get(), width_ * 3, width_, height_, libyuv::FOURCC_24BG);
+    bgr_converted_ = true;
+  }
   size_t shape[3] = {static_cast<size_t>(height_), static_cast<size_t>(width_),
                      3};
   return nb::ndarray<nb::numpy, uint8_t, nb::shape<-1, -1, 3>>(
-      argb_data_.get(), 3, shape, nb::handle());
+      bgr_data_.get(), 3, shape, nb::handle());
+}
+
+nb::tuple SoraVideoFrame::Planes() {
+  int uv_width = width_ / 2;
+  int uv_height = height_ / 2;
+
+  // Y プレーン（stride 付き）
+  size_t y_shape[2] = {static_cast<size_t>(height_),
+                       static_cast<size_t>(width_)};
+  int64_t y_strides[2] = {i420_buffer_->StrideY(), 1};
+  auto y_plane = nb::ndarray<nb::numpy, uint8_t>(
+      const_cast<uint8_t*>(i420_buffer_->DataY()), 2, y_shape, nb::handle(),
+      y_strides);
+
+  // U プレーン（stride 付き）
+  size_t uv_shape[2] = {static_cast<size_t>(uv_height),
+                        static_cast<size_t>(uv_width)};
+  int64_t u_strides[2] = {i420_buffer_->StrideU(), 1};
+  auto u_plane = nb::ndarray<nb::numpy, uint8_t>(
+      const_cast<uint8_t*>(i420_buffer_->DataU()), 2, uv_shape, nb::handle(),
+      u_strides);
+
+  // V プレーン（stride 付き）
+  int64_t v_strides[2] = {i420_buffer_->StrideV(), 1};
+  auto v_plane = nb::ndarray<nb::numpy, uint8_t>(
+      const_cast<uint8_t*>(i420_buffer_->DataV()), 2, uv_shape, nb::handle(),
+      v_strides);
+
+  return nb::make_tuple(y_plane, u_plane, v_plane);
 }
 
 SoraVideoSinkImpl::SoraVideoSinkImpl(nb::ref<SoraTrackInterface> track)

diff --git a/src/sora_video_sink.h b/src/sora_video_sink.h
@@ -3,7 +3,7 @@
 
 #include <memory>
 
-// nonobind
+// nanobind
 #include <nanobind/nanobind.h>
 #include <nanobind/ndarray.h>
 #include <nanobind/stl/shared_ptr.h>
@@ -22,26 +22,46 @@ namespace nb = nanobind;
 
 /**
  * Sora からのフレームを格納する SoraVideoFrame です。
- * 
+ *
  * on_frame_ コールバックで直接フレームデータの ndarray を返してしまうとメモリーリークしてしまうため、
  * フレームデータを Python で適切にハンドリングできるようにするために用意しました。
  */
 class SoraVideoFrame {
  public:
-  SoraVideoFrame(webrtc::scoped_refptr<webrtc::I420BufferInterface> i420_data);
+  SoraVideoFrame(webrtc::scoped_refptr<webrtc::I420BufferInterface> i420_buffer);
 
   /**
    * SoraVideoFrame 内のフレームデータへの numpy.ndarray での参照を渡します。
-   * 
+   *
+   * BGR への変換は初回呼び出し時のみ行われ、以降はキャッシュされたデータを返します。
+   *
    * @return NumPy の配列 numpy.ndarray で H x W x BGR になっているフレームデータ
    */
   nb::ndarray<nb::numpy, uint8_t, nb::shape<-1, -1, 3>> Data();
 
+  /**
+   * I420 形式の Y, U, V プレーンへのビューを返します。
+   *
+   * I420BufferInterface への参照を直接返すため、コピーは発生しません。
+   *
+   * @return (Y, U, V) の 3 つの 2D ndarray のタプル
+   *         - Y: (height, width) の形状
+   *         - U: (height/2, width/2) の形状
+   *         - V: (height/2, width/2) の形状
+   */
+  nb::tuple Planes();
+
  private:
   // width や height は ndarray に情報として含まれるため、これらを別で返す関数は不要
   const int width_;
   const int height_;
-  std::unique_ptr<uint8_t> argb_data_;
+
+  // I420 バッファの参照を保持（ゼロコピー）
+  webrtc::scoped_refptr<webrtc::I420BufferInterface> i420_buffer_;
+
+  // BGR データ（遅延生成）
+  mutable std::unique_ptr<uint8_t[]> bgr_data_;
+  mutable bool bgr_converted_ = false;
 };
 
 /**

diff --git a/tests/test_video_frame_planes.py b/tests/test_video_frame_planes.py
@@ -0,0 +1,107 @@
+import time
+
+import numpy
+import pytest
+from client import SoraClient, SoraRole
+
+
+def test_video_frame_planes(settings):
+    """
+    SoraVideoFrame.planes() が正しい形式の I420 プレーンを返すことをテストする。
+    """
+    sendonly = SoraClient(
+        settings,
+        SoraRole.SENDONLY,
+        audio=False,
+        video=True,
+        video_codec_type="VP8",
+    )
+    sendonly.connect(fake_video=True)
+
+    time.sleep(3)
+
+    recvonly = SoraClient(
+        settings,
+        SoraRole.RECVONLY,
+    )
+    recvonly.connect()
+
+    # フレームを受信するまで待つ
+    frame = recvonly._q_out.get(timeout=10)
+
+    # planes() を呼び出す
+    y, u, v = frame.planes()
+
+    # data() を呼び出して width, height を取得
+    bgr = frame.data()
+    height, width = bgr.shape[0], bgr.shape[1]
+
+    # Y プレーンの形状を確認
+    assert y.shape == (height, width), f"Y plane shape mismatch: {y.shape} != ({height}, {width})"
+
+    # U プレーンの形状を確認 (I420: height/2, width/2)
+    assert u.shape == (
+        height // 2,
+        width // 2,
+    ), f"U plane shape mismatch: {u.shape} != ({height // 2}, {width // 2})"
+
+    # V プレーンの形状を確認 (I420: height/2, width/2)
+    assert v.shape == (
+        height // 2,
+        width // 2,
+    ), f"V plane shape mismatch: {v.shape} != ({height // 2}, {width // 2})"
+
+    # データ型が uint8 であることを確認
+    assert y.dtype == numpy.uint8, f"Y plane dtype mismatch: {y.dtype}"
+    assert u.dtype == numpy.uint8, f"U plane dtype mismatch: {u.dtype}"
+    assert v.dtype == numpy.uint8, f"V plane dtype mismatch: {v.dtype}"
+
+    sendonly.disconnect()
+    recvonly.disconnect()
+
+
+def test_video_frame_data_and_planes_both_work(settings):
+    """
+    SoraVideoFrame.data() と SoraVideoFrame.planes() が両方とも正しく動作することをテストする。
+    """
+    sendonly = SoraClient(
+        settings,
+        SoraRole.SENDONLY,
+        audio=False,
+        video=True,
+        video_codec_type="VP8",
+    )
+    sendonly.connect(fake_video=True)
+
+    time.sleep(3)
+
+    recvonly = SoraClient(
+        settings,
+        SoraRole.RECVONLY,
+    )
+    recvonly.connect()
+
+    # フレームを受信するまで待つ
+    frame = recvonly._q_out.get(timeout=10)
+
+    # planes() を先に呼び出す
+    y1, u1, v1 = frame.planes()
+
+    # data() を呼び出す
+    bgr = frame.data()
+    height, width = bgr.shape[0], bgr.shape[1]
+
+    # planes() をもう一度呼び出す
+    y2, u2, v2 = frame.planes()
+
+    # BGR データが正しい形状であることを確認
+    assert bgr.shape == (height, width, 3)
+    assert bgr.dtype == numpy.uint8
+
+    # planes() の結果が一貫していることを確認
+    assert y1.shape == y2.shape
+    assert u1.shape == u2.shape
+    assert v1.shape == v2.shape
+
+    sendonly.disconnect()
+    recvonly.disconnect()