meta-pytorch · Dan-Flores · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -373,22 +373,72 @@ std::string CudaDeviceInterface::getDetails() {
 // Below are methods exclusive to video encoding:
 // --------------------------------------------------------------------------
 namespace {
-// RGB to NV12 color conversion matrix for BT.601 limited range.
-// NPP ColorTwist function used below expects the limited range
-// color conversion matrix, and this matches FFmpeg's default behavior.
-const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
-    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
-    {0.257f, 0.504f, 0.098f, 16.0f},
-    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
-    {-0.148f, -0.291f, 0.439f, 128.0f},
-    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
-    {0.439f, -0.368f, -0.071f, 128.0f}};
+// RGB to YUV conversion matrices to use in NPP color conversion functions
 // Color space and color range 
 // --------------------------- 
 // Color space and color range 
 // --------------------------- 
+struct ColorConversionMatrices {
+  static constexpr Npp32f BT601_LIMITED[3][4] = {
+      {0.257f, 0.504f, 0.098f, 16.0f},
+      {-0.148f, -0.291f, 0.439f, 128.0f},
+      {0.439f, -0.368f, -0.071f, 128.0f}};
+
+  static constexpr Npp32f BT601_FULL[3][4] = {
+      {0.299f, 0.587f, 0.114f, 0.0f},
+      {-0.168736f, -0.331264f, 0.5f, 128.0f},
+      {0.5f, -0.418688f, -0.081312f, 128.0f}};
+
+  static constexpr Npp32f BT709_LIMITED[3][4] = {
+      {0.183f, 0.614f, 0.062f, 16.0f},
+      {-0.101f, -0.338f, 0.439f, 128.0f},
+      {0.439f, -0.399f, -0.040f, 128.0f}};
+
+  static constexpr Npp32f BT709_FULL[3][4] = {
+      {0.2126f, 0.7152f, 0.0722f, 0.0f},
+      {-0.114572f, -0.385428f, 0.5f, 128.0f},
+      {0.5f, -0.454153f, -0.045847f, 128.0f}};
+
+  static constexpr Npp32f BT2020_LIMITED[3][4] = {
+      {0.2256f, 0.5823f, 0.0509f, 16.0f},
+      {-0.122f, -0.315f, 0.439f, 128.0f},
+      {0.439f, -0.403f, -0.036f, 128.0f}};
+
+  static constexpr Npp32f BT2020_FULL[3][4] = {
+      {0.2627f, 0.6780f, 0.0593f, 0.0f},
+      {-0.139630f, -0.360370f, 0.5f, 128.0f},
+      {0.5f, -0.459786f, -0.040214f, 128.0f}};
+};
+
+// Returns conversion matrix based on codec context color space and range
+const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] {
+  if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range
+      codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) {
+    if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
+      return ColorConversionMatrices::BT601_LIMITED;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT709) {
+      return ColorConversionMatrices::BT709_LIMITED;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
+      return ColorConversionMatrices::BT2020_LIMITED;
+    } else { // default to BT.601
+      return ColorConversionMatrices::BT601_LIMITED;
+    }
+  } else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range
+    if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
+      return ColorConversionMatrices::BT601_FULL;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT709) {
+      return ColorConversionMatrices::BT709_FULL;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
+      return ColorConversionMatrices::BT2020_FULL;
+    } else { // default to BT.601
+      return ColorConversionMatrices::BT601_FULL;
+    }
+  }
+  return ColorConversionMatrices::BT601_LIMITED;
+}
 } // namespace
 
 UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
     const torch::Tensor& tensor,
     int frameIndex,
-    AVCodecContext* codecContext) {
+    AVCodecContext* codecContext,
+    AVPixelFormat targetPixelFormat) {
   TORCH_CHECK(
       tensor.dim() == 3 && tensor.size(0) == 3,
       "Expected 3D RGB tensor (CHW format), got shape: ",
@@ -427,25 +477,48 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
   torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
 
   NppiSize oSizeROI = {width, height};
-  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
-      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
-      validateInt64ToInt(
-          hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"),
-      avFrame->data,
-      avFrame->linesize,
-      oSizeROI,
-      defaultLimitedRangeRgbToNv12,
-      *nppCtx_);
+  NppStatus status;
+  switch (targetPixelFormat) {
+    case AV_PIX_FMT_NV12:
+      status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+          static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+          hwcFrame.stride(0) * hwcFrame.element_size(),
+          avFrame->data,
+          avFrame->linesize,
+          oSizeROI,
+          getConversionMatrix(codecContext),
+          *nppCtx_);
+      break;
+    case AV_PIX_FMT_YUV420P:
+      status = nppiRGBToYUV420_8u_ColorTwist32f_C3P3R_Ctx(
+          static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+          hwcFrame.stride(0) * hwcFrame.element_size(),
+          avFrame->data,
+          avFrame->linesize,
+          oSizeROI,
+          getConversionMatrix(codecContext),
+          *nppCtx_);
+      break;
+    default:
+      TORCH_CHECK(
+          false,
+          "CUDA encoding only supports NV12 and YUV420P formats, got ",
+          av_get_pix_fmt_name(targetPixelFormat));
+  }
 
   TORCH_CHECK(
       status == NPP_SUCCESS,
-      "Failed to convert RGB to NV12: NPP error code ",
+      "Failed to convert RGB to ",
+      av_get_pix_fmt_name(targetPixelFormat),
+      ": NPP error code ",
       status);
 
-  // TODO-VideoEncoder: Enable configuration of color properties, similar to
-  // FFmpeg. Below are the default color properties used by FFmpeg.
-  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
-  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
+  avFrame->colorspace = codecContext->colorspace != AVCOL_SPC_UNSPECIFIED
+      ? codecContext->colorspace
+      : AVCOL_SPC_BT470BG; // BT.601
+  avFrame->color_range = codecContext->color_range != AVCOL_RANGE_UNSPECIFIED
+      ? codecContext->color_range
+      : AVCOL_RANGE_MPEG; // limited range
 
   return avFrame;
 }
@@ -454,7 +527,8 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
 // to enable encoding with CUDA device. The hw_frames_ctx field is needed by
 // FFmpeg to allocate frames on GPU's memory.
 void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
-    AVCodecContext* codecContext) {
+    AVCodecContext* codecContext,
+    AVPixelFormat targetPixelFormat) {
   TORCH_CHECK(codecContext != nullptr, "codecContext is null");
   TORCH_CHECK(
       hardwareDeviceCtx_, "Hardware device context has not been initialized");
@@ -464,9 +538,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
       hwFramesCtxRef != nullptr,
       "Failed to allocate hardware frames context for codec");
 
-  // TODO-VideoEncoder: Enable user set pixel formats to be set
-  // (outPixelFormat_) and handled with the appropriate NPP function
-  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->sw_pix_fmt = targetPixelFormat;
   // Always set pixel format to support CUDA encoding.
   codecContext->pix_fmt = AV_PIX_FMT_CUDA;
 

diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -44,10 +44,12 @@ class CudaDeviceInterface : public DeviceInterface {
   UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
       const torch::Tensor& tensor,
       int frameIndex,
-      AVCodecContext* codecContext) override;
+      AVCodecContext* codecContext,
+      AVPixelFormat targetPixelFormat) override;
 
   void setupHardwareFrameContextForEncoding(
-      AVCodecContext* codecContext) override;
+      AVCodecContext* codecContext,
+      AVPixelFormat targetPixelFormat) override;
 
  private:
   // Our CUDA decoding code assumes NV12 format. In order to handle other

diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -145,14 +145,16 @@ class DeviceInterface {
   virtual UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
       [[maybe_unused]] const torch::Tensor& tensor,
       [[maybe_unused]] int frameIndex,
-      [[maybe_unused]] AVCodecContext* codecContext) {
+      [[maybe_unused]] AVCodecContext* codecContext,
+      [[maybe_unused]] AVPixelFormat targetPixelFormat) {
     TORCH_CHECK(false);
   }
 
   // Function used for video encoding, only implemented in CudaDeviceInterface.
   // It is here to isolate CUDA dependencies from CPU builds
   virtual void setupHardwareFrameContextForEncoding(
-      [[maybe_unused]] AVCodecContext* codecContext) {
+      [[maybe_unused]] AVCodecContext* codecContext,
+      [[maybe_unused]] AVPixelFormat targetPixelFormat) {
     TORCH_CHECK(false);
   }
 

diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -775,23 +775,30 @@ void VideoEncoder::initializeEncoder(
   outHeight_ = inHeight_;
 
   if (videoStreamOptions.pixelFormat.has_value()) {
-    if (frames_.device().is_cuda()) {
+    outPixelFormat_ =
+        validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
+    if (frames_.device().is_cuda() &&
+        !(outPixelFormat_ == AV_PIX_FMT_NV12 ||
+          outPixelFormat_ == AV_PIX_FMT_YUV420P)) {
       TORCH_CHECK(
           false,
-          "GPU Video encoding currently only supports the NV12 pixel format. "
-          "Do not set pixel_format to use NV12.");
+          "GPU encoding only supports nv12 and yuv420p pixel formats, got ",
+          av_get_pix_fmt_name(outPixelFormat_));
     }
-    outPixelFormat_ =
-        validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
   } else {
-    const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
-    // Use first listed pixel format as default (often yuv420p).
-    // This is similar to FFmpeg's logic:
-    // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
-    // If pixel formats are undefined for some reason, try yuv420p
-    outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
-        ? formats[0]
-        : AV_PIX_FMT_YUV420P;
+    if (frames_.device().is_cuda()) {
+      // Default to YUV420P for CUDA encoding if unset.
+      outPixelFormat_ = AV_PIX_FMT_YUV420P;
+    } else {
+      const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
+      // Use first listed pixel format as default (often yuv420p).
+      // This is similar to FFmpeg's logic:
+      // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
+      // If pixel formats are undefined for some reason, try yuv420p
+      outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
+          ? formats[0]
+          : AV_PIX_FMT_YUV420P;
+    }
   }
 
   // Configure codec parameters
@@ -837,7 +844,7 @@ void VideoEncoder::initializeEncoder(
   if (frames_.device().is_cuda() && deviceInterface_) {
     deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
     deviceInterface_->setupHardwareFrameContextForEncoding(
-        avCodecContext_.get());
+        avCodecContext_.get(), outPixelFormat_);
   }
 
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
@@ -883,7 +890,7 @@ void VideoEncoder::encode() {
     UniqueAVFrame avFrame;
     if (frames_.device().is_cuda() && deviceInterface_) {
       auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding(
-          currFrame, i, avCodecContext_.get());
+          currFrame, i, avCodecContext_.get(), outPixelFormat_);
       TORCH_CHECK(
           cudaFrame != nullptr,
           "convertCUDATensorToAVFrameForEncoding failed for frame ",

diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -780,9 +780,9 @@ def test_pixel_format_errors(self, method, device, tmp_path):
         if device == "cuda":
             with pytest.raises(
                 RuntimeError,
-                match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12",
+                match="GPU encoding only supports nv12 and yuv420p pixel formats, got yuv444p",
             ):
-                getattr(encoder, method)(**valid_params, pixel_format="yuv420p")
+                getattr(encoder, method)(**valid_params, pixel_format="yuv444p")
             return
 
         with pytest.raises(
@@ -1353,8 +1353,14 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
         ],
     )
     @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
-    # TODO-VideoEncoder: Enable additional pixel formats ("yuv420p", "yuv444p")
-    def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
+    @pytest.mark.parametrize("pixel_format", ("nv12", "yuv420p", None))
+    # BT.601, BT.709, BT.2020
+    @pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc"))
+    # Full/PC range, Limited/TV range
+    @pytest.mark.parametrize("color_range", ("pc", "tv"))
+    def test_nvenc_against_ffmpeg_cli(
+        self, tmp_path, format_codec, method, pixel_format, color_space, color_range
+    ):
         # Encode with FFmpeg CLI using nvenc codecs
         format, codec = format_codec
         device = "cuda"
@@ -1385,7 +1391,14 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
             codec,  # Use specified NVENC hardware encoder
         ]
 
-        ffmpeg_cmd.extend(["-pix_fmt", "nv12"])  # Output format is always NV12
+        if color_space:
+            ffmpeg_cmd.extend(["-colorspace", color_space])
+        if color_range:
+            ffmpeg_cmd.extend(["-color_range", color_range])
+        if pixel_format:
+            ffmpeg_cmd.extend(["-pix_fmt", pixel_format])
+        else:  # VideoEncoder will default to yuv420p for nvenc codecs
+            ffmpeg_cmd.extend(["-pix_fmt", "yuv420p"])
         if codec == "av1_nvenc":
             ffmpeg_cmd.extend(["-rc", "constqp"])  # Set rate control mode for AV1
         ffmpeg_cmd.extend(["-qp", str(qp)])  # Use lossless qp for other codecs
@@ -1396,18 +1409,24 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
         encoder_extra_options = {"qp": qp}
         if codec == "av1_nvenc":
             encoder_extra_options["rc"] = 0  # constqp mode
+        if color_space:
+            encoder_extra_options["colorspace"] = color_space
+        if color_range:
+            encoder_extra_options["color_range"] = color_range
         if method == "to_file":
             encoder_output_path = str(tmp_path / f"nvenc_output.{format}")
             encoder.to_file(
                 dest=encoder_output_path,
                 codec=codec,
+                pixel_format=pixel_format,
                 extra_options=encoder_extra_options,
             )
             encoder_output = encoder_output_path
         elif method == "to_tensor":
             encoder_output = encoder.to_tensor(
                 format=format,
                 codec=codec,
+                pixel_format=pixel_format,
                 extra_options=encoder_extra_options,
             )
         elif method == "to_file_like":
@@ -1416,6 +1435,7 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
                 file_like=file_like,
                 format=format,
                 codec=codec,
+                pixel_format=pixel_format,
                 extra_options=encoder_extra_options,
             )
             encoder_output = file_like.getvalue()
@@ -1426,13 +1446,39 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, format_codec, method):
         encoder_frames = self.decode(encoder_output).data
 
         assert ffmpeg_frames.shape[0] == encoder_frames.shape[0]
+        # The combination of full range + bt709 results in worse accuracy
+        percentage = 91 if color_range == "full" and color_space == "bt709" else 96
         for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames):
             assert psnr(ff_frame, enc_frame) > 25
-            assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2)
+            assert_tensor_close_on_at_least(
+                ff_frame, enc_frame, percentage=percentage, atol=2
+            )
 
         if method == "to_file":
-            ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"])
-            encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"])
-            # pix_fmt nv12 is stored as yuv420p in metadata
-            assert encoder_metadata["pix_fmt"] == "yuv420p"
-            assert ffmpeg_metadata["pix_fmt"] == "yuv420p"
+            metadata_fields = ["pix_fmt", "color_range", "color_space"]
+            ffmpeg_metadata = self._get_video_metadata(
+                ffmpeg_encoded_path, metadata_fields
+            )
+            encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields)
+            assert (
+                encoder_metadata["color_range"]
+                == ffmpeg_metadata["color_range"]
+                == color_range
+            )
+            assert (
+                encoder_metadata["color_space"]
+                == ffmpeg_metadata["color_space"]
+                == color_space
+            )
+            if color_range == "pc" and codec != "av1_nvenc":
+                # This format represents full range (pc) with yuv420p pixel format
+                # It's deprecated, but is set automatically in h264 and hevc NVENC codecs
+                expected_pix_fmt = "yuvj420p"
+            else:
+                # av1_nvenc does not utilize the yuvj420p pixel format
+                expected_pix_fmt = "yuv420p"
+            assert (
+                encoder_metadata["pix_fmt"]
+                == ffmpeg_metadata["pix_fmt"]
+                == expected_pix_fmt
+            )