diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 34c6ab6d5..f368619dc 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -380,22 +380,76 @@ std::string CudaDeviceInterface::getDetails() { // Below are methods exclusive to video encoding: // -------------------------------------------------------------------------- namespace { -// RGB to NV12 color conversion matrix for BT.601 limited range. -// NPP ColorTwist function used below expects the limited range -// color conversion matrix, and this matches FFmpeg's default behavior. -const Npp32f defaultLimitedRangeRgbToNv12[3][4] = { - // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B) - {0.257f, 0.504f, 0.098f, 16.0f}, - // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients) - {-0.148f, -0.291f, 0.439f, 128.0f}, - // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients) - {0.439f, -0.368f, -0.071f, 128.0f}}; +// For background on these matrices, see the note: +// [YUV -> RGB Color Conversion, color space and color range] +// https://github.com/meta-pytorch/torchcodec/blob/main/src/torchcodec/_core/CUDACommon.cpp#L63-L65 +// TODO Video-Encoder: Extend note to explain limited vs full range +// RGB to YUV conversion matrices to use in NPP color conversion functions +struct ColorConversionMatrices { + static constexpr Npp32f BT601_LIMITED[3][4] = { + {0.257f, 0.504f, 0.098f, 16.0f}, + {-0.148f, -0.291f, 0.439f, 128.0f}, + {0.439f, -0.368f, -0.071f, 128.0f}}; + + static constexpr Npp32f BT601_FULL[3][4] = { + {0.299f, 0.587f, 0.114f, 0.0f}, + {-0.168736f, -0.331264f, 0.5f, 128.0f}, + {0.5f, -0.418688f, -0.081312f, 128.0f}}; + + static constexpr Npp32f BT709_LIMITED[3][4] = { + {0.183f, 0.614f, 0.062f, 16.0f}, + {-0.101f, -0.338f, 0.439f, 128.0f}, + {0.439f, -0.399f, -0.040f, 128.0f}}; + + static constexpr Npp32f BT709_FULL[3][4] = { + {0.2126f, 0.7152f, 0.0722f, 0.0f}, + {-0.114572f, -0.385428f, 0.5f, 128.0f}, + {0.5f, -0.454153f, -0.045847f, 128.0f}}; + + static constexpr Npp32f BT2020_LIMITED[3][4] = { + {0.2256f, 0.5823f, 0.0509f, 16.0f}, + {-0.122f, -0.315f, 0.439f, 128.0f}, + {0.439f, -0.403f, -0.036f, 128.0f}}; + + static constexpr Npp32f BT2020_FULL[3][4] = { + {0.2627f, 0.6780f, 0.0593f, 0.0f}, + {-0.139630f, -0.360370f, 0.5f, 128.0f}, + {0.5f, -0.459786f, -0.040214f, 128.0f}}; +}; + +// Returns conversion matrix based on codec context color space and range +const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] { + if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range + codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) { + if (codecContext->colorspace == AVCOL_SPC_BT470BG) { + return ColorConversionMatrices::BT601_LIMITED; + } else if (codecContext->colorspace == AVCOL_SPC_BT709) { + return ColorConversionMatrices::BT709_LIMITED; + } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) { + return ColorConversionMatrices::BT2020_LIMITED; + } else { // default to BT.601 + return ColorConversionMatrices::BT601_LIMITED; + } + } else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range + if (codecContext->colorspace == AVCOL_SPC_BT470BG) { + return ColorConversionMatrices::BT601_FULL; + } else if (codecContext->colorspace == AVCOL_SPC_BT709) { + return ColorConversionMatrices::BT709_FULL; + } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) { + return ColorConversionMatrices::BT2020_FULL; + } else { // default to BT.601 + return ColorConversionMatrices::BT601_FULL; + } + } + return ColorConversionMatrices::BT601_LIMITED; +} } // namespace UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( const torch::Tensor& tensor, int frameIndex, - AVCodecContext* codecContext) { + AVCodecContext* codecContext, + AVPixelFormat targetPixelFormat) { TORCH_CHECK( tensor.dim() == 3 && tensor.size(0) == 3, "Expected 3D RGB tensor (CHW format), got shape: ", @@ -434,25 +488,39 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous(); NppiSize oSizeROI = {width, height}; - NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( - static_cast(hwcFrame.data_ptr()), - validateInt64ToInt( - hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"), - avFrame->data, - avFrame->linesize, - oSizeROI, - defaultLimitedRangeRgbToNv12, - *nppCtx_); + NppStatus status; + switch (targetPixelFormat) { + case AV_PIX_FMT_NV12: + status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( + static_cast(hwcFrame.data_ptr()), + hwcFrame.stride(0) * hwcFrame.element_size(), + avFrame->data, + avFrame->linesize, + oSizeROI, + getConversionMatrix(codecContext), + *nppCtx_); + break; + default: + TORCH_CHECK( + false, + "GPU encoding expected to encode into nv12 pixel format, but got ", + av_get_pix_fmt_name(targetPixelFormat), + ". This should not happen, please report this to the TorchCodec repo"); + } TORCH_CHECK( status == NPP_SUCCESS, - "Failed to convert RGB to NV12: NPP error code ", + "Failed to convert RGB to ", + av_get_pix_fmt_name(targetPixelFormat), + ": NPP error code ", status); - // TODO-VideoEncoder: Enable configuration of color properties, similar to - // FFmpeg. Below are the default color properties used by FFmpeg. - avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601 - avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range + avFrame->colorspace = codecContext->colorspace != AVCOL_SPC_UNSPECIFIED + ? codecContext->colorspace + : AVCOL_SPC_BT470BG; // BT.601 + avFrame->color_range = codecContext->color_range != AVCOL_RANGE_UNSPECIFIED + ? codecContext->color_range + : AVCOL_RANGE_MPEG; // limited range return avFrame; } @@ -461,7 +529,8 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( // to enable encoding with CUDA device. The hw_frames_ctx field is needed by // FFmpeg to allocate frames on GPU's memory. void CudaDeviceInterface::setupHardwareFrameContextForEncoding( - AVCodecContext* codecContext) { + AVCodecContext* codecContext, + AVPixelFormat targetPixelFormat) { TORCH_CHECK(codecContext != nullptr, "codecContext is null"); TORCH_CHECK( hardwareDeviceCtx_, "Hardware device context has not been initialized"); @@ -471,9 +540,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding( hwFramesCtxRef != nullptr, "Failed to allocate hardware frames context for codec"); - // TODO-VideoEncoder: Enable user set pixel formats to be set - // (outPixelFormat_) and handled with the appropriate NPP function - codecContext->sw_pix_fmt = AV_PIX_FMT_NV12; + codecContext->sw_pix_fmt = targetPixelFormat; // Always set pixel format to support CUDA encoding. codecContext->pix_fmt = AV_PIX_FMT_CUDA; diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index 267127c68..78e11c4eb 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -46,10 +46,12 @@ class CudaDeviceInterface : public DeviceInterface { UniqueAVFrame convertCUDATensorToAVFrameForEncoding( const torch::Tensor& tensor, int frameIndex, - AVCodecContext* codecContext) override; + AVCodecContext* codecContext, + AVPixelFormat targetPixelFormat) override; void setupHardwareFrameContextForEncoding( - AVCodecContext* codecContext) override; + AVCodecContext* codecContext, + AVPixelFormat targetPixelFormat) override; private: // Our CUDA decoding code assumes NV12 format. In order to handle other diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index 3e0fa0ec3..9b943b75e 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -146,14 +146,16 @@ class DeviceInterface { virtual UniqueAVFrame convertCUDATensorToAVFrameForEncoding( [[maybe_unused]] const torch::Tensor& tensor, [[maybe_unused]] int frameIndex, - [[maybe_unused]] AVCodecContext* codecContext) { + [[maybe_unused]] AVCodecContext* codecContext, + [[maybe_unused]] AVPixelFormat targetPixelFormat) { TORCH_CHECK(false); } // Function used for video encoding, only implemented in CudaDeviceInterface. // It is here to isolate CUDA dependencies from CPU builds virtual void setupHardwareFrameContextForEncoding( - [[maybe_unused]] AVCodecContext* codecContext) { + [[maybe_unused]] AVCodecContext* codecContext, + [[maybe_unused]] AVPixelFormat targetPixelFormat) { TORCH_CHECK(false); } diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index c19781669..4ad0b62d6 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp @@ -790,23 +790,30 @@ void VideoEncoder::initializeEncoder( outHeight_ = inHeight_; if (videoStreamOptions.pixelFormat.has_value()) { + // TODO-VideoEncoder: Enable pixel formats to be set by user + // and handled with the appropriate NPP function on GPU. if (frames_.device().is_cuda()) { TORCH_CHECK( false, - "GPU Video encoding currently only supports the NV12 pixel format. " - "Do not set pixel_format to use NV12."); + "Video encoding on GPU currently only supports the nv12 pixel format. " + "Do not set pixel_format to use nv12 by default."); } outPixelFormat_ = validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value()); } else { - const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec); - // Use first listed pixel format as default (often yuv420p). - // This is similar to FFmpeg's logic: - // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087 - // If pixel formats are undefined for some reason, try yuv420p - outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE) - ? formats[0] - : AV_PIX_FMT_YUV420P; + if (frames_.device().is_cuda()) { + // Default to nv12 pixel format when encoding on GPU. + outPixelFormat_ = AV_PIX_FMT_NV12; + } else { + const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec); + // Use first listed pixel format as default (often yuv420p). + // This is similar to FFmpeg's logic: + // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087 + // If pixel formats are undefined for some reason, try yuv420p + outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE) + ? formats[0] + : AV_PIX_FMT_YUV420P; + } } // Configure codec parameters @@ -852,7 +859,7 @@ void VideoEncoder::initializeEncoder( if (frames_.device().is_cuda() && deviceInterface_) { deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get()); deviceInterface_->setupHardwareFrameContextForEncoding( - avCodecContext_.get()); + avCodecContext_.get(), outPixelFormat_); } int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions); @@ -898,7 +905,7 @@ void VideoEncoder::encode() { UniqueAVFrame avFrame; if (frames_.device().is_cuda() && deviceInterface_) { auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding( - currFrame, i, avCodecContext_.get()); + currFrame, i, avCodecContext_.get(), outPixelFormat_); TORCH_CHECK( cudaFrame != nullptr, "convertCUDATensorToAVFrameForEncoding failed for frame ", diff --git a/test/test_encoders.py b/test/test_encoders.py index 77ef045c0..84a4f4788 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -780,9 +780,9 @@ def test_pixel_format_errors(self, method, device, tmp_path): if device == "cuda": with pytest.raises( RuntimeError, - match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12", + match="Video encoding on GPU currently only supports the nv12 pixel format. Do not set pixel_format to use nv12 by default.", ): - getattr(encoder, method)(**valid_params, pixel_format="yuv420p") + getattr(encoder, method)(**valid_params, pixel_format="yuv444p") return with pytest.raises( @@ -1354,7 +1354,22 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range ), ], ) - def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): + # BT.601, BT.709, BT.2020 + @pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc")) + # Full/PC range, Limited/TV range + @pytest.mark.parametrize("color_range", ("pc", "tv")) + def test_nvenc_against_ffmpeg_cli( + self, tmp_path, method, format, codec, color_space, color_range + ): + ffmpeg_version = get_ffmpeg_major_version() + if ffmpeg_version in (4, 6) and not ( + color_space == "bt470bg" and color_range == "tv" + ): + pytest.skip( + "Non-default color space and range have lower accuracy on FFmpeg 4 and 6" + ) + if ffmpeg_version == 4 and codec == "av1_nvenc": + pytest.skip("av1_nvenc is not supported on FFmpeg 4") # Encode with FFmpeg CLI using nvenc codecs device = "cuda" qp = 1 # Use near lossless encoding to reduce noise and support av1_nvenc @@ -1382,16 +1397,23 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): temp_raw_path, ] # CLI requires explicit codec for nvenc + # VideoEncoder will default to h264_nvenc since the frames are on GPU. ffmpeg_cmd.extend(["-c:v", codec if codec is not None else "h264_nvenc"]) - # VideoEncoder will select an NVENC encoder by default since the frames are on GPU. - ffmpeg_cmd.extend(["-pix_fmt", "nv12"]) # Output format is always NV12 - ffmpeg_cmd.extend(["-qp", str(qp)]) + ffmpeg_cmd.extend(["-qp", str(qp)]) # Use lossless qp for other codecs + if color_space: + ffmpeg_cmd.extend(["-colorspace", color_space]) + if color_range: + ffmpeg_cmd.extend(["-color_range", color_range]) ffmpeg_cmd.extend([ffmpeg_encoded_path]) subprocess.run(ffmpeg_cmd, check=True, capture_output=True) encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate) encoder_extra_options = {"qp": qp} + if color_space: + encoder_extra_options["colorspace"] = color_space + if color_range: + encoder_extra_options["color_range"] = color_range if method == "to_file": encoder_output_path = str(tmp_path / f"nvenc_output.{format}") encoder.to_file( @@ -1422,13 +1444,39 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec): encoder_frames = self.decode(encoder_output).data assert ffmpeg_frames.shape[0] == encoder_frames.shape[0] + # The combination of full range + bt709 results in worse accuracy + percentage = 91 if color_range == "full" and color_space == "bt709" else 96 for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames): assert psnr(ff_frame, enc_frame) > 25 - assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2) + assert_tensor_close_on_at_least( + ff_frame, enc_frame, percentage=percentage, atol=2 + ) if method == "to_file": - ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"]) - encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"]) - # pix_fmt nv12 is stored as yuv420p in metadata - assert encoder_metadata["pix_fmt"] == "yuv420p" - assert ffmpeg_metadata["pix_fmt"] == "yuv420p" + metadata_fields = ["pix_fmt", "color_range", "color_space"] + ffmpeg_metadata = self._get_video_metadata( + ffmpeg_encoded_path, metadata_fields + ) + encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields) + # pix_fmt nv12 is stored as yuv420p in metadata, unless full range (pc)is used + # In that case, h264 and hevc NVENC codecs will use yuvj420p automatically. + if color_range == "pc" and codec != "av1_nvenc": + expected_pix_fmt = "yuvj420p" + else: + # av1_nvenc does not utilize the yuvj420p pixel format + expected_pix_fmt = "yuv420p" + assert ( + encoder_metadata["pix_fmt"] + == ffmpeg_metadata["pix_fmt"] + == expected_pix_fmt + ) + assert ( + encoder_metadata["color_range"] + == ffmpeg_metadata["color_range"] + == color_range + ) + assert ( + encoder_metadata["color_space"] + == ffmpeg_metadata["color_space"] + == color_space + )