Skip to content

Enhanced Video Codec Support and Error Handling #5825

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion dali/operators/decoder/video/video_decoder_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,11 @@ to store frame metadata and longer initialization time to scan the entire video
stores metadata, such as whether it is a key frame and the presentation timestamp (PTS).

Building an index is particularly useful when decoding a small number of frames spaced far
apart or starting playback from a frame deep into the video)code",
apart or starting playback from a frame deep into the video.

Note: Building an index requires that the video codec is supported by the libavcodec version
provided with DALI. For video formats where CPU codec support is not available, this option
will be ignored and no index will be built.)code",
true);

class VideoDecoderCpu : public VideoDecoderBase<CPUBackend, FramesDecoderCpu> {
Expand Down
120 changes: 92 additions & 28 deletions dali/operators/reader/loader/video/frames_decoder_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,39 @@ void FramesDecoderBase::InitAvState(bool init_codecs) {
}
}

std::string FramesDecoderBase::GetAllStreamInfo() const {
std::stringstream ss;
ss << "Number of streams: " << av_state_->ctx_->nb_streams << std::endl;
for (size_t i = 0; i < av_state_->ctx_->nb_streams; ++i) {
ss << "Stream " << i << ": " << av_state_->ctx_->streams[i]->codecpar->codec_type << std::endl;
ss << " Codec ID: " << av_state_->ctx_->streams[i]->codecpar->codec_id << " ("
<< avcodec_get_name(av_state_->ctx_->streams[i]->codecpar->codec_id) << ")" << std::endl;
ss << " Codec Type: " << av_state_->ctx_->streams[i]->codecpar->codec_type << std::endl;
ss << " Format: " << av_state_->ctx_->streams[i]->codecpar->format << std::endl;
ss << " Width: " << av_state_->ctx_->streams[i]->codecpar->width << std::endl;
ss << " Height: " << av_state_->ctx_->streams[i]->codecpar->height << std::endl;
ss << " Sample Rate: " << av_state_->ctx_->streams[i]->codecpar->sample_rate << std::endl;
ss << " Bit Rate: " << av_state_->ctx_->streams[i]->codecpar->bit_rate << std::endl;
}
return ss.str();
}

bool FramesDecoderBase::FindVideoStream(bool init_codecs) {
if (init_codecs) {
size_t i = 0;

for (i = 0; i < av_state_->ctx_->nb_streams; ++i) {
av_state_->codec_params_ = av_state_->ctx_->streams[i]->codecpar;
av_state_->codec_ = avcodec_find_decoder(av_state_->codec_params_->codec_id);
if (av_state_->codec_ == nullptr) {
LOG_LINE << "No decoder found for stream " << i
<< " (codec_id=" << av_state_->codec_params_->codec_id
<< ", codec_type=" << av_state_->codec_params_->codec_type
<< ", format=" << av_state_->codec_params_->format
<< ", width=" << av_state_->codec_params_->width
<< ", height=" << av_state_->codec_params_->height
<< ", sample_rate=" << av_state_->codec_params_->sample_rate
<< ", bit_rate=" << av_state_->codec_params_->bit_rate
<< ")" << std::endl;
continue;
}
if (av_state_->codec_->type == AVMEDIA_TYPE_VIDEO) {
Expand All @@ -127,10 +152,11 @@ bool FramesDecoderBase::FindVideoStream(bool init_codecs) {
av_state_->stream_id_ = i;
break;
}
LOG_LINE << "Stream " << i << " is not a video stream" << std::endl;
}

if (i >= av_state_->ctx_->nb_streams) {
DALI_WARN(make_string("Could not find a valid video stream in a file ", Filename()));
LOG_LINE << "Could not find a valid video stream in a file " << Filename() << std::endl;
return false;
}
} else {
Expand All @@ -139,12 +165,13 @@ bool FramesDecoderBase::FindVideoStream(bool init_codecs) {

LOG_LINE << "Best stream " << av_state_->stream_id_ << std::endl;
if (av_state_->stream_id_ < 0) {
DALI_WARN(make_string("Could not find a valid video stream in a file ", Filename()));
LOG_LINE << "No valid video stream found" << std::endl;
return false;
}

av_state_->codec_params_ = av_state_->ctx_->streams[av_state_->stream_id_]->codecpar;
}

if (Height() == 0 || Width() == 0) {
if (avformat_find_stream_info(av_state_->ctx_, nullptr) < 0) {
DALI_WARN(make_string("Could not find stream information in ", Filename()));
Expand Down Expand Up @@ -172,12 +199,31 @@ FramesDecoderBase::FramesDecoderBase(const std::string &filename)
return;
}

if (!FindVideoStream()) {
bool video_stream_found = FindVideoStream(true);
bool init_codecs = video_stream_found;
bool build_index = true;
if (!video_stream_found) {
video_stream_found = FindVideoStream(false);
if (video_stream_found) {
LOG_LINE << "No available CPU codec found for video stream " << av_state_->stream_id_
<< " (codec_id=" << av_state_->codec_params_->codec_id
<< ", codec_name=" << avcodec_get_name(av_state_->codec_params_->codec_id)
<< ") in " << Filename() << ". Index building will be skipped." << std::endl;
}
}

if (!video_stream_found) {
DALI_WARN(make_string("Could not find a valid video stream in a file ", Filename(),
". Streams available: ", GetAllStreamInfo()));
return;
}

InitAvState();
BuildIndex();
InitAvState(init_codecs);

if (build_index) {
LOG_LINE << "Building index" << std::endl;
BuildIndex();
}
is_valid_ = true;
}

Expand All @@ -186,13 +232,10 @@ FramesDecoderBase::FramesDecoderBase(const char *memory_file, int memory_file_si
std::string_view source_info)
: av_state_(std::make_unique<AvState>()) {
av_log_set_level(AV_LOG_ERROR);

filename_ = source_info;
memory_video_file_.emplace(memory_file, memory_file_size);

DALI_ENFORCE(init_codecs || !build_index,
"FramesDecoderBase doesn't support index without CPU codecs");
av_log_set_level(AV_LOG_ERROR);

if (num_frames != -1) {
num_frames_ = num_frames;
}
Expand Down Expand Up @@ -224,21 +267,40 @@ FramesDecoderBase::FramesDecoderBase(const char *memory_file, int memory_file_si
return;
}

if (!FindVideoStream(init_codecs || build_index)) {
return;
bool video_stream_found = false;
if (init_codecs) {
// Try with CPU codecs if needed/requested
video_stream_found = init_codecs = FindVideoStream(true);
if (!video_stream_found) {
// Try one more time without codecs if we haven't yet
video_stream_found = FindVideoStream(false);
if (video_stream_found) {
LOG_LINE << "No available CPU codec found for video stream " << av_state_->stream_id_
<< " (codec_id=" << av_state_->codec_params_->codec_id
<< ", codec_name=" << avcodec_get_name(av_state_->codec_params_->codec_id)
<< ") in " << Filename() << ". Index building will be skipped." << std::endl;
}
}
} else {
video_stream_found = FindVideoStream(false);
}

InitAvState(init_codecs || build_index);

// Number of frames is unknown and we do not plan to build the index
if (NumFrames() == 0 && !build_index) {
ParseNumFrames();
if (!video_stream_found) {
DALI_WARN(make_string("No suitable stream found in ", Filename(),
". Streams available: ", GetAllStreamInfo()));
return;
}

InitAvState(init_codecs);

if (build_index) {
LOG_LINE << "Building index" << std::endl;
BuildIndex();
} else if (NumFrames() == 0) {
ParseNumFrames();
LOG_LINE << "Parsed number of frames: " << NumFrames() << std::endl;
}

is_valid_ = true;
}

Expand Down Expand Up @@ -543,8 +605,9 @@ void FramesDecoderBase::LazyInitSwContext() {
}
}

bool FramesDecoderBase::ReadRegularFrame(uint8_t *data, bool copy_to_output) {
bool FramesDecoderBase::ReadRegularFrame(uint8_t *data) {
int ret = -1;
bool copy_to_output = data != nullptr;
while (true) {
ret = av_read_frame(av_state_->ctx_, av_state_->packet_);
auto packet = AVPacketScope(av_state_->packet_, av_packet_unref);
Expand All @@ -568,7 +631,7 @@ bool FramesDecoderBase::ReadRegularFrame(uint8_t *data, bool copy_to_output) {
}

LOG_LINE << "Read frame (ReadRegularFrame), index " << next_frame_idx_ << ", timestamp "
<< std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output
<< std::setw(5) << av_state_->frame_->pts << ", copy_to_output=" << copy_to_output
<< std::endl;
if (!copy_to_output) {
++next_frame_idx_;
Expand Down Expand Up @@ -646,7 +709,7 @@ void FramesDecoderBase::SeekFrame(int frame_id) {

// Seeking clears av buffers, so reset flush state info
if (flush_state_) {
while (ReadFlushFrame(nullptr, false)) {}
while (ReadFlushFrame(nullptr)) {}
flush_state_ = false;
}

Expand All @@ -673,12 +736,13 @@ void FramesDecoderBase::SeekFrame(int frame_id) {
// Skip all remaining frames until the requested frame
LOG_LINE << "Skipping frames from " << next_frame_idx_ << " to " << frame_id << std::endl;
for (int i = next_frame_idx_; i < frame_id; i++) {
ReadNextFrame(nullptr, false);
ReadNextFrame(nullptr);
}
assert(next_frame_idx_ == frame_id);
}

bool FramesDecoderBase::ReadFlushFrame(uint8_t *data, bool copy_to_output) {
bool FramesDecoderBase::ReadFlushFrame(uint8_t *data) {
bool copy_to_output = data != nullptr;
if (avcodec_receive_frame(av_state_->codec_ctx_, av_state_->frame_) < 0) {
flush_state_ = false;
return false;
Expand All @@ -689,7 +753,7 @@ bool FramesDecoderBase::ReadFlushFrame(uint8_t *data, bool copy_to_output) {
}

LOG_LINE << "Read frame (ReadFlushFrame), index " << next_frame_idx_ << " timestamp "
<< std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output
<< std::setw(5) << av_state_->frame_->pts << ", copy_to_output=" << copy_to_output
<< std::endl;
++next_frame_idx_;

Expand All @@ -703,16 +767,16 @@ bool FramesDecoderBase::ReadFlushFrame(uint8_t *data, bool copy_to_output) {
return true;
}

bool FramesDecoderBase::ReadNextFrame(uint8_t *data, bool copy_to_output) {
bool FramesDecoderBase::ReadNextFrame(uint8_t *data) {
LOG_LINE << "ReadNextFrame: frame_idx=" << next_frame_idx_
<< " copy=" << copy_to_output
<< " flush=" << flush_state_ << std::endl;
<< " copy_to_output=" << (data != nullptr)
<< " flush=" << flush_state_ << std::endl;
if (!flush_state_) {
if (ReadRegularFrame(data, copy_to_output)) {
if (ReadRegularFrame(data)) {
return true;
}
}
return ReadFlushFrame(data, copy_to_output);
return ReadFlushFrame(data);
}

} // namespace dali
19 changes: 9 additions & 10 deletions dali/operators/reader/loader/video/frames_decoder_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,12 @@ class DLL_PUBLIC FramesDecoderBase {
}

/**
* @brief Reads next frame of the video and copies it to the provided buffer, if copy_to_output is True.
* @brief Reads next frame of the video.
*
* @param data Output buffer to copy data to.
* @param copy_to_output Whether copy the data to the output.
* @param data Output buffer to copy data to. If nullptr, the frame will be effectively skipped.
* @return Boolean indicating whether the frame was read or not. False means no more frames in the decoder.
*/
virtual bool ReadNextFrame(uint8_t *data, bool copy_to_output = true);
virtual bool ReadNextFrame(uint8_t *data);

/**
* @brief Seeks to the frame given by id. Next call to ReadNextFrame will return this frame
Expand Down Expand Up @@ -236,24 +235,22 @@ class DLL_PUBLIC FramesDecoderBase {
* After this method returns false, there might be more frames to read. Call `ReadFlushFrame` until
* it returns false, to get all of the frames from the video file.
*
* @param data Output buffer to copy data to. If `copy_to_output` is false, this value is ignored.
* @param copy_to_output Whether copy the frame to provided output.
* @param data Output buffer to copy data to. If nullptr, the frame will be effectively skipped.
*
* @returns True, if the read was succesful, or false, when all regular frames were consumed.
*
*/
bool ReadRegularFrame(uint8_t *data, bool copy_to_output = true);
bool ReadRegularFrame(uint8_t *data);

/**
* @brief Reads frames from the last packet. This packet can hold
* multiple frames. This method will read all of them one by one.
*
* @param data Output buffer to copy data to. If `copy_to_output` is false, this value is ignored.
* @param copy_to_output Whether copy the frame to provided output.
* @param data Output buffer to copy data to. If nullptr, the frame will be effectively skipped.
*
* @returns True, if the read was succesful, or false, when ther are no more frames in last the packet.
*/
bool ReadFlushFrame(uint8_t *data, bool copy_to_output = true);
bool ReadFlushFrame(uint8_t *data);

void CopyToOutput(uint8_t *data);

Expand All @@ -275,6 +272,8 @@ class DLL_PUBLIC FramesDecoderBase {

void CountFrames(AvState *av_state);

std::string GetAllStreamInfo() const;

int channels_ = 3;
bool flush_state_ = false;
bool is_vfr_ = false;
Expand Down
15 changes: 9 additions & 6 deletions dali/operators/reader/loader/video/frames_decoder_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@ bool FramesDecoderCpu::CanDecode(AVCodecID codec_id) const {
AVCodecID::AV_CODEC_ID_VP8,
AVCodecID::AV_CODEC_ID_VP9,
AVCodecID::AV_CODEC_ID_MJPEG,
// TODO(janton): AVCodecID::AV_CODEC_ID_AV1,
// TODO(janton): AVCodecID::AV_CODEC_ID_MPEG4,
// Those are not supported by our compiled version of libavcodec,
// AVCodecID::AV_CODEC_ID_AV1,
// AVCodecID::AV_CODEC_ID_MPEG4,
};
if (std::find(codecs.begin(), codecs.end(), codec_id) == codecs.end()) {
DALI_WARN(make_string("Codec ", avcodec_get_name(av_state_->codec_params_->codec_id),
" is not supported by this DALI operator."));
DALI_WARN(make_string("Codec ", codec_id, " (", avcodec_get_name(codec_id),
") is not supported by the CPU variant of this operator."));
return false;
}

Expand All @@ -54,8 +55,10 @@ bool FramesDecoderCpu::CanDecode(AVCodecID codec_id) const {
return true;
}
}
DALI_WARN(make_string("Codec ", avcodec_get_name(codec_id),
" is not supported by the FFMPEG version provided by DALI."));
DALI_WARN(
make_string("Codec ", codec_id, " (", avcodec_get_name(codec_id),
") is not supported by the libavcodec version provided by DALI, and therefore "
"cannot be decoded on the CPU."));
return false;
}

Expand Down
Loading
Loading